fix media download with longer timeout

This commit is contained in:
Nick Sweeting 2019-01-11 06:33:35 -05:00
parent c33f7ba91c
commit a15a331798
2 changed files with 18 additions and 12 deletions

View file

@ -29,6 +29,7 @@ from config import (
CHROME_USER_DATA_DIR, CHROME_USER_DATA_DIR,
CHROME_SANDBOX, CHROME_SANDBOX,
TIMEOUT, TIMEOUT,
MEDIA_TIMEOUT,
ANSI, ANSI,
ARCHIVE_DIR, ARCHIVE_DIR,
GIT_DOMAINS, GIT_DOMAINS,
@ -441,28 +442,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT):
} }
@attach_result_to_link('media') @attach_result_to_link('media')
def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False): def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False):
"""Download playlists or individual video, audio, and subtitles using youtube-dl""" """Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = os.path.join(link_dir, 'media')
# import ipdb; ipdb.set_trace()
output = os.path.join(link_dir, 'media')
if os.path.exists(output) and not overwrite: if os.path.exists(output) and not overwrite:
return {'output': 'media', 'status': 'skipped'} return {'output': 'media', 'status': 'skipped'}
os.mkdir(output) os.makedirs(output, exist_ok=True)
print(' - Downloading media')
CMD = [ CMD = [
'youtube-dl', 'youtube-dl',
'--write-description', '--write-description',
'--write-info-json', '--write-info-json',
'--write-annotations', '--write-annotations',
'--yes-playlist', '--yes-playlist',
'--write-thumbnail ', '--write-thumbnail',
'--no-call-home', '--no-call-home',
'--no-check-certificate', '--no-check-certificate',
'--user-agent ', '--user-agent',
'--all-subs', '--all-subs',
'-x', '-x',
'-k',
'--audio-format', 'mp3', '--audio-format', 'mp3',
'--audio-quality', '320K', '--audio-quality', '320K',
'--embed-thumbnail', '--embed-thumbnail',
@ -472,17 +474,20 @@ def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False):
end = progress(timeout, prefix=' ') end = progress(timeout, prefix=' ')
try: try:
result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=output, timeout=timeout + 1) # audio/audio.mp3 result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1) # audio/audio.mp3
end() end()
if result.returncode: if result.returncode:
print(' got youtubedl response code {}:'.format(result.returncode)) if b'ERROR: Unsupported URL' in result.stderr:
raise Exception('Failed to download media') # print(' none found')
chmod_file('media', cwd=link_dir) pass
return 'media' else:
print(' got youtubedl response code {}:'.format(result.returncode))
raise Exception('Failed to download media')
except Exception as e: except Exception as e:
end() end()
print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD)))
print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset']))
output = e
return { return {
'cmd': CMD, 'cmd': CMD,

View file

@ -22,7 +22,7 @@ FETCH_PDF = os.getenv('FETCH_PDF', 'True'
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true'
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true' FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true' FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'False' ).lower() == 'true'
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) RESOLUTION = os.getenv('RESOLUTION', '1440,1200' )
@ -33,6 +33,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget'
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox') WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox')
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
TIMEOUT = int(os.getenv('TIMEOUT', '60')) TIMEOUT = int(os.getenv('TIMEOUT', '60'))
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',') GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')