From a15a331798febda954509b41d0abb4d78b9493ef Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Fri, 11 Jan 2019 06:33:35 -0500 Subject: [PATCH] fix media download with longer timeout --- archivebox/archive_methods.py | 27 ++++++++++++++++----------- archivebox/config.py | 3 ++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/archivebox/archive_methods.py b/archivebox/archive_methods.py index 52431f59..5a283304 100644 --- a/archivebox/archive_methods.py +++ b/archivebox/archive_methods.py @@ -29,6 +29,7 @@ from config import ( CHROME_USER_DATA_DIR, CHROME_SANDBOX, TIMEOUT, + MEDIA_TIMEOUT, ANSI, ARCHIVE_DIR, GIT_DOMAINS, @@ -441,28 +442,29 @@ def fetch_favicon(link_dir, link, timeout=TIMEOUT): } @attach_result_to_link('media') -def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False): +def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT, overwrite=False): """Download playlists or individual video, audio, and subtitles using youtube-dl""" - output = os.path.join(link_dir, 'media') + # import ipdb; ipdb.set_trace() + output = os.path.join(link_dir, 'media') if os.path.exists(output) and not overwrite: return {'output': 'media', 'status': 'skipped'} - os.mkdir(output) - print(' - Downloading media') + os.makedirs(output, exist_ok=True) CMD = [ 'youtube-dl', '--write-description', '--write-info-json', '--write-annotations', '--yes-playlist', - '--write-thumbnail ', + '--write-thumbnail', '--no-call-home', '--no-check-certificate', - '--user-agent ', + '--user-agent', '--all-subs', '-x', + '-k', '--audio-format', 'mp3', '--audio-quality', '320K', '--embed-thumbnail', @@ -472,17 +474,20 @@ def fetch_media(link_dir, link, timeout=TIMEOUT, overwrite=False): end = progress(timeout, prefix=' ') try: - result = run(CMD, stdout=DEVNULL, stderr=DEVNULL, cwd=output, timeout=timeout + 1) # audio/audio.mp3 + result = run(CMD, stdout=PIPE, stderr=PIPE, cwd=output, timeout=timeout + 1) # audio/audio.mp3 end() if result.returncode: - print(' got youtubedl response code {}:'.format(result.returncode)) - raise Exception('Failed to download media') - chmod_file('media', cwd=link_dir) - return 'media' + if b'ERROR: Unsupported URL' in result.stderr: + # print(' none found') + pass + else: + print(' got youtubedl response code {}:'.format(result.returncode)) + raise Exception('Failed to download media') except Exception as e: end() print(' Run to see full output:', 'cd {}; {}'.format(link_dir, ' '.join(CMD))) print(' {}Failed: {} {}{}'.format(ANSI['red'], e.__class__.__name__, e, ANSI['reset'])) + output = e return { 'cmd': CMD, diff --git a/archivebox/config.py b/archivebox/config.py index e436313c..ec634039 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -22,7 +22,7 @@ FETCH_PDF = os.getenv('FETCH_PDF', 'True' FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true' FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true' FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true' -FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true' +FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'False' ).lower() == 'true' FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true' SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true' RESOLUTION = os.getenv('RESOLUTION', '1440,1200' ) @@ -33,6 +33,7 @@ WGET_BINARY = os.getenv('WGET_BINARY', 'wget' WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox') CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None) TIMEOUT = int(os.getenv('TIMEOUT', '60')) +MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600')) FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',) GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')