From ac9e0e356d23cb283cae0e4b2add2c6041012f34 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 31 Oct 2020 07:55:27 -0400 Subject: [PATCH] config fixes --- archivebox/config.py | 7 ++-- archivebox/core/admin.py | 14 +++++-- archivebox/core/models.py | 10 ++--- archivebox/core/settings.py | 1 + archivebox/core/utils.py | 6 +-- archivebox/extractors/archive_org.py | 2 +- archivebox/extractors/headers.py | 4 +- archivebox/extractors/mercury.py | 55 +++++++++++++++++----------- 8 files changed, 57 insertions(+), 42 deletions(-) diff --git a/archivebox/config.py b/archivebox/config.py index d79d0fa8..1ede0b07 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None: stderr('') if config['TIMEOUT'] < 5: - stderr() stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.') stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)') stderr() stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr() elif config['USE_CHROME'] and config['TIMEOUT'] < 15: - stderr() stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red') stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.') stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)') stderr() stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles') + stderr() if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20: - stderr() stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red') stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.') stderr(' (Setting it somewhere over 60 seconds is recommended)') stderr() stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media') - + stderr() def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None: output_dir = out_dir or config['OUTPUT_DIR'] diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 55c68e16..a061cd9d 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -11,7 +11,7 @@ from django.shortcuts import render, redirect from django.contrib.auth import get_user_model from django import forms -from core.models import Snapshot +from core.models import Snapshot, Tag from core.forms import AddLinkForm, TagField from core.utils import get_icons @@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin): def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html(' {} ', tag.id, tag) + format_html('{} ', tag.id, tag) for tag in obj.tags.all() + if str(tag).strip() ) return format_html( '' @@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin): obj.archive_path, 'fetched' if obj.latest_title or obj.title else 'pending', urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' - ) + mark_safe(f'{tags}') + ) + mark_safe(f' {tags}') def files(self, obj): return get_icons(obj) @@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin): title_str.admin_order_field = 'title' url_str.admin_order_field = 'url' +class TagAdmin(admin.ModelAdmin): + list_display = ('slug', 'name', 'id') + sort_fields = ('id', 'name', 'slug') + readonly_fields = ('id',) + search_fields = ('id', 'name', 'slug') + fields = (*readonly_fields, 'name', 'slug') class ArchiveBoxAdmin(admin.AdminSite): @@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite): admin.site = ArchiveBoxAdmin() admin.site.register(get_user_model()) admin.site.register(Snapshot, SnapshotAdmin) +admin.site.register(Tag, TagAdmin) admin.site.disable_action('delete_selected') diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 7d0c799f..f43fc631 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -82,7 +82,7 @@ class Snapshot(models.Model): args = args or self.keys return { key: getattr(self, key) - if key != 'tags' else self.get_tags_str() + if key != 'tags' else self.tags_str() for key in args } @@ -93,12 +93,8 @@ class Snapshot(models.Model): from ..index import load_link_details return load_link_details(self.as_link()) - def get_tags_str(self) -> str: - tags = ','.join( - tag.name - for tag in self.tags.all() - ) if self.tags.all() else '' - return tags + def tags_str(self) -> str: + return ','.join(self.tags.order_by('name').values_list('name', flat=True)) @cached_property def bookmarked(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 28a3e1fe..3417beb2 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] ### Django Core Settings ################################################################################ +DEBUG = True WSGI_APPLICATION = 'core.wsgi.application' ROOT_URLCONF = 'core.urls' diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py index 58376843..1496ab74 100644 --- a/archivebox/core/utils.py +++ b/archivebox/core/utils.py @@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str: # slow version: highlights icons based on whether files exist or not for that output # link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) # fast version: all icons are highlighted without checking for outputs in filesystem - link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method]) + link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists()) return format_html( '' '🌐 ' + '🗜 ' '📄 ' '🖥 ' '🅷 ' '🆆 ' - '🗜 ' '📼 ' '📦 ' '🏛 ' '', *link_tuple(link, 'wget_path'), + *link_tuple(link, 'singlefile_path'), *link_tuple(link, 'pdf_path'), *link_tuple(link, 'screenshot_path'), *link_tuple(link, 'dom_path'), *link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')), - *link_tuple(link, 'singlefile_path'), *link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')), *link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')), canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(), diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index db9e2517..f5598d6f 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int= result = run(cmd, cwd=str(out_dir), timeout=timeout) content_location, errors = parse_archive_dot_org_response(result.stdout) if content_location: - archive_org_url = 'https://web.archive.org{}'.format(content_location[0]) + archive_org_url = content_location[0] elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]: archive_org_url = None # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url))) diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 2ddae8d0..4e69dec1 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) link.url, ] try: - json_headers = get_headers(link.url) - + json_headers = get_headers(link.url, timeout=timeout) output_folder.mkdir(exist_ok=True) atomic_write(str(output_folder / "headers.json"), json_headers) - except (Exception, OSError) as err: status = 'failed' output = err diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index 2d2711ca..5fec1961 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors' from pathlib import Path -from typing import Optional +from subprocess import CompletedProcess +from typing import Optional, Tuple, List import json from ..index.schema import Link, ArchiveResult, ArchiveError @@ -20,6 +21,21 @@ from ..config import ( ) from ..logging_util import TimedProgress + + +@enforce_types +def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError: + # parse out last line of stderr + return ArchiveError( + f'Got {cmd[0]} response code: {result.returncode}).', + *( + line.strip() + for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:] + if line.strip() + ), + ) + + @enforce_types def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool: out_dir = out_dir or link.link_dir @@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool: @enforce_types -def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult: +def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: """download reader friendly version using @postlight/mercury-parser""" out_dir = Path(out_dir or link.link_dir) @@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) status = 'succeeded' timer = TimedProgress(timeout, prefix=' ') try: + # Get plain text version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url, "--format=text" ] result = run(cmd, cwd=out_dir, timeout=timeout) - txtresult_json = json.loads(result.stdout) - + try: + article_text = json.loads(result.stdout) + except json.JSONDecodeError: + raise ShellError(cmd, result) + + # Get HTML version of article cmd = [ DEPENDENCIES['MERCURY_BINARY']['path'], link.url ] result = run(cmd, cwd=out_dir, timeout=timeout) - result_json = json.loads(result.stdout) + try: + article_json = json.loads(result.stdout) + except json.JSONDecodeError: + raise ShellError(cmd, result) output_folder.mkdir(exist_ok=True) - atomic_write(str(output_folder / "content.html"), result_json.pop("content")) - atomic_write(str(output_folder / "content.txt"), txtresult_json["content"]) - atomic_write(str(output_folder / "article.json"), result_json) - - # parse out last line of stderr - output_tail = [ - line.strip() - for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:] - if line.strip() - ] - hints = ( - 'Got mercury response code: {}.'.format(result.returncode), - *output_tail, - ) + atomic_write(str(output_folder / "content.html"), article_json.pop("content")) + atomic_write(str(output_folder / "content.txt"), article_text["content"]) + atomic_write(str(output_folder / "article.json"), article_json) # Check for common failure cases if (result.returncode > 0): - raise ArchiveError('Mercury parser was not able to archive the page', hints) - except (Exception, OSError) as err: + raise ShellError(cmd, result) + except (ArchiveError, Exception, OSError) as err: status = 'failed' output = err finally: