mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 06:00:22 +00:00
config fixes
This commit is contained in:
parent
aa71a231f6
commit
ac9e0e356d
8 changed files with 57 additions and 42 deletions
|
@ -885,32 +885,31 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr('')
|
||||
|
||||
if config['TIMEOUT'] < 5:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
|
||||
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr()
|
||||
|
||||
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
stderr()
|
||||
|
||||
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
|
||||
stderr(' (Setting it somewhere over 60 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
|
||||
|
||||
stderr()
|
||||
|
||||
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
|
|
|
@ -11,7 +11,7 @@ from django.shortcuts import render, redirect
|
|||
from django.contrib.auth import get_user_model
|
||||
from django import forms
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.models import Snapshot, Tag
|
||||
from core.forms import AddLinkForm, TagField
|
||||
from core.utils import get_icons
|
||||
|
||||
|
@ -109,8 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
||||
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
||||
for tag in obj.tags.all()
|
||||
if str(tag).strip()
|
||||
)
|
||||
return format_html(
|
||||
'<a href="/{}">'
|
||||
|
@ -124,7 +125,7 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
obj.archive_path,
|
||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
) + mark_safe(f'<span class="tags">{tags}</span>')
|
||||
) + mark_safe(f' <span class="tags">{tags}</span>')
|
||||
|
||||
def files(self, obj):
|
||||
return get_icons(obj)
|
||||
|
@ -151,6 +152,12 @@ class SnapshotAdmin(admin.ModelAdmin):
|
|||
title_str.admin_order_field = 'title'
|
||||
url_str.admin_order_field = 'url'
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
list_display = ('slug', 'name', 'id')
|
||||
sort_fields = ('id', 'name', 'slug')
|
||||
readonly_fields = ('id',)
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
fields = (*readonly_fields, 'name', 'slug')
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
|
@ -206,4 +213,5 @@ class ArchiveBoxAdmin(admin.AdminSite):
|
|||
admin.site = ArchiveBoxAdmin()
|
||||
admin.site.register(get_user_model())
|
||||
admin.site.register(Snapshot, SnapshotAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.disable_action('delete_selected')
|
||||
|
|
|
@ -82,7 +82,7 @@ class Snapshot(models.Model):
|
|||
args = args or self.keys
|
||||
return {
|
||||
key: getattr(self, key)
|
||||
if key != 'tags' else self.get_tags_str()
|
||||
if key != 'tags' else self.tags_str()
|
||||
for key in args
|
||||
}
|
||||
|
||||
|
@ -93,12 +93,8 @@ class Snapshot(models.Model):
|
|||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link())
|
||||
|
||||
def get_tags_str(self) -> str:
|
||||
tags = ','.join(
|
||||
tag.name
|
||||
for tag in self.tags.all()
|
||||
) if self.tags.all() else ''
|
||||
return tags
|
||||
def tags_str(self) -> str:
|
||||
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
|
||||
@cached_property
|
||||
def bookmarked(self):
|
||||
|
|
|
@ -25,6 +25,7 @@ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
|||
### Django Core Settings
|
||||
################################################################################
|
||||
|
||||
DEBUG = True
|
||||
WSGI_APPLICATION = 'core.wsgi.application'
|
||||
ROOT_URLCONF = 'core.urls'
|
||||
|
||||
|
|
|
@ -13,26 +13,26 @@ def get_icons(snapshot: Snapshot) -> str:
|
|||
# slow version: highlights icons based on whether files exist or not for that output
|
||||
# link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
||||
# fast version: all icons are highlighted without checking for outputs in filesystem
|
||||
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method])
|
||||
link_tuple = lambda link, method: (link.archive_path, canon[method] or '', canon[method] and (out_dir / (canon[method] or 'notdone')).exists())
|
||||
|
||||
return format_html(
|
||||
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
|
||||
'<a href="/{}/{}" class="exists-{}" title="Wget clone">🌐 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="SingleFile">🗜 </a>'
|
||||
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="WARC">🆆 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="SingleFile">🗜 </a>'
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
|
||||
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
||||
'</span>',
|
||||
*link_tuple(link, 'wget_path'),
|
||||
*link_tuple(link, 'singlefile_path'),
|
||||
*link_tuple(link, 'pdf_path'),
|
||||
*link_tuple(link, 'screenshot_path'),
|
||||
*link_tuple(link, 'dom_path'),
|
||||
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
||||
*link_tuple(link, 'singlefile_path'),
|
||||
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
||||
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
||||
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
||||
|
|
|
@ -59,7 +59,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
result = run(cmd, cwd=str(out_dir), timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
archive_org_url = content_location[0]
|
||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||
archive_org_url = None
|
||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
|
||||
|
|
|
@ -50,11 +50,9 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
link.url,
|
||||
]
|
||||
try:
|
||||
json_headers = get_headers(link.url)
|
||||
|
||||
json_headers = get_headers(link.url, timeout=timeout)
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
atomic_write(str(output_folder / "headers.json"), json_headers)
|
||||
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
|
|
|
@ -2,7 +2,8 @@ __package__ = 'archivebox.extractors'
|
|||
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional
|
||||
from subprocess import CompletedProcess
|
||||
from typing import Optional, Tuple, List
|
||||
import json
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
|
@ -20,6 +21,21 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
||||
# parse out last line of stderr
|
||||
return ArchiveError(
|
||||
f'Got {cmd[0]} response code: {result.returncode}).',
|
||||
*(
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
|
||||
if line.strip()
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
@ -31,7 +47,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None) -> bool:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download reader friendly version using @postlight/mercury-parser"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
|
@ -41,41 +57,38 @@ def save_mercury(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
# Get plain text version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
link.url,
|
||||
"--format=text"
|
||||
]
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
txtresult_json = json.loads(result.stdout)
|
||||
|
||||
try:
|
||||
article_text = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
raise ShellError(cmd, result)
|
||||
|
||||
# Get HTML version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
link.url
|
||||
]
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
result_json = json.loads(result.stdout)
|
||||
try:
|
||||
article_json = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
raise ShellError(cmd, result)
|
||||
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
||||
atomic_write(str(output_folder / "content.txt"), txtresult_json["content"])
|
||||
atomic_write(str(output_folder / "article.json"), result_json)
|
||||
|
||||
# parse out last line of stderr
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 20)[-20:]
|
||||
if line.strip()
|
||||
]
|
||||
hints = (
|
||||
'Got mercury response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
|
||||
atomic_write(str(output_folder / "content.txt"), article_text["content"])
|
||||
atomic_write(str(output_folder / "article.json"), article_json)
|
||||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0):
|
||||
raise ArchiveError('Mercury parser was not able to archive the page', hints)
|
||||
except (Exception, OSError) as err:
|
||||
raise ShellError(cmd, result)
|
||||
except (ArchiveError, Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
|
|
Loading…
Reference in a new issue