mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-23 00:38:27 +00:00
load EXTRACTORS dynamically using importlib.import_module
This commit is contained in:
parent
c7f55fc3ba
commit
457c42bf84
18 changed files with 198 additions and 40 deletions
|
@ -17,8 +17,6 @@ except AttributeError:
|
|||
|
||||
|
||||
def forwards_func(apps, schema_editor):
|
||||
from core.models import EXTRACTORS
|
||||
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@ import json
|
|||
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from importlib import import_module
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
@ -20,9 +21,9 @@ from ..system import get_dir_size
|
|||
from ..util import parse_date, base_url, hashurl
|
||||
from ..index.schema import Link
|
||||
from ..index.html import snapshot_icons
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
|
||||
EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
|
||||
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
|
||||
STATUS_CHOICES = [
|
||||
("succeeded", "succeeded"),
|
||||
("failed", "failed"),
|
||||
|
@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager):
|
|||
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
|
||||
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
|
||||
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
|
||||
cmd = JSONField()
|
||||
pwd = models.CharField(max_length=256)
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
|
@ -284,3 +287,34 @@ class ArchiveResult(models.Model):
|
|||
|
||||
def __str__(self):
|
||||
return self.extractor
|
||||
|
||||
@cached_property
|
||||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
|
||||
|
||||
@property
|
||||
def extractor_module(self):
|
||||
return EXTRACTORS[self.extractor]
|
||||
|
||||
def output_path(self) -> str:
|
||||
"""return the canonical output filename or directory name within the snapshot dir"""
|
||||
return self.extractor_module.get_output_path()
|
||||
|
||||
def embed_path(self) -> str:
|
||||
"""
|
||||
return the actual runtime-calculated path to the file on-disk that
|
||||
should be used for user-facing iframe embeds of this result
|
||||
"""
|
||||
|
||||
if hasattr(self.extractor_module, 'get_embed_path'):
|
||||
return self.extractor_module.get_embed_path(self)
|
||||
|
||||
return self.extractor_module.get_output_path()
|
||||
|
||||
def legacy_output_path(self):
|
||||
link = self.snapshot.as_link()
|
||||
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
return Path(self.output_path()).exists()
|
||||
|
|
|
@ -1,11 +1,13 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Callable, Optional, List, Iterable, Union
|
||||
from importlib import import_module
|
||||
from datetime import datetime, timezone
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from ..config import (
|
||||
|
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
|||
|
||||
log_archiving_finished(num_links)
|
||||
return all_links
|
||||
|
||||
|
||||
|
||||
EXTRACTORS_DIR = Path(__file__).parent
|
||||
|
||||
class ExtractorModuleProtocol(Protocol):
|
||||
"""Type interface for an Extractor Module (WIP)"""
|
||||
|
||||
get_output_path: Callable
|
||||
|
||||
# TODO:
|
||||
# get_embed_path: Callable | None
|
||||
# should_extract(Snapshot)
|
||||
# extract(Snapshot)
|
||||
|
||||
|
||||
def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
|
||||
"""iterate through archivebox/extractors/*.py and load extractor modules"""
|
||||
EXTRACTORS = {}
|
||||
|
||||
for filename in EXTRACTORS_DIR.glob('*.py'):
|
||||
if filename.name.startswith('__'):
|
||||
continue
|
||||
|
||||
extractor_name = filename.name.replace('.py', '')
|
||||
|
||||
extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
|
||||
|
||||
assert getattr(extractor_module, 'get_output_path')
|
||||
EXTRACTORS[extractor_name] = extractor_module
|
||||
|
||||
return EXTRACTORS
|
||||
|
||||
EXTRACTORS = get_extractors(EXTRACTORS_DIR)
|
||||
|
|
|
@ -24,6 +24,8 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
def get_output_path():
|
||||
return 'archive.org.txt'
|
||||
|
||||
|
||||
@enforce_types
|
||||
|
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'archive.org.txt').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
|
@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'archive.org.txt'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
# later options take precedence
|
||||
|
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
|
|||
archive_org_url = archive_org_url or submit_url
|
||||
with open(str(out_dir / output), 'w', encoding='utf-8') as f:
|
||||
f.write(archive_org_url)
|
||||
chmod_file('archive.org.txt', cwd=str(out_dir))
|
||||
chmod_file(str(out_dir / output), cwd=str(out_dir))
|
||||
output = archive_org_url
|
||||
|
||||
return ArchiveResult(
|
||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'output.html'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'output.html').exists():
|
||||
if (out_dir / 'output.html').stat().st_size > 1:
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
if (out_dir / get_output_path()).stat().st_size > 1:
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
|
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'output.html'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
|
|
|
@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
|||
from ..system import chmod_file, run
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
domain,
|
||||
dedupe,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
|||
|
||||
return SAVE_FAVICON
|
||||
|
||||
@enforce_types
|
||||
def get_output_path():
|
||||
return 'favicon.ico'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
|
|
@ -26,6 +26,19 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'git/'
|
||||
|
||||
def get_embed_path(archiveresult=None):
|
||||
if not archiveresult:
|
||||
return get_output_path()
|
||||
|
||||
try:
|
||||
return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
return get_output_path()
|
||||
|
||||
@enforce_types
|
||||
def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'git').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
|
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
"""download full site using git"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'git'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
cmd = [
|
||||
|
|
|
@ -23,10 +23,14 @@ from ..config import (
|
|||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
def get_output_path():
|
||||
return 'headers.json'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'headers.json').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_HEADERS
|
||||
|
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute()
|
||||
output: ArchiveOutput = 'headers.json'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
|
|||
try:
|
||||
json_headers = get_headers(link.url, timeout=timeout)
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
atomic_write(str(output_folder / "headers.json"), json_headers)
|
||||
atomic_write(str(output_folder / get_output_path()), json_headers)
|
||||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
|
|
|
@ -19,6 +19,12 @@ from ..util import (
|
|||
)
|
||||
from .title import get_html
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return "htmltotext.txt"
|
||||
|
||||
|
||||
|
||||
class HTMLTextExtractor(HTMLParser):
|
||||
TEXT_ATTRS = [
|
||||
"alt", "cite", "href", "label",
|
||||
|
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'htmltotext.txt').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_HTMLTOTEXT
|
||||
|
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
"""extract search-indexing-friendly text from an HTML document"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output = "htmltotext.txt"
|
||||
output = get_output_path()
|
||||
cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
|
||||
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
|
|
@ -22,13 +22,27 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'media/'
|
||||
|
||||
def get_embed_path(archiveresult=None):
|
||||
if not archiveresult:
|
||||
return get_output_path()
|
||||
|
||||
out_dir = archiveresult.snapshot_dir / get_output_path()
|
||||
try:
|
||||
return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
|
||||
except IndexError:
|
||||
return get_output_path()
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'media').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
|
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'media'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
output_path = out_dir / output
|
||||
output_path.mkdir(exist_ok=True)
|
||||
# later options take precedence
|
||||
|
|
|
@ -24,6 +24,12 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'mercury/'
|
||||
|
||||
def get_embed_path(archiveresult=None):
|
||||
return get_output_path() + 'content.html'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
|
||||
|
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'mercury').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_MERCURY
|
||||
|
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
"""download reader friendly version using @postlight/mercury-parser"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute() / "mercury"
|
||||
output = "mercury"
|
||||
output_folder = out_dir.absolute() / get_output_path()
|
||||
output = get_output_path()
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
|
|
|
@ -19,13 +19,17 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'output.pdf'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'output.pdf').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
|
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'output.pdf'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
'--print-to-pdf',
|
||||
|
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
|
|||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save PDF', hints)
|
||||
|
||||
chmod_file('output.pdf', cwd=str(out_dir))
|
||||
chmod_file(get_output_path(), cwd=str(out_dir))
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
|
|
|
@ -22,6 +22,12 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
from .title import get_html
|
||||
|
||||
def get_output_path():
|
||||
return 'readability/'
|
||||
|
||||
def get_embed_path(archiveresult=None):
|
||||
return get_output_path() + 'content.html'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'readability').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_READABILITY
|
||||
|
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|||
"""download reader friendly version using @mozilla/readability"""
|
||||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute() / "readability"
|
||||
output = "readability"
|
||||
output_folder = out_dir.absolute() / get_output_path()
|
||||
output = get_output_path()
|
||||
|
||||
# Readability Docs: https://github.com/mozilla/readability
|
||||
|
||||
|
|
|
@ -19,6 +19,9 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'screenshot.png'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
|
|||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'screenshot.png').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
|
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'screenshot.png'
|
||||
output: ArchiveOutput = get_output_path()
|
||||
cmd = [
|
||||
*chrome_args(),
|
||||
'--screenshot',
|
||||
|
|
|
@ -26,13 +26,17 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
return 'singlefile.html'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'singlefile.html').exists():
|
||||
if not overwrite and (out_dir / get_output_path()).exists():
|
||||
return False
|
||||
|
||||
return SAVE_SINGLEFILE
|
||||
|
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
"""download full site using single-file"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output = "singlefile.html"
|
||||
output = get_output_path()
|
||||
|
||||
browser_args = chrome_args(CHROME_TIMEOUT=0)
|
||||
|
||||
|
|
|
@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
|
|||
if tag.lower() == "title":
|
||||
self.inside_title_tag = False
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||
"""
|
||||
|
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
|||
else:
|
||||
return document
|
||||
|
||||
|
||||
def get_output_path():
|
||||
# TODO: actually save title to this file
|
||||
# (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
|
||||
return 'title.json'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
|
|
|
@ -35,6 +35,18 @@ from ..config import (
|
|||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
def get_output_path():
|
||||
# TODO: actually save output into this folder, instead of do {domain}/**/index.html
|
||||
return 'wget/'
|
||||
|
||||
def get_embed_path(archiveresult=None):
|
||||
if not archiveresult:
|
||||
return get_output_path()
|
||||
|
||||
link = archiveresult.snapshot.as_link()
|
||||
return wget_output_path(link)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
output_path = wget_output_path(link)
|
||||
|
|
|
@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
|
|||
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
||||
|
||||
def calc_snapshot_icons():
|
||||
from core.models import EXTRACTORS
|
||||
from core.models import EXTRACTOR_CHOICES
|
||||
# start = datetime.now(timezone.utc)
|
||||
|
||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
|
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
|
|||
# Missing specific entry for WARC
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in EXTRACTORS:
|
||||
for extractor, _ in EXTRACTOR_CHOICES:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor and result:
|
||||
extractor_outputs[extractor] = result
|
||||
|
||||
for extractor, _ in EXTRACTORS:
|
||||
for extractor, _ in EXTRACTOR_CHOICES:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||
|
|
Loading…
Add table
Reference in a new issue