load EXTRACTORS dynamically using importlib.import_module

2025-02-23 00:38:27 +00:00 · 2024-05-11 22:28:59 -07:00 · 2024-05-11 22:28:59 -07:00 · 457c42bf84
commit 457c42bf84
parent c7f55fc3ba
18 changed files with 198 additions and 40 deletions
--- a/archivebox/core/migrations/0007_archiveresult.py
+++ b/archivebox/core/migrations/0007_archiveresult.py
@ -17,8 +17,6 @@ except AttributeError:


 def forwards_func(apps, schema_editor):
-    from core.models import EXTRACTORS
-
    Snapshot = apps.get_model("core", "Snapshot")
    ArchiveResult = apps.get_model("core", "ArchiveResult")

--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -6,6 +6,7 @@ import json

 from pathlib import Path
 from typing import Optional, List
+from importlib import import_module

 from django.db import models
 from django.utils.functional import cached_property
@ -20,9 +21,9 @@ from ..system import get_dir_size
 from ..util import parse_date, base_url, hashurl
 from ..index.schema import Link
 from ..index.html import snapshot_icons
-from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
+from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS

-EXTRACTORS = [(extractor[0], extractor[0]) for extractor in get_default_archive_methods()]
+EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
 STATUS_CHOICES = [
    ("succeeded", "succeeded"),
    ("failed", "failed"),
@ -267,11 +268,13 @@ class ArchiveResultManager(models.Manager):


 class ArchiveResult(models.Model):
+    EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
+
    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
    uuid = models.UUIDField(default=uuid.uuid4, editable=False)

    snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
-    extractor = models.CharField(choices=EXTRACTORS, max_length=32)
+    extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
    cmd = JSONField()
    pwd = models.CharField(max_length=256)
    cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
@ -284,3 +287,34 @@ class ArchiveResult(models.Model):

    def __str__(self):
        return self.extractor
+
+    @cached_property
+    def snapshot_dir(self):
+        return Path(self.snapshot.link_dir)
+
+
+    @property
+    def extractor_module(self):
+        return EXTRACTORS[self.extractor]
+
+    def output_path(self) -> str:
+        """return the canonical output filename or directory name within the snapshot dir"""
+        return self.extractor_module.get_output_path()
+
+    def embed_path(self) -> str:
+        """
+        return the actual runtime-calculated path to the file on-disk that
+        should be used for user-facing iframe embeds of this result
+        """
+
+        if hasattr(self.extractor_module, 'get_embed_path'):
+            return self.extractor_module.get_embed_path(self)
+
+        return self.extractor_module.get_output_path()
+
+    def legacy_output_path(self):
+        link = self.snapshot.as_link()
+        return link.canonical_outputs().get(f'{self.extractor}_path')
+
+    def output_exists(self) -> bool:
+        return Path(self.output_path()).exists()
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@ -1,11 +1,13 @@
 __package__ = 'archivebox.extractors'

+from typing import Callable, Optional, Dict, List, Iterable, Union, Protocol, cast
+
 import os
 import sys
 from pathlib import Path
-
-from typing import Callable, Optional, List, Iterable, Union
+from importlib import import_module
 from datetime import datetime, timezone
+
 from django.db.models import QuerySet

 from ..config import (
@ -240,3 +242,37 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa

    log_archiving_finished(num_links)
    return all_links
+
+
+
+EXTRACTORS_DIR = Path(__file__).parent
+
+class ExtractorModuleProtocol(Protocol):
+    """Type interface for an Extractor Module (WIP)"""
+    
+    get_output_path: Callable
+    
+    # TODO:
+    # get_embed_path: Callable | None
+    # should_extract(Snapshot)
+    # extract(Snapshot)
+
+
+def get_extractors(dir: Path=EXTRACTORS_DIR) -> Dict[str, ExtractorModuleProtocol]:
+    """iterate through archivebox/extractors/*.py and load extractor modules"""
+    EXTRACTORS = {}
+
+    for filename in EXTRACTORS_DIR.glob('*.py'):
+        if filename.name.startswith('__'):
+            continue
+
+        extractor_name = filename.name.replace('.py', '')
+
+        extractor_module = cast(ExtractorModuleProtocol, import_module(f'.{extractor_name}', package=__package__))
+
+        assert getattr(extractor_module, 'get_output_path')
+        EXTRACTORS[extractor_name] = extractor_module
+
+    return EXTRACTORS
+
+EXTRACTORS = get_extractors(EXTRACTORS_DIR)
--- a/archivebox/extractors/archive_org.py
+++ b/archivebox/extractors/archive_org.py
@ -24,6 +24,8 @@ from ..config import (
 )
 from ..logging_util import TimedProgress

+def get_output_path():
+    return 'archive.org.txt'


@enforce_types
@ -32,7 +34,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'archive.org.txt').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
        return False

@ -43,7 +45,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
    """submit site to archive.org for archiving via their service, save returned archive url"""

    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'archive.org.txt'
+    output: ArchiveOutput = get_output_path()
    archive_org_url = None
    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
    # later options take precedence
@ -88,7 +90,7 @@ def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=
        archive_org_url = archive_org_url or submit_url
        with open(str(out_dir / output), 'w', encoding='utf-8') as f:
            f.write(archive_org_url)
-        chmod_file('archive.org.txt', cwd=str(out_dir))
+        chmod_file(str(out_dir / output), cwd=str(out_dir))
        output = archive_org_url

    return ArchiveResult(
--- a/archivebox/extractors/dom.py
+++ b/archivebox/extractors/dom.py
@ -19,6 +19,9 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'output.html'
+

@enforce_types
 def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,8 +29,8 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'output.html').exists():
-        if (out_dir / 'output.html').stat().st_size > 1:
+    if not overwrite and (out_dir / get_output_path()).exists():
+        if (out_dir / get_output_path()).stat().st_size > 1:
            return False

    return SAVE_DOM
@ -37,7 +40,7 @@ def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    """print HTML of site to file using chrome --dump-html"""

    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'output.html'
+    output: ArchiveOutput = get_output_path()
    output_path = out_dir / output
    cmd = [
        *chrome_args(),
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@ -8,8 +8,8 @@ from ..index.schema import Link, ArchiveResult, ArchiveOutput
 from ..system import chmod_file, run
 from ..util import (
    enforce_types,
-     domain,
-     dedupe,
+    domain,
+    dedupe,
 )
 from ..config import (
    TIMEOUT,
@ -33,6 +33,11 @@ def should_save_favicon(link: Link, out_dir: Optional[str]=None, overwrite: Opti

    return SAVE_FAVICON

+@enforce_types
+def get_output_path():
+    return 'favicon.ico'
+
+
@enforce_types
 def save_favicon(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
    """download site favicon from google's favicon api"""
--- a/archivebox/extractors/git.py
+++ b/archivebox/extractors/git.py
@ -26,6 +26,19 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'git/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    try:
+        return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
+    except IndexError:
+        pass
+
+    return get_output_path()

@enforce_types
 def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -33,7 +46,7 @@ def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'git').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    is_clonable_url = (
@ -51,7 +64,7 @@ def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    """download full site using git"""

    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'git'
+    output: ArchiveOutput = get_output_path()
    output_path = out_dir / output
    output_path.mkdir(exist_ok=True)
    cmd = [
--- a/archivebox/extractors/headers.py
+++ b/archivebox/extractors/headers.py
@ -23,10 +23,14 @@ from ..config import (
 )
 from ..logging_util import TimedProgress

+def get_output_path():
+    return 'headers.json'
+
+
@enforce_types
 def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'headers.json').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_HEADERS
@ -38,7 +42,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)

    out_dir = Path(out_dir or link.link_dir)
    output_folder = out_dir.absolute()
-    output: ArchiveOutput = 'headers.json'
+    output: ArchiveOutput = get_output_path()

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
@ -59,7 +63,7 @@ def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT)
    try:
        json_headers = get_headers(link.url, timeout=timeout)
        output_folder.mkdir(exist_ok=True)
-        atomic_write(str(output_folder / "headers.json"), json_headers)
+        atomic_write(str(output_folder / get_output_path()), json_headers)
    except (Exception, OSError) as err:
        status = 'failed'
        output = err
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@ -19,6 +19,12 @@ from ..util import (
 )
 from .title import get_html

+
+def get_output_path():
+    return "htmltotext.txt"
+
+
+
 class HTMLTextExtractor(HTMLParser):
    TEXT_ATTRS = [
        "alt", "cite", "href", "label",
@ -109,7 +115,7 @@ def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite:
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'htmltotext.txt').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_HTMLTOTEXT
@ -120,7 +126,7 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    """extract search-indexing-friendly text from an HTML document"""

    out_dir = Path(out_dir or link.link_dir)
-    output = "htmltotext.txt"
+    output = get_output_path()
    cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']

    timer = TimedProgress(timeout, prefix='      ')
--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@ -22,13 +22,27 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'media/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    out_dir = archiveresult.snapshot_dir / get_output_path()
+    try:
+        return get_output_path() + list(out_dir.glob('*.mp4'))[0].name
+    except IndexError:
+        return get_output_path()
+
+
@enforce_types
 def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
    if is_static_file(link.url):
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'media').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_MEDIA
@ -38,7 +52,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
    """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""

    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'media'
+    output: ArchiveOutput = get_output_path()
    output_path = out_dir / output
    output_path.mkdir(exist_ok=True)
    # later options take precedence
--- a/archivebox/extractors/mercury.py
+++ b/archivebox/extractors/mercury.py
@ -24,6 +24,12 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'mercury/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+

@enforce_types
 def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
@ -44,7 +50,7 @@ def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Opti
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'mercury').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_MERCURY
@ -55,8 +61,8 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
    """download reader friendly version using @postlight/mercury-parser"""

    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / "mercury"
-    output = "mercury"
+    output_folder = out_dir.absolute() / get_output_path()
+    output = get_output_path()

    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
--- a/archivebox/extractors/pdf.py
+++ b/archivebox/extractors/pdf.py
@ -19,13 +19,17 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'output.pdf'
+
+
@enforce_types
 def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
    if is_static_file(link.url):
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'output.pdf').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_PDF
@ -36,7 +40,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
    """print PDF of site to file using chrome --headless"""

    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'output.pdf'
+    output: ArchiveOutput = get_output_path()
    cmd = [
        *chrome_args(),
        '--print-to-pdf',
@ -51,7 +55,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
            hints = (result.stderr or result.stdout).decode()
            raise ArchiveError('Failed to save PDF', hints)
        
-        chmod_file('output.pdf', cwd=str(out_dir))
+        chmod_file(get_output_path(), cwd=str(out_dir))
    except Exception as err:
        status = 'failed'
        output = err
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@ -22,6 +22,12 @@ from ..config import (
 from ..logging_util import TimedProgress
 from .title import get_html

+def get_output_path():
+    return 'readability/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+

@enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@ -29,7 +35,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'readability').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_READABILITY
@ -40,8 +46,8 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
    """download reader friendly version using @mozilla/readability"""

    out_dir = Path(out_dir or link.link_dir)
-    output_folder = out_dir.absolute() / "readability"
-    output = "readability"
+    output_folder = out_dir.absolute() / get_output_path()
+    output = get_output_path()

    # Readability Docs: https://github.com/mozilla/readability

--- a/archivebox/extractors/screenshot.py
+++ b/archivebox/extractors/screenshot.py
@ -19,6 +19,9 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'screenshot.png'
+

@enforce_types
 def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
@ -26,7 +29,7 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'screenshot.png').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_SCREENSHOT
@ -36,7 +39,7 @@ def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    """take screenshot of site using chrome --headless"""
    
    out_dir = out_dir or Path(link.link_dir)
-    output: ArchiveOutput = 'screenshot.png'
+    output: ArchiveOutput = get_output_path()
    cmd = [
        *chrome_args(),
        '--screenshot',
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@ -26,13 +26,17 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    return 'singlefile.html'
+
+
@enforce_types
 def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
    if is_static_file(link.url):
        return False

    out_dir = out_dir or Path(link.link_dir)
-    if not overwrite and (out_dir / 'singlefile.html').exists():
+    if not overwrite and (out_dir / get_output_path()).exists():
        return False

    return SAVE_SINGLEFILE
@ -43,7 +47,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
    """download full site using single-file"""

    out_dir = out_dir or Path(link.link_dir)
-    output = "singlefile.html"
+    output = get_output_path()

    browser_args = chrome_args(CHROME_TIMEOUT=0)

--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@ -60,6 +60,7 @@ class TitleParser(HTMLParser):
        if tag.lower() == "title":
            self.inside_title_tag = False

+
@enforce_types
 def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
    """
@ -84,6 +85,13 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
    else:
        return document

+
+def get_output_path():
+    # TODO: actually save title to this file
+    # (currently only saved in ArchiveResult.output as charfield value, not saved to filesystem)
+    return 'title.json'
+
+
@enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
    # if link already has valid title, skip it
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@ -35,6 +35,18 @@ from ..config import (
 from ..logging_util import TimedProgress


+def get_output_path():
+    # TODO: actually save output into this folder, instead of do {domain}/**/index.html
+    return 'wget/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    link = archiveresult.snapshot.as_link()
+    return wget_output_path(link)
+
+
@enforce_types
 def should_save_wget(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
    output_path = wget_output_path(link)
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@ -121,7 +121,7 @@ def snapshot_icons(snapshot) -> str:
    cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
    
    def calc_snapshot_icons():
-        from core.models import EXTRACTORS
+        from core.models import EXTRACTOR_CHOICES
        # start = datetime.now(timezone.utc)

        archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
@ -147,12 +147,12 @@ def snapshot_icons(snapshot) -> str:
        # Missing specific entry for WARC

        extractor_outputs = defaultdict(lambda: None)
-        for extractor, _ in EXTRACTORS:
+        for extractor, _ in EXTRACTOR_CHOICES:
            for result in archive_results:
                if result.extractor == extractor and result:
                    extractor_outputs[extractor] = result

-        for extractor, _ in EXTRACTORS:
+        for extractor, _ in EXTRACTOR_CHOICES:
            if extractor not in exclude:
                existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
                # Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)