add extractors files to favicon and title plugins

This commit is contained in:
Nick Sweeting 2024-11-17 20:11:43 -08:00
parent 1b8bafdb56
commit 2f30a35d2b
No known key found for this signature in database
5 changed files with 101 additions and 0 deletions

View file

@ -0,0 +1,26 @@
__package__ = 'abx_plugin_chrome'
from abx_pkg import BinName
from abx_spec_extractor import BaseExtractor, ExtractorName
from .binaries import CHROME_BINARY
class PDFExtractor(BaseExtractor):
name: ExtractorName = 'pdf'
binary: BinName = CHROME_BINARY.name
PDF_EXTRACTOR = PDFExtractor()
class ScreenshotExtractor(BaseExtractor):
name: ExtractorName = 'screenshot'
binary: BinName = CHROME_BINARY.name
SCREENSHOT_EXTRACTOR = ScreenshotExtractor()
class DOMExtractor(BaseExtractor):
name: ExtractorName = 'dom'
binary: BinName = CHROME_BINARY.name
DOM_EXTRACTOR = DOMExtractor()

View file

@ -0,0 +1,32 @@
__package__ = 'abx_plugin_favicon'
from typing import ClassVar
from core.actors import ActorType
from core.statemachines import ArchiveResultMachine
from statemachine import State
from .models import FaviconResult
class FaviconResultActor(ActorType[FaviconResult]):
"""
The primary actor for progressing ArchiveResult objects
through their lifecycle using the ArchiveResultMachine.
"""
Model = FaviconResult
StateMachineClass = ArchiveResultMachine
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name # status
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
MAX_TICK_TIME: ClassVar[int] = 60
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
# @classproperty
# def qs(cls) -> QuerySet[ModelType]:
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
# return cls.Model.objects.filter(extractor='favicon')

View file

@ -0,0 +1,19 @@
__package__ = 'abx_plugin_favicon'
from pathlib import Path
from abx_pkg import BinName
from abx_spec_extractor import BaseExtractor, ExtractorName
from abx_plugin_curl.binaries import CURL_BINARY
class FaviconExtractor(BaseExtractor):
name: ExtractorName = 'favicon'
binary: BinName = CURL_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
return Path(snapshot.link_dir) / 'favicon.png'
FAVICON_EXTRACTOR = FaviconExtractor()

View file

@ -0,0 +1,14 @@
# from django.db import models
# from core.models import ArchiveResult
# class FaviconResultManager(models.Manager):
# def get_queryset(self):
# return super().get_queryset().filter(extractor='favicon')
# class FaviconResult(ArchiveResult):
# objects = FaviconResultManager()
# class Meta:
# proxy = True

View file

@ -0,0 +1,10 @@
__package__ = 'abx_plugin_title'
from abx_spec_extractor import BaseExtractor, ExtractorName
class TitleExtractor(BaseExtractor):
name: ExtractorName = 'title'
TITLE_EXTRACTOR = TitleExtractor()