mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-24 21:23:22 +00:00
add extractors files to favicon and title plugins
This commit is contained in:
parent
1b8bafdb56
commit
2f30a35d2b
5 changed files with 101 additions and 0 deletions
|
@ -0,0 +1,26 @@
|
|||
__package__ = 'abx_plugin_chrome'
|
||||
|
||||
from abx_pkg import BinName
|
||||
|
||||
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import CHROME_BINARY
|
||||
|
||||
|
||||
class PDFExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'pdf'
|
||||
binary: BinName = CHROME_BINARY.name
|
||||
|
||||
PDF_EXTRACTOR = PDFExtractor()
|
||||
|
||||
|
||||
class ScreenshotExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'screenshot'
|
||||
binary: BinName = CHROME_BINARY.name
|
||||
|
||||
SCREENSHOT_EXTRACTOR = ScreenshotExtractor()
|
||||
|
||||
class DOMExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'dom'
|
||||
binary: BinName = CHROME_BINARY.name
|
||||
DOM_EXTRACTOR = DOMExtractor()
|
|
@ -0,0 +1,32 @@
|
|||
__package__ = 'abx_plugin_favicon'
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
from core.actors import ActorType
|
||||
from core.statemachines import ArchiveResultMachine
|
||||
|
||||
from statemachine import State
|
||||
|
||||
from .models import FaviconResult
|
||||
|
||||
|
||||
class FaviconResultActor(ActorType[FaviconResult]):
|
||||
"""
|
||||
The primary actor for progressing ArchiveResult objects
|
||||
through their lifecycle using the ArchiveResultMachine.
|
||||
"""
|
||||
Model = FaviconResult
|
||||
StateMachineClass = ArchiveResultMachine
|
||||
|
||||
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
||||
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
|
||||
STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name # status
|
||||
|
||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||
|
||||
# @classproperty
|
||||
# def qs(cls) -> QuerySet[ModelType]:
|
||||
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
|
||||
# return cls.Model.objects.filter(extractor='favicon')
|
|
@ -0,0 +1,19 @@
|
|||
__package__ = 'abx_plugin_favicon'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from abx_pkg import BinName
|
||||
|
||||
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from abx_plugin_curl.binaries import CURL_BINARY
|
||||
|
||||
|
||||
class FaviconExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'favicon'
|
||||
binary: BinName = CURL_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return Path(snapshot.link_dir) / 'favicon.png'
|
||||
|
||||
FAVICON_EXTRACTOR = FaviconExtractor()
|
|
@ -0,0 +1,14 @@
|
|||
# from django.db import models
|
||||
|
||||
# from core.models import ArchiveResult
|
||||
|
||||
# class FaviconResultManager(models.Manager):
|
||||
# def get_queryset(self):
|
||||
# return super().get_queryset().filter(extractor='favicon')
|
||||
|
||||
|
||||
# class FaviconResult(ArchiveResult):
|
||||
# objects = FaviconResultManager()
|
||||
|
||||
# class Meta:
|
||||
# proxy = True
|
|
@ -0,0 +1,10 @@
|
|||
__package__ = 'abx_plugin_title'
|
||||
|
||||
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
|
||||
|
||||
class TitleExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'title'
|
||||
|
||||
TITLE_EXTRACTOR = TitleExtractor()
|
Loading…
Reference in a new issue