mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
add extractors files to favicon and title plugins
This commit is contained in:
parent
1b8bafdb56
commit
2f30a35d2b
5 changed files with 101 additions and 0 deletions
|
@ -0,0 +1,26 @@
|
||||||
|
__package__ = 'abx_plugin_chrome'
|
||||||
|
|
||||||
|
from abx_pkg import BinName
|
||||||
|
|
||||||
|
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from .binaries import CHROME_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
class PDFExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'pdf'
|
||||||
|
binary: BinName = CHROME_BINARY.name
|
||||||
|
|
||||||
|
PDF_EXTRACTOR = PDFExtractor()
|
||||||
|
|
||||||
|
|
||||||
|
class ScreenshotExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'screenshot'
|
||||||
|
binary: BinName = CHROME_BINARY.name
|
||||||
|
|
||||||
|
SCREENSHOT_EXTRACTOR = ScreenshotExtractor()
|
||||||
|
|
||||||
|
class DOMExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'dom'
|
||||||
|
binary: BinName = CHROME_BINARY.name
|
||||||
|
DOM_EXTRACTOR = DOMExtractor()
|
|
@ -0,0 +1,32 @@
|
||||||
|
__package__ = 'abx_plugin_favicon'
|
||||||
|
|
||||||
|
from typing import ClassVar
|
||||||
|
|
||||||
|
from core.actors import ActorType
|
||||||
|
from core.statemachines import ArchiveResultMachine
|
||||||
|
|
||||||
|
from statemachine import State
|
||||||
|
|
||||||
|
from .models import FaviconResult
|
||||||
|
|
||||||
|
|
||||||
|
class FaviconResultActor(ActorType[FaviconResult]):
|
||||||
|
"""
|
||||||
|
The primary actor for progressing ArchiveResult objects
|
||||||
|
through their lifecycle using the ArchiveResultMachine.
|
||||||
|
"""
|
||||||
|
Model = FaviconResult
|
||||||
|
StateMachineClass = ArchiveResultMachine
|
||||||
|
|
||||||
|
ACTIVE_STATE: ClassVar[State] = ArchiveResultMachine.started # 'started'
|
||||||
|
FINAL_STATES: ClassVar[list[State]] = ArchiveResultMachine.final_states # ['succeeded', 'failed', 'skipped']
|
||||||
|
STATE_FIELD_NAME: ClassVar[str] = ArchiveResultMachine.state_field_name # status
|
||||||
|
|
||||||
|
MAX_CONCURRENT_ACTORS: ClassVar[int] = 6
|
||||||
|
MAX_TICK_TIME: ClassVar[int] = 60
|
||||||
|
CLAIM_FROM_TOP_N: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10
|
||||||
|
|
||||||
|
# @classproperty
|
||||||
|
# def qs(cls) -> QuerySet[ModelType]:
|
||||||
|
# """Get the unfiltered and unsorted QuerySet of all objects that this Actor might care about."""
|
||||||
|
# return cls.Model.objects.filter(extractor='favicon')
|
|
@ -0,0 +1,19 @@
|
||||||
|
__package__ = 'abx_plugin_favicon'
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from abx_pkg import BinName
|
||||||
|
|
||||||
|
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
from abx_plugin_curl.binaries import CURL_BINARY
|
||||||
|
|
||||||
|
|
||||||
|
class FaviconExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'favicon'
|
||||||
|
binary: BinName = CURL_BINARY.name
|
||||||
|
|
||||||
|
def get_output_path(self, snapshot) -> Path | None:
|
||||||
|
return Path(snapshot.link_dir) / 'favicon.png'
|
||||||
|
|
||||||
|
FAVICON_EXTRACTOR = FaviconExtractor()
|
|
@ -0,0 +1,14 @@
|
||||||
|
# from django.db import models
|
||||||
|
|
||||||
|
# from core.models import ArchiveResult
|
||||||
|
|
||||||
|
# class FaviconResultManager(models.Manager):
|
||||||
|
# def get_queryset(self):
|
||||||
|
# return super().get_queryset().filter(extractor='favicon')
|
||||||
|
|
||||||
|
|
||||||
|
# class FaviconResult(ArchiveResult):
|
||||||
|
# objects = FaviconResultManager()
|
||||||
|
|
||||||
|
# class Meta:
|
||||||
|
# proxy = True
|
|
@ -0,0 +1,10 @@
|
||||||
|
__package__ = 'abx_plugin_title'
|
||||||
|
|
||||||
|
from abx_spec_extractor import BaseExtractor, ExtractorName
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
class TitleExtractor(BaseExtractor):
|
||||||
|
name: ExtractorName = 'title'
|
||||||
|
|
||||||
|
TITLE_EXTRACTOR = TitleExtractor()
|
Loading…
Reference in a new issue