ArchiveBox/archivebox/plugins_extractor/wget/extractors.py

__package__ = 'plugins_extractor.wget'

from pathlib import Path

from pydantic_pkgr import BinName

from abx.archivebox.base_extractor import BaseExtractor, ExtractorName

from .binaries import WGET_BINARY
from .wget_util import wget_output_path

class WgetExtractor(BaseExtractor):
    name: ExtractorName = 'wget'
    binary: BinName = WGET_BINARY.name

    def get_output_path(self, snapshot) -> Path | None:
        wget_index_path = wget_output_path(snapshot.as_link())
        if wget_index_path:
            return Path(wget_index_path)
        return None

WGET_EXTRACTOR = WgetExtractor()


class WarcExtractor(BaseExtractor):
    name: ExtractorName = 'warc'
    binary: BinName = WGET_BINARY.name

    def get_output_path(self, snapshot) -> Path | None:
        warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
        if warc_files:
            return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
        return None


WARC_EXTRACTOR = WarcExtractor()