ArchiveBox/archivebox/plugins_extractor/wget/extractors.py
2024-10-14 21:50:47 -07:00

37 lines
1,012 B
Python

__package__ = 'plugins_extractor.wget'
from pathlib import Path
from pydantic_pkgr import BinName
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from .binaries import WGET_BINARY
from .wget_util import wget_output_path
class WgetExtractor(BaseExtractor):
name: ExtractorName = 'wget'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
wget_index_path = wget_output_path(snapshot.as_link())
if wget_index_path:
return Path(wget_index_path)
return None
WGET_EXTRACTOR = WgetExtractor()
class WarcExtractor(BaseExtractor):
name: ExtractorName = 'warc'
binary: BinName = WGET_BINARY.name
def get_output_path(self, snapshot) -> Path | None:
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
if warc_files:
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
return None
WARC_EXTRACTOR = WarcExtractor()