mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-18 14:28:25 +00:00
37 lines
1,012 B
Python
37 lines
1,012 B
Python
__package__ = 'plugins_extractor.wget'
|
|
|
|
from pathlib import Path
|
|
|
|
from pydantic_pkgr import BinName
|
|
|
|
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
|
|
|
from .binaries import WGET_BINARY
|
|
from .wget_util import wget_output_path
|
|
|
|
class WgetExtractor(BaseExtractor):
|
|
name: ExtractorName = 'wget'
|
|
binary: BinName = WGET_BINARY.name
|
|
|
|
def get_output_path(self, snapshot) -> Path | None:
|
|
wget_index_path = wget_output_path(snapshot.as_link())
|
|
if wget_index_path:
|
|
return Path(wget_index_path)
|
|
return None
|
|
|
|
WGET_EXTRACTOR = WgetExtractor()
|
|
|
|
|
|
class WarcExtractor(BaseExtractor):
|
|
name: ExtractorName = 'warc'
|
|
binary: BinName = WGET_BINARY.name
|
|
|
|
def get_output_path(self, snapshot) -> Path | None:
|
|
warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz'))
|
|
if warc_files:
|
|
return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0]
|
|
return None
|
|
|
|
|
|
WARC_EXTRACTOR = WarcExtractor()
|
|
|