2020-11-28 07:03:40 +00:00
|
|
|
"""
|
|
|
|
|
|
|
|
WARNING: THIS FILE IS ALL LEGACY CODE TO BE REMOVED.
|
|
|
|
|
|
|
|
DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
|
|
|
|
|
2024-05-06 13:58:03 +00:00
|
|
|
These are the old types we used to use before ArchiveBox v0.4 (before we switched to Django).
|
2020-11-28 07:03:40 +00:00
|
|
|
"""
|
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
__package__ = 'archivebox.index'
|
|
|
|
|
2021-04-10 08:19:30 +00:00
|
|
|
from datetime import datetime, timezone, timedelta
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
from typing import List, Dict, Any, Optional, Union
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-30 19:03:46 +00:00
|
|
|
from dataclasses import dataclass, asdict, field, fields
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2021-02-16 20:49:29 +00:00
|
|
|
from django.utils.functional import cached_property
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2024-09-30 22:59:05 +00:00
|
|
|
from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
2024-09-26 09:41:09 +00:00
|
|
|
|
|
|
|
from plugins_extractor.favicon.apps import FAVICON_CONFIG
|
|
|
|
|
2020-06-26 03:32:01 +00:00
|
|
|
from ..system import get_dir_size
|
2021-04-10 08:19:30 +00:00
|
|
|
from ..util import ts_to_date_str, parse_date
|
2024-09-26 09:41:09 +00:00
|
|
|
|
2020-07-24 15:34:47 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
class ArchiveError(Exception):
|
|
|
|
def __init__(self, message, hints=None):
|
|
|
|
super().__init__(message)
|
|
|
|
self.hints = hints
|
|
|
|
|
|
|
|
LinkDict = Dict[str, Any]
|
|
|
|
|
2019-03-27 22:25:17 +00:00
|
|
|
ArchiveOutput = Union[str, Exception, None]
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@dataclass(frozen=True)
|
|
|
|
class ArchiveResult:
|
2019-03-26 09:33:34 +00:00
|
|
|
cmd: List[str]
|
|
|
|
pwd: Optional[str]
|
|
|
|
cmd_version: Optional[str]
|
2019-03-27 22:25:17 +00:00
|
|
|
output: ArchiveOutput
|
2019-03-26 09:33:34 +00:00
|
|
|
status: str
|
|
|
|
start_ts: datetime
|
|
|
|
end_ts: datetime
|
2020-11-17 23:42:57 +00:00
|
|
|
index_texts: Union[List[str], None] = None
|
2019-03-27 03:25:07 +00:00
|
|
|
schema: str = 'ArchiveResult'
|
|
|
|
|
|
|
|
def __post_init__(self):
|
2019-03-30 19:03:46 +00:00
|
|
|
self.typecheck()
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def _asdict(self):
|
|
|
|
return asdict(self)
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-30 19:03:46 +00:00
|
|
|
def typecheck(self) -> None:
|
|
|
|
assert self.schema == self.__class__.__name__
|
|
|
|
assert isinstance(self.status, str) and self.status
|
|
|
|
assert isinstance(self.start_ts, datetime)
|
|
|
|
assert isinstance(self.end_ts, datetime)
|
|
|
|
assert isinstance(self.cmd, list)
|
|
|
|
assert all(isinstance(arg, str) and arg for arg in self.cmd)
|
2021-02-09 04:22:02 +00:00
|
|
|
|
|
|
|
# TODO: replace emptystrings in these three with None / remove them from the DB
|
|
|
|
assert self.pwd is None or isinstance(self.pwd, str)
|
|
|
|
assert self.cmd_version is None or isinstance(self.cmd_version, str)
|
2021-02-09 04:24:48 +00:00
|
|
|
assert self.output is None or isinstance(self.output, (str, Exception))
|
2019-03-30 19:03:46 +00:00
|
|
|
|
|
|
|
@classmethod
|
2020-07-24 14:24:52 +00:00
|
|
|
def guess_ts(_cls, dict_info):
|
|
|
|
from ..util import parse_date
|
|
|
|
parsed_timestamp = parse_date(dict_info["timestamp"])
|
|
|
|
start_ts = parsed_timestamp
|
|
|
|
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
|
|
|
return start_ts, end_ts
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def from_json(cls, json_info, guess=False):
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import parse_date
|
2019-03-30 19:03:46 +00:00
|
|
|
|
|
|
|
info = {
|
|
|
|
key: val
|
|
|
|
for key, val in json_info.items()
|
2019-04-11 11:00:26 +00:00
|
|
|
if key in cls.field_names()
|
2019-03-30 19:03:46 +00:00
|
|
|
}
|
2020-07-24 14:24:52 +00:00
|
|
|
if guess:
|
|
|
|
keys = info.keys()
|
|
|
|
if "start_ts" not in keys:
|
|
|
|
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
|
|
|
|
else:
|
|
|
|
info['start_ts'] = parse_date(info['start_ts'])
|
|
|
|
info['end_ts'] = parse_date(info['end_ts'])
|
|
|
|
if "pwd" not in keys:
|
2024-09-26 09:41:09 +00:00
|
|
|
info["pwd"] = str(ARCHIVE_DIR / json_info["timestamp"])
|
2020-07-24 14:24:52 +00:00
|
|
|
if "cmd_version" not in keys:
|
|
|
|
info["cmd_version"] = "Undefined"
|
|
|
|
if "cmd" not in keys:
|
|
|
|
info["cmd"] = []
|
|
|
|
else:
|
|
|
|
info['start_ts'] = parse_date(info['start_ts'])
|
|
|
|
info['end_ts'] = parse_date(info['end_ts'])
|
|
|
|
info['cmd_version'] = info.get('cmd_version')
|
2020-07-24 19:36:08 +00:00
|
|
|
if type(info["cmd"]) is str:
|
|
|
|
info["cmd"] = [info["cmd"]]
|
2019-03-30 19:03:46 +00:00
|
|
|
return cls(**info)
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
def to_dict(self, *keys) -> dict:
|
|
|
|
if keys:
|
|
|
|
return {k: v for k, v in asdict(self).items() if k in keys}
|
|
|
|
return asdict(self)
|
|
|
|
|
|
|
|
def to_json(self, indent=4, sort_keys=True) -> str:
|
|
|
|
from .json import to_json
|
2019-04-11 11:00:26 +00:00
|
|
|
|
|
|
|
return to_json(self, indent=indent, sort_keys=sort_keys)
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
|
|
|
from .csv import to_csv
|
2019-04-11 11:00:26 +00:00
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
|
2019-04-11 11:00:26 +00:00
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def field_names(cls):
|
|
|
|
return [f.name for f in fields(cls)]
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@property
|
|
|
|
def duration(self) -> int:
|
|
|
|
return (self.end_ts - self.start_ts).seconds
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@dataclass(frozen=True)
|
|
|
|
class Link:
|
2019-03-26 09:33:34 +00:00
|
|
|
timestamp: str
|
|
|
|
url: str
|
|
|
|
title: Optional[str]
|
2019-03-26 23:21:34 +00:00
|
|
|
tags: Optional[str]
|
2019-03-26 09:33:34 +00:00
|
|
|
sources: List[str]
|
2019-03-26 23:21:34 +00:00
|
|
|
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
2024-09-05 06:42:36 +00:00
|
|
|
downloaded_at: Optional[datetime] = None
|
2019-03-27 03:25:07 +00:00
|
|
|
schema: str = 'Link'
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-04-24 15:36:47 +00:00
|
|
|
def __str__(self) -> str:
|
2020-08-18 22:46:21 +00:00
|
|
|
return f'[{self.timestamp}] {self.url} "{self.title}"'
|
2019-04-24 15:36:47 +00:00
|
|
|
|
2019-03-27 02:26:21 +00:00
|
|
|
def __post_init__(self):
|
2019-03-30 19:03:46 +00:00
|
|
|
self.typecheck()
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-03-27 07:49:39 +00:00
|
|
|
def overwrite(self, **kwargs):
|
|
|
|
"""pure functional version of dict.update that returns a new instance"""
|
|
|
|
return Link(**{**self._asdict(), **kwargs})
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def __eq__(self, other):
|
|
|
|
if not isinstance(other, Link):
|
|
|
|
return NotImplemented
|
2019-03-27 02:26:21 +00:00
|
|
|
return self.url == other.url
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
def __gt__(self, other):
|
|
|
|
if not isinstance(other, Link):
|
|
|
|
return NotImplemented
|
|
|
|
if not self.timestamp or not other.timestamp:
|
|
|
|
return
|
|
|
|
return float(self.timestamp) > float(other.timestamp)
|
2019-03-30 19:03:46 +00:00
|
|
|
|
|
|
|
def typecheck(self) -> None:
|
2024-09-30 22:59:05 +00:00
|
|
|
from ..config.legacy import stderr, ANSI
|
2019-04-17 06:25:28 +00:00
|
|
|
try:
|
|
|
|
assert self.schema == self.__class__.__name__
|
|
|
|
assert isinstance(self.timestamp, str) and self.timestamp
|
|
|
|
assert self.timestamp.replace('.', '').isdigit()
|
|
|
|
assert isinstance(self.url, str) and '://' in self.url
|
2024-09-05 06:42:36 +00:00
|
|
|
assert self.downloaded_at is None or isinstance(self.downloaded_at, datetime)
|
2019-04-17 06:25:28 +00:00
|
|
|
assert self.title is None or (isinstance(self.title, str) and self.title)
|
2019-05-01 06:28:48 +00:00
|
|
|
assert self.tags is None or isinstance(self.tags, str)
|
2019-04-17 06:25:28 +00:00
|
|
|
assert isinstance(self.sources, list)
|
|
|
|
assert all(isinstance(source, str) and source for source in self.sources)
|
|
|
|
assert isinstance(self.history, dict)
|
|
|
|
for method, results in self.history.items():
|
|
|
|
assert isinstance(method, str) and method
|
|
|
|
assert isinstance(results, list)
|
|
|
|
assert all(isinstance(result, ArchiveResult) for result in results)
|
|
|
|
except Exception:
|
|
|
|
stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
|
|
|
|
raise
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
def _asdict(self, extended=False):
|
|
|
|
info = {
|
2019-03-27 03:25:07 +00:00
|
|
|
'schema': 'Link',
|
2019-03-26 23:21:34 +00:00
|
|
|
'url': self.url,
|
|
|
|
'title': self.title or None,
|
|
|
|
'timestamp': self.timestamp,
|
2024-09-05 06:42:36 +00:00
|
|
|
'downloaded_at': self.downloaded_at or None,
|
2019-03-26 23:21:34 +00:00
|
|
|
'tags': self.tags or None,
|
|
|
|
'sources': self.sources or [],
|
|
|
|
'history': self.history or {},
|
|
|
|
}
|
|
|
|
if extended:
|
|
|
|
info.update({
|
2021-02-16 20:54:27 +00:00
|
|
|
'snapshot_id': self.snapshot_id,
|
2024-05-13 12:12:12 +00:00
|
|
|
'snapshot_abid': self.snapshot_abid,
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
'link_dir': self.link_dir,
|
|
|
|
'archive_path': self.archive_path,
|
2024-08-21 01:31:21 +00:00
|
|
|
|
2019-04-17 03:21:24 +00:00
|
|
|
'hash': self.url_hash,
|
|
|
|
'base_url': self.base_url,
|
|
|
|
'scheme': self.scheme,
|
2019-03-26 23:21:34 +00:00
|
|
|
'domain': self.domain,
|
|
|
|
'path': self.path,
|
|
|
|
'basename': self.basename,
|
|
|
|
'extension': self.extension,
|
|
|
|
'is_static': self.is_static,
|
2024-08-21 01:31:21 +00:00
|
|
|
|
2021-04-10 08:19:30 +00:00
|
|
|
'tags_str': (self.tags or '').strip(','), # only used to render static index in index/html.py, remove if no longer needed there
|
2021-02-16 06:23:31 +00:00
|
|
|
'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
|
2019-04-17 03:21:24 +00:00
|
|
|
|
|
|
|
'bookmarked_date': self.bookmarked_date,
|
2024-09-05 06:42:36 +00:00
|
|
|
'downloaded_datestr': self.downloaded_datestr,
|
2019-04-17 03:21:24 +00:00
|
|
|
'oldest_archive_date': self.oldest_archive_date,
|
|
|
|
'newest_archive_date': self.newest_archive_date,
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
'is_archived': self.is_archived,
|
|
|
|
'num_outputs': self.num_outputs,
|
2019-03-27 07:49:39 +00:00
|
|
|
'num_failures': self.num_failures,
|
2019-04-17 03:21:24 +00:00
|
|
|
|
|
|
|
'latest': self.latest_outputs(),
|
|
|
|
'canonical': self.canonical_outputs(),
|
2019-03-26 23:21:34 +00:00
|
|
|
})
|
|
|
|
return info
|
|
|
|
|
2020-11-28 07:01:53 +00:00
|
|
|
def as_snapshot(self):
|
|
|
|
from core.models import Snapshot
|
|
|
|
return Snapshot.objects.get(url=self.url)
|
|
|
|
|
2019-03-30 19:03:46 +00:00
|
|
|
@classmethod
|
2020-07-24 14:24:52 +00:00
|
|
|
def from_json(cls, json_info, guess=False):
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import parse_date
|
2019-03-30 19:03:46 +00:00
|
|
|
|
|
|
|
info = {
|
|
|
|
key: val
|
|
|
|
for key, val in json_info.items()
|
2019-04-11 11:00:26 +00:00
|
|
|
if key in cls.field_names()
|
2019-03-30 19:03:46 +00:00
|
|
|
}
|
2024-09-05 06:42:36 +00:00
|
|
|
info['downloaded_at'] = parse_date(info.get('updated') or info.get('downloaded_at'))
|
2019-04-22 17:20:19 +00:00
|
|
|
info['sources'] = info.get('sources') or []
|
2019-03-30 19:03:46 +00:00
|
|
|
|
2019-04-22 17:20:19 +00:00
|
|
|
json_history = info.get('history') or {}
|
2019-03-30 19:03:46 +00:00
|
|
|
cast_history = {}
|
|
|
|
|
|
|
|
for method, method_history in json_history.items():
|
|
|
|
cast_history[method] = []
|
|
|
|
for json_result in method_history:
|
|
|
|
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
2020-07-24 14:24:52 +00:00
|
|
|
cast_result = ArchiveResult.from_json(json_result, guess)
|
2019-03-30 19:03:46 +00:00
|
|
|
cast_history[method].append(cast_result)
|
|
|
|
|
|
|
|
info['history'] = cast_history
|
|
|
|
return cls(**info)
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
def to_json(self, indent=4, sort_keys=True) -> str:
|
|
|
|
from .json import to_json
|
2019-04-11 11:00:26 +00:00
|
|
|
|
|
|
|
return to_json(self, indent=indent, sort_keys=sort_keys)
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
|
|
|
from .csv import to_csv
|
2019-04-11 11:00:26 +00:00
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
|
2019-04-11 11:00:26 +00:00
|
|
|
|
2021-02-16 20:54:27 +00:00
|
|
|
@cached_property
|
2024-05-13 12:12:12 +00:00
|
|
|
def snapshot(self):
|
2021-02-16 20:54:27 +00:00
|
|
|
from core.models import Snapshot
|
2024-09-04 23:40:15 +00:00
|
|
|
return Snapshot.objects.only('id', 'abid').get(url=self.url)
|
2024-05-13 12:12:12 +00:00
|
|
|
|
|
|
|
@cached_property
|
|
|
|
def snapshot_id(self):
|
|
|
|
return str(self.snapshot.pk)
|
|
|
|
|
|
|
|
@cached_property
|
|
|
|
def snapshot_abid(self):
|
|
|
|
return str(self.snapshot.ABID)
|
2021-02-16 20:54:27 +00:00
|
|
|
|
2019-04-11 11:00:26 +00:00
|
|
|
@classmethod
|
|
|
|
def field_names(cls):
|
|
|
|
return [f.name for f in fields(cls)]
|
2019-03-30 19:03:46 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@property
|
|
|
|
def link_dir(self) -> str:
|
2024-09-26 09:41:09 +00:00
|
|
|
return str(ARCHIVE_DIR / self.timestamp)
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def archive_path(self) -> str:
|
|
|
|
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
|
|
|
|
2020-06-26 03:32:01 +00:00
|
|
|
@property
|
|
|
|
def archive_size(self) -> float:
|
|
|
|
try:
|
|
|
|
return get_dir_size(self.archive_path)[0]
|
|
|
|
except Exception:
|
|
|
|
return 0
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
### URL Helpers
|
|
|
|
@property
|
2019-04-17 03:21:24 +00:00
|
|
|
def url_hash(self):
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import hashurl
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
return hashurl(self.url)
|
|
|
|
|
2019-04-17 03:21:24 +00:00
|
|
|
@property
|
|
|
|
def scheme(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import scheme
|
2019-04-17 03:21:24 +00:00
|
|
|
return scheme(self.url)
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@property
|
|
|
|
def extension(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import extension
|
2019-03-26 23:21:34 +00:00
|
|
|
return extension(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def domain(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import domain
|
2019-03-26 23:21:34 +00:00
|
|
|
return domain(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def path(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import path
|
2019-03-26 23:21:34 +00:00
|
|
|
return path(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def basename(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import basename
|
2019-03-26 23:21:34 +00:00
|
|
|
return basename(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def base_url(self) -> str:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import base_url
|
2019-03-26 23:21:34 +00:00
|
|
|
return base_url(self.url)
|
|
|
|
|
|
|
|
### Pretty Printing Helpers
|
|
|
|
@property
|
|
|
|
def bookmarked_date(self) -> Optional[str]:
|
2021-04-10 08:19:30 +00:00
|
|
|
max_ts = (datetime.now(timezone.utc) + timedelta(days=30)).timestamp()
|
2020-04-23 01:15:15 +00:00
|
|
|
|
|
|
|
if self.timestamp and self.timestamp.replace('.', '').isdigit():
|
|
|
|
if 0 < float(self.timestamp) < max_ts:
|
2021-04-10 08:19:30 +00:00
|
|
|
return ts_to_date_str(datetime.fromtimestamp(float(self.timestamp)))
|
2020-04-23 01:15:15 +00:00
|
|
|
else:
|
|
|
|
return str(self.timestamp)
|
|
|
|
return None
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
@property
|
2024-09-05 06:42:36 +00:00
|
|
|
def downloaded_datestr(self) -> Optional[str]:
|
|
|
|
return ts_to_date_str(self.downloaded_at) if self.downloaded_at else None
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-03-27 07:49:39 +00:00
|
|
|
@property
|
2019-04-02 20:36:41 +00:00
|
|
|
def archive_dates(self) -> List[datetime]:
|
|
|
|
return [
|
2021-04-10 08:19:30 +00:00
|
|
|
parse_date(result.start_ts)
|
2019-04-02 20:36:41 +00:00
|
|
|
for method in self.history.keys()
|
|
|
|
for result in self.history[method]
|
|
|
|
]
|
2019-03-27 07:49:39 +00:00
|
|
|
|
2019-04-02 20:36:41 +00:00
|
|
|
@property
|
|
|
|
def oldest_archive_date(self) -> Optional[datetime]:
|
|
|
|
return min(self.archive_dates, default=None)
|
2019-03-27 07:49:39 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def newest_archive_date(self) -> Optional[datetime]:
|
2019-04-02 20:36:41 +00:00
|
|
|
return max(self.archive_dates, default=None)
|
2019-03-27 07:49:39 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
### Archive Status Helpers
|
|
|
|
@property
|
|
|
|
def num_outputs(self) -> int:
|
2020-12-08 23:05:37 +00:00
|
|
|
return self.as_snapshot().num_outputs
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-03-27 07:49:39 +00:00
|
|
|
@property
|
|
|
|
def num_failures(self) -> int:
|
|
|
|
return sum(1
|
|
|
|
for method in self.history.keys()
|
|
|
|
for result in self.history[method]
|
|
|
|
if result.status == 'failed')
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@property
|
|
|
|
def is_static(self) -> bool:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import is_static_file
|
2019-03-26 23:21:34 +00:00
|
|
|
return is_static_file(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def is_archived(self) -> bool:
|
2019-04-27 21:26:24 +00:00
|
|
|
from ..util import domain
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-04-24 15:36:47 +00:00
|
|
|
output_paths = (
|
2019-03-26 23:21:34 +00:00
|
|
|
domain(self.url),
|
2024-01-19 11:43:36 +00:00
|
|
|
'output.html',
|
2019-04-24 15:36:47 +00:00
|
|
|
'output.pdf',
|
|
|
|
'screenshot.png',
|
2024-01-19 11:43:36 +00:00
|
|
|
'singlefile.html',
|
|
|
|
'readability/content.html',
|
|
|
|
'mercury/content.html',
|
|
|
|
'htmltotext.txt',
|
2019-04-24 15:36:47 +00:00
|
|
|
'media',
|
2024-01-19 11:43:36 +00:00
|
|
|
'git',
|
2019-04-24 15:36:47 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
return any(
|
2024-09-26 09:41:09 +00:00
|
|
|
(ARCHIVE_DIR / self.timestamp / path).exists()
|
2019-04-24 15:36:47 +00:00
|
|
|
for path in output_paths
|
|
|
|
)
|
2019-03-26 23:21:34 +00:00
|
|
|
|
2019-03-27 22:25:17 +00:00
|
|
|
def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]:
|
2019-03-26 23:21:34 +00:00
|
|
|
"""get the latest output that each archive method produced for link"""
|
|
|
|
|
2019-03-27 22:25:17 +00:00
|
|
|
ARCHIVE_METHODS = (
|
2020-07-31 19:35:42 +00:00
|
|
|
'title', 'favicon', 'wget', 'warc', 'singlefile', 'pdf',
|
2019-03-27 22:25:17 +00:00
|
|
|
'screenshot', 'dom', 'git', 'media', 'archive_org',
|
|
|
|
)
|
|
|
|
latest: Dict[str, ArchiveOutput] = {}
|
|
|
|
for archive_method in ARCHIVE_METHODS:
|
2019-03-26 23:21:34 +00:00
|
|
|
# get most recent succesful result in history for each archive method
|
|
|
|
history = self.history.get(archive_method) or []
|
2019-03-27 22:25:17 +00:00
|
|
|
history = list(filter(lambda result: result.output, reversed(history)))
|
2019-03-26 23:21:34 +00:00
|
|
|
if status is not None:
|
2019-03-27 22:25:17 +00:00
|
|
|
history = list(filter(lambda result: result.status == status, history))
|
2019-03-26 23:21:34 +00:00
|
|
|
|
|
|
|
history = list(history)
|
|
|
|
if history:
|
|
|
|
latest[archive_method] = history[0].output
|
2019-03-27 22:25:17 +00:00
|
|
|
else:
|
|
|
|
latest[archive_method] = None
|
2019-03-26 23:21:34 +00:00
|
|
|
return latest
|
|
|
|
|
2019-04-17 03:21:24 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
2019-04-17 03:21:24 +00:00
|
|
|
"""predict the expected output paths that should be present after archiving"""
|
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
from ..extractors.wget import wget_output_path
|
2021-01-31 03:03:59 +00:00
|
|
|
# TODO: banish this awful duplication from the codebase and import these
|
|
|
|
# from their respective extractor files
|
2019-03-26 23:21:34 +00:00
|
|
|
canonical = {
|
2019-04-17 03:21:24 +00:00
|
|
|
'index_path': 'index.html',
|
|
|
|
'favicon_path': 'favicon.ico',
|
2024-09-26 09:41:09 +00:00
|
|
|
'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain),
|
2019-04-17 03:21:24 +00:00
|
|
|
'wget_path': wget_output_path(self),
|
2021-01-30 10:35:29 +00:00
|
|
|
'warc_path': 'warc/',
|
2020-08-01 15:59:07 +00:00
|
|
|
'singlefile_path': 'singlefile.html',
|
2020-08-11 17:14:13 +00:00
|
|
|
'readability_path': 'readability/content.html',
|
2020-09-22 09:22:02 +00:00
|
|
|
'mercury_path': 'mercury/content.html',
|
2023-10-24 01:42:25 +00:00
|
|
|
'htmltotext_path': 'htmltotext.txt',
|
2019-04-17 03:21:24 +00:00
|
|
|
'pdf_path': 'output.pdf',
|
|
|
|
'screenshot_path': 'screenshot.png',
|
|
|
|
'dom_path': 'output.html',
|
|
|
|
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
|
2021-01-30 10:35:29 +00:00
|
|
|
'git_path': 'git/',
|
|
|
|
'media_path': 'media/',
|
2021-01-31 02:58:38 +00:00
|
|
|
'headers_path': 'headers.json',
|
2019-03-26 23:21:34 +00:00
|
|
|
}
|
|
|
|
if self.is_static:
|
|
|
|
# static binary files like PDF and images are handled slightly differently.
|
|
|
|
# they're just downloaded once and aren't archived separately multiple times,
|
|
|
|
# so the wget, screenshot, & pdf urls should all point to the same file
|
|
|
|
|
2019-04-17 03:21:24 +00:00
|
|
|
static_path = wget_output_path(self)
|
2019-03-26 23:21:34 +00:00
|
|
|
canonical.update({
|
|
|
|
'title': self.basename,
|
2019-04-17 03:21:24 +00:00
|
|
|
'wget_path': static_path,
|
|
|
|
'pdf_path': static_path,
|
|
|
|
'screenshot_path': static_path,
|
|
|
|
'dom_path': static_path,
|
2020-07-31 19:46:21 +00:00
|
|
|
'singlefile_path': static_path,
|
2020-08-11 17:14:13 +00:00
|
|
|
'readability_path': static_path,
|
2020-09-22 09:22:02 +00:00
|
|
|
'mercury_path': static_path,
|
2023-10-24 01:42:25 +00:00
|
|
|
'htmltotext_path': static_path,
|
2019-03-26 23:21:34 +00:00
|
|
|
})
|
|
|
|
return canonical
|
2019-05-01 03:13:04 +00:00
|
|
|
|