ArchiveBox/archivebox/schema.py

246 lines
6.9 KiB
Python
Raw Normal View History

import os
from datetime import datetime
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
LinkDict = Dict[str, Any]
@dataclass(frozen=True)
class ArchiveResult:
cmd: List[str]
pwd: Optional[str]
cmd_version: Optional[str]
output: Union[str, Exception, None]
status: str
start_ts: datetime
end_ts: datetime
def _asdict(self):
return asdict(self)
@property
def duration(self) -> int:
return (self.end_ts - self.start_ts).seconds
@dataclass(frozen=True)
class Link:
timestamp: str
url: str
title: Optional[str]
tags: Optional[str]
sources: List[str]
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
updated: Optional[str] = None
def __hash__(self):
return self.urlhash
def __eq__(self, other):
if not isinstance(other, Link):
return NotImplemented
return self.urlhash == other.urlhash
def __gt__(self, other):
if not isinstance(other, Link):
return NotImplemented
if not self.timestamp or not other.timestamp:
return
return float(self.timestamp) > float(other.timestamp)
def _asdict(self, extended=False):
info = {
'url': self.url,
'title': self.title or None,
'timestamp': self.timestamp,
'updated': self.updated or None,
'tags': self.tags or None,
'sources': self.sources or [],
'history': self.history or {},
}
if extended:
info.update({
'link_dir': self.link_dir,
'archive_path': self.archive_path,
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
'domain': self.domain,
'path': self.path,
'basename': self.basename,
'extension': self.extension,
'base_url': self.base_url,
'is_static': self.is_static,
'is_archived': self.is_archived,
'num_outputs': self.num_outputs,
})
return info
@property
def link_dir(self) -> str:
from config import ARCHIVE_DIR
return os.path.join(ARCHIVE_DIR, self.timestamp)
@property
def archive_path(self) -> str:
from config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
### URL Helpers
@property
def urlhash(self):
from util import hashurl
return hashurl(self.url)
@property
def extension(self) -> str:
from util import extension
return extension(self.url)
@property
def domain(self) -> str:
from util import domain
return domain(self.url)
@property
def path(self) -> str:
from util import path
return path(self.url)
@property
def basename(self) -> str:
from util import basename
return basename(self.url)
@property
def base_url(self) -> str:
from util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@property
def bookmarked_date(self) -> Optional[str]:
from util import ts_to_date
return ts_to_date(self.timestamp) if self.timestamp else None
@property
def updated_date(self) -> Optional[str]:
from util import ts_to_date
return ts_to_date(self.updated) if self.updated else None
### Archive Status Helpers
@property
def num_outputs(self) -> int:
return len(tuple(filter(None, self.latest_outputs().values())))
@property
def is_static(self) -> bool:
from util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
from config import ARCHIVE_DIR
from util import domain
return os.path.exists(os.path.join(
ARCHIVE_DIR,
self.timestamp,
domain(self.url),
))
def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:
"""get the latest output that each archive method produced for link"""
latest = {
'title': None,
'favicon': None,
'wget': None,
'warc': None,
'pdf': None,
'screenshot': None,
'dom': None,
'git': None,
'media': None,
'archive_org': None,
}
for archive_method in latest.keys():
# get most recent succesful result in history for each archive method
history = self.history.get(archive_method) or []
history = filter(lambda result: result.output, reversed(history))
if status is not None:
history = filter(lambda result: result.status == status, history)
history = list(history)
if history:
latest[archive_method] = history[0].output
return latest
def canonical_outputs(self) -> Dict[str, Optional[str]]:
from util import wget_output_path
canonical = {
'index_url': 'index.html',
'favicon_url': 'favicon.ico',
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'archive_url': wget_output_path(self),
'warc_url': 'warc',
'pdf_url': 'output.pdf',
'screenshot_url': 'screenshot.png',
'dom_url': 'output.html',
'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_url': 'git',
'media_url': 'media',
}
if self.is_static:
# static binary files like PDF and images are handled slightly differently.
# they're just downloaded once and aren't archived separately multiple times,
# so the wget, screenshot, & pdf urls should all point to the same file
static_url = wget_output_path(self)
canonical.update({
'title': self.basename,
'archive_url': static_url,
'pdf_url': static_url,
'screenshot_url': static_url,
'dom_url': static_url,
})
return canonical
@dataclass(frozen=True)
class ArchiveIndex:
info: str
version: str
source: str
docs: str
num_links: int
updated: str
links: List[Link]
def _asdict(self):
return asdict(self)
@dataclass
class RuntimeStats:
skipped: int
succeeded: int
failed: int
parse_start_ts: datetime
parse_end_ts: datetime
index_start_ts: datetime
index_end_ts: datetime
archiving_start_ts: datetime
archiving_end_ts: datetime