2019-03-26 23:21:34 +00:00
|
|
|
import os
|
|
|
|
|
2019-03-26 09:33:34 +00:00
|
|
|
from datetime import datetime
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
from typing import List, Dict, Any, Optional, Union
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
from dataclasses import dataclass, asdict, field
|
2019-03-26 09:33:34 +00:00
|
|
|
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
class ArchiveError(Exception):
|
|
|
|
def __init__(self, message, hints=None):
|
|
|
|
super().__init__(message)
|
|
|
|
self.hints = hints
|
|
|
|
|
|
|
|
LinkDict = Dict[str, Any]
|
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
class ArchiveResult:
|
2019-03-26 09:33:34 +00:00
|
|
|
cmd: List[str]
|
|
|
|
pwd: Optional[str]
|
|
|
|
cmd_version: Optional[str]
|
|
|
|
output: Union[str, Exception, None]
|
|
|
|
status: str
|
|
|
|
start_ts: datetime
|
|
|
|
end_ts: datetime
|
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def _asdict(self):
|
|
|
|
return asdict(self)
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@property
|
|
|
|
def duration(self) -> int:
|
|
|
|
return (self.end_ts - self.start_ts).seconds
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@dataclass(frozen=True)
|
|
|
|
class Link:
|
2019-03-26 09:33:34 +00:00
|
|
|
timestamp: str
|
|
|
|
url: str
|
|
|
|
title: Optional[str]
|
2019-03-26 23:21:34 +00:00
|
|
|
tags: Optional[str]
|
2019-03-26 09:33:34 +00:00
|
|
|
sources: List[str]
|
2019-03-26 23:21:34 +00:00
|
|
|
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
|
|
|
updated: Optional[str] = None
|
|
|
|
|
|
|
|
def __hash__(self):
|
|
|
|
return self.urlhash
|
|
|
|
|
|
|
|
def __eq__(self, other):
|
|
|
|
if not isinstance(other, Link):
|
|
|
|
return NotImplemented
|
|
|
|
return self.urlhash == other.urlhash
|
|
|
|
|
|
|
|
def __gt__(self, other):
|
|
|
|
if not isinstance(other, Link):
|
|
|
|
return NotImplemented
|
|
|
|
if not self.timestamp or not other.timestamp:
|
|
|
|
return
|
|
|
|
return float(self.timestamp) > float(other.timestamp)
|
|
|
|
|
|
|
|
def _asdict(self, extended=False):
|
|
|
|
info = {
|
|
|
|
'url': self.url,
|
|
|
|
'title': self.title or None,
|
|
|
|
'timestamp': self.timestamp,
|
|
|
|
'updated': self.updated or None,
|
|
|
|
'tags': self.tags or None,
|
|
|
|
'sources': self.sources or [],
|
|
|
|
'history': self.history or {},
|
|
|
|
}
|
|
|
|
if extended:
|
|
|
|
info.update({
|
|
|
|
'link_dir': self.link_dir,
|
|
|
|
'archive_path': self.archive_path,
|
|
|
|
'bookmarked_date': self.bookmarked_date,
|
|
|
|
'updated_date': self.updated_date,
|
|
|
|
'domain': self.domain,
|
|
|
|
'path': self.path,
|
|
|
|
'basename': self.basename,
|
|
|
|
'extension': self.extension,
|
|
|
|
'base_url': self.base_url,
|
|
|
|
'is_static': self.is_static,
|
|
|
|
'is_archived': self.is_archived,
|
|
|
|
'num_outputs': self.num_outputs,
|
|
|
|
})
|
|
|
|
return info
|
|
|
|
|
|
|
|
@property
|
|
|
|
def link_dir(self) -> str:
|
|
|
|
from config import ARCHIVE_DIR
|
|
|
|
return os.path.join(ARCHIVE_DIR, self.timestamp)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def archive_path(self) -> str:
|
|
|
|
from config import ARCHIVE_DIR_NAME
|
|
|
|
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
|
|
|
|
|
|
|
### URL Helpers
|
|
|
|
@property
|
|
|
|
def urlhash(self):
|
|
|
|
from util import hashurl
|
|
|
|
|
|
|
|
return hashurl(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def extension(self) -> str:
|
|
|
|
from util import extension
|
|
|
|
return extension(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def domain(self) -> str:
|
|
|
|
from util import domain
|
|
|
|
return domain(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def path(self) -> str:
|
|
|
|
from util import path
|
|
|
|
return path(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def basename(self) -> str:
|
|
|
|
from util import basename
|
|
|
|
return basename(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def base_url(self) -> str:
|
|
|
|
from util import base_url
|
|
|
|
return base_url(self.url)
|
|
|
|
|
|
|
|
### Pretty Printing Helpers
|
|
|
|
@property
|
|
|
|
def bookmarked_date(self) -> Optional[str]:
|
|
|
|
from util import ts_to_date
|
|
|
|
return ts_to_date(self.timestamp) if self.timestamp else None
|
|
|
|
|
|
|
|
@property
|
|
|
|
def updated_date(self) -> Optional[str]:
|
|
|
|
from util import ts_to_date
|
|
|
|
return ts_to_date(self.updated) if self.updated else None
|
|
|
|
|
|
|
|
### Archive Status Helpers
|
|
|
|
@property
|
|
|
|
def num_outputs(self) -> int:
|
|
|
|
return len(tuple(filter(None, self.latest_outputs().values())))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def is_static(self) -> bool:
|
|
|
|
from util import is_static_file
|
|
|
|
return is_static_file(self.url)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def is_archived(self) -> bool:
|
|
|
|
from config import ARCHIVE_DIR
|
|
|
|
from util import domain
|
|
|
|
|
|
|
|
return os.path.exists(os.path.join(
|
|
|
|
ARCHIVE_DIR,
|
|
|
|
self.timestamp,
|
|
|
|
domain(self.url),
|
|
|
|
))
|
|
|
|
|
|
|
|
def latest_outputs(self, status: str=None) -> Dict[str, Optional[str]]:
|
|
|
|
"""get the latest output that each archive method produced for link"""
|
|
|
|
|
|
|
|
latest = {
|
|
|
|
'title': None,
|
|
|
|
'favicon': None,
|
|
|
|
'wget': None,
|
|
|
|
'warc': None,
|
|
|
|
'pdf': None,
|
|
|
|
'screenshot': None,
|
|
|
|
'dom': None,
|
|
|
|
'git': None,
|
|
|
|
'media': None,
|
|
|
|
'archive_org': None,
|
|
|
|
}
|
|
|
|
for archive_method in latest.keys():
|
|
|
|
# get most recent succesful result in history for each archive method
|
|
|
|
history = self.history.get(archive_method) or []
|
|
|
|
history = filter(lambda result: result.output, reversed(history))
|
|
|
|
if status is not None:
|
|
|
|
history = filter(lambda result: result.status == status, history)
|
|
|
|
|
|
|
|
history = list(history)
|
|
|
|
if history:
|
|
|
|
latest[archive_method] = history[0].output
|
|
|
|
|
|
|
|
return latest
|
|
|
|
|
|
|
|
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
|
|
|
from util import wget_output_path
|
|
|
|
canonical = {
|
|
|
|
'index_url': 'index.html',
|
|
|
|
'favicon_url': 'favicon.ico',
|
|
|
|
'google_favicon_url': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
|
|
|
'archive_url': wget_output_path(self),
|
|
|
|
'warc_url': 'warc',
|
|
|
|
'pdf_url': 'output.pdf',
|
|
|
|
'screenshot_url': 'screenshot.png',
|
|
|
|
'dom_url': 'output.html',
|
|
|
|
'archive_org_url': 'https://web.archive.org/web/{}'.format(self.base_url),
|
|
|
|
'git_url': 'git',
|
|
|
|
'media_url': 'media',
|
|
|
|
}
|
|
|
|
if self.is_static:
|
|
|
|
# static binary files like PDF and images are handled slightly differently.
|
|
|
|
# they're just downloaded once and aren't archived separately multiple times,
|
|
|
|
# so the wget, screenshot, & pdf urls should all point to the same file
|
|
|
|
|
|
|
|
static_url = wget_output_path(self)
|
|
|
|
canonical.update({
|
|
|
|
'title': self.basename,
|
|
|
|
'archive_url': static_url,
|
|
|
|
'pdf_url': static_url,
|
|
|
|
'screenshot_url': static_url,
|
|
|
|
'dom_url': static_url,
|
|
|
|
})
|
|
|
|
return canonical
|
|
|
|
|
|
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
|
|
class ArchiveIndex:
|
|
|
|
info: str
|
|
|
|
version: str
|
|
|
|
source: str
|
|
|
|
docs: str
|
|
|
|
num_links: int
|
|
|
|
updated: str
|
|
|
|
links: List[Link]
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
def _asdict(self):
|
|
|
|
return asdict(self)
|
2019-03-26 09:33:34 +00:00
|
|
|
|
2019-03-26 23:21:34 +00:00
|
|
|
@dataclass
|
|
|
|
class RuntimeStats:
|
2019-03-26 09:33:34 +00:00
|
|
|
skipped: int
|
|
|
|
succeeded: int
|
|
|
|
failed: int
|
|
|
|
|
|
|
|
parse_start_ts: datetime
|
|
|
|
parse_end_ts: datetime
|
|
|
|
|
|
|
|
index_start_ts: datetime
|
|
|
|
index_end_ts: datetime
|
|
|
|
|
|
|
|
archiving_start_ts: datetime
|
|
|
|
archiving_end_ts: datetime
|