2019-04-27 21:26:24 +00:00
|
|
|
__package__ = 'archivebox.index'
|
2019-04-18 02:00:54 +00:00
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
import os
|
2019-04-19 01:09:54 +00:00
|
|
|
import sys
|
2019-05-01 03:13:04 +00:00
|
|
|
import json as pyjson
|
2020-07-22 19:22:00 +00:00
|
|
|
from pathlib import Path
|
2019-04-17 06:25:28 +00:00
|
|
|
|
|
|
|
from datetime import datetime
|
2020-09-03 22:26:49 +00:00
|
|
|
from typing import List, Optional, Iterator, Any, Union
|
2019-04-17 06:25:28 +00:00
|
|
|
|
2019-04-27 21:26:24 +00:00
|
|
|
from .schema import Link, ArchiveResult
|
2019-05-01 03:13:04 +00:00
|
|
|
from ..system import atomic_write
|
|
|
|
from ..util import enforce_types
|
2019-04-17 06:25:28 +00:00
|
|
|
from ..config import (
|
|
|
|
VERSION,
|
|
|
|
OUTPUT_DIR,
|
2019-04-19 01:09:54 +00:00
|
|
|
FOOTER_INFO,
|
|
|
|
GIT_SHA,
|
|
|
|
DEPENDENCIES,
|
|
|
|
JSON_INDEX_FILENAME,
|
2019-04-24 08:07:46 +00:00
|
|
|
ARCHIVE_DIR_NAME,
|
2020-07-22 22:08:32 +00:00
|
|
|
ANSI
|
2019-04-17 06:25:28 +00:00
|
|
|
)
|
2019-04-27 21:26:24 +00:00
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
|
2019-04-19 01:09:54 +00:00
|
|
|
MAIN_INDEX_HEADER = {
|
|
|
|
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
2019-04-27 21:26:24 +00:00
|
|
|
'schema': 'archivebox.index.json',
|
2019-04-19 01:09:54 +00:00
|
|
|
'copyright_info': FOOTER_INFO,
|
|
|
|
'meta': {
|
|
|
|
'project': 'ArchiveBox',
|
|
|
|
'version': VERSION,
|
|
|
|
'git_sha': GIT_SHA,
|
|
|
|
'website': 'https://ArchiveBox.io',
|
|
|
|
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
|
|
|
|
'source': 'https://github.com/pirate/ArchiveBox',
|
|
|
|
'issues': 'https://github.com/pirate/ArchiveBox/issues',
|
|
|
|
'dependencies': DEPENDENCIES,
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2019-04-17 06:25:28 +00:00
|
|
|
### Main Links Index
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
2019-04-27 21:26:24 +00:00
|
|
|
"""parse an archive index json file and return the list of links"""
|
2019-04-17 06:25:28 +00:00
|
|
|
|
2019-04-19 01:09:54 +00:00
|
|
|
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
2019-04-17 06:25:28 +00:00
|
|
|
if os.path.exists(index_path):
|
|
|
|
with open(index_path, 'r', encoding='utf-8') as f:
|
2019-05-01 03:13:04 +00:00
|
|
|
links = pyjson.load(f)['links']
|
2019-04-17 06:25:28 +00:00
|
|
|
for link_json in links:
|
2020-07-22 19:22:00 +00:00
|
|
|
try:
|
|
|
|
yield Link.from_json(link_json)
|
|
|
|
except KeyError:
|
2020-07-22 22:08:32 +00:00
|
|
|
try:
|
2020-07-23 15:22:36 +00:00
|
|
|
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
2020-07-22 22:08:32 +00:00
|
|
|
yield parse_json_link_details(str(detail_index_path))
|
|
|
|
except KeyError:
|
2020-07-24 14:24:52 +00:00
|
|
|
# as a last effort, try to guess the missing values out of existing ones
|
|
|
|
try:
|
|
|
|
yield Link.from_json(link_json, guess=True)
|
|
|
|
except KeyError:
|
|
|
|
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
|
|
|
continue
|
2019-04-17 06:25:28 +00:00
|
|
|
return ()
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
2019-04-17 06:25:28 +00:00
|
|
|
"""write the json link index to a given path"""
|
|
|
|
|
|
|
|
assert isinstance(links, List), 'Links must be a list, not a generator.'
|
|
|
|
assert not links or isinstance(links[0].history, dict)
|
|
|
|
assert not links or isinstance(links[0].sources, list)
|
|
|
|
|
|
|
|
if links and links[0].history.get('title'):
|
|
|
|
assert isinstance(links[0].history['title'][0], ArchiveResult)
|
|
|
|
|
|
|
|
if links and links[0].sources:
|
|
|
|
assert isinstance(links[0].sources[0], str)
|
|
|
|
|
2019-04-19 01:09:54 +00:00
|
|
|
main_index_json = {
|
|
|
|
**MAIN_INDEX_HEADER,
|
2019-04-17 06:25:28 +00:00
|
|
|
'num_links': len(links),
|
|
|
|
'updated': datetime.now(),
|
2019-04-22 18:34:30 +00:00
|
|
|
'last_run_cmd': sys.argv,
|
2019-04-17 06:25:28 +00:00
|
|
|
'links': links,
|
|
|
|
}
|
2020-06-26 01:30:29 +00:00
|
|
|
atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
|
2019-04-17 06:25:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
### Link Details Index
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
|
|
|
"""write a json file with some info about the link"""
|
|
|
|
|
|
|
|
out_dir = out_dir or link.link_dir
|
2019-04-19 01:09:54 +00:00
|
|
|
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
2020-06-26 01:30:29 +00:00
|
|
|
atomic_write(path, link._asdict(extended=True))
|
2019-04-17 06:25:28 +00:00
|
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
|
2019-04-17 06:25:28 +00:00
|
|
|
"""load the json link index from a given directory"""
|
2019-04-19 01:09:54 +00:00
|
|
|
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
2019-04-17 06:25:28 +00:00
|
|
|
if os.path.exists(existing_index):
|
|
|
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
2019-04-24 15:38:13 +00:00
|
|
|
try:
|
2019-05-01 03:13:04 +00:00
|
|
|
link_json = pyjson.load(f)
|
2020-07-24 14:24:52 +00:00
|
|
|
return Link.from_json(link_json, guess)
|
2019-05-01 03:13:04 +00:00
|
|
|
except pyjson.JSONDecodeError:
|
2019-04-24 15:38:13 +00:00
|
|
|
pass
|
2019-04-17 06:25:28 +00:00
|
|
|
return None
|
2019-04-24 08:07:46 +00:00
|
|
|
|
2019-05-01 03:13:04 +00:00
|
|
|
|
2019-04-24 08:07:46 +00:00
|
|
|
@enforce_types
|
2020-09-03 22:26:49 +00:00
|
|
|
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
|
2019-04-24 08:07:46 +00:00
|
|
|
"""read through all the archive data folders and return the parsed links"""
|
|
|
|
|
|
|
|
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
|
|
|
if entry.is_dir(follow_symlinks=True):
|
|
|
|
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
2020-07-22 22:08:32 +00:00
|
|
|
try:
|
|
|
|
link = parse_json_link_details(entry.path)
|
|
|
|
except KeyError:
|
|
|
|
link = None
|
2019-04-27 21:26:24 +00:00
|
|
|
if link:
|
|
|
|
yield link
|
2019-05-01 03:13:04 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
### Helpers
|
|
|
|
|
|
|
|
class ExtendedEncoder(pyjson.JSONEncoder):
|
|
|
|
"""
|
|
|
|
Extended json serializer that supports serializing several model
|
|
|
|
fields and objects
|
|
|
|
"""
|
|
|
|
|
|
|
|
def default(self, obj):
|
|
|
|
cls_name = obj.__class__.__name__
|
|
|
|
|
|
|
|
if hasattr(obj, '_asdict'):
|
|
|
|
return obj._asdict()
|
|
|
|
|
|
|
|
elif isinstance(obj, bytes):
|
|
|
|
return obj.decode()
|
|
|
|
|
|
|
|
elif isinstance(obj, datetime):
|
|
|
|
return obj.isoformat()
|
|
|
|
|
|
|
|
elif isinstance(obj, Exception):
|
|
|
|
return '{}: {}'.format(obj.__class__.__name__, obj)
|
|
|
|
|
|
|
|
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
|
|
|
return tuple(obj)
|
|
|
|
|
|
|
|
return pyjson.JSONEncoder.default(self, obj)
|
|
|
|
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
|
|
|
|
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
|
|
|
|