fix: Guess timestamps and add placeholders to support older indices

This commit is contained in:
Cristian 2020-07-24 09:24:52 -05:00
parent 523c384e62
commit 100fa5d1f5
3 changed files with 49 additions and 16 deletions

View file

@ -529,8 +529,16 @@ def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
except KeyError:
# Try to fix index
if index_exists:
try:
# Last attempt to repair the detail index
link_guessed = parse_json_link_details(entry.path, guess=True)
write_json_link_details(link_guessed, out_dir=entry.path)
link = parse_json_link_details(entry.path)
except Exception as e:
pass
if index_exists and link is None:
# index exists but it's corrupted or unparseable
@ -555,9 +563,9 @@ def is_valid(link: Link) -> bool:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir)
parsed_link = parse_json_link_details(link.link_dir, guess=True)
return link.url == parsed_link.url
except Exception:
except Exception as e:
pass
return False

View file

@ -39,7 +39,6 @@ MAIN_INDEX_HEADER = {
},
}
### Main Links Index
@enforce_types
@ -58,8 +57,12 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
yield parse_json_link_details(str(detail_index_path))
except KeyError:
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
continue
# as a last effort, try to guess the missing values out of existing ones
try:
yield Link.from_json(link_json, guess=True)
except KeyError:
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
continue
return ()
@enforce_types
@ -94,19 +97,18 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(path, link._asdict(extended=True))
@enforce_types
def parse_json_link_details(out_dir: str) -> Optional[Link]:
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
try:
link_json = pyjson.load(f)
return Link.from_json(link_json)
return Link.from_json(link_json, guess)
except pyjson.JSONDecodeError:
pass
return None

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.index'
import os
from pathlib import Path
from datetime import datetime, timedelta
@ -51,7 +52,15 @@ class ArchiveResult:
assert self.output
@classmethod
def from_json(cls, json_info):
def guess_ts(_cls, dict_info):
from ..util import parse_date
parsed_timestamp = parse_date(dict_info["timestamp"])
start_ts = parsed_timestamp
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
return start_ts, end_ts
@classmethod
def from_json(cls, json_info, guess=False):
from ..util import parse_date
info = {
@ -59,9 +68,23 @@ class ArchiveResult:
for key, val in json_info.items()
if key in cls.field_names()
}
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
info['cmd_version'] = info.get('cmd_version')
if guess:
keys = info.keys()
if "start_ts" not in keys:
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
else:
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
if "pwd" not in keys:
info["pwd"] = str(os.getcwd() / Path(f"archive/{json_info['timestamp']}"))
if "cmd_version" not in keys:
info["cmd_version"] = "Undefined"
if "cmd" not in keys:
info["cmd"] = []
else:
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
info['cmd_version'] = info.get('cmd_version')
return cls(**info)
def to_dict(self, *keys) -> dict:
@ -182,7 +205,7 @@ class Link:
return info
@classmethod
def from_json(cls, json_info):
def from_json(cls, json_info, guess=False):
from ..util import parse_date
info = {
@ -200,7 +223,7 @@ class Link:
cast_history[method] = []
for json_result in method_history:
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
cast_result = ArchiveResult.from_json(json_result)
cast_result = ArchiveResult.from_json(json_result, guess)
cast_history[method].append(cast_result)
info['history'] = cast_history