mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 06:00:22 +00:00
fix: Guess timestamps and add placeholders to support older indices
This commit is contained in:
parent
523c384e62
commit
100fa5d1f5
3 changed files with 49 additions and 16 deletions
|
@ -529,8 +529,16 @@ def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
|
|||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
except KeyError:
|
||||
# Try to fix index
|
||||
if index_exists:
|
||||
try:
|
||||
# Last attempt to repair the detail index
|
||||
link_guessed = parse_json_link_details(entry.path, guess=True)
|
||||
write_json_link_details(link_guessed, out_dir=entry.path)
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception as e:
|
||||
pass
|
||||
|
||||
if index_exists and link is None:
|
||||
# index exists but it's corrupted or unparseable
|
||||
|
@ -555,9 +563,9 @@ def is_valid(link: Link) -> bool:
|
|||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
parsed_link = parse_json_link_details(link.link_dir)
|
||||
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
||||
return link.url == parsed_link.url
|
||||
except Exception:
|
||||
except Exception as e:
|
||||
pass
|
||||
return False
|
||||
|
||||
|
|
|
@ -39,7 +39,6 @@ MAIN_INDEX_HEADER = {
|
|||
},
|
||||
}
|
||||
|
||||
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
|
@ -58,8 +57,12 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
|||
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
||||
yield parse_json_link_details(str(detail_index_path))
|
||||
except KeyError:
|
||||
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
||||
continue
|
||||
# as a last effort, try to guess the missing values out of existing ones
|
||||
try:
|
||||
yield Link.from_json(link_json, guess=True)
|
||||
except KeyError:
|
||||
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
||||
continue
|
||||
return ()
|
||||
|
||||
@enforce_types
|
||||
|
@ -94,19 +97,18 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
|||
|
||||
out_dir = out_dir or link.link_dir
|
||||
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
|
||||
atomic_write(path, link._asdict(extended=True))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_link_details(out_dir: str) -> Optional[Link]:
|
||||
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
|
||||
"""load the json link index from a given directory"""
|
||||
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(existing_index):
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
link_json = pyjson.load(f)
|
||||
return Link.from_json(link_json)
|
||||
return Link.from_json(link_json, guess)
|
||||
except pyjson.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
|
@ -51,7 +52,15 @@ class ArchiveResult:
|
|||
assert self.output
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_info):
|
||||
def guess_ts(_cls, dict_info):
|
||||
from ..util import parse_date
|
||||
parsed_timestamp = parse_date(dict_info["timestamp"])
|
||||
start_ts = parsed_timestamp
|
||||
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
||||
return start_ts, end_ts
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
|
@ -59,9 +68,23 @@ class ArchiveResult:
|
|||
for key, val in json_info.items()
|
||||
if key in cls.field_names()
|
||||
}
|
||||
info['start_ts'] = parse_date(info['start_ts'])
|
||||
info['end_ts'] = parse_date(info['end_ts'])
|
||||
info['cmd_version'] = info.get('cmd_version')
|
||||
if guess:
|
||||
keys = info.keys()
|
||||
if "start_ts" not in keys:
|
||||
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
|
||||
else:
|
||||
info['start_ts'] = parse_date(info['start_ts'])
|
||||
info['end_ts'] = parse_date(info['end_ts'])
|
||||
if "pwd" not in keys:
|
||||
info["pwd"] = str(os.getcwd() / Path(f"archive/{json_info['timestamp']}"))
|
||||
if "cmd_version" not in keys:
|
||||
info["cmd_version"] = "Undefined"
|
||||
if "cmd" not in keys:
|
||||
info["cmd"] = []
|
||||
else:
|
||||
info['start_ts'] = parse_date(info['start_ts'])
|
||||
info['end_ts'] = parse_date(info['end_ts'])
|
||||
info['cmd_version'] = info.get('cmd_version')
|
||||
return cls(**info)
|
||||
|
||||
def to_dict(self, *keys) -> dict:
|
||||
|
@ -182,7 +205,7 @@ class Link:
|
|||
return info
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_info):
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
|
@ -200,7 +223,7 @@ class Link:
|
|||
cast_history[method] = []
|
||||
for json_result in method_history:
|
||||
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
||||
cast_result = ArchiveResult.from_json(json_result)
|
||||
cast_result = ArchiveResult.from_json(json_result, guess)
|
||||
cast_history[method].append(cast_result)
|
||||
|
||||
info['history'] = cast_history
|
||||
|
|
Loading…
Reference in a new issue