mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-26 14:10:20 +00:00
fix: Guess timestamps and add placeholders to support older indices
This commit is contained in:
parent
523c384e62
commit
100fa5d1f5
3 changed files with 49 additions and 16 deletions
|
@ -529,8 +529,16 @@ def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Option
|
||||||
link = None
|
link = None
|
||||||
try:
|
try:
|
||||||
link = parse_json_link_details(entry.path)
|
link = parse_json_link_details(entry.path)
|
||||||
except Exception:
|
except KeyError:
|
||||||
pass
|
# Try to fix index
|
||||||
|
if index_exists:
|
||||||
|
try:
|
||||||
|
# Last attempt to repair the detail index
|
||||||
|
link_guessed = parse_json_link_details(entry.path, guess=True)
|
||||||
|
write_json_link_details(link_guessed, out_dir=entry.path)
|
||||||
|
link = parse_json_link_details(entry.path)
|
||||||
|
except Exception as e:
|
||||||
|
pass
|
||||||
|
|
||||||
if index_exists and link is None:
|
if index_exists and link is None:
|
||||||
# index exists but it's corrupted or unparseable
|
# index exists but it's corrupted or unparseable
|
||||||
|
@ -555,9 +563,9 @@ def is_valid(link: Link) -> bool:
|
||||||
return False
|
return False
|
||||||
if dir_exists and index_exists:
|
if dir_exists and index_exists:
|
||||||
try:
|
try:
|
||||||
parsed_link = parse_json_link_details(link.link_dir)
|
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
||||||
return link.url == parsed_link.url
|
return link.url == parsed_link.url
|
||||||
except Exception:
|
except Exception as e:
|
||||||
pass
|
pass
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,6 @@ MAIN_INDEX_HEADER = {
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
### Main Links Index
|
### Main Links Index
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -58,8 +57,12 @@ def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||||
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
||||||
yield parse_json_link_details(str(detail_index_path))
|
yield parse_json_link_details(str(detail_index_path))
|
||||||
except KeyError:
|
except KeyError:
|
||||||
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
# as a last effort, try to guess the missing values out of existing ones
|
||||||
continue
|
try:
|
||||||
|
yield Link.from_json(link_json, guess=True)
|
||||||
|
except KeyError:
|
||||||
|
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
||||||
|
continue
|
||||||
return ()
|
return ()
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
@ -94,19 +97,18 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||||
|
|
||||||
out_dir = out_dir or link.link_dir
|
out_dir = out_dir or link.link_dir
|
||||||
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||||
|
|
||||||
atomic_write(path, link._asdict(extended=True))
|
atomic_write(path, link._asdict(extended=True))
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def parse_json_link_details(out_dir: str) -> Optional[Link]:
|
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
|
||||||
"""load the json link index from a given directory"""
|
"""load the json link index from a given directory"""
|
||||||
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||||
if os.path.exists(existing_index):
|
if os.path.exists(existing_index):
|
||||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||||
try:
|
try:
|
||||||
link_json = pyjson.load(f)
|
link_json = pyjson.load(f)
|
||||||
return Link.from_json(link_json)
|
return Link.from_json(link_json, guess)
|
||||||
except pyjson.JSONDecodeError:
|
except pyjson.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
return None
|
return None
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
__package__ = 'archivebox.index'
|
__package__ = 'archivebox.index'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
@ -51,7 +52,15 @@ class ArchiveResult:
|
||||||
assert self.output
|
assert self.output
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json(cls, json_info):
|
def guess_ts(_cls, dict_info):
|
||||||
|
from ..util import parse_date
|
||||||
|
parsed_timestamp = parse_date(dict_info["timestamp"])
|
||||||
|
start_ts = parsed_timestamp
|
||||||
|
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
||||||
|
return start_ts, end_ts
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_json(cls, json_info, guess=False):
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
|
@ -59,9 +68,23 @@ class ArchiveResult:
|
||||||
for key, val in json_info.items()
|
for key, val in json_info.items()
|
||||||
if key in cls.field_names()
|
if key in cls.field_names()
|
||||||
}
|
}
|
||||||
info['start_ts'] = parse_date(info['start_ts'])
|
if guess:
|
||||||
info['end_ts'] = parse_date(info['end_ts'])
|
keys = info.keys()
|
||||||
info['cmd_version'] = info.get('cmd_version')
|
if "start_ts" not in keys:
|
||||||
|
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
|
||||||
|
else:
|
||||||
|
info['start_ts'] = parse_date(info['start_ts'])
|
||||||
|
info['end_ts'] = parse_date(info['end_ts'])
|
||||||
|
if "pwd" not in keys:
|
||||||
|
info["pwd"] = str(os.getcwd() / Path(f"archive/{json_info['timestamp']}"))
|
||||||
|
if "cmd_version" not in keys:
|
||||||
|
info["cmd_version"] = "Undefined"
|
||||||
|
if "cmd" not in keys:
|
||||||
|
info["cmd"] = []
|
||||||
|
else:
|
||||||
|
info['start_ts'] = parse_date(info['start_ts'])
|
||||||
|
info['end_ts'] = parse_date(info['end_ts'])
|
||||||
|
info['cmd_version'] = info.get('cmd_version')
|
||||||
return cls(**info)
|
return cls(**info)
|
||||||
|
|
||||||
def to_dict(self, *keys) -> dict:
|
def to_dict(self, *keys) -> dict:
|
||||||
|
@ -182,7 +205,7 @@ class Link:
|
||||||
return info
|
return info
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def from_json(cls, json_info):
|
def from_json(cls, json_info, guess=False):
|
||||||
from ..util import parse_date
|
from ..util import parse_date
|
||||||
|
|
||||||
info = {
|
info = {
|
||||||
|
@ -200,7 +223,7 @@ class Link:
|
||||||
cast_history[method] = []
|
cast_history[method] = []
|
||||||
for json_result in method_history:
|
for json_result in method_history:
|
||||||
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
||||||
cast_result = ArchiveResult.from_json(json_result)
|
cast_result = ArchiveResult.from_json(json_result, guess)
|
||||||
cast_history[method].append(cast_result)
|
cast_history[method].append(cast_result)
|
||||||
|
|
||||||
info['history'] = cast_history
|
info['history'] = cast_history
|
||||||
|
|
Loading…
Reference in a new issue