mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
prefer dom dump to singlefile for generating readability output
This commit is contained in:
parent
78d942ac22
commit
db2984e47b
1 changed files with 3 additions and 1 deletions
|
@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
|||
"""
|
||||
canonical = link.canonical_outputs()
|
||||
abs_path = path.absolute()
|
||||
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
|
||||
|
||||
# prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
|
||||
sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
|
||||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
|
|
Loading…
Reference in a new issue