prefer dom dump to singlefile for generating readability output

This commit is contained in:
Nick Sweeting 2024-01-03 20:11:06 -08:00
parent 78d942ac22
commit db2984e47b

View file

@ -66,7 +66,9 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
# prefer chrome-generated DOM dump to singlefile as singlefile output often includes HUGE url(data:image/...base64) strings that crash parsers
sources = [canonical["dom_path"], canonical["singlefile_path"], canonical["wget_path"]]
document = None
for source in sources:
try: