mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-29 23:50:22 +00:00
improve readability and mercury error handling and fix output path to be relative
This commit is contained in:
parent
c95698e608
commit
acb932ba12
2 changed files with 18 additions and 5 deletions
|
@ -54,7 +54,7 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "mercury"
|
output_folder = out_dir.absolute() / "mercury"
|
||||||
output = str(output_folder)
|
output = "mercury"
|
||||||
|
|
||||||
status = 'succeeded'
|
status = 'succeeded'
|
||||||
timer = TimedProgress(timeout, prefix=' ')
|
timer = TimedProgress(timeout, prefix=' ')
|
||||||
|
@ -73,6 +73,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
raise ShellError(cmd, result)
|
raise ShellError(cmd, result)
|
||||||
|
|
||||||
|
if article_text.get('failed'):
|
||||||
|
raise ArchiveError('Mercury was not able to get article text from the URL')
|
||||||
|
|
||||||
atomic_write(str(output_folder / "content.txt"), article_text["content"])
|
atomic_write(str(output_folder / "content.txt"), article_text["content"])
|
||||||
|
|
||||||
# Get HTML version of article
|
# Get HTML version of article
|
||||||
|
@ -86,6 +89,9 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
||||||
except json.JSONDecodeError:
|
except json.JSONDecodeError:
|
||||||
raise ShellError(cmd, result)
|
raise ShellError(cmd, result)
|
||||||
|
|
||||||
|
if article_text.get('failed'):
|
||||||
|
raise ArchiveError('Mercury was not able to get article HTML from the URL')
|
||||||
|
|
||||||
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
|
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
|
||||||
atomic_write(str(output_folder / "article.json"), article_json)
|
atomic_write(str(output_folder / "article.json"), article_json)
|
||||||
|
|
||||||
|
|
|
@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
|
|
||||||
out_dir = Path(out_dir or link.link_dir)
|
out_dir = Path(out_dir or link.link_dir)
|
||||||
output_folder = out_dir.absolute() / "readability"
|
output_folder = out_dir.absolute() / "readability"
|
||||||
output = str(output_folder)
|
output = "readability"
|
||||||
|
|
||||||
# Readability Docs: https://github.com/mozilla/readability
|
# Readability Docs: https://github.com/mozilla/readability
|
||||||
|
|
||||||
|
@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
temp_doc.write(document.encode("utf-8"))
|
temp_doc.write(document.encode("utf-8"))
|
||||||
temp_doc.close()
|
temp_doc.close()
|
||||||
|
|
||||||
|
if not document or len(document) < 10:
|
||||||
|
raise ArchiveError('Readability could not find HTML to parse for article text')
|
||||||
|
|
||||||
cmd = [
|
cmd = [
|
||||||
DEPENDENCIES['READABILITY_BINARY']['path'],
|
DEPENDENCIES['READABILITY_BINARY']['path'],
|
||||||
temp_doc.name
|
temp_doc.name,
|
||||||
]
|
]
|
||||||
|
|
||||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||||
result_json = json.loads(result.stdout)
|
try:
|
||||||
|
result_json = json.loads(result.stdout)
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
||||||
|
|
||||||
output_folder.mkdir(exist_ok=True)
|
output_folder.mkdir(exist_ok=True)
|
||||||
readability_content = result_json.pop("textContent")
|
readability_content = result_json.pop("textContent")
|
||||||
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
||||||
|
@ -122,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||||
cmd_version=READABILITY_VERSION,
|
cmd_version=READABILITY_VERSION,
|
||||||
output=output,
|
output=output,
|
||||||
status=status,
|
status=status,
|
||||||
index_texts= [readability_content] if readability_content else [],
|
index_texts=[readability_content] if readability_content else [],
|
||||||
**timer.stats,
|
**timer.stats,
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue