From bf3ea4214191af5e59d49c49ac683a19905f742d Mon Sep 17 00:00:00 2001 From: Cristian Date: Thu, 27 Aug 2020 09:51:33 -0500 Subject: [PATCH 1/3] fix: Add a default cmd value to handle case where the html cannot be retrieved --- archivebox/extractors/readability.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index cf3d1e4c..52d3b6ae 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -65,6 +65,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO # Readability Docs: https://github.com/mozilla/readability status = 'succeeded' + cmd = [] timer = TimedProgress(timeout, prefix=' ') try: document = get_html(link, out_dir) From 66037535fd5297ffcec7036caa3e7f8f01b0f429 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 1 Sep 2020 10:16:24 -0500 Subject: [PATCH 2/3] feat: Add curl command on readability as default command to debug --- archivebox/extractors/readability.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 52d3b6ae..5508e096 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -16,6 +16,7 @@ from ..util import ( ) from ..config import ( TIMEOUT, + CURL_BINARY, SAVE_READABILITY, DEPENDENCIES, READABILITY_VERSION, @@ -65,7 +66,10 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO # Readability Docs: https://github.com/mozilla/readability status = 'succeeded' - cmd = [] + cmd = [ + CURL_BINARY, + link.url + ] timer = TimedProgress(timeout, prefix=' ') try: document = get_html(link, out_dir) From a645f36b87c228c63c609e91a2ac7228dd0b06fb Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 1 Sep 2020 19:42:22 -0400 Subject: [PATCH 3/3] add comment about fake cmd --- archivebox/extractors/readability.py | 1 + 1 file changed, 1 insertion(+) diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 5508e096..bd45e9d5 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -66,6 +66,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO # Readability Docs: https://github.com/mozilla/readability status = 'succeeded' + # fake command to show the user so they have something to try debugging if get_html fails cmd = [ CURL_BINARY, link.url