From 5dc7e63792286c31988a964e5d5ef3a89a70ced8 Mon Sep 17 00:00:00 2001 From: Cristian Date: Tue, 11 Aug 2020 11:52:43 -0500 Subject: [PATCH] feat: Update dockerfile to support readability --- Dockerfile | 8 +++++++- archivebox/extractors/readability.py | 3 +-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/Dockerfile b/Dockerfile index c6b898e7..f7b64d75 100644 --- a/Dockerfile +++ b/Dockerfile @@ -57,6 +57,11 @@ RUN wget -qO - https://github.com/gildas-lormeau/SingleFile/archive/master.zip > && npm install --prefix SingleFile-master/cli --production > /dev/null 2>&1 \ && chmod +x SingleFile-master/cli/single-file +RUN wget -qO - https://github.com/pirate/readability-extractor/archive/master.zip > readability.zip \ + && unzip -q readability.zip \ + && npm install --prefix readability-extractor-master --production > /dev/null 2>&1 \ + && chmod +x readability-extractor-master/readability-extractor + # Run everything from here on out as non-privileged user RUN groupadd --system archivebox \ && useradd --system --create-home --gid archivebox --groups audio,video archivebox @@ -74,7 +79,8 @@ EXPOSE 8000 ENV IN_DOCKER=True \ CHROME_BINARY=google-chrome \ CHROME_SANDBOX=False \ - SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" + SINGLEFILE_BINARY="$EXTRA_PATH/SingleFile-master/cli/single-file" \ + READABILITY_BINARY="$EXTRA_PATH/readability-extractor-master/readability-extractor" RUN env ALLOW_ROOT=True archivebox version diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 91e85468..8b573720 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -50,8 +50,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool: if is_static_file(link.url): return False - output = Path(out_dir or link.link_dir) / 'readability.json' - print(output, SAVE_READABILITY) + output = Path(out_dir or link.link_dir) / 'readability' return SAVE_READABILITY and (not output.exists())