tests: Add readability tests

This commit is contained in:
Cristian 2020-08-11 11:15:15 -05:00
parent 8aa7b34de7
commit 2a68af1b94
2 changed files with 36 additions and 3 deletions

View file

@ -37,7 +37,7 @@ def get_html(link: Link, path: Path) -> str:
with open(abs_path / source, "r") as f:
document = f.read()
break
except FileNotFoundError:
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url)
@ -51,6 +51,7 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None) -> bool:
return False
output = Path(out_dir or link.link_dir) / 'readability.json'
print(output, SAVE_READABILITY)
return SAVE_READABILITY and (not output.exists())
@ -63,8 +64,9 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
output = str(output_folder)
document = get_html(link, out_dir)
temp_doc = NamedTemporaryFile()
temp_doc = NamedTemporaryFile(delete=False)
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
# Readability Docs: https://github.com/mozilla/readability
cmd = [
READABILITY_BINARY,
@ -101,7 +103,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
output = err
finally:
timer.end()
temp_doc.close()
return ArchiveResult(
cmd=cmd,

View file

@ -21,3 +21,35 @@ def test_singlefile_works(tmp_path, process, disable_extractors_dict):
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html"
assert output_file.exists()
def test_readability_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "readability" / "content.html"
assert output_file.exists()
def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "readability" / "content.html"
assert output_file.exists()
def test_readability_works_with_singlefile(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true", "USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "readability" / "content.html"
assert output_file.exists()
def test_readability_works_with_dom(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_READABILITY": "true", "SAVE_DOM": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "readability" / "content.html"
assert output_file.exists()