From 310b4d124259f1593a8cb497cc5c7a2aba658504 Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Mon, 23 Oct 2023 21:42:25 -0400 Subject: [PATCH] Add htmltotext extractor Saves HTML text nodes and selected element attributes in `htmltotext.txt` for each Snapshot. Primarily intended to be used for search indexing. --- archivebox/config.py | 1 + .../migrations/0022_auto_20231023_2008.py | 18 ++ archivebox/extractors/__init__.py | 14 +- archivebox/extractors/htmltotext.py | 154 ++++++++++++++++++ archivebox/index/html.py | 2 +- archivebox/index/schema.py | 2 + archivebox/search/utils.py | 107 +----------- tests/fixtures.py | 1 + tests/test_extractors.py | 8 + 9 files changed, 203 insertions(+), 104 deletions(-) create mode 100644 archivebox/core/migrations/0022_auto_20231023_2008.py create mode 100644 archivebox/extractors/htmltotext.py diff --git a/archivebox/config.py b/archivebox/config.py index 4286ce58..37bebfc1 100644 --- a/archivebox/config.py +++ b/archivebox/config.py @@ -134,6 +134,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = { 'SAVE_SINGLEFILE': {'type': bool, 'default': True, 'aliases': ('FETCH_SINGLEFILE',)}, 'SAVE_READABILITY': {'type': bool, 'default': True, 'aliases': ('FETCH_READABILITY',)}, 'SAVE_MERCURY': {'type': bool, 'default': True, 'aliases': ('FETCH_MERCURY',)}, + 'SAVE_HTMLTOTEXT': {'type': bool, 'default': True, 'aliases': ('FETCH_HTMLTOTEXT',)}, 'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)}, 'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)}, 'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)}, diff --git a/archivebox/core/migrations/0022_auto_20231023_2008.py b/archivebox/core/migrations/0022_auto_20231023_2008.py new file mode 100644 index 00000000..1b0becef --- /dev/null +++ b/archivebox/core/migrations/0022_auto_20231023_2008.py @@ -0,0 +1,18 @@ +# Generated by Django 3.1.14 on 2023-10-23 20:08 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0021_auto_20220914_0934'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='extractor', + field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32), + ), + ] diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py index 3ca9cfa7..183f9824 100644 --- a/archivebox/extractors/__init__.py +++ b/archivebox/extractors/__init__.py @@ -33,6 +33,7 @@ from .wget import should_save_wget, save_wget from .singlefile import should_save_singlefile, save_singlefile from .readability import should_save_readability, save_readability from .mercury import should_save_mercury, save_mercury +from .htmltotext import should_save_htmltotext, save_htmltotext from .pdf import should_save_pdf, save_pdf from .screenshot import should_save_screenshot, save_screenshot from .dom import should_save_dom, save_dom @@ -51,15 +52,24 @@ def get_default_archive_methods(): ('screenshot', should_save_screenshot, save_screenshot), ('dom', should_save_dom, save_dom), ('wget', should_save_wget, save_wget), - ('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them + # keep title, readability, and htmltotext below wget and singlefile, as they depend on them + ('title', should_save_title, save_title), ('readability', should_save_readability, save_readability), ('mercury', should_save_mercury, save_mercury), + ('htmltotext', should_save_htmltotext, save_htmltotext), ('git', should_save_git, save_git), ('media', should_save_media, save_media), ('archive_org', should_save_archive_dot_org, save_archive_dot_org), ] -ARCHIVE_METHODS_INDEXING_PRECEDENCE = [('readability', 1), ('singlefile', 2), ('dom', 3), ('wget', 4)] +ARCHIVE_METHODS_INDEXING_PRECEDENCE = [ + ('readability', 1), + ('mercury', 2), + ('htmltotext', 3), + ('singlefile', 4), + ('dom', 5), + ('wget', 6) +] @enforce_types def ignore_methods(to_ignore: List[str]): diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py new file mode 100644 index 00000000..18722f13 --- /dev/null +++ b/archivebox/extractors/htmltotext.py @@ -0,0 +1,154 @@ +__package__ = 'archivebox.extractors' + +from html.parser import HTMLParser +import io +from pathlib import Path +from typing import Optional + +from ..config import ( + SAVE_HTMLTOTEXT, + TIMEOUT, + VERSION, +) +from ..index.schema import Link, ArchiveResult, ArchiveError +from ..logging_util import TimedProgress +from ..system import atomic_write +from ..util import ( + enforce_types, + is_static_file, +) +from .title import get_html + +class HTMLTextExtractor(HTMLParser): + TEXT_ATTRS = [ + "alt", "cite", "href", "label", + "list", "placeholder", "title", "value" + ] + NOTEXT_TAGS = ["script", "style", "template"] + NOTEXT_HREF = ["data:", "javascript:", "#"] + + def __init__(self): + super().__init__() + + self.output = io.StringIO() + self._tag_stack = [] + + def _is_text_attr(self, name, value): + if not isinstance(value, str): + return False + if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): + return False + + if name in self.TEXT_ATTRS: + return True + + return False + + def _parent_tag(self): + try: + return self._tag_stack[-1] + except IndexError: + return None + + def _in_notext_tag(self): + return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) + + def handle_starttag(self, tag, attrs): + self._tag_stack.append(tag) + + # Don't write out attribute values if any ancestor + # is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + for name, value in attrs: + if self._is_text_attr(name, value): + self.output.write(f"({value.strip()}) ") + + def handle_endtag(self, tag): + orig_stack = self._tag_stack.copy() + try: + # Keep popping tags until we find the nearest + # ancestor matching this end tag + while tag != self._tag_stack.pop(): + pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") + except IndexError: + # Got to the top of the stack, but somehow missed + # this end tag -- maybe malformed markup -- restore the + # stack + self._tag_stack = orig_stack + + def handle_data(self, data): + # Don't output text data if any ancestor is in NOTEXT_TAGS + if self._in_notext_tag(): + return + + data = data.lstrip() + len_before_rstrip = len(data) + data = data.rstrip() + spaces_rstripped = len_before_rstrip - len(data) + if data: + self.output.write(data) + if spaces_rstripped: + # Add back a single space if 1 or more + # whitespace characters were stripped + self.output.write(' ') + + def __str__(self): + return self.output.getvalue() + + +@enforce_types +def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: + if is_static_file(link.url): + return False + + out_dir = out_dir or Path(link.link_dir) + if not overwrite and (out_dir / 'htmltotext.txt').exists(): + return False + + return SAVE_HTMLTOTEXT + + +@enforce_types +def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: + """extract search-indexing-friendly text from an HTML document""" + + out_dir = Path(out_dir or link.link_dir) + output = "htmltotext.txt" + + timer = TimedProgress(timeout, prefix=' ') + extracted_text = None + try: + extractor = HTMLTextExtractor() + document = get_html(link, out_dir) + + if not document: + raise ArchiveError('htmltotext could not find HTML to parse for article text') + + extractor.feed(document) + extractor.close() + extracted_text = str(extractor) + + atomic_write(str(out_dir / output), extracted_text) + except (Exception, OSError) as err: + status = 'failed' + output = err + cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html'] + finally: + timer.end() + + return ArchiveResult( + cmd=cmd, + pwd=str(out_dir), + cmd_version=VERSION, + output=output, + status=status, + index_texts=[extracted_text] if extracted_text else [], + **timer.stats, + ) diff --git a/archivebox/index/html.py b/archivebox/index/html.py index c0229674..6b914446 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -143,7 +143,7 @@ def snapshot_icons(snapshot) -> str: "mercury": "🅼", "warc": "📦" } - exclude = ["favicon", "title", "headers", "archive_org"] + exclude = ["favicon", "title", "headers", "htmltotext", "archive_org"] # Missing specific entry for WARC extractor_outputs = defaultdict(lambda: None) diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index c44165a9..85972993 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -429,6 +429,7 @@ class Link: 'singlefile_path': 'singlefile.html', 'readability_path': 'readability/content.html', 'mercury_path': 'mercury/content.html', + 'htmltotext_path': 'htmltotext.txt', 'pdf_path': 'output.pdf', 'screenshot_path': 'screenshot.png', 'dom_path': 'output.html', @@ -452,6 +453,7 @@ class Link: 'singlefile_path': static_path, 'readability_path': static_path, 'mercury_path': static_path, + 'htmltotext_path': static_path, }) return canonical diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index 348b5603..723c7fb5 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -1,117 +1,23 @@ -from html.parser import HTMLParser -import io - from django.db.models import QuerySet from archivebox.util import enforce_types -from archivebox.config import ANSI, SEARCH_PROCESS_HTML - -BLOCK_SIZE = 32768 +from archivebox.config import ANSI def log_index_started(url): print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI)) print( ) - -class HTMLTextExtractor(HTMLParser): - - TEXT_ATTRS = ["alt", "cite", "href", "label", "list", "placeholder", "title", "value"] - NOTEXT_TAGS = ["script", "style", "template"] - NOTEXT_HREF = ["data:", "javascript:", "#"] - - def __init__(self): - super().__init__() - - self.output = io.StringIO() - self._tag_stack = [] - - def _is_text_attr(self, name, value): - if not isinstance(value, str): - return False - if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)): - return False - - if name in self.TEXT_ATTRS: - return True - - return False - - def _parent_tag(self): - try: - return self._tag_stack[-1] - except IndexError: - return None - - def _in_notext_tag(self): - return any([t in self._tag_stack for t in self.NOTEXT_TAGS]) - - def handle_starttag(self, tag, attrs): - self._tag_stack.append(tag) - - # Don't write out attribute values if any ancestor - # is in NOTEXT_TAGS - if self._in_notext_tag(): - return - - for name, value in attrs: - if self._is_text_attr(name, value): - self.output.write(value.strip()) - self.output.write(" ") - - def handle_endtag(self, tag): - orig_stack = self._tag_stack.copy() - try: - # Keep popping tags until we find the nearest - # ancestor matching this end tag - while tag != self._tag_stack.pop(): - pass - # Write a space after every tag, to ensure that tokens - # in tag text aren't concatenated. This may result in - # excess spaces, which should be ignored by search tokenizers. - if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: - self.output.write(" ") - except IndexError: - # Got to the top of the stack, but somehow missed - # this end tag -- maybe malformed markup -- restore the - # stack - self._tag_stack = orig_stack - - def handle_data(self, data): - # Don't output text data if any ancestor is in NOTEXT_TAGS - if self._in_notext_tag(): - return - - self.output.write(data) - - def __str__(self): - return self.output.getvalue() - - -def _read_all(file: io.TextIOBase) -> str: - return file.read() - - -def _extract_html_text(file: io.TextIOBase) -> str: - extractor = HTMLTextExtractor() - while (block := file.read(BLOCK_SIZE)): - extractor.feed(block) - else: - extractor.close() - - return str(extractor) - - -def get_file_result_content(res, extra_path, use_pwd=False, *, filter=_read_all): +def get_file_result_content(res, extra_path, use_pwd=False): if use_pwd: fpath = f'{res.pwd}/{res.output}' else: fpath = f'{res.output}' - + if extra_path: fpath = f'{fpath}/{extra_path}' - with open(fpath, 'r', encoding='utf-8', errors='replace') as file: - data = filter(file) + with open(fpath, 'r', encoding='utf-8') as file: + data = file.read() if data: return [data] return [] @@ -132,8 +38,7 @@ def get_indexable_content(results: QuerySet): if method == 'readability': return get_file_result_content(res, 'content.txt', use_pwd=True) elif method == 'singlefile': - filter = _extract_html_text if SEARCH_PROCESS_HTML else _read_all - return get_file_result_content(res, '', use_pwd=True, filter=filter) + return get_file_result_content(res, '', use_pwd=True) elif method == 'dom': return get_file_result_content(res, '', use_pwd=True) elif method == 'wget': diff --git a/tests/fixtures.py b/tests/fixtures.py index cca722f3..e9c0bc48 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -17,6 +17,7 @@ def disable_extractors_dict(): "USE_SINGLEFILE": "false", "USE_READABILITY": "false", "USE_MERCURY": "false", + "SAVE_HTMLTOTEXT": "false", "SAVE_PDF": "false", "SAVE_SCREENSHOT": "false", "SAVE_DOM": "false", diff --git a/tests/test_extractors.py b/tests/test_extractors.py index 86b50d51..bf67b853 100644 --- a/tests/test_extractors.py +++ b/tests/test_extractors.py @@ -39,6 +39,14 @@ def test_mercury_works(tmp_path, process, disable_extractors_dict): output_file = archived_item_path / "mercury" / "content.html" assert output_file.exists() +def test_htmltotext_works(tmp_path, process, disable_extractors_dict): + disable_extractors_dict.update({"SAVE_HTMLTOTEXT": "true"}) + add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], + capture_output=True, env=disable_extractors_dict) + archived_item_path = list(tmp_path.glob("archive/**/*"))[0] + output_file = archived_item_path / "htmltotext.txt" + assert output_file.exists() + def test_readability_works_with_wget(tmp_path, process, disable_extractors_dict): disable_extractors_dict.update({"USE_READABILITY": "true", "USE_WGET": "true"}) add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],