From 6555719489dde081ad01ed89d5aa657993534f3e Mon Sep 17 00:00:00 2001 From: Ross Williams Date: Fri, 13 Oct 2023 18:01:32 -0400 Subject: [PATCH] Add space after tags when extracting text Add space after any close tag to ensure that tokens that would be rendered separate in HTML get extracted as separate tokens in text. Example: `

First

Second

` --> `First Second` NOT `FirstSecond` --- archivebox/search/utils.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py index f734908c..348b5603 100644 --- a/archivebox/search/utils.py +++ b/archivebox/search/utils.py @@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser): # ancestor matching this end tag while tag != self._tag_stack.pop(): pass + # Write a space after every tag, to ensure that tokens + # in tag text aren't concatenated. This may result in + # excess spaces, which should be ignored by search tokenizers. + if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS: + self.output.write(" ") except IndexError: # Got to the top of the stack, but somehow missed # this end tag -- maybe malformed markup -- restore the @@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser): # Don't output text data if any ancestor is in NOTEXT_TAGS if self._in_notext_tag(): return - if stripped := data.strip(): - self.output.write(stripped) - self.output.write(" ") + + self.output.write(data) def __str__(self): return self.output.getvalue()