From 6555719489dde081ad01ed89d5aa657993534f3e Mon Sep 17 00:00:00 2001
From: Ross Williams <ross@ross-williams.net>
Date: Fri, 13 Oct 2023 18:01:32 -0400
Subject: [PATCH] Add space after tags when extracting text

Add space after any close tag to ensure that
tokens that would be rendered separate in HTML
get extracted as separate tokens in text.

Example:

`<p>First</p><p>Second</p>` --> `First Second`
NOT `FirstSecond`
---
 archivebox/search/utils.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py
index f734908c..348b5603 100644
--- a/archivebox/search/utils.py
+++ b/archivebox/search/utils.py
@@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser):
             # ancestor matching this end tag
             while tag != self._tag_stack.pop():
                 pass
+            # Write a space after every tag, to ensure that tokens
+            # in tag text aren't concatenated. This may result in
+            # excess spaces, which should be ignored by search tokenizers.
+            if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
+                self.output.write(" ")
         except IndexError:
             # Got to the top of the stack, but somehow missed
             # this end tag -- maybe malformed markup -- restore the
@@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser):
         # Don't output text data if any ancestor is in NOTEXT_TAGS
         if self._in_notext_tag():
             return
-        if stripped := data.strip():
-            self.output.write(stripped)
-            self.output.write(" ")
+        
+        self.output.write(data)
 
     def __str__(self):
         return self.output.getvalue()