Add space after tags when extracting text

Add space after any close tag to ensure that
tokens that would be rendered separate in HTML
get extracted as separate tokens in text.

Example:

`<p>First</p><p>Second</p>` --> `First Second`
NOT `FirstSecond`
This commit is contained in:
Ross Williams 2023-10-13 18:01:32 -04:00
parent d8aa84ac98
commit 6555719489

View file

@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser):
# ancestor matching this end tag # ancestor matching this end tag
while tag != self._tag_stack.pop(): while tag != self._tag_stack.pop():
pass pass
# Write a space after every tag, to ensure that tokens
# in tag text aren't concatenated. This may result in
# excess spaces, which should be ignored by search tokenizers.
if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
self.output.write(" ")
except IndexError: except IndexError:
# Got to the top of the stack, but somehow missed # Got to the top of the stack, but somehow missed
# this end tag -- maybe malformed markup -- restore the # this end tag -- maybe malformed markup -- restore the
@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser):
# Don't output text data if any ancestor is in NOTEXT_TAGS # Don't output text data if any ancestor is in NOTEXT_TAGS
if self._in_notext_tag(): if self._in_notext_tag():
return return
if stripped := data.strip():
self.output.write(stripped) self.output.write(data)
self.output.write(" ")
def __str__(self): def __str__(self):
return self.output.getvalue() return self.output.getvalue()