mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
Add space after tags when extracting text
Add space after any close tag to ensure that tokens that would be rendered separate in HTML get extracted as separate tokens in text. Example: `<p>First</p><p>Second</p>` --> `First Second` NOT `FirstSecond`
This commit is contained in:
parent
d8aa84ac98
commit
6555719489
1 changed files with 7 additions and 3 deletions
|
@ -65,6 +65,11 @@ class HTMLTextExtractor(HTMLParser):
|
||||||
# ancestor matching this end tag
|
# ancestor matching this end tag
|
||||||
while tag != self._tag_stack.pop():
|
while tag != self._tag_stack.pop():
|
||||||
pass
|
pass
|
||||||
|
# Write a space after every tag, to ensure that tokens
|
||||||
|
# in tag text aren't concatenated. This may result in
|
||||||
|
# excess spaces, which should be ignored by search tokenizers.
|
||||||
|
if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
|
||||||
|
self.output.write(" ")
|
||||||
except IndexError:
|
except IndexError:
|
||||||
# Got to the top of the stack, but somehow missed
|
# Got to the top of the stack, but somehow missed
|
||||||
# this end tag -- maybe malformed markup -- restore the
|
# this end tag -- maybe malformed markup -- restore the
|
||||||
|
@ -75,9 +80,8 @@ class HTMLTextExtractor(HTMLParser):
|
||||||
# Don't output text data if any ancestor is in NOTEXT_TAGS
|
# Don't output text data if any ancestor is in NOTEXT_TAGS
|
||||||
if self._in_notext_tag():
|
if self._in_notext_tag():
|
||||||
return
|
return
|
||||||
if stripped := data.strip():
|
|
||||||
self.output.write(stripped)
|
self.output.write(data)
|
||||||
self.output.write(" ")
|
|
||||||
|
|
||||||
def __str__(self):
|
def __str__(self):
|
||||||
return self.output.getvalue()
|
return self.output.getvalue()
|
||||||
|
|
Loading…
Reference in a new issue