Fix website loader content encoding detection (#482)

This commit is contained in:
Sascha Ißbrücker 2023-05-30 22:04:54 +02:00 committed by GitHub
parent 5d48c64b2b
commit 4220ea0b4c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 16 additions and 2 deletions

View file

@ -71,8 +71,10 @@ def load_page(url: str):
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
# Stop reading if we have parsed end of head tag
if '</head>'.encode('utf-8') in content:
end_of_head = '</head>'.encode('utf-8')
if end_of_head in content:
logger.debug(f'Found closing head tag after {size} bytes')
content = content.split(end_of_head)[0] + end_of_head
break
# Stop reading if we exceed limit
if size > MAX_CONTENT_LIMIT:

View file

@ -59,7 +59,7 @@ class WebsiteLoaderTestCase(TestCase):
expected_content_size = 6 * 1024 * 1000
self.assertEqual(expected_content_size, len(content))
def test_load_page_stops_reading_at_closing_head_tag(self):
def test_load_page_stops_reading_at_end_of_head(self):
with mock.patch('requests.get') as mock_get:
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000,
insert_head_after_chunk=0)
@ -69,6 +69,18 @@ class WebsiteLoaderTestCase(TestCase):
expected_content_size = 1 * 1024 * 1000 + len('</head>')
self.assertEqual(expected_content_size, len(content))
def test_load_page_removes_bytes_after_end_of_head(self):
with mock.patch('requests.get') as mock_get:
mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0)
mock_response.chunks[0] = '<head>人</head>'.encode('utf-8')
# add a single byte that can't be decoded to utf-8
mock_response.chunks[0] += 0xff.to_bytes(1, 'big')
mock_get.return_value = mock_response
content = website_loader.load_page('https://example.com')
# verify that byte after head was removed, content parsed as utf-8
self.assertEqual(content, '<head>人</head>')
def test_load_website_metadata(self):
with mock.patch('bookmarks.services.website_loader.load_page') as mock_load_page:
mock_load_page.return_value = self.render_html_document('test title', 'test description')