mirror of
https://github.com/sissbruecker/linkding
synced 2024-11-21 19:03:02 +00:00
Fix website loader content encoding detection (#482)
This commit is contained in:
parent
5d48c64b2b
commit
4220ea0b4c
2 changed files with 16 additions and 2 deletions
|
@ -71,8 +71,10 @@ def load_page(url: str):
|
|||
logger.debug(f'Loaded chunk (iteration={iteration}, total={size / 1024})')
|
||||
|
||||
# Stop reading if we have parsed end of head tag
|
||||
if '</head>'.encode('utf-8') in content:
|
||||
end_of_head = '</head>'.encode('utf-8')
|
||||
if end_of_head in content:
|
||||
logger.debug(f'Found closing head tag after {size} bytes')
|
||||
content = content.split(end_of_head)[0] + end_of_head
|
||||
break
|
||||
# Stop reading if we exceed limit
|
||||
if size > MAX_CONTENT_LIMIT:
|
||||
|
|
|
@ -59,7 +59,7 @@ class WebsiteLoaderTestCase(TestCase):
|
|||
expected_content_size = 6 * 1024 * 1000
|
||||
self.assertEqual(expected_content_size, len(content))
|
||||
|
||||
def test_load_page_stops_reading_at_closing_head_tag(self):
|
||||
def test_load_page_stops_reading_at_end_of_head(self):
|
||||
with mock.patch('requests.get') as mock_get:
|
||||
mock_get.return_value = MockStreamingResponse(num_chunks=10, chunk_size=1024 * 1000,
|
||||
insert_head_after_chunk=0)
|
||||
|
@ -69,6 +69,18 @@ class WebsiteLoaderTestCase(TestCase):
|
|||
expected_content_size = 1 * 1024 * 1000 + len('</head>')
|
||||
self.assertEqual(expected_content_size, len(content))
|
||||
|
||||
def test_load_page_removes_bytes_after_end_of_head(self):
|
||||
with mock.patch('requests.get') as mock_get:
|
||||
mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0)
|
||||
mock_response.chunks[0] = '<head>人</head>'.encode('utf-8')
|
||||
# add a single byte that can't be decoded to utf-8
|
||||
mock_response.chunks[0] += 0xff.to_bytes(1, 'big')
|
||||
mock_get.return_value = mock_response
|
||||
content = website_loader.load_page('https://example.com')
|
||||
|
||||
# verify that byte after head was removed, content parsed as utf-8
|
||||
self.assertEqual(content, '<head>人</head>')
|
||||
|
||||
def test_load_website_metadata(self):
|
||||
with mock.patch('bookmarks.services.website_loader.load_page') as mock_load_page:
|
||||
mock_load_page.return_value = self.render_html_document('test title', 'test description')
|
||||
|
|
Loading…
Reference in a new issue