2022-10-07 19:18:18 +00:00
|
|
|
from unittest import mock
|
|
|
|
from bookmarks.services import website_loader
|
|
|
|
|
|
|
|
from django.test import TestCase
|
|
|
|
|
|
|
|
|
|
|
|
class MockStreamingResponse:
|
|
|
|
def __init__(self, num_chunks, chunk_size, insert_head_after_chunk=None):
|
|
|
|
self.chunks = []
|
|
|
|
for index in range(num_chunks):
|
2024-01-27 10:29:16 +00:00
|
|
|
chunk = "".zfill(chunk_size)
|
|
|
|
self.chunks.append(chunk.encode("utf-8"))
|
2022-10-07 19:18:18 +00:00
|
|
|
|
|
|
|
if index == insert_head_after_chunk:
|
2024-01-27 10:29:16 +00:00
|
|
|
self.chunks.append("</head>".encode("utf-8"))
|
2022-10-07 19:18:18 +00:00
|
|
|
|
|
|
|
def iter_content(self, **kwargs):
|
|
|
|
return self.chunks
|
|
|
|
|
|
|
|
def __enter__(self):
|
|
|
|
return self
|
|
|
|
|
|
|
|
def __exit__(self, exc_type, exc_value, traceback):
|
|
|
|
pass
|
|
|
|
|
|
|
|
|
|
|
|
class WebsiteLoaderTestCase(TestCase):
|
2023-01-20 21:28:44 +00:00
|
|
|
def setUp(self):
|
|
|
|
# clear cached metadata before test run
|
|
|
|
website_loader.load_website_metadata.cache_clear()
|
|
|
|
|
2024-05-07 16:58:52 +00:00
|
|
|
def render_html_document(
|
|
|
|
self, title, description="", og_description="", og_image=""
|
|
|
|
):
|
2024-01-27 10:29:16 +00:00
|
|
|
meta_description = (
|
|
|
|
f'<meta name="description" content="{description}">' if description else ""
|
|
|
|
)
|
|
|
|
meta_og_description = (
|
|
|
|
f'<meta property="og:description" content="{og_description}">'
|
|
|
|
if og_description
|
|
|
|
else ""
|
|
|
|
)
|
2024-05-07 16:58:52 +00:00
|
|
|
meta_og_image = (
|
|
|
|
f'<meta property="og:image" content="{og_image}">' if og_image else ""
|
|
|
|
)
|
2024-01-27 10:29:16 +00:00
|
|
|
return f"""
|
2023-01-12 20:06:36 +00:00
|
|
|
<!DOCTYPE html>
|
|
|
|
<html lang="en">
|
|
|
|
<head>
|
|
|
|
<meta charset="UTF-8">
|
|
|
|
<title>{title}</title>
|
2024-01-27 09:28:46 +00:00
|
|
|
{meta_description}
|
|
|
|
{meta_og_description}
|
2024-05-07 16:58:52 +00:00
|
|
|
{meta_og_image}
|
2023-01-12 20:06:36 +00:00
|
|
|
</head>
|
|
|
|
<body></body>
|
|
|
|
</html>
|
2024-01-27 10:29:16 +00:00
|
|
|
"""
|
2023-01-12 20:06:36 +00:00
|
|
|
|
2022-10-07 19:18:18 +00:00
|
|
|
def test_load_page_returns_content(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch("requests.get") as mock_get:
|
|
|
|
mock_get.return_value = MockStreamingResponse(
|
|
|
|
num_chunks=10, chunk_size=1024
|
|
|
|
)
|
|
|
|
content = website_loader.load_page("https://example.com")
|
2022-10-07 19:18:18 +00:00
|
|
|
|
|
|
|
expected_content_size = 10 * 1024
|
|
|
|
self.assertEqual(expected_content_size, len(content))
|
|
|
|
|
|
|
|
def test_load_page_limits_large_documents(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch("requests.get") as mock_get:
|
|
|
|
mock_get.return_value = MockStreamingResponse(
|
|
|
|
num_chunks=10, chunk_size=1024 * 1000
|
|
|
|
)
|
|
|
|
content = website_loader.load_page("https://example.com")
|
2022-10-07 19:18:18 +00:00
|
|
|
|
|
|
|
# Should have read six chunks, after which content exceeds the max of 5MB
|
|
|
|
expected_content_size = 6 * 1024 * 1000
|
|
|
|
self.assertEqual(expected_content_size, len(content))
|
|
|
|
|
2023-05-30 20:04:54 +00:00
|
|
|
def test_load_page_stops_reading_at_end_of_head(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch("requests.get") as mock_get:
|
|
|
|
mock_get.return_value = MockStreamingResponse(
|
|
|
|
num_chunks=10, chunk_size=1024 * 1000, insert_head_after_chunk=0
|
|
|
|
)
|
|
|
|
content = website_loader.load_page("https://example.com")
|
2022-10-07 19:18:18 +00:00
|
|
|
|
|
|
|
# Should have read first chunk, and second chunk containing closing head tag
|
2024-01-27 10:29:16 +00:00
|
|
|
expected_content_size = 1 * 1024 * 1000 + len("</head>")
|
2022-10-07 19:18:18 +00:00
|
|
|
self.assertEqual(expected_content_size, len(content))
|
2023-01-12 20:06:36 +00:00
|
|
|
|
2023-05-30 20:04:54 +00:00
|
|
|
def test_load_page_removes_bytes_after_end_of_head(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch("requests.get") as mock_get:
|
2023-05-30 20:04:54 +00:00
|
|
|
mock_response = MockStreamingResponse(num_chunks=1, chunk_size=0)
|
2024-01-27 10:29:16 +00:00
|
|
|
mock_response.chunks[0] = "<head>人</head>".encode("utf-8")
|
2023-05-30 20:04:54 +00:00
|
|
|
# add a single byte that can't be decoded to utf-8
|
2024-01-27 10:29:16 +00:00
|
|
|
mock_response.chunks[0] += 0xFF.to_bytes(1, "big")
|
2023-05-30 20:04:54 +00:00
|
|
|
mock_get.return_value = mock_response
|
2024-01-27 10:29:16 +00:00
|
|
|
content = website_loader.load_page("https://example.com")
|
2023-05-30 20:04:54 +00:00
|
|
|
|
|
|
|
# verify that byte after head was removed, content parsed as utf-8
|
2024-01-27 10:29:16 +00:00
|
|
|
self.assertEqual(content, "<head>人</head>")
|
2023-05-30 20:04:54 +00:00
|
|
|
|
2023-01-12 20:06:36 +00:00
|
|
|
def test_load_website_metadata(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", "test description"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata("https://example.com")
|
|
|
|
self.assertEqual("test title", metadata.title)
|
|
|
|
self.assertEqual("test description", metadata.description)
|
2024-05-07 16:58:52 +00:00
|
|
|
self.assertIsNone(metadata.preview_image)
|
2023-01-12 20:06:36 +00:00
|
|
|
|
|
|
|
def test_load_website_metadata_trims_title_and_description(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
" test title ", " test description "
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata("https://example.com")
|
|
|
|
self.assertEqual("test title", metadata.title)
|
|
|
|
self.assertEqual("test description", metadata.description)
|
2024-01-27 09:28:46 +00:00
|
|
|
|
|
|
|
def test_load_website_metadata_using_og_description(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", "", og_description="test og description"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata("https://example.com")
|
|
|
|
self.assertEqual("test title", metadata.title)
|
|
|
|
self.assertEqual("test og description", metadata.description)
|
2024-01-27 09:28:46 +00:00
|
|
|
|
2024-05-07 16:58:52 +00:00
|
|
|
def test_load_website_metadata_using_og_image(self):
|
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", og_image="http://example.com/image.jpg"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata("https://example.com")
|
|
|
|
self.assertEqual("http://example.com/image.jpg", metadata.preview_image)
|
|
|
|
|
|
|
|
def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_dots(
|
|
|
|
self,
|
|
|
|
):
|
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", og_image="../image.jpg"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata(
|
|
|
|
"https://example.com/a/b/page.html"
|
|
|
|
)
|
|
|
|
self.assertEqual("https://example.com/a/image.jpg", metadata.preview_image)
|
|
|
|
|
|
|
|
def test_load_website_metadata_gets_absolute_og_image_path_when_path_starts_with_slash(
|
|
|
|
self,
|
|
|
|
):
|
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", og_image="/image.jpg"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata(
|
|
|
|
"https://example.com/a/b/page.html"
|
|
|
|
)
|
|
|
|
self.assertEqual("https://example.com/image.jpg", metadata.preview_image)
|
|
|
|
|
2024-01-27 09:28:46 +00:00
|
|
|
def test_load_website_metadata_prefers_description_over_og_description(self):
|
2024-01-27 10:29:16 +00:00
|
|
|
with mock.patch(
|
|
|
|
"bookmarks.services.website_loader.load_page"
|
|
|
|
) as mock_load_page:
|
|
|
|
mock_load_page.return_value = self.render_html_document(
|
|
|
|
"test title", "test description", og_description="test og description"
|
|
|
|
)
|
|
|
|
metadata = website_loader.load_website_metadata("https://example.com")
|
|
|
|
self.assertEqual("test title", metadata.title)
|
|
|
|
self.assertEqual("test description", metadata.description)
|