Load bookmark thumbnails after import (#724)

* Update thumbnails after import

* Safer way to download thumbnails

* small test improvements

* add missing tests

---------

Co-authored-by: Sascha Ißbrücker <sascha.issbruecker@gmail.com>
This commit is contained in:
Viacheslav Slinko 2024-05-10 10:19:00 +03:00 committed by GitHub
parent 87cd4061cb
commit b4376a9ff1
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
9 changed files with 263 additions and 20 deletions

View file

@ -1,4 +1,4 @@
# Generated by Django 5.0.3 on 2024-05-07 07:27
# Generated by Django 5.0.3 on 2024-05-10 07:01
from django.db import migrations, models
@ -13,7 +13,7 @@ class Migration(migrations.Migration):
migrations.AddField(
model_name="bookmark",
name="preview_image_file",
field=models.CharField(blank=True, max_length=512, null=True),
field=models.CharField(blank=True, max_length=512),
),
migrations.AddField(
model_name="userprofile",

View file

@ -59,7 +59,7 @@ class Bookmark(models.Model):
website_description = models.TextField(blank=True, null=True)
web_archive_snapshot_url = models.CharField(max_length=2048, blank=True)
favicon_file = models.CharField(max_length=512, blank=True)
preview_image_file = models.CharField(max_length=512, blank=True, null=True)
preview_image_file = models.CharField(max_length=512, blank=True)
unread = models.BooleanField(default=False)
is_archived = models.BooleanField(default=False)
shared = models.BooleanField(default=False)

View file

@ -83,6 +83,8 @@ def import_netscape_html(
tasks.schedule_bookmarks_without_snapshots(user)
# Load favicons for newly imported bookmarks
tasks.schedule_bookmarks_without_favicons(user)
# Load previews for newly imported bookmarks
tasks.schedule_bookmarks_without_previews(user)
end = timezone.now()
logger.debug(f"Import duration: {end - import_start}")

View file

@ -31,16 +31,58 @@ def load_preview_image(url: str) -> str | None:
logger.debug(f"Could not find preview image in metadata: {url}")
return None
logger.debug(f"Loading preview image: {metadata.preview_image}")
with requests.get(metadata.preview_image, stream=True) as response:
content_type = response.headers["Content-Type"]
preview_image_hash = _url_to_filename(url)
image_url = metadata.preview_image
logger.debug(f"Loading preview image: {image_url}")
with requests.get(image_url, stream=True) as response:
if response.status_code < 200 or response.status_code >= 300:
logger.debug(
f"Bad response status code for preview image: {image_url} status_code={response.status_code}"
)
return None
if "Content-Length" not in response.headers:
logger.debug(f"Empty Content-Length for preview image: {image_url}")
return None
content_length = int(response.headers["Content-Length"])
if content_length > settings.LD_PREVIEW_MAX_SIZE:
logger.debug(
f"Content-Length exceeds LD_PREVIEW_MAX_SIZE: {image_url} length={content_length}"
)
return None
if "Content-Type" not in response.headers:
logger.debug(f"Empty Content-Type for preview image: {image_url}")
return None
content_type = response.headers["Content-Type"].split(";", 1)[0]
file_extension = mimetypes.guess_extension(content_type)
if file_extension not in settings.LD_PREVIEW_ALLOWED_EXTENSIONS:
logger.debug(
f"Unsupported Content-Type for preview image: {image_url} content_type={content_type}"
)
return None
preview_image_hash = _url_to_filename(url)
preview_image_file = f"{preview_image_hash}{file_extension}"
preview_image_path = _get_image_path(preview_image_file)
with open(preview_image_path, "wb") as file:
downloaded = 0
for chunk in response.iter_content(chunk_size=8192):
downloaded += len(chunk)
if downloaded > content_length:
logger.debug(
f"Content-Length mismatch for preview image: {image_url} length={content_length} downloaded={downloaded}"
)
file.close()
preview_image_path.unlink()
return None
file.write(chunk)
logger.debug(f"Saved preview image as: {preview_image_path}")
return preview_image_file

View file

@ -7,6 +7,7 @@ import waybackpy
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
from django.db.models import Q
from django.utils import timezone, formats
from huey import crontab
from huey.contrib.djhuey import HUEY as huey
@ -166,6 +167,12 @@ def is_favicon_feature_active(user: User) -> bool:
return background_tasks_enabled and user.profile.enable_favicons
def is_preview_feature_active(user: User) -> bool:
return (
user.profile.enable_preview_images and not settings.LD_DISABLE_BACKGROUND_TASKS
)
def load_favicon(user: User, bookmark: Bookmark):
if is_favicon_feature_active(user):
_load_favicon_task(bookmark.id)
@ -222,7 +229,7 @@ def _schedule_refresh_favicons_task(user_id: int):
def load_preview_image(user: User, bookmark: Bookmark):
if user.profile.enable_preview_images and not settings.LD_DISABLE_BACKGROUND_TASKS:
if is_preview_feature_active(user):
_load_preview_image_task(bookmark.id)
@ -245,6 +252,27 @@ def _load_preview_image_task(bookmark_id: int):
)
def schedule_bookmarks_without_previews(user: User):
if is_preview_feature_active(user):
_schedule_bookmarks_without_previews_task(user.id)
@task()
def _schedule_bookmarks_without_previews_task(user_id: int):
user = get_user_model().objects.get(id=user_id)
bookmarks = Bookmark.objects.filter(
Q(preview_image_file__exact=""),
owner=user,
)
# TODO: Implement bulk task creation
for bookmark in bookmarks:
try:
_load_preview_image_task(bookmark.id)
except Exception as exc:
logging.exception(exc)
def is_html_snapshot_feature_active() -> bool:
return settings.LD_ENABLE_SNAPSHOTS and not settings.LD_DISABLE_BACKGROUND_TASKS

View file

@ -578,6 +578,60 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
self.assertEqual(self.executed_count(), 0)
def test_schedule_bookmarks_without_previews_should_load_preview_for_all_bookmarks_without_preview(
self,
):
user = self.get_or_create_test_user()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark(preview_image_file="test.png")
self.setup_bookmark(preview_image_file="test.png")
self.setup_bookmark(preview_image_file="test.png")
tasks.schedule_bookmarks_without_previews(user)
self.assertEqual(self.executed_count(), 4)
self.assertEqual(self.mock_load_preview_image.call_count, 3)
def test_schedule_bookmarks_without_previews_should_only_update_user_owned_bookmarks(
self,
):
user = self.get_or_create_test_user()
other_user = User.objects.create_user(
"otheruser", "otheruser@example.com", "password123"
)
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
tasks.schedule_bookmarks_without_previews(user)
self.assertEqual(self.mock_load_preview_image.call_count, 3)
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_schedule_bookmarks_without_previews_should_not_run_when_background_tasks_are_disabled(
self,
):
self.setup_bookmark()
tasks.schedule_bookmarks_without_previews(self.get_or_create_test_user())
self.assertEqual(self.executed_count(), 0)
def test_schedule_bookmarks_without_previews_should_not_run_when_preview_feature_is_disabled(
self,
):
self.user.profile.enable_preview_images = False
self.user.profile.save()
self.setup_bookmark()
tasks.schedule_bookmarks_without_previews(self.get_or_create_test_user())
self.assertEqual(self.executed_count(), 0)
@override_settings(LD_ENABLE_SNAPSHOTS=True)
def test_create_html_snapshot_should_create_pending_asset(self):
bookmark = self.setup_bookmark()

View file

@ -465,3 +465,14 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin, ImportTestMixin):
import_netscape_html(test_html, user)
mock_schedule_bookmarks_without_favicons.assert_called_once_with(user)
def test_schedule_preview_loading(self):
user = self.get_or_create_test_user()
test_html = self.render_html(tags_html="")
with patch.object(
tasks, "schedule_bookmarks_without_previews"
) as mock_schedule_bookmarks_without_previews:
import_netscape_html(test_html, user)
mock_schedule_bookmarks_without_previews.assert_called_once_with(user)

View file

@ -13,9 +13,20 @@ mock_image_data = b"mock_image"
class MockStreamingResponse:
def __init__(self, data=mock_image_data, content_type="image/png"):
def __init__(
self,
url,
data=mock_image_data,
content_type="image/png",
content_length=None,
status_code=200,
):
self.url = url
self.chunks = [data]
self.headers = {"Content-Type": content_type}
self.status_code = status_code
if not content_length:
content_length = len(data)
self.headers = {"Content-Type": content_type, "Content-Length": content_length}
def iter_content(self, **kwargs):
return self.chunks
@ -47,19 +58,29 @@ class PreviewImageLoaderTestCase(TestCase):
self.settings_override.disable()
self.mock_load_website_metadata_patcher.stop()
def create_mock_response(self, icon_data=mock_image_data, content_type="image/png"):
def create_mock_response(
self,
url="https://example.com/image.png",
icon_data=mock_image_data,
content_type="image/png",
content_length=len(mock_image_data),
status_code=200,
):
mock_response = mock.Mock()
mock_response.raw = io.BytesIO(icon_data)
return MockStreamingResponse(icon_data, content_type)
return MockStreamingResponse(
url, icon_data, content_type, content_length, status_code
)
def get_image_path(self, filename):
return Path(os.path.join(settings.LD_PREVIEW_FOLDER, filename))
def image_exists(self, filename):
return self.get_image_path(filename).exists()
def assertImageExists(self, filename, data):
self.assertTrue(self.get_image_path(filename).exists())
self.assertEqual(self.get_image_path(filename).read_bytes(), data)
def get_image_data(self, filename):
return self.get_image_path(filename).read_bytes()
def assertNoImageExists(self):
self.assertFalse(os.listdir(settings.LD_PREVIEW_FOLDER))
def test_load_preview_image(self):
with mock.patch("requests.get") as mock_get:
@ -67,8 +88,8 @@ class PreviewImageLoaderTestCase(TestCase):
file = preview_image_loader.load_preview_image("https://example.com")
self.assertTrue(self.image_exists(file))
self.assertEqual(mock_image_data, self.get_image_data(file))
self.assertIsNotNone(file)
self.assertImageExists(file, mock_image_data)
def test_load_preview_image_returns_none_if_no_preview_image_detected(self):
with mock.patch("requests.get") as mock_get:
@ -78,6 +99,80 @@ class PreviewImageLoaderTestCase(TestCase):
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNone(file)
self.assertNoImageExists()
def test_load_preview_image_returns_none_for_invalid_status_code(self):
invalid_status_codes = [199, 300, 400, 500]
for status_code in invalid_status_codes:
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(
status_code=status_code
)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNone(file)
self.assertNoImageExists()
def test_load_preview_image_returns_none_if_content_length_exceeds_limit(self):
# exceeds max size
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(
content_length=settings.LD_PREVIEW_MAX_SIZE + 1
)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNone(file)
self.assertNoImageExists()
# equals max size
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(
content_length=settings.LD_PREVIEW_MAX_SIZE
)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNotNone(file)
self.assertImageExists(file, mock_image_data)
def test_load_preview_image_returns_none_for_invalid_content_type(self):
invalid_content_types = ["text/html", "application/json"]
for content_type in invalid_content_types:
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(
content_type=content_type
)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNone(file)
self.assertNoImageExists()
valid_content_types = ["image/png", "image/jpeg", "image/gif"]
for content_type in valid_content_types:
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(
content_type=content_type
)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNotNone(file)
self.assertImageExists(file, mock_image_data)
def test_load_preview_image_returns_none_if_download_exceeds_content_length(self):
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(content_length=1)
file = preview_image_loader.load_preview_image("https://example.com")
self.assertIsNone(file)
self.assertNoImageExists()
def test_load_preview_image_creates_folder_if_not_exists(self):
with mock.patch("requests.get") as mock_get:
@ -95,14 +190,16 @@ class PreviewImageLoaderTestCase(TestCase):
def test_guess_file_extension(self):
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(content_type="image/png")
file = preview_image_loader.load_preview_image("https://example.com")
self.assertTrue(self.image_exists(file))
self.assertImageExists(file, mock_image_data)
self.assertEqual("png", file.split(".")[-1])
with mock.patch("requests.get") as mock_get:
mock_get.return_value = self.create_mock_response(content_type="image/jpeg")
file = preview_image_loader.load_preview_image("https://example.com")
self.assertTrue(self.image_exists(file))
self.assertImageExists(file, mock_image_data)
self.assertEqual("jpg", file.split(".")[-1])

View file

@ -289,6 +289,15 @@ LD_ENABLE_REFRESH_FAVICONS = os.getenv("LD_ENABLE_REFRESH_FAVICONS", True) in (
# Previews settings
LD_PREVIEW_FOLDER = os.path.join(BASE_DIR, "data", "previews")
LD_PREVIEW_MAX_SIZE = int(os.getenv("LD_PREVIEW_MAX_SIZE", 5242880))
LD_PREVIEW_ALLOWED_EXTENSIONS = [
".jpg",
".jpeg",
".png",
".gif",
".svg",
".webp",
]
# Asset / snapshot settings
LD_ASSET_FOLDER = os.path.join(BASE_DIR, "data", "assets")