Generate fallback URLs for web archive links (#804)

* generate fallback web archive URL if none exists

* remove fallback web archive snapshot creation

* fix test
This commit is contained in:
Sascha Ißbrücker 2024-08-29 22:45:10 +02:00 committed by GitHub
parent 36a84276a2
commit 749bc1ef63
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 53 additions and 336 deletions

View file

@ -79,8 +79,6 @@ def import_netscape_html(
for batch in batches:
_import_batch(batch, user, options, tag_cache, result)
# Create snapshots for newly imported bookmarks
tasks.schedule_bookmarks_without_snapshots(user)
# Load favicons for newly imported bookmarks
tasks.schedule_bookmarks_without_favicons(user)
# Load previews for newly imported bookmarks

View file

@ -12,9 +12,8 @@ from django.utils import timezone, formats
from huey import crontab
from huey.contrib.djhuey import HUEY as huey
from huey.exceptions import TaskLockedException
from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
from waybackpy.exceptions import WaybackError, TooManyRequestsError
import bookmarks.services.wayback
from bookmarks.models import Bookmark, BookmarkAsset, UserProfile
from bookmarks.services import favicon_loader, singlefile, preview_image_loader
from bookmarks.services.website_loader import DEFAULT_USER_AGENT
@ -66,29 +65,6 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
_create_web_archive_snapshot_task(bookmark.id, force_update)
def _load_newest_snapshot(bookmark: Bookmark):
try:
logger.info(f"Load existing snapshot for bookmark. url={bookmark.url}")
cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(
bookmark.url
)
existing_snapshot = cdx_api.newest()
if existing_snapshot:
bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
bookmark.save(update_fields=["web_archive_snapshot_url"])
logger.info(
f"Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}"
)
except NoCDXRecordFound:
logger.info(f"Could not find any snapshots for bookmark. url={bookmark.url}")
except WaybackError as error:
logger.error(
f"Failed to load existing snapshot. url={bookmark.url}", exc_info=error
)
def _create_snapshot(bookmark: Bookmark):
logger.info(f"Create new snapshot for bookmark. url={bookmark.url}...")
archive = waybackpy.WaybackMachineSaveAPI(
@ -117,48 +93,27 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
return
except TooManyRequestsError:
logger.error(
f"Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}"
f"Failed to create snapshot due to rate limiting. url={bookmark.url}"
)
except WaybackError as error:
logger.error(
f"Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}",
f"Failed to create snapshot. url={bookmark.url}",
exc_info=error,
)
# Load the newest snapshot as fallback
_load_newest_snapshot(bookmark)
@task()
def _load_web_archive_snapshot_task(bookmark_id: int):
try:
bookmark = Bookmark.objects.get(id=bookmark_id)
except Bookmark.DoesNotExist:
return
# Skip if snapshot exists
if bookmark.web_archive_snapshot_url:
return
# Load the newest snapshot
_load_newest_snapshot(bookmark)
def schedule_bookmarks_without_snapshots(user: User):
if is_web_archive_integration_active(user):
_schedule_bookmarks_without_snapshots_task(user.id)
# Loading snapshots from CDX API has been removed, keeping the task function
# for now to prevent errors when huey tries to run the task
pass
@task()
def _schedule_bookmarks_without_snapshots_task(user_id: int):
user = get_user_model().objects.get(id=user_id)
bookmarks_without_snapshots = Bookmark.objects.filter(
web_archive_snapshot_url__exact="", owner=user
)
# TODO: Implement bulk task creation
for bookmark in bookmarks_without_snapshots:
# To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
# new ones when processing bookmarks in bulk
_load_web_archive_snapshot_task(bookmark.id)
# Loading snapshots from CDX API has been removed, keeping the task function
# for now to prevent errors when huey tries to run the task
pass
def is_favicon_feature_active(user: User) -> bool:

View file

@ -1,42 +1,20 @@
import time
from typing import Dict
import datetime
import waybackpy
import waybackpy.utils
from waybackpy.exceptions import NoCDXRecordFound
from django.utils import timezone
class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
def generate_fallback_webarchive_url(
url: str, timestamp: datetime.datetime
) -> str | None:
"""
Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
See https://github.com/akamhy/waybackpy/issues/176
Generate a URL to the web archive for the given URL and timestamp.
A snapshot for the specific timestamp might not exist, in which case the
web archive will show the closest snapshot to the given timestamp.
If there is no snapshot at all the URL will be invalid.
"""
if not url:
return None
if not timestamp:
timestamp = timezone.now()
def newest(self):
unix_timestamp = int(time.time())
self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(
unix_timestamp
)
self.sort = "closest"
self.limit = -5
newest_snapshot = None
for snapshot in self.snapshots():
newest_snapshot = snapshot
break
if not newest_snapshot:
raise NoCDXRecordFound(
"Wayback Machine's CDX server did not return any records "
+ "for the query. The URL may not have any archives "
+ " on the Wayback Machine or the URL may have been recently "
+ "archived and is still not available on the CDX server."
)
return newest_snapshot
def add_payload(self, payload: Dict[str, str]) -> None:
super().add_payload(payload)
# Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
# makes searching for latest snapshots faster
payload["fastLatest"] = "true"
return f"https://web.archive.org/web/{timestamp.strftime('%Y%m%d%H%M%S')}/{url}"

View file

@ -1,15 +1,7 @@
from django.conf import settings
from django.contrib.auth import user_logged_in
from django.db.backends.signals import connection_created
from django.dispatch import receiver
from bookmarks.services import tasks
@receiver(user_logged_in)
def user_logged_in(sender, request, user, **kwargs):
tasks.schedule_bookmarks_without_snapshots(user)
@receiver(connection_created)
def extend_sqlite(connection=None, **kwargs):

View file

@ -6,11 +6,7 @@ from django.test import TestCase
from django.urls import reverse
from bookmarks.models import Bookmark, BookmarkSearch, Tag, UserProfile
from bookmarks.tests.helpers import (
BookmarkFactoryMixin,
HtmlTestMixin,
collapse_whitespace,
)
from bookmarks.tests.helpers import BookmarkFactoryMixin, HtmlTestMixin
class BookmarkIndexViewTestCase(TestCase, BookmarkFactoryMixin, HtmlTestMixin):

View file

@ -1,3 +1,4 @@
import datetime
from typing import Type
from dateutil.relativedelta import relativedelta
@ -36,15 +37,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
html,
)
def assertDateLabel(self, html: str, label_content: str):
self.assertInHTML(
f"""
<span>{label_content}</span>
<span>|</span>
""",
html,
)
def assertWebArchiveLink(
self, html: str, label_content: str, url: str, link_target: str = "_blank"
):
@ -465,15 +457,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
style = bookmark_list["style"]
self.assertIn("--ld-bookmark-description-max-lines:3;", style)
def test_should_respect_absolute_date_setting(self):
bookmark = self.setup_date_format_test(
UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE
)
html = self.render_template()
formatted_date = formats.date_format(bookmark.date_added, "SHORT_DATE_FORMAT")
self.assertDateLabel(html, formatted_date)
def test_should_render_web_archive_link_with_absolute_date_setting(self):
bookmark = self.setup_date_format_test(
UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE,
@ -486,12 +469,6 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
html, formatted_date, bookmark.web_archive_snapshot_url
)
def test_should_respect_relative_date_setting(self):
self.setup_date_format_test(UserProfile.BOOKMARK_DATE_DISPLAY_RELATIVE)
html = self.render_template()
self.assertDateLabel(html, "1 week ago")
def test_should_render_web_archive_link_with_relative_date_setting(self):
bookmark = self.setup_date_format_test(
UserProfile.BOOKMARK_DATE_DISPLAY_RELATIVE,
@ -501,6 +478,27 @@ class BookmarkListTemplateTest(TestCase, BookmarkFactoryMixin, HtmlTestMixin):
self.assertWebArchiveLink(html, "1 week ago", bookmark.web_archive_snapshot_url)
def test_should_render_generated_web_archive_link_without_saved_snapshot_url(self):
user = self.get_or_create_test_user()
user.profile.bookmark_date_display = UserProfile.BOOKMARK_DATE_DISPLAY_ABSOLUTE
user.profile.save()
date_added = timezone.datetime(
2023, 8, 11, 21, 45, 11, tzinfo=datetime.timezone.utc
)
bookmark = self.setup_bookmark(
url="https://example.com/article", added=date_added
)
html = self.render_template()
formatted_date = formats.date_format(bookmark.date_added, "SHORT_DATE_FORMAT")
self.assertWebArchiveLink(
html,
formatted_date,
"https://web.archive.org/web/20230811214511/https://example.com/article",
)
def test_bookmark_link_target_should_be_blank_by_default(self):
bookmark = self.setup_bookmark()
html = self.render_template()

View file

@ -1,6 +1,4 @@
import datetime
import os.path
from dataclasses import dataclass
from unittest import mock
import waybackpy
@ -10,8 +8,6 @@ from django.test import TestCase, override_settings
from huey.contrib.djhuey import HUEY as huey
from waybackpy.exceptions import WaybackError
import bookmarks.services.favicon_loader
import bookmarks.services.wayback
from bookmarks.models import BookmarkAsset, UserProfile
from bookmarks.services import tasks, singlefile
from bookmarks.tests.helpers import BookmarkFactoryMixin
@ -29,12 +25,6 @@ def create_wayback_machine_save_api_mock(
return mock_api
@dataclass
class MockCdxSnapshot:
archive_url: str
datetime_timestamp: datetime.datetime
class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
def setUp(self):
@ -50,17 +40,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
)
self.mock_save_api_patcher.start()
self.mock_cdx_api = mock.Mock()
self.mock_cdx_api.newest.return_value = MockCdxSnapshot(
"https://example.com/newest_snapshot", datetime.datetime.now()
)
self.mock_cdx_api_patcher = mock.patch.object(
bookmarks.services.wayback,
"CustomWaybackMachineCDXServerAPI",
return_value=self.mock_cdx_api,
)
self.mock_cdx_api_patcher.start()
self.mock_load_favicon_patcher = mock.patch(
"bookmarks.services.favicon_loader.load_favicon"
)
@ -90,7 +69,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
def tearDown(self):
self.mock_save_api_patcher.stop()
self.mock_cdx_api_patcher.stop()
self.mock_load_favicon_patcher.stop()
self.mock_singlefile_create_snapshot_patcher.stop()
self.mock_load_preview_image_patcher.stop()
@ -142,45 +120,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
self.assertEqual(bookmark.web_archive_snapshot_url, "https://other.com")
def test_create_web_archive_snapshot_should_use_newest_snapshot_as_fallback(self):
bookmark = self.setup_bookmark()
self.mock_save_api.save.side_effect = WaybackError
tasks.create_web_archive_snapshot(
self.get_or_create_test_user(), bookmark, False
)
bookmark.refresh_from_db()
self.mock_cdx_api.newest.assert_called_once()
self.assertEqual(
"https://example.com/newest_snapshot",
bookmark.web_archive_snapshot_url,
)
def test_create_web_archive_snapshot_should_ignore_missing_newest_snapshot(self):
bookmark = self.setup_bookmark()
self.mock_save_api.save.side_effect = WaybackError
self.mock_cdx_api.newest.return_value = None
tasks.create_web_archive_snapshot(
self.get_or_create_test_user(), bookmark, False
)
bookmark.refresh_from_db()
self.assertEqual("", bookmark.web_archive_snapshot_url)
def test_create_web_archive_snapshot_should_ignore_newest_snapshot_errors(self):
bookmark = self.setup_bookmark()
self.mock_save_api.save.side_effect = WaybackError
self.mock_cdx_api.newest.side_effect = WaybackError
tasks.create_web_archive_snapshot(
self.get_or_create_test_user(), bookmark, False
)
bookmark.refresh_from_db()
self.assertEqual("", bookmark.web_archive_snapshot_url)
def test_create_web_archive_snapshot_should_not_save_stale_bookmark_data(self):
bookmark = self.setup_bookmark()
@ -203,69 +142,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
bookmark.web_archive_snapshot_url,
)
def test_load_web_archive_snapshot_should_update_snapshot_url(self):
bookmark = self.setup_bookmark()
tasks._load_web_archive_snapshot_task(bookmark.id)
bookmark.refresh_from_db()
self.assertEqual(self.executed_count(), 1)
self.mock_cdx_api.newest.assert_called_once()
self.assertEqual(
"https://example.com/newest_snapshot", bookmark.web_archive_snapshot_url
)
def test_load_web_archive_snapshot_should_handle_missing_bookmark_id(self):
tasks._load_web_archive_snapshot_task(123)
self.assertEqual(self.executed_count(), 1)
self.mock_cdx_api.newest.assert_not_called()
def test_load_web_archive_snapshot_should_skip_if_snapshot_exists(self):
bookmark = self.setup_bookmark(web_archive_snapshot_url="https://example.com")
tasks._load_web_archive_snapshot_task(bookmark.id)
self.assertEqual(self.executed_count(), 1)
self.mock_cdx_api.newest.assert_not_called()
def test_load_web_archive_snapshot_should_handle_missing_snapshot(self):
bookmark = self.setup_bookmark()
self.mock_cdx_api.newest.return_value = None
tasks._load_web_archive_snapshot_task(bookmark.id)
self.assertEqual("", bookmark.web_archive_snapshot_url)
def test_load_web_archive_snapshot_should_handle_wayback_errors(self):
bookmark = self.setup_bookmark()
self.mock_cdx_api.newest.side_effect = WaybackError
tasks._load_web_archive_snapshot_task(bookmark.id)
self.assertEqual("", bookmark.web_archive_snapshot_url)
def test_load_web_archive_snapshot_should_not_save_stale_bookmark_data(self):
bookmark = self.setup_bookmark()
# update bookmark during API call to check that saving
# the snapshot does not overwrite updated bookmark data
def mock_newest_impl():
bookmark.title = "Updated title"
bookmark.save()
return mock.DEFAULT
self.mock_cdx_api.newest.side_effect = mock_newest_impl
tasks._load_web_archive_snapshot_task(bookmark.id)
bookmark.refresh_from_db()
self.assertEqual("Updated title", bookmark.title)
self.assertEqual(
"https://example.com/newest_snapshot", bookmark.web_archive_snapshot_url
)
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_create_web_archive_snapshot_should_not_run_when_background_tasks_are_disabled(
self,
@ -292,59 +168,6 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
self.assertEqual(self.executed_count(), 0)
def test_schedule_bookmarks_without_snapshots_should_load_snapshot_for_all_bookmarks_without_snapshot(
self,
):
user = self.get_or_create_test_user()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark(web_archive_snapshot_url="https://example.com")
self.setup_bookmark(web_archive_snapshot_url="https://example.com")
self.setup_bookmark(web_archive_snapshot_url="https://example.com")
tasks.schedule_bookmarks_without_snapshots(user)
self.assertEqual(self.executed_count(), 4)
self.assertEqual(self.mock_cdx_api.newest.call_count, 3)
def test_schedule_bookmarks_without_snapshots_should_only_update_user_owned_bookmarks(
self,
):
user = self.get_or_create_test_user()
other_user = User.objects.create_user(
"otheruser", "otheruser@example.com", "password123"
)
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
tasks.schedule_bookmarks_without_snapshots(user)
self.assertEqual(self.mock_cdx_api.newest.call_count, 3)
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_schedule_bookmarks_without_snapshots_should_not_run_when_background_tasks_are_disabled(
self,
):
tasks.schedule_bookmarks_without_snapshots(self.user)
self.assertEqual(self.executed_count(), 0)
def test_schedule_bookmarks_without_snapshots_should_not_run_when_web_archive_integration_is_disabled(
self,
):
self.user.profile.web_archive_integration = (
UserProfile.WEB_ARCHIVE_INTEGRATION_DISABLED
)
self.user.profile.save()
tasks.schedule_bookmarks_without_snapshots(self.user)
self.assertEqual(self.executed_count(), 0)
def test_load_favicon_should_create_favicon_file(self):
bookmark = self.setup_bookmark()

View file

@ -444,17 +444,6 @@ class ImporterTestCase(TestCase, BookmarkFactoryMixin, ImportTestMixin):
self.assertEqual(Bookmark.objects.all()[0].description, "Example description")
self.assertEqual(Bookmark.objects.all()[0].notes, "Updated notes")
def test_schedule_snapshot_creation(self):
user = self.get_or_create_test_user()
test_html = self.render_html(tags_html="")
with patch.object(
tasks, "schedule_bookmarks_without_snapshots"
) as mock_schedule_bookmarks_without_snapshots:
import_netscape_html(test_html, user)
mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user)
def test_schedule_favicon_loading(self):
user = self.get_or_create_test_user()
test_html = self.render_html(tags_html="")

View file

@ -1,17 +0,0 @@
from unittest.mock import patch
from django.test import TestCase
from bookmarks.services import tasks
from bookmarks.tests.helpers import BookmarkFactoryMixin
class SignalsTestCase(TestCase, BookmarkFactoryMixin):
def test_login_should_schedule_snapshot_creation(self):
user = self.get_or_create_test_user()
with patch.object(
tasks, "schedule_bookmarks_without_snapshots"
) as mock_schedule_bookmarks_without_snapshots:
self.client.force_login(user)
mock_schedule_bookmarks_without_snapshots.assert_called_once_with(user)

View file

@ -1,12 +1,12 @@
import re
import urllib.parse
from typing import Set, List
import re
from django.conf import settings
from django.core.handlers.wsgi import WSGIRequest
from django.core.paginator import Paginator
from django.db import models
from django.urls import reverse
from django.conf import settings
from bookmarks import queries
from bookmarks import utils
@ -18,6 +18,7 @@ from bookmarks.models import (
UserProfile,
Tag,
)
from bookmarks.services.wayback import generate_fallback_webarchive_url
DEFAULT_PAGE_SIZE = 30
CJK_RE = re.compile(r"[\u4e00-\u9fff]+")
@ -144,6 +145,10 @@ class BookmarkItem:
self.notes = bookmark.notes
self.tag_names = bookmark.tag_names
self.web_archive_snapshot_url = bookmark.web_archive_snapshot_url
if not self.web_archive_snapshot_url:
self.web_archive_snapshot_url = generate_fallback_webarchive_url(
bookmark.url, bookmark.date_added
)
self.favicon_file = bookmark.favicon_file
self.preview_image_file = bookmark.preview_image_file
self.is_archived = bookmark.is_archived