diff --git a/bookmarks/services/tasks.py b/bookmarks/services/tasks.py index 8587343..99a79da 100644 --- a/bookmarks/services/tasks.py +++ b/bookmarks/services/tasks.py @@ -5,8 +5,9 @@ from background_task import background from django.conf import settings from django.contrib.auth import get_user_model from django.contrib.auth.models import User -from waybackpy.exceptions import WaybackError +from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound +import bookmarks.services.wayback from bookmarks.models import Bookmark, UserProfile from bookmarks.services.website_loader import DEFAULT_USER_AGENT @@ -26,6 +27,32 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo _create_web_archive_snapshot_task(bookmark.id, force_update) +def _load_newest_snapshot(bookmark: Bookmark): + try: + logger.debug(f'Load existing snapshot for bookmark. url={bookmark.url}') + cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(bookmark.url) + existing_snapshot = cdx_api.newest() + + if existing_snapshot: + bookmark.web_archive_snapshot_url = existing_snapshot.archive_url + bookmark.save() + logger.debug(f'Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}') + + except NoCDXRecordFound: + logger.error(f'Could not find any snapshots for bookmark. url={bookmark.url}') + except WaybackError as error: + logger.error(f'Failed to load existing snapshot. url={bookmark.url}', exc_info=error) + + +def _create_snapshot(bookmark: Bookmark): + logger.debug(f'Create new snapshot for bookmark. url={bookmark.url}...') + archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT, max_tries=1) + archive.save() + bookmark.web_archive_snapshot_url = archive.archive_url + bookmark.save() + logger.debug(f'Successfully created new snapshot for bookmark:. url={bookmark.url}') + + @background() def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool): try: @@ -37,19 +64,31 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool): if bookmark.web_archive_snapshot_url and not force_update: return - logger.debug(f'Create web archive link for bookmark: {bookmark}...') - - archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT) - + # Create new snapshot try: - archive.save() - except WaybackError as error: - logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error) - raise + _create_snapshot(bookmark) + return + except TooManyRequestsError: + logger.error( + f'Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}') + except WaybackError: + logger.error(f'Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}') - bookmark.web_archive_snapshot_url = archive.archive_url - bookmark.save() - logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...') + # Load the newest snapshot as fallback + _load_newest_snapshot(bookmark) + + +@background() +def _load_web_archive_snapshot_task(bookmark_id: int): + try: + bookmark = Bookmark.objects.get(id=bookmark_id) + except Bookmark.DoesNotExist: + return + # Skip if snapshot exists + if bookmark.web_archive_snapshot_url: + return + # Load the newest snapshot + _load_newest_snapshot(bookmark) def schedule_bookmarks_without_snapshots(user: User): @@ -63,4 +102,6 @@ def _schedule_bookmarks_without_snapshots_task(user_id: int): bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user) for bookmark in bookmarks_without_snapshots: - _create_web_archive_snapshot_task(bookmark.id, False) + # To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating + # new ones when processing bookmarks in bulk + _load_web_archive_snapshot_task(bookmark.id) diff --git a/bookmarks/services/wayback.py b/bookmarks/services/wayback.py new file mode 100644 index 0000000..d830403 --- /dev/null +++ b/bookmarks/services/wayback.py @@ -0,0 +1,40 @@ +import time +from typing import Dict + +import waybackpy +import waybackpy.utils +from waybackpy.exceptions import NoCDXRecordFound + + +class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI): + """ + Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot. + See https://github.com/akamhy/waybackpy/issues/176 + """ + + def newest(self): + unix_timestamp = int(time.time()) + self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp) + self.sort = 'closest' + self.limit = -5 + + newest_snapshot = None + for snapshot in self.snapshots(): + newest_snapshot = snapshot + break + + if not newest_snapshot: + raise NoCDXRecordFound( + "Wayback Machine's CDX server did not return any records " + + "for the query. The URL may not have any archives " + + " on the Wayback Machine or the URL may have been recently " + + "archived and is still not available on the CDX server." + ) + + return newest_snapshot + + def add_payload(self, payload: Dict[str, str]) -> None: + super().add_payload(payload) + # Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest + # makes searching for latest snapshots faster + payload['fastLatest'] = 'true' diff --git a/bookmarks/templates/settings/general.html b/bookmarks/templates/settings/general.html index 9728b1c..6c4fbcc 100644 --- a/bookmarks/templates/settings/general.html +++ b/bookmarks/templates/settings/general.html @@ -2,136 +2,139 @@ {% load widget_tweaks %} {% block content %} -
+
- {% include 'settings/nav.html' %} + {% include 'settings/nav.html' %} - {# Profile section #} -
-

Profile

-

- Change password -

-
- {% csrf_token %} -
- - {{ form.theme|add_class:"form-select col-2 col-sm-12" }} -
- Whether to use a light or dark theme, or automatically adjust the theme based on your system's settings. -
-
-
- - {{ form.bookmark_date_display|add_class:"form-select col-2 col-sm-12" }} -
- Whether to show bookmark dates as relative (how long ago), or as absolute dates. Alternatively the date can be hidden. -
-
-
- - {{ form.bookmark_link_target|add_class:"form-select col-2 col-sm-12" }} -
- Whether to open bookmarks a new page or in the same page. -
-
-
- - {{ form.web_archive_integration|add_class:"form-select col-2 col-sm-12" }} -
- Enabling this feature will automatically create snapshots of bookmarked websites on the Internet Archive Wayback - Machine. This allows - to preserve, and later access, the website as it was at the point in time it was bookmarked, in - case it goes offline or its content is modified. -
-
-
- -
- Allows to share bookmarks with other users, and to view shared bookmarks. - Disabling this feature will hide all previously shared bookmarks from other users. -
-
-
- -
-
-
+ {# Profile section #} +
+

Profile

+

+ Change password +

+
+ {% csrf_token %} +
+ + {{ form.theme|add_class:"form-select col-2 col-sm-12" }} +
+ Whether to use a light or dark theme, or automatically adjust the theme based on your system's settings. +
+
+
+ + {{ form.bookmark_date_display|add_class:"form-select col-2 col-sm-12" }} +
+ Whether to show bookmark dates as relative (how long ago), or as absolute dates. Alternatively the date can + be hidden. +
+
+
+ + {{ form.bookmark_link_target|add_class:"form-select col-2 col-sm-12" }} +
+ Whether to open bookmarks a new page or in the same page. +
+
+
+ + {{ form.web_archive_integration|add_class:"form-select col-2 col-sm-12" }} +
+ Enabling this feature will automatically create snapshots of bookmarked websites on the Internet Archive Wayback + Machine. + This allows to preserve, and later access the website as it was at the point in time it was bookmarked, in + case it goes offline or its content is modified. + Please consider donating to the Internet Archive if you make use of this feature. +
+
+
+ +
+ Allows to share bookmarks with other users, and to view shared bookmarks. + Disabling this feature will hide all previously shared bookmarks from other users. +
+
+
+ +
+
+
- {# Import section #} -
-

Import

-

Import bookmarks and tags in the Netscape HTML format. This will execute a sync where new bookmarks are - added and existing ones are updated.

-
- {% csrf_token %} -
-
- - -
- {% if import_success_message %} -
-

- {{ import_success_message }} -

-
- {% endif %} - {% if import_errors_message %} -
-

- {{ import_errors_message }} -

-
- {% endif %} -
-
-
+ {# Import section #} +
+

Import

+

Import bookmarks and tags in the Netscape HTML format. This will execute a sync where new bookmarks are + added and existing ones are updated.

+
+ {% csrf_token %} +
+
+ + +
+ {% if import_success_message %} +
+

+ {{ import_success_message }} +

+
+ {% endif %} + {% if import_errors_message %} +
+

+ {{ import_errors_message }} +

+
+ {% endif %} +
+
+
- {# Export section #} -
-

Export

-

Export all bookmarks in Netscape HTML format.

- Download (.html) - {% if export_error %} -
-

- {{ export_error }} -

-
- {% endif %} -
+ {# Export section #} +
+

Export

+

Export all bookmarks in Netscape HTML format.

+ Download (.html) + {% if export_error %} +
+

+ {{ export_error }} +

+
+ {% endif %} +
- {# About section #} -
-

About

- - - - - - - - - - - - - - - - - -
Version{{ version_info }}
LinksGitHub
Documentation
Changelog
-
-
+ {# About section #} +
+

About

+ + + + + + + + + + + + + + + + + +
Version{{ version_info }}
LinksGitHub
Documentation
Changelog
+
+
{% endblock %} diff --git a/bookmarks/tests/test_bookmarks_tasks.py b/bookmarks/tests/test_bookmarks_tasks.py index d4c7e31..e0a3c73 100644 --- a/bookmarks/tests/test_bookmarks_tasks.py +++ b/bookmarks/tests/test_bookmarks_tasks.py @@ -1,25 +1,51 @@ +import datetime +from dataclasses import dataclass from unittest.mock import patch import waybackpy from background_task.models import Task from django.contrib.auth.models import User from django.test import TestCase, override_settings +from waybackpy.exceptions import WaybackError -from bookmarks.models import Bookmark, UserProfile +import bookmarks.services.wayback +from bookmarks.models import UserProfile from bookmarks.services import tasks from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging class MockWaybackMachineSaveAPI: - def __init__(self, archive_url: str): + def __init__(self, archive_url: str = 'https://example.com/created_snapshot', fail_on_save: bool = False): self.archive_url = archive_url + self.fail_on_save = fail_on_save def save(self): + if self.fail_on_save: + raise WaybackError return self -class MockWaybackUrlWithSaveError: - def save(self): - raise NotImplementedError + +@dataclass +class MockCdxSnapshot: + archive_url: str + datetime_timestamp: datetime.datetime + + +class MockWaybackMachineCDXServerAPI: + def __init__(self, + archive_url: str = 'https://example.com/newest_snapshot', + has_no_snapshot=False, + fail_loading_snapshot=False): + self.archive_url = archive_url + self.has_no_snapshot = has_no_snapshot + self.fail_loading_snapshot = fail_loading_snapshot + + def newest(self): + if self.has_no_snapshot: + return None + if self.fail_loading_snapshot: + raise WaybackError + return MockCdxSnapshot(self.archive_url, datetime.datetime.now()) class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): @@ -50,49 +76,130 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): def test_create_web_archive_snapshot_should_update_snapshot_url(self): bookmark = self.setup_bookmark() - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')): + with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI()): tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) self.run_pending_task(tasks._create_web_archive_snapshot_task) bookmark.refresh_from_db() - self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com') + self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com/created_snapshot') def test_create_web_archive_snapshot_should_handle_missing_bookmark_id(self): - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')) as mock_wayback_url: + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI()) as mock_save_api: tasks._create_web_archive_snapshot_task(123, False) self.run_pending_task(tasks._create_web_archive_snapshot_task) - mock_wayback_url.assert_not_called() - - def test_create_web_archive_snapshot_should_handle_wayback_save_error(self): - bookmark = self.setup_bookmark() - - with patch.object(waybackpy, 'WaybackMachineSaveAPI', - return_value=MockWaybackUrlWithSaveError()): - with self.assertRaises(NotImplementedError): - tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) - self.run_pending_task(tasks._create_web_archive_snapshot_task) + mock_save_api.assert_not_called() def test_create_web_archive_snapshot_should_skip_if_snapshot_exists(self): bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com') - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')): + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI()) as mock_save_api: tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) self.run_pending_task(tasks._create_web_archive_snapshot_task) - bookmark.refresh_from_db() - self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com') + mock_save_api.assert_not_called() def test_create_web_archive_snapshot_should_force_update_snapshot(self): bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com') - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')): + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI('https://other.com')): tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, True) self.run_pending_task(tasks._create_web_archive_snapshot_task) bookmark.refresh_from_db() self.assertEqual(bookmark.web_archive_snapshot_url, 'https://other.com') + def test_create_web_archive_snapshot_should_use_newest_snapshot_as_fallback(self): + bookmark = self.setup_bookmark() + + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI(fail_on_save=True)): + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI()): + tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) + self.run_pending_task(tasks._create_web_archive_snapshot_task) + + bookmark.refresh_from_db() + self.assertEqual('https://example.com/newest_snapshot', bookmark.web_archive_snapshot_url) + + def test_create_web_archive_snapshot_should_ignore_missing_newest_snapshot(self): + bookmark = self.setup_bookmark() + + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI(fail_on_save=True)): + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI(has_no_snapshot=True)): + tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) + self.run_pending_task(tasks._create_web_archive_snapshot_task) + + bookmark.refresh_from_db() + self.assertEqual('', bookmark.web_archive_snapshot_url) + + def test_create_web_archive_snapshot_should_ignore_newest_snapshot_errors(self): + bookmark = self.setup_bookmark() + + with patch.object(waybackpy, 'WaybackMachineSaveAPI', + return_value=MockWaybackMachineSaveAPI(fail_on_save=True)): + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI(fail_loading_snapshot=True)): + tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False) + self.run_pending_task(tasks._create_web_archive_snapshot_task) + + bookmark.refresh_from_db() + self.assertEqual('', bookmark.web_archive_snapshot_url) + + def test_load_web_archive_snapshot_should_update_snapshot_url(self): + bookmark = self.setup_bookmark() + + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI()): + tasks._load_web_archive_snapshot_task(bookmark.id) + self.run_pending_task(tasks._load_web_archive_snapshot_task) + + bookmark.refresh_from_db() + self.assertEqual('https://example.com/newest_snapshot', bookmark.web_archive_snapshot_url) + + def test_load_web_archive_snapshot_should_handle_missing_bookmark_id(self): + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI()) as mock_cdx_api: + tasks._load_web_archive_snapshot_task(123) + self.run_pending_task(tasks._load_web_archive_snapshot_task) + + mock_cdx_api.assert_not_called() + + def test_load_web_archive_snapshot_should_skip_if_snapshot_exists(self): + bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com') + + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI()) as mock_cdx_api: + tasks._load_web_archive_snapshot_task(bookmark.id) + self.run_pending_task(tasks._load_web_archive_snapshot_task) + + mock_cdx_api.assert_not_called() + + def test_load_web_archive_snapshot_should_handle_missing_snapshot(self): + bookmark = self.setup_bookmark() + + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI(has_no_snapshot=True)): + tasks._load_web_archive_snapshot_task(bookmark.id) + self.run_pending_task(tasks._load_web_archive_snapshot_task) + + self.assertEqual('', bookmark.web_archive_snapshot_url) + + def test_load_web_archive_snapshot_should_handle_wayback_errors(self): + bookmark = self.setup_bookmark() + + with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI', + return_value=MockWaybackMachineCDXServerAPI(fail_loading_snapshot=True)): + tasks._load_web_archive_snapshot_task(bookmark.id) + self.run_pending_task(tasks._load_web_archive_snapshot_task) + + self.assertEqual('', bookmark.web_archive_snapshot_url) + @override_settings(LD_DISABLE_BACKGROUND_TASKS=True) def test_create_web_archive_snapshot_should_not_run_when_background_tasks_are_disabled(self): bookmark = self.setup_bookmark() @@ -109,33 +216,23 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): self.assertEqual(Task.objects.count(), 0) - def test_schedule_bookmarks_without_snapshots_should_create_snapshot_task_for_all_bookmarks_without_snapshot(self): + def test_schedule_bookmarks_without_snapshots_should_load_snapshot_for_all_bookmarks_without_snapshot(self): user = self.get_or_create_test_user() self.setup_bookmark() self.setup_bookmark() self.setup_bookmark() - - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')): - tasks.schedule_bookmarks_without_snapshots(user) - self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task) - self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task) - - for bookmark in Bookmark.objects.all(): - self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com') - - def test_schedule_bookmarks_without_snapshots_should_not_update_bookmarks_with_existing_snapshot(self): - user = self.get_or_create_test_user() self.setup_bookmark(web_archive_snapshot_url='https://example.com') self.setup_bookmark(web_archive_snapshot_url='https://example.com') self.setup_bookmark(web_archive_snapshot_url='https://example.com') - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')): - tasks.schedule_bookmarks_without_snapshots(user) - self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task) - self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task) + tasks.schedule_bookmarks_without_snapshots(user) + self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task) - for bookmark in Bookmark.objects.all(): - self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com') + task_list = Task.objects.all() + self.assertEqual(task_list.count(), 3) + + for task in task_list: + self.assertEqual(task.task_name, 'bookmarks.services.tasks._load_web_archive_snapshot_task') def test_schedule_bookmarks_without_snapshots_should_only_update_user_owned_bookmarks(self): user = self.get_or_create_test_user() @@ -147,16 +244,11 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin): self.setup_bookmark(user=other_user) self.setup_bookmark(user=other_user) - with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')): - tasks.schedule_bookmarks_without_snapshots(user) - self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task) - self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task) + tasks.schedule_bookmarks_without_snapshots(user) + self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task) - for bookmark in Bookmark.objects.all().filter(owner=user): - self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com') - - for bookmark in Bookmark.objects.all().filter(owner=other_user): - self.assertEqual(bookmark.web_archive_snapshot_url, '') + task_list = Task.objects.all() + self.assertEqual(task_list.count(), 3) @override_settings(LD_DISABLE_BACKGROUND_TASKS=True) def test_schedule_bookmarks_without_snapshots_should_not_run_when_background_tasks_are_disabled(self):