diff --git a/bookmarks/services/tasks.py b/bookmarks/services/tasks.py
index 8587343..99a79da 100644
--- a/bookmarks/services/tasks.py
+++ b/bookmarks/services/tasks.py
@@ -5,8 +5,9 @@ from background_task import background
from django.conf import settings
from django.contrib.auth import get_user_model
from django.contrib.auth.models import User
-from waybackpy.exceptions import WaybackError
+from waybackpy.exceptions import WaybackError, TooManyRequestsError, NoCDXRecordFound
+import bookmarks.services.wayback
from bookmarks.models import Bookmark, UserProfile
from bookmarks.services.website_loader import DEFAULT_USER_AGENT
@@ -26,6 +27,32 @@ def create_web_archive_snapshot(user: User, bookmark: Bookmark, force_update: bo
_create_web_archive_snapshot_task(bookmark.id, force_update)
+def _load_newest_snapshot(bookmark: Bookmark):
+ try:
+ logger.debug(f'Load existing snapshot for bookmark. url={bookmark.url}')
+ cdx_api = bookmarks.services.wayback.CustomWaybackMachineCDXServerAPI(bookmark.url)
+ existing_snapshot = cdx_api.newest()
+
+ if existing_snapshot:
+ bookmark.web_archive_snapshot_url = existing_snapshot.archive_url
+ bookmark.save()
+ logger.debug(f'Using newest snapshot. url={bookmark.url} from={existing_snapshot.datetime_timestamp}')
+
+ except NoCDXRecordFound:
+ logger.error(f'Could not find any snapshots for bookmark. url={bookmark.url}')
+ except WaybackError as error:
+ logger.error(f'Failed to load existing snapshot. url={bookmark.url}', exc_info=error)
+
+
+def _create_snapshot(bookmark: Bookmark):
+ logger.debug(f'Create new snapshot for bookmark. url={bookmark.url}...')
+ archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT, max_tries=1)
+ archive.save()
+ bookmark.web_archive_snapshot_url = archive.archive_url
+ bookmark.save()
+ logger.debug(f'Successfully created new snapshot for bookmark:. url={bookmark.url}')
+
+
@background()
def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
try:
@@ -37,19 +64,31 @@ def _create_web_archive_snapshot_task(bookmark_id: int, force_update: bool):
if bookmark.web_archive_snapshot_url and not force_update:
return
- logger.debug(f'Create web archive link for bookmark: {bookmark}...')
-
- archive = waybackpy.WaybackMachineSaveAPI(bookmark.url, DEFAULT_USER_AGENT)
-
+ # Create new snapshot
try:
- archive.save()
- except WaybackError as error:
- logger.exception(f'Error creating web archive link for bookmark: {bookmark}...', exc_info=error)
- raise
+ _create_snapshot(bookmark)
+ return
+ except TooManyRequestsError:
+ logger.error(
+ f'Failed to create snapshot due to rate limiting, trying to load newest snapshot as fallback. url={bookmark.url}')
+ except WaybackError:
+ logger.error(f'Failed to create snapshot, trying to load newest snapshot as fallback. url={bookmark.url}')
- bookmark.web_archive_snapshot_url = archive.archive_url
- bookmark.save()
- logger.debug(f'Successfully created web archive link for bookmark: {bookmark}...')
+ # Load the newest snapshot as fallback
+ _load_newest_snapshot(bookmark)
+
+
+@background()
+def _load_web_archive_snapshot_task(bookmark_id: int):
+ try:
+ bookmark = Bookmark.objects.get(id=bookmark_id)
+ except Bookmark.DoesNotExist:
+ return
+ # Skip if snapshot exists
+ if bookmark.web_archive_snapshot_url:
+ return
+ # Load the newest snapshot
+ _load_newest_snapshot(bookmark)
def schedule_bookmarks_without_snapshots(user: User):
@@ -63,4 +102,6 @@ def _schedule_bookmarks_without_snapshots_task(user_id: int):
bookmarks_without_snapshots = Bookmark.objects.filter(web_archive_snapshot_url__exact='', owner=user)
for bookmark in bookmarks_without_snapshots:
- _create_web_archive_snapshot_task(bookmark.id, False)
+ # To prevent rate limit errors from the Wayback API only try to load the latest snapshots instead of creating
+ # new ones when processing bookmarks in bulk
+ _load_web_archive_snapshot_task(bookmark.id)
diff --git a/bookmarks/services/wayback.py b/bookmarks/services/wayback.py
new file mode 100644
index 0000000..d830403
--- /dev/null
+++ b/bookmarks/services/wayback.py
@@ -0,0 +1,40 @@
+import time
+from typing import Dict
+
+import waybackpy
+import waybackpy.utils
+from waybackpy.exceptions import NoCDXRecordFound
+
+
+class CustomWaybackMachineCDXServerAPI(waybackpy.WaybackMachineCDXServerAPI):
+ """
+ Customized WaybackMachineCDXServerAPI to work around some issues with retrieving the newest snapshot.
+ See https://github.com/akamhy/waybackpy/issues/176
+ """
+
+ def newest(self):
+ unix_timestamp = int(time.time())
+ self.closest = waybackpy.utils.unix_timestamp_to_wayback_timestamp(unix_timestamp)
+ self.sort = 'closest'
+ self.limit = -5
+
+ newest_snapshot = None
+ for snapshot in self.snapshots():
+ newest_snapshot = snapshot
+ break
+
+ if not newest_snapshot:
+ raise NoCDXRecordFound(
+ "Wayback Machine's CDX server did not return any records "
+ + "for the query. The URL may not have any archives "
+ + " on the Wayback Machine or the URL may have been recently "
+ + "archived and is still not available on the CDX server."
+ )
+
+ return newest_snapshot
+
+ def add_payload(self, payload: Dict[str, str]) -> None:
+ super().add_payload(payload)
+ # Set fastLatest query param, as we are only using this API to get the latest snapshot and using fastLatest
+ # makes searching for latest snapshots faster
+ payload['fastLatest'] = 'true'
diff --git a/bookmarks/templates/settings/general.html b/bookmarks/templates/settings/general.html
index 9728b1c..6c4fbcc 100644
--- a/bookmarks/templates/settings/general.html
+++ b/bookmarks/templates/settings/general.html
@@ -2,136 +2,139 @@
{% load widget_tweaks %}
{% block content %}
-
+
- {% include 'settings/nav.html' %}
+ {% include 'settings/nav.html' %}
- {# Profile section #}
-
- Profile
-
- Change password
-
-
-
+ {# Profile section #}
+
+ Profile
+
+ Change password
+
+
+
- {# Import section #}
-
- Import
- Import bookmarks and tags in the Netscape HTML format. This will execute a sync where new bookmarks are
- added and existing ones are updated.
-
-
+ {# Import section #}
+
+ Import
+ Import bookmarks and tags in the Netscape HTML format. This will execute a sync where new bookmarks are
+ added and existing ones are updated.
+
+
- {# Export section #}
-
- Export
- Export all bookmarks in Netscape HTML format.
- Download (.html)
- {% if export_error %}
-
-
- {{ export_error }}
-
-
- {% endif %}
-
+ {# Export section #}
+
+ Export
+ Export all bookmarks in Netscape HTML format.
+ Download (.html)
+ {% if export_error %}
+
+
+ {{ export_error }}
+
+
+ {% endif %}
+
- {# About section #}
-
-
+ {# About section #}
+
+
{% endblock %}
diff --git a/bookmarks/tests/test_bookmarks_tasks.py b/bookmarks/tests/test_bookmarks_tasks.py
index d4c7e31..e0a3c73 100644
--- a/bookmarks/tests/test_bookmarks_tasks.py
+++ b/bookmarks/tests/test_bookmarks_tasks.py
@@ -1,25 +1,51 @@
+import datetime
+from dataclasses import dataclass
from unittest.mock import patch
import waybackpy
from background_task.models import Task
from django.contrib.auth.models import User
from django.test import TestCase, override_settings
+from waybackpy.exceptions import WaybackError
-from bookmarks.models import Bookmark, UserProfile
+import bookmarks.services.wayback
+from bookmarks.models import UserProfile
from bookmarks.services import tasks
from bookmarks.tests.helpers import BookmarkFactoryMixin, disable_logging
class MockWaybackMachineSaveAPI:
- def __init__(self, archive_url: str):
+ def __init__(self, archive_url: str = 'https://example.com/created_snapshot', fail_on_save: bool = False):
self.archive_url = archive_url
+ self.fail_on_save = fail_on_save
def save(self):
+ if self.fail_on_save:
+ raise WaybackError
return self
-class MockWaybackUrlWithSaveError:
- def save(self):
- raise NotImplementedError
+
+@dataclass
+class MockCdxSnapshot:
+ archive_url: str
+ datetime_timestamp: datetime.datetime
+
+
+class MockWaybackMachineCDXServerAPI:
+ def __init__(self,
+ archive_url: str = 'https://example.com/newest_snapshot',
+ has_no_snapshot=False,
+ fail_loading_snapshot=False):
+ self.archive_url = archive_url
+ self.has_no_snapshot = has_no_snapshot
+ self.fail_loading_snapshot = fail_loading_snapshot
+
+ def newest(self):
+ if self.has_no_snapshot:
+ return None
+ if self.fail_loading_snapshot:
+ raise WaybackError
+ return MockCdxSnapshot(self.archive_url, datetime.datetime.now())
class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
@@ -50,49 +76,130 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
def test_create_web_archive_snapshot_should_update_snapshot_url(self):
bookmark = self.setup_bookmark()
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')):
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI()):
tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
self.run_pending_task(tasks._create_web_archive_snapshot_task)
bookmark.refresh_from_db()
- self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
+ self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com/created_snapshot')
def test_create_web_archive_snapshot_should_handle_missing_bookmark_id(self):
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')) as mock_wayback_url:
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI()) as mock_save_api:
tasks._create_web_archive_snapshot_task(123, False)
self.run_pending_task(tasks._create_web_archive_snapshot_task)
- mock_wayback_url.assert_not_called()
-
- def test_create_web_archive_snapshot_should_handle_wayback_save_error(self):
- bookmark = self.setup_bookmark()
-
- with patch.object(waybackpy, 'WaybackMachineSaveAPI',
- return_value=MockWaybackUrlWithSaveError()):
- with self.assertRaises(NotImplementedError):
- tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
- self.run_pending_task(tasks._create_web_archive_snapshot_task)
+ mock_save_api.assert_not_called()
def test_create_web_archive_snapshot_should_skip_if_snapshot_exists(self):
bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com')
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')):
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI()) as mock_save_api:
tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
self.run_pending_task(tasks._create_web_archive_snapshot_task)
- bookmark.refresh_from_db()
- self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
+ mock_save_api.assert_not_called()
def test_create_web_archive_snapshot_should_force_update_snapshot(self):
bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com')
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')):
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI('https://other.com')):
tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, True)
self.run_pending_task(tasks._create_web_archive_snapshot_task)
bookmark.refresh_from_db()
self.assertEqual(bookmark.web_archive_snapshot_url, 'https://other.com')
+ def test_create_web_archive_snapshot_should_use_newest_snapshot_as_fallback(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI(fail_on_save=True)):
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI()):
+ tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
+ self.run_pending_task(tasks._create_web_archive_snapshot_task)
+
+ bookmark.refresh_from_db()
+ self.assertEqual('https://example.com/newest_snapshot', bookmark.web_archive_snapshot_url)
+
+ def test_create_web_archive_snapshot_should_ignore_missing_newest_snapshot(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI(fail_on_save=True)):
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI(has_no_snapshot=True)):
+ tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
+ self.run_pending_task(tasks._create_web_archive_snapshot_task)
+
+ bookmark.refresh_from_db()
+ self.assertEqual('', bookmark.web_archive_snapshot_url)
+
+ def test_create_web_archive_snapshot_should_ignore_newest_snapshot_errors(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(waybackpy, 'WaybackMachineSaveAPI',
+ return_value=MockWaybackMachineSaveAPI(fail_on_save=True)):
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI(fail_loading_snapshot=True)):
+ tasks.create_web_archive_snapshot(self.get_or_create_test_user(), bookmark, False)
+ self.run_pending_task(tasks._create_web_archive_snapshot_task)
+
+ bookmark.refresh_from_db()
+ self.assertEqual('', bookmark.web_archive_snapshot_url)
+
+ def test_load_web_archive_snapshot_should_update_snapshot_url(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI()):
+ tasks._load_web_archive_snapshot_task(bookmark.id)
+ self.run_pending_task(tasks._load_web_archive_snapshot_task)
+
+ bookmark.refresh_from_db()
+ self.assertEqual('https://example.com/newest_snapshot', bookmark.web_archive_snapshot_url)
+
+ def test_load_web_archive_snapshot_should_handle_missing_bookmark_id(self):
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI()) as mock_cdx_api:
+ tasks._load_web_archive_snapshot_task(123)
+ self.run_pending_task(tasks._load_web_archive_snapshot_task)
+
+ mock_cdx_api.assert_not_called()
+
+ def test_load_web_archive_snapshot_should_skip_if_snapshot_exists(self):
+ bookmark = self.setup_bookmark(web_archive_snapshot_url='https://example.com')
+
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI()) as mock_cdx_api:
+ tasks._load_web_archive_snapshot_task(bookmark.id)
+ self.run_pending_task(tasks._load_web_archive_snapshot_task)
+
+ mock_cdx_api.assert_not_called()
+
+ def test_load_web_archive_snapshot_should_handle_missing_snapshot(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI(has_no_snapshot=True)):
+ tasks._load_web_archive_snapshot_task(bookmark.id)
+ self.run_pending_task(tasks._load_web_archive_snapshot_task)
+
+ self.assertEqual('', bookmark.web_archive_snapshot_url)
+
+ def test_load_web_archive_snapshot_should_handle_wayback_errors(self):
+ bookmark = self.setup_bookmark()
+
+ with patch.object(bookmarks.services.wayback, 'CustomWaybackMachineCDXServerAPI',
+ return_value=MockWaybackMachineCDXServerAPI(fail_loading_snapshot=True)):
+ tasks._load_web_archive_snapshot_task(bookmark.id)
+ self.run_pending_task(tasks._load_web_archive_snapshot_task)
+
+ self.assertEqual('', bookmark.web_archive_snapshot_url)
+
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_create_web_archive_snapshot_should_not_run_when_background_tasks_are_disabled(self):
bookmark = self.setup_bookmark()
@@ -109,33 +216,23 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
self.assertEqual(Task.objects.count(), 0)
- def test_schedule_bookmarks_without_snapshots_should_create_snapshot_task_for_all_bookmarks_without_snapshot(self):
+ def test_schedule_bookmarks_without_snapshots_should_load_snapshot_for_all_bookmarks_without_snapshot(self):
user = self.get_or_create_test_user()
self.setup_bookmark()
self.setup_bookmark()
self.setup_bookmark()
-
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')):
- tasks.schedule_bookmarks_without_snapshots(user)
- self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task)
- self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task)
-
- for bookmark in Bookmark.objects.all():
- self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
-
- def test_schedule_bookmarks_without_snapshots_should_not_update_bookmarks_with_existing_snapshot(self):
- user = self.get_or_create_test_user()
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
self.setup_bookmark(web_archive_snapshot_url='https://example.com')
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://other.com')):
- tasks.schedule_bookmarks_without_snapshots(user)
- self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task)
- self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task)
+ tasks.schedule_bookmarks_without_snapshots(user)
+ self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task)
- for bookmark in Bookmark.objects.all():
- self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
+ task_list = Task.objects.all()
+ self.assertEqual(task_list.count(), 3)
+
+ for task in task_list:
+ self.assertEqual(task.task_name, 'bookmarks.services.tasks._load_web_archive_snapshot_task')
def test_schedule_bookmarks_without_snapshots_should_only_update_user_owned_bookmarks(self):
user = self.get_or_create_test_user()
@@ -147,16 +244,11 @@ class BookmarkTasksTestCase(TestCase, BookmarkFactoryMixin):
self.setup_bookmark(user=other_user)
self.setup_bookmark(user=other_user)
- with patch.object(waybackpy, 'WaybackMachineSaveAPI', return_value=MockWaybackMachineSaveAPI('https://example.com')):
- tasks.schedule_bookmarks_without_snapshots(user)
- self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task)
- self.run_all_pending_tasks(tasks._create_web_archive_snapshot_task)
+ tasks.schedule_bookmarks_without_snapshots(user)
+ self.run_pending_task(tasks._schedule_bookmarks_without_snapshots_task)
- for bookmark in Bookmark.objects.all().filter(owner=user):
- self.assertEqual(bookmark.web_archive_snapshot_url, 'https://example.com')
-
- for bookmark in Bookmark.objects.all().filter(owner=other_user):
- self.assertEqual(bookmark.web_archive_snapshot_url, '')
+ task_list = Task.objects.all()
+ self.assertEqual(task_list.count(), 3)
@override_settings(LD_DISABLE_BACKGROUND_TASKS=True)
def test_schedule_bookmarks_without_snapshots_should_not_run_when_background_tasks_are_disabled(self):