linkding/bookmarks/services/importer.py
Sascha Ißbrücker 749bc1ef63
Generate fallback URLs for web archive links (#804)
* generate fallback web archive URL if none exists

* remove fallback web archive snapshot creation

* fix test
2024-08-29 22:45:10 +02:00

245 lines
7.8 KiB
Python

import logging
from dataclasses import dataclass
from typing import List
from django.contrib.auth.models import User
from django.utils import timezone
from bookmarks.models import Bookmark, Tag
from bookmarks.services import tasks
from bookmarks.services.parser import parse, NetscapeBookmark
from bookmarks.utils import parse_timestamp
logger = logging.getLogger(__name__)
@dataclass
class ImportResult:
total: int = 0
success: int = 0
failed: int = 0
@dataclass
class ImportOptions:
map_private_flag: bool = False
class TagCache:
def __init__(self, user: User):
self.user = user
self.cache = dict()
# Init cache with all existing tags for that user
tags = Tag.objects.filter(owner=user)
for tag in tags:
self.put(tag)
def get(self, tag_name: str):
tag_name_lowercase = tag_name.lower()
if tag_name_lowercase in self.cache:
return self.cache[tag_name_lowercase]
else:
return None
def get_all(self, tag_names: List[str]):
result = []
for tag_name in tag_names:
tag = self.get(tag_name)
# Prevent returning duplicates
if not (tag in result):
result.append(tag)
return result
def put(self, tag: Tag):
self.cache[tag.name.lower()] = tag
def import_netscape_html(
html: str, user: User, options: ImportOptions = ImportOptions()
) -> ImportResult:
result = ImportResult()
import_start = timezone.now()
try:
netscape_bookmarks = parse(html)
except:
logging.exception("Could not read bookmarks file.")
raise
parse_end = timezone.now()
logger.debug(f"Parse duration: {parse_end - import_start}")
# Create and cache all tags beforehand
_create_missing_tags(netscape_bookmarks, user)
tag_cache = TagCache(user)
# Split bookmarks to import into batches, to keep memory usage for bulk operations manageable
batches = _get_batches(netscape_bookmarks, 200)
for batch in batches:
_import_batch(batch, user, options, tag_cache, result)
# Load favicons for newly imported bookmarks
tasks.schedule_bookmarks_without_favicons(user)
# Load previews for newly imported bookmarks
tasks.schedule_bookmarks_without_previews(user)
end = timezone.now()
logger.debug(f"Import duration: {end - import_start}")
return result
def _create_missing_tags(netscape_bookmarks: List[NetscapeBookmark], user: User):
tag_cache = TagCache(user)
tags_to_create = []
for netscape_bookmark in netscape_bookmarks:
for tag_name in netscape_bookmark.tag_names:
tag = tag_cache.get(tag_name)
if not tag:
tag = Tag(name=tag_name, owner=user)
tag.date_added = timezone.now()
tags_to_create.append(tag)
tag_cache.put(tag)
Tag.objects.bulk_create(tags_to_create)
def _get_batches(items: List, batch_size: int):
batches = []
offset = 0
num_items = len(items)
while offset < num_items:
batch = items[offset : min(offset + batch_size, num_items)]
if len(batch) > 0:
batches.append(batch)
offset = offset + batch_size
return batches
def _import_batch(
netscape_bookmarks: List[NetscapeBookmark],
user: User,
options: ImportOptions,
tag_cache: TagCache,
result: ImportResult,
):
# Query existing bookmarks
batch_urls = [bookmark.href for bookmark in netscape_bookmarks]
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
# Create or update bookmarks from parsed Netscape bookmarks
bookmarks_to_create = []
bookmarks_to_update = []
for netscape_bookmark in netscape_bookmarks:
result.total = result.total + 1
try:
# Lookup existing bookmark by URL, or create new bookmark if there is no bookmark for that URL yet
bookmark = next(
(
bookmark
for bookmark in existing_bookmarks
if bookmark.url == netscape_bookmark.href
),
None,
)
if not bookmark:
bookmark = Bookmark(owner=user)
is_update = False
else:
is_update = True
# Copy data from parsed bookmark
_copy_bookmark_data(netscape_bookmark, bookmark, options)
# Validate bookmark fields, exclude owner to prevent n+1 database query,
# also there is no specific validation on owner
bookmark.clean_fields(exclude=["owner"])
# Schedule for update or insert
if is_update:
bookmarks_to_update.append(bookmark)
else:
bookmarks_to_create.append(bookmark)
result.success = result.success + 1
except:
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + "..."
logging.exception("Error importing bookmark: " + shortened_bookmark_tag_str)
result.failed = result.failed + 1
# Bulk update bookmarks in DB
Bookmark.objects.bulk_update(
bookmarks_to_update,
[
"url",
"date_added",
"date_modified",
"unread",
"shared",
"title",
"description",
"notes",
"owner",
],
)
# Bulk insert new bookmarks into DB
Bookmark.objects.bulk_create(bookmarks_to_create)
# Bulk assign tags
# In Django 3, bulk_create does not return the auto-generated IDs when bulk inserting,
# so we have to reload the inserted bookmarks, and match them to the parsed bookmarks by URL
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
BookmarkToTagRelationShip = Bookmark.tags.through
relationships = []
for netscape_bookmark in netscape_bookmarks:
# Lookup bookmark by URL again
bookmark = next(
(
bookmark
for bookmark in existing_bookmarks
if bookmark.url == netscape_bookmark.href
),
None,
)
if not bookmark:
# Something is wrong, we should have just created this bookmark
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + "..."
logging.warning(
f"Failed to assign tags to the bookmark: {shortened_bookmark_tag_str}. Could not find bookmark by URL."
)
continue
# Get tag models by string, schedule inserts for bookmark -> tag associations
tags = tag_cache.get_all(netscape_bookmark.tag_names)
for tag in tags:
relationships.append(BookmarkToTagRelationShip(bookmark=bookmark, tag=tag))
# Insert all bookmark -> tag associations at once, should ignore errors if association already exists
BookmarkToTagRelationShip.objects.bulk_create(relationships, ignore_conflicts=True)
def _copy_bookmark_data(
netscape_bookmark: NetscapeBookmark, bookmark: Bookmark, options: ImportOptions
):
bookmark.url = netscape_bookmark.href
if netscape_bookmark.date_added:
bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
else:
bookmark.date_added = timezone.now()
bookmark.date_modified = bookmark.date_added
bookmark.unread = netscape_bookmark.to_read
if netscape_bookmark.title:
bookmark.title = netscape_bookmark.title
if netscape_bookmark.description:
bookmark.description = netscape_bookmark.description
if netscape_bookmark.notes:
bookmark.notes = netscape_bookmark.notes
if options.map_private_flag and not netscape_bookmark.private:
bookmark.shared = True
if netscape_bookmark.archived:
bookmark.is_archived = True