linkding/bookmarks/services/importer.py

246 lines
7.8 KiB
Python
Raw Normal View History

import logging
from dataclasses import dataclass
from typing import List
2019-06-29 06:42:54 +00:00
from django.contrib.auth.models import User
2021-05-14 21:34:53 +00:00
from django.utils import timezone
2019-06-29 06:42:54 +00:00
from bookmarks.models import Bookmark, Tag
from bookmarks.services import tasks
from bookmarks.services.parser import parse, NetscapeBookmark
from bookmarks.utils import parse_timestamp
2019-06-29 06:42:54 +00:00
logger = logging.getLogger(__name__)
@dataclass
class ImportResult:
total: int = 0
success: int = 0
failed: int = 0
2019-06-29 06:42:54 +00:00
@dataclass
class ImportOptions:
map_private_flag: bool = False
class TagCache:
def __init__(self, user: User):
self.user = user
self.cache = dict()
# Init cache with all existing tags for that user
tags = Tag.objects.filter(owner=user)
for tag in tags:
self.put(tag)
def get(self, tag_name: str):
tag_name_lowercase = tag_name.lower()
if tag_name_lowercase in self.cache:
return self.cache[tag_name_lowercase]
else:
return None
def get_all(self, tag_names: List[str]):
result = []
for tag_name in tag_names:
tag = self.get(tag_name)
# Prevent returning duplicates
if not (tag in result):
result.append(tag)
return result
def put(self, tag: Tag):
self.cache[tag.name.lower()] = tag
2024-01-27 10:29:16 +00:00
def import_netscape_html(
html: str, user: User, options: ImportOptions = ImportOptions()
) -> ImportResult:
result = ImportResult()
import_start = timezone.now()
try:
netscape_bookmarks = parse(html)
except:
2024-01-27 10:29:16 +00:00
logging.exception("Could not read bookmarks file.")
raise
2019-06-29 06:42:54 +00:00
parse_end = timezone.now()
2024-01-27 10:29:16 +00:00
logger.debug(f"Parse duration: {parse_end - import_start}")
# Create and cache all tags beforehand
_create_missing_tags(netscape_bookmarks, user)
tag_cache = TagCache(user)
# Split bookmarks to import into batches, to keep memory usage for bulk operations manageable
batches = _get_batches(netscape_bookmarks, 200)
for batch in batches:
_import_batch(batch, user, options, tag_cache, result)
# Load favicons for newly imported bookmarks
tasks.schedule_bookmarks_without_favicons(user)
# Load previews for newly imported bookmarks
tasks.schedule_bookmarks_without_previews(user)
end = timezone.now()
2024-01-27 10:29:16 +00:00
logger.debug(f"Import duration: {end - import_start}")
return result
def _create_missing_tags(netscape_bookmarks: List[NetscapeBookmark], user: User):
tag_cache = TagCache(user)
tags_to_create = []
for netscape_bookmark in netscape_bookmarks:
for tag_name in netscape_bookmark.tag_names:
tag = tag_cache.get(tag_name)
if not tag:
tag = Tag(name=tag_name, owner=user)
tag.date_added = timezone.now()
tags_to_create.append(tag)
tag_cache.put(tag)
Tag.objects.bulk_create(tags_to_create)
def _get_batches(items: List, batch_size: int):
batches = []
offset = 0
num_items = len(items)
while offset < num_items:
2024-01-27 10:29:16 +00:00
batch = items[offset : min(offset + batch_size, num_items)]
if len(batch) > 0:
batches.append(batch)
offset = offset + batch_size
return batches
2024-01-27 10:29:16 +00:00
def _import_batch(
netscape_bookmarks: List[NetscapeBookmark],
user: User,
options: ImportOptions,
tag_cache: TagCache,
result: ImportResult,
):
# Query existing bookmarks
batch_urls = [bookmark.href for bookmark in netscape_bookmarks]
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
# Create or update bookmarks from parsed Netscape bookmarks
bookmarks_to_create = []
bookmarks_to_update = []
for netscape_bookmark in netscape_bookmarks:
result.total = result.total + 1
try:
# Lookup existing bookmark by URL, or create new bookmark if there is no bookmark for that URL yet
bookmark = next(
2024-01-27 10:29:16 +00:00
(
bookmark
for bookmark in existing_bookmarks
if bookmark.url == netscape_bookmark.href
),
None,
)
if not bookmark:
bookmark = Bookmark(owner=user)
is_update = False
else:
is_update = True
# Copy data from parsed bookmark
_copy_bookmark_data(netscape_bookmark, bookmark, options)
# Validate bookmark fields, exclude owner to prevent n+1 database query,
# also there is no specific validation on owner
2024-01-27 10:29:16 +00:00
bookmark.clean_fields(exclude=["owner"])
# Schedule for update or insert
if is_update:
bookmarks_to_update.append(bookmark)
else:
bookmarks_to_create.append(bookmark)
result.success = result.success + 1
except:
2024-01-27 10:29:16 +00:00
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + "..."
logging.exception("Error importing bookmark: " + shortened_bookmark_tag_str)
result.failed = result.failed + 1
# Bulk update bookmarks in DB
2024-01-27 10:29:16 +00:00
Bookmark.objects.bulk_update(
bookmarks_to_update,
[
"url",
"date_added",
"date_modified",
"unread",
"shared",
"title",
"description",
"notes",
"owner",
],
)
# Bulk insert new bookmarks into DB
Bookmark.objects.bulk_create(bookmarks_to_create)
# Bulk assign tags
# In Django 3, bulk_create does not return the auto-generated IDs when bulk inserting,
# so we have to reload the inserted bookmarks, and match them to the parsed bookmarks by URL
existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)
2019-06-29 06:42:54 +00:00
BookmarkToTagRelationShip = Bookmark.tags.through
relationships = []
2019-06-29 06:42:54 +00:00
for netscape_bookmark in netscape_bookmarks:
# Lookup bookmark by URL again
bookmark = next(
2024-01-27 10:29:16 +00:00
(
bookmark
for bookmark in existing_bookmarks
if bookmark.url == netscape_bookmark.href
),
None,
)
2019-06-29 06:42:54 +00:00
if not bookmark:
# Something is wrong, we should have just created this bookmark
2024-01-27 10:29:16 +00:00
shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + "..."
logging.warning(
2024-01-27 10:29:16 +00:00
f"Failed to assign tags to the bookmark: {shortened_bookmark_tag_str}. Could not find bookmark by URL."
)
continue
# Get tag models by string, schedule inserts for bookmark -> tag associations
tags = tag_cache.get_all(netscape_bookmark.tag_names)
for tag in tags:
relationships.append(BookmarkToTagRelationShip(bookmark=bookmark, tag=tag))
# Insert all bookmark -> tag associations at once, should ignore errors if association already exists
BookmarkToTagRelationShip.objects.bulk_create(relationships, ignore_conflicts=True)
2024-01-27 10:29:16 +00:00
def _copy_bookmark_data(
netscape_bookmark: NetscapeBookmark, bookmark: Bookmark, options: ImportOptions
):
bookmark.url = netscape_bookmark.href
2021-05-14 21:34:53 +00:00
if netscape_bookmark.date_added:
bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
2021-05-14 21:34:53 +00:00
else:
bookmark.date_added = timezone.now()
2019-06-29 06:42:54 +00:00
bookmark.date_modified = bookmark.date_added
bookmark.unread = netscape_bookmark.to_read
if netscape_bookmark.title:
bookmark.title = netscape_bookmark.title
if netscape_bookmark.description:
bookmark.description = netscape_bookmark.description
if netscape_bookmark.notes:
bookmark.notes = netscape_bookmark.notes
if options.map_private_flag and not netscape_bookmark.private:
bookmark.shared = True
if netscape_bookmark.archived:
bookmark.is_archived = True