linkding/bookmarks/services/importer.py

import logging
from dataclasses import dataclass
from typing import List

from django.contrib.auth.models import User
from django.utils import timezone

from bookmarks.models import Bookmark, Tag, parse_tag_string
from bookmarks.services import tasks
from bookmarks.services.parser import parse, NetscapeBookmark
from bookmarks.utils import parse_timestamp

logger = logging.getLogger(__name__)


@dataclass
class ImportResult:
    total: int = 0
    success: int = 0
    failed: int = 0


class TagCache:
    def __init__(self, user: User):
        self.user = user
        self.cache = dict()
        # Init cache with all existing tags for that user
        tags = Tag.objects.filter(owner=user)
        for tag in tags:
            self.put(tag)

    def get(self, tag_name: str):
        tag_name_lowercase = tag_name.lower()
        if tag_name_lowercase in self.cache:
            return self.cache[tag_name_lowercase]
        else:
            return None

    def get_all(self, tag_names: List[str]):
        result = []
        for tag_name in tag_names:
            tag = self.get(tag_name)
            # Prevent returning duplicates
            if not (tag in result):
                result.append(tag)

        return result

    def put(self, tag: Tag):
        self.cache[tag.name.lower()] = tag


def import_netscape_html(html: str, user: User):
    result = ImportResult()
    import_start = timezone.now()

    try:
        netscape_bookmarks = parse(html)
    except:
        logging.exception('Could not read bookmarks file.')
        raise

    parse_end = timezone.now()
    logger.debug(f'Parse duration: {parse_end - import_start}')

    # Create and cache all tags beforehand
    _create_missing_tags(netscape_bookmarks, user)
    tag_cache = TagCache(user)

    # Split bookmarks to import into batches, to keep memory usage for bulk operations manageable
    batches = _get_batches(netscape_bookmarks, 200)
    for batch in batches:
        _import_batch(batch, user, tag_cache, result)

    # Create snapshots for newly imported bookmarks
    tasks.schedule_bookmarks_without_snapshots(user)

    end = timezone.now()
    logger.debug(f'Import duration: {end - import_start}')

    return result


def _create_missing_tags(netscape_bookmarks: List[NetscapeBookmark], user: User):
    tag_cache = TagCache(user)
    tags_to_create = []

    for netscape_bookmark in netscape_bookmarks:
        tag_names = parse_tag_string(netscape_bookmark.tag_string)
        for tag_name in tag_names:
            tag = tag_cache.get(tag_name)
            if not tag:
                tag = Tag(name=tag_name, owner=user)
                tag.date_added = timezone.now()
                tags_to_create.append(tag)

    Tag.objects.bulk_create(tags_to_create)


def _get_batches(items: List, batch_size: int):
    batches = []
    offset = 0
    num_items = len(items)

    while offset < num_items:
        batch = items[offset:min(offset + batch_size, num_items)]
        if len(batch) > 0:
            batches.append(batch)
        offset = offset + batch_size

    return batches


def _import_batch(netscape_bookmarks: List[NetscapeBookmark], user: User, tag_cache: TagCache, result: ImportResult):
    # Query existing bookmarks
    batch_urls = [bookmark.href for bookmark in netscape_bookmarks]
    existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)

    # Create or update bookmarks from parsed Netscape bookmarks
    bookmarks_to_create = []
    bookmarks_to_update = []

    for netscape_bookmark in netscape_bookmarks:
        result.total = result.total + 1
        try:
            # Lookup existing bookmark by URL, or create new bookmark if there is no bookmark for that URL yet
            bookmark = next(
                (bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)
            if not bookmark:
                bookmark = Bookmark(owner=user)
                is_update = False
            else:
                is_update = True
            # Copy data from parsed bookmark
            _copy_bookmark_data(netscape_bookmark, bookmark)
            # Validate bookmark fields, exclude owner to prevent n+1 database query,
            # also there is no specific validation on owner
            bookmark.clean_fields(exclude=['owner'])
            # Schedule for update or insert
            if is_update:
                bookmarks_to_update.append(bookmark)
            else:
                bookmarks_to_create.append(bookmark)

            result.success = result.success + 1
        except:
            shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
            logging.exception('Error importing bookmark: ' + shortened_bookmark_tag_str)
            result.failed = result.failed + 1

    # Bulk update bookmarks in DB
    Bookmark.objects.bulk_update(bookmarks_to_update,
                                 ['url', 'date_added', 'date_modified', 'unread', 'title', 'description', 'owner'])
    # Bulk insert new bookmarks into DB
    Bookmark.objects.bulk_create(bookmarks_to_create)

    # Bulk assign tags
    # In Django 3, bulk_create does not return the auto-generated IDs when bulk inserting,
    # so we have to reload the inserted bookmarks, and match them to the parsed bookmarks by URL
    existing_bookmarks = Bookmark.objects.filter(owner=user, url__in=batch_urls)

    BookmarkToTagRelationShip = Bookmark.tags.through
    relationships = []

    for netscape_bookmark in netscape_bookmarks:
        # Lookup bookmark by URL again
        bookmark = next(
            (bookmark for bookmark in existing_bookmarks if bookmark.url == netscape_bookmark.href), None)

        if not bookmark:
            # Something is wrong, we should have just created this bookmark
            shortened_bookmark_tag_str = str(netscape_bookmark)[:100] + '...'
            logging.warning(
                f'Failed to assign tags to the bookmark: {shortened_bookmark_tag_str}. Could not find bookmark by URL.')

        # Get tag models by string, schedule inserts for bookmark -> tag associations
        tag_names = parse_tag_string(netscape_bookmark.tag_string)
        tags = tag_cache.get_all(tag_names)
        for tag in tags:
            relationships.append(BookmarkToTagRelationShip(bookmark=bookmark, tag=tag))

    # Insert all bookmark -> tag associations at once, should ignore errors if association already exists
    BookmarkToTagRelationShip.objects.bulk_create(relationships, ignore_conflicts=True)


def _copy_bookmark_data(netscape_bookmark: NetscapeBookmark, bookmark: Bookmark):
    bookmark.url = netscape_bookmark.href
    if netscape_bookmark.date_added:
        bookmark.date_added = parse_timestamp(netscape_bookmark.date_added)
    else:
        bookmark.date_added = timezone.now()
    bookmark.date_modified = bookmark.date_added
    bookmark.unread = False
    if netscape_bookmark.title:
        bookmark.title = netscape_bookmark.title
    if netscape_bookmark.description:
        bookmark.description = netscape_bookmark.description