Merge pull request #479 from afreydev/tags

Tags migration
2025-02-17 13:58:25 +00:00 · 2020-10-20 08:24:16 -05:00 · 2020-10-20 08:24:16 -05:00 · 71e111e13f
commit 71e111e13f
parent 4e3e5d67c1 a850b4a9d9
11 changed files with 392 additions and 18 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -3,6 +3,7 @@ on: [push]
 env:
  MAX_LINE_LENGTH: 110
  DOCKER_IMAGE: archivebox-ci
 jobs:
  lint:
@ -118,12 +119,12 @@ jobs:
      - name: Build image
        run: |
-          docker build . -t archivebox
+          docker build . -t "$DOCKER_IMAGE"
      - name: Init data dir
        run: |
          mkdir data
-          docker run -v "$PWD"/data:/data archivebox init
+          docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init
      - name: Run test server
        run: |
@ -132,7 +133,7 @@ jobs:
      - name: Add link
        run: |
-          docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local
+          docker run -v "$PWD"/data:/data --network host "$DOCKER_IMAGE" add http://www.test-nginx-1.local
      - name: Add stdin link
        run: |
@ -140,8 +141,8 @@ jobs:
      - name: List links
        run: |
-          docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
+          docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
-          docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
+          docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
      - name: Start docker-compose stack
        run: |
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@ -9,9 +9,10 @@ from django.utils.html import format_html
 from django.utils.safestring import mark_safe
 from django.shortcuts import render, redirect
 from django.contrib.auth import get_user_model
 from django import forms
 from core.models import Snapshot
-from core.forms import AddLinkForm
+from core.forms import AddLinkForm, TagField
 from core.utils import get_icons
 from util import htmldecode, urldecode, ansi_to_html
@ -55,6 +56,32 @@ def delete_snapshots(modeladmin, request, queryset):
 delete_snapshots.short_description = "Delete"
 class SnapshotAdminForm(forms.ModelForm):
    tags = TagField(required=False)
    class Meta:
        model = Snapshot
        fields = "__all__"
    def save(self, commit=True):
        # Based on: https://stackoverflow.com/a/49933068/3509554
        # Get the unsave instance
        instance = forms.ModelForm.save(self, False)
        tags = self.cleaned_data.pop("tags")
        #update save_m2m
        def new_save_m2m():
            instance.save_tags(tags)
        # Do we need to save all changes now?
        self.save_m2m = new_save_m2m
        if commit:
            instance.save()
        return instance
 class SnapshotAdmin(admin.ModelAdmin):
    list_display = ('added', 'title_str', 'url_str', 'files', 'size')
    sort_fields = ('title_str', 'url_str', 'added')
@ -65,6 +92,13 @@ class SnapshotAdmin(admin.ModelAdmin):
    ordering = ['-added']
    actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
    actions_template = 'admin/actions_as_select.html'
    form = SnapshotAdminForm
    def get_queryset(self, request):
        return super().get_queryset(request).prefetch_related('tags')
    def tag_list(self, obj):
        return ', '.join(obj.tags.values_list('name', flat=True))
    def id_str(self, obj):
        return format_html(
@ -75,9 +109,9 @@ class SnapshotAdmin(admin.ModelAdmin):
    def title_str(self, obj):
        canon = obj.as_link().canonical_outputs()
        tags = ''.join(
-            format_html('<span>{}</span>', tag.strip())
+            format_html(' <a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
-            for tag in obj.tags.split(',')
+            for tag in obj.tags.all()
-        ) if obj.tags else ''
+        )
        return format_html(
            '<a href="/{}">'
                '<img src="/{}/{}" class="favicon" onerror="this.remove()">'
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@ -3,6 +3,7 @@ __package__ = 'archivebox.core'
 from django import forms
 from ..util import URL_REGEX
 from .utils_taggit import edit_string_for_tags, parse_tags
 CHOICES = (
    ('0', 'depth = 0 (archive just these URLs)'),
@ -12,3 +13,44 @@ CHOICES = (
 class AddLinkForm(forms.Form):
    url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
    depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
 class TagWidgetMixin:
    def format_value(self, value):
        if value is not None and not isinstance(value, str):
            value = edit_string_for_tags(value)
        return super().format_value(value)
 class TagWidget(TagWidgetMixin, forms.TextInput):
    pass
 class TagField(forms.CharField):
    widget = TagWidget
    def clean(self, value):
        value = super().clean(value)
        try:
            return parse_tags(value)
        except ValueError:
            raise forms.ValidationError(
                "Please provide a comma-separated list of tags."
            )
    def has_changed(self, initial_value, data_value):
        # Always return False if the field is disabled since self.bound_data
        # always uses the initial value in this case.
        if self.disabled:
            return False
        try:
            data_value = self.clean(data_value)
        except forms.ValidationError:
            pass
        if initial_value is None:
            initial_value = []
        initial_value = [tag.name for tag in initial_value]
        initial_value.sort()
        return initial_value != data_value
--- a/archivebox/core/migrations/0006_auto_20201012_1520.py
+++ b/archivebox/core/migrations/0006_auto_20201012_1520.py
@ -0,0 +1,70 @@
 # Generated by Django 3.0.8 on 2020-10-12 15:20
 from django.db import migrations, models
 from django.utils.text import slugify
 def forwards_func(apps, schema_editor):
    SnapshotModel = apps.get_model("core", "Snapshot")
    TagModel = apps.get_model("core", "Tag")
    db_alias = schema_editor.connection.alias
    snapshots = SnapshotModel.objects.all()
    for snapshot in snapshots:
        tags = snapshot.tags
        tag_set = (
            set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
        )
        tag_set.discard("")
        for tag in tag_set:
            to_add, _ = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
            snapshot.tags.add(to_add)
 def reverse_func(apps, schema_editor):
    SnapshotModel = apps.get_model("core", "Snapshot")
    TagModel = apps.get_model("core", "Tag")
    db_alias = schema_editor.connection.alias
    snapshots = SnapshotModel.objects.all()
    for snapshot in snapshots:
        tags = snapshot.tags.values_list("name", flat=True)
        snapshot.tags_old = ",".join([tag for tag in tags])
        snapshot.save()
 class Migration(migrations.Migration):
    dependencies = [
        ('core', '0005_auto_20200728_0326'),
    ]
    operations = [
        migrations.RenameField(
            model_name='snapshot',
            old_name='tags',
            new_name='tags_old',
        ),
        migrations.CreateModel(
            name='Tag',
            fields=[
                ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
                ('name', models.CharField(max_length=100, unique=True, verbose_name='name')),
                ('slug', models.SlugField(max_length=100, unique=True, verbose_name='slug')),
            ],
            options={
                'verbose_name': 'Tag',
                'verbose_name_plural': 'Tags',
            },
        ),
        migrations.AddField(
            model_name='snapshot',
            name='tags',
            field=models.ManyToManyField(to='core.Tag'),
        ),
        migrations.RunPython(forwards_func, reverse_func),
        migrations.RemoveField(
            model_name='snapshot',
            name='tags_old',
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@ -2,13 +2,55 @@ __package__ = 'archivebox.core'
 import uuid
-from django.db import models
+from django.db import models, transaction
 from django.utils.functional import cached_property
 from django.utils.text import slugify
 from ..util import parse_date
 from ..index.schema import Link
 class Tag(models.Model):
    """
    Based on django-taggit model
    """
    name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
    slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
    class Meta:
        verbose_name = "Tag"
        verbose_name_plural = "Tags"
    def __str__(self):
        return self.name
    def slugify(self, tag, i=None):
        slug = slugify(tag)
        if i is not None:
            slug += "_%d" % i
        return slug
    def save(self, *args, **kwargs):
        if self._state.adding and not self.slug:
            self.slug = self.slugify(self.name)
            with transaction.atomic():
                slugs = set(
                    type(self)
                    ._default_manager.filter(slug__startswith=self.slug)
                    .values_list("slug", flat=True)
                )
                i = None
                while True:
                    slug = self.slugify(self.name, i)
                    if slug not in slugs:
                        self.slug = slug
                        return super().save(*args, **kwargs)
                    i = 1 if i is None else i+1
        else:
            return super().save(*args, **kwargs)
 class Snapshot(models.Model):
    id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@ -16,11 +58,10 @@ class Snapshot(models.Model):
    timestamp = models.CharField(max_length=32, unique=True, db_index=True)
    title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
    tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
    added = models.DateTimeField(auto_now_add=True, db_index=True)
    updated = models.DateTimeField(null=True, blank=True, db_index=True)
-    # bookmarked = models.DateTimeField()
+    tags = models.ManyToManyField(Tag)
    keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@ -41,7 +82,8 @@ class Snapshot(models.Model):
        args = args or self.keys
        return {
            key: getattr(self, key)
-            for key in args
+            if key != 'tags' else self.get_tags_str()
            for key in args 
        }
    def as_link(self) -> Link:
@ -50,6 +92,13 @@ class Snapshot(models.Model):
    def as_link_with_details(self) -> Link:
        from ..index import load_link_details
        return load_link_details(self.as_link())
    def get_tags_str(self) -> str:
        tags = ','.join(
            tag.name
            for tag in self.tags.all()
        ) if self.tags.all() else ''
        return tags
    @cached_property
    def bookmarked(self):
@ -96,3 +145,10 @@ class Snapshot(models.Model):
            and self.history['title'][-1].output.strip()):
            return self.history['title'][-1].output.strip()
        return None
    def save_tags(self, tags=[]):
        tags_id = []
        for tag in tags:
            tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
        self.tags.clear()
        self.tags.add(*tags_id)
--- a/archivebox/core/utils_taggit.py
+++ b/archivebox/core/utils_taggit.py
@ -0,0 +1,113 @@
 # Taken from https://github.com/jazzband/django-taggit/blob/3b56adb637ab95aca5036c37a358402c825a367c/taggit/utils.py
 def parse_tags(tagstring):
    """
    Parses tag input, with multiple word input being activated and
    delineated by commas and double quotes. Quotes take precedence, so
    they may contain commas.
    Returns a sorted list of unique tag names.
    Ported from Jonathan Buchanan's `django-tagging
    <http://django-tagging.googlecode.com/>`_
    """
    if not tagstring:
        return []
    # Special case - if there are no commas or double quotes in the
    # input, we don't *do* a recall... I mean, we know we only need to
    # split on spaces.
    if "," not in tagstring and '"' not in tagstring:
        words = list(set(split_strip(tagstring, " ")))
        words.sort()
        return words
    words = []
    buffer = []
    # Defer splitting of non-quoted sections until we know if there are
    # any unquoted commas.
    to_be_split = []
    saw_loose_comma = False
    open_quote = False
    i = iter(tagstring)
    try:
        while True:
            c = next(i)
            if c == '"':
                if buffer:
                    to_be_split.append("".join(buffer))
                    buffer = []
                # Find the matching quote
                open_quote = True
                c = next(i)
                while c != '"':
                    buffer.append(c)
                    c = next(i)
                if buffer:
                    word = "".join(buffer).strip()
                    if word:
                        words.append(word)
                    buffer = []
                open_quote = False
            else:
                if not saw_loose_comma and c == ",":
                    saw_loose_comma = True
                buffer.append(c)
    except StopIteration:
        # If we were parsing an open quote which was never closed treat
        # the buffer as unquoted.
        if buffer:
            if open_quote and "," in buffer:
                saw_loose_comma = True
            to_be_split.append("".join(buffer))
    if to_be_split:
        if saw_loose_comma:
            delimiter = ","
        else:
            delimiter = " "
        for chunk in to_be_split:
            words.extend(split_strip(chunk, delimiter))
    words = list(set(words))
    words.sort()
    return words
 def split_strip(string, delimiter=","):
    """
    Splits ``string`` on ``delimiter``, stripping each resulting string
    and returning a list of non-empty strings.
    Ported from Jonathan Buchanan's `django-tagging
    <http://django-tagging.googlecode.com/>`_
    """
    if not string:
        return []
    words = [w.strip() for w in string.split(delimiter)]
    return [w for w in words if w]
 def edit_string_for_tags(tags):
    """
    Given list of ``Tag`` instances, creates a string representation of
    the list suitable for editing by the user, such that submitting the
    given string representation back without changing it will give the
    same list of tags.
    Tag names which contain commas will be double quoted.
    If any tag name which isn't being quoted contains whitespace, the
    resulting string of tag names will be comma-delimited, otherwise
    it will be space-delimited.
    Ported from Jonathan Buchanan's `django-tagging
    <http://django-tagging.googlecode.com/>`_
    """
    names = []
    for tag in tags:
        name = tag.name
        if "," in name or " " in name:
            names.append('"%s"' % name)
        else:
            names.append(name)
    return ", ".join(sorted(names))
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@ -34,13 +34,19 @@ def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) ->
 def write_link_to_sql_index(link: Link):
    from core.models import Snapshot
    info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
    tags = info.pop("tags")
    if tags is None:
        tags = []
    try:
        info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
    except Snapshot.DoesNotExist:
        while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
            info["timestamp"] = str(float(info["timestamp"]) + 1.0)
-    return Snapshot.objects.update_or_create(url=link.url, defaults=info)[0]
+    snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
    snapshot.save_tags(tags)
    return snapshot
@enforce_types
@ -65,8 +71,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
        except Snapshot.DoesNotExist:
            snap = write_link_to_sql_index(link)
        snap.title = link.title
-        snap.tags = link.tags
+
        tag_set = (
            set(tag.strip() for tag in (link.tags or '').split(','))
        )
        tag_list = list(tag_set) or []
        snap.save()
        snap.save_tags(tag_list)
--- a/archivebox/themes/default/static/admin.css
+++ b/archivebox/themes/default/static/admin.css
@ -222,3 +222,11 @@ body.model-snapshot.change-list #content .object-tools {
  0% { transform: rotate(0deg); }
  100% { transform: rotate(360deg); }
 }
 .tags > a > .tag {
  border: 1px solid;
  border-radius: 10px;
  background-color: #f3f3f3;
  padding: 3px;
 }
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -12,7 +12,7 @@ version: '3.7'
 services:
    archivebox:
        # build: .
-        image: nikisweeting/archivebox:latest
+        image: ${DOCKER_IMAGE:-nikisweeting/archivebox:latest} 
        command: server 0.0.0.0:8000
        stdin_open: true
        tty: true
--- a/tests/tags_migration/index.sqlite3
+++ b/tests/tags_migration/index.sqlite3
--- a/tests/test_init.py
+++ b/tests/test_init.py
@ -4,7 +4,7 @@
 import os
 import subprocess
 from pathlib import Path
-import json
+import json, shutil
 import sqlite3
 from archivebox.config import OUTPUT_PERMISSIONS
@ -131,4 +131,42 @@ def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
    assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
-    assert init_process.returncode == 0
+    assert init_process.returncode == 0
 def test_tags_migration(tmp_path, disable_extractors_dict):
    base_sqlite_path = Path(__file__).parent / 'tags_migration'
    if os.path.exists(tmp_path):
        shutil.rmtree(tmp_path)
    shutil.copytree(str(base_sqlite_path), tmp_path)
    os.chdir(tmp_path)
    conn = sqlite3.connect("index.sqlite3")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    c.execute("SELECT id, tags from core_snapshot")
    snapshots = c.fetchall()
    snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots}
    conn.commit()
    conn.close()
    init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
    conn = sqlite3.connect("index.sqlite3")
    conn.row_factory = sqlite3.Row
    c = conn.cursor()
    c.execute("""
        SELECT core_snapshot.id, core_tag.name from core_snapshot
        JOIN core_snapshot_tags on core_snapshot_tags.snapshot_id=core_snapshot.id
        JOIN core_tag on core_tag.id=core_snapshot_tags.tag_id
    """)
    tags = c.fetchall()
    conn.commit()
    conn.close()
    for tag in tags:
        snapshot_id = tag["id"]
        tag_name = tag["name"]
        # Check each tag migrated is in the previous field
        assert tag_name in snapshots_dict[snapshot_id]