diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt
index 71dc253d..ca279875 100644
--- a/archivebox.egg-info/requires.txt
+++ b/archivebox.egg-info/requires.txt
@@ -4,6 +4,7 @@ mypy-extensions==0.4.3
base32-crockford==0.3.0
django==3.0.8
django-extensions==3.0.3
+django-taggit==1.3.0
dateparser
ipython
youtube-dl
diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py
index 4337e4a3..a35d589b 100644
--- a/archivebox/core/admin.py
+++ b/archivebox/core/admin.py
@@ -66,6 +66,12 @@ class SnapshotAdmin(admin.ModelAdmin):
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
+ def get_queryset(self, request):
+ return super().get_queryset(request).prefetch_related('tags')
+
+ def tag_list(self, obj):
+ return u", ".join(o.name for o in obj.tags.all())
+
def id_str(self, obj):
return format_html(
'{}
',
@@ -75,9 +81,9 @@ class SnapshotAdmin(admin.ModelAdmin):
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
- format_html('{}', tag.strip())
- for tag in obj.tags.split(',')
- ) if obj.tags else ''
+ format_html(' {} ', tag)
+ for tag in obj.tags.all()
+ ) if obj.tags.all() else ''
return format_html(
''
''
diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py
new file mode 100644
index 00000000..59bb111e
--- /dev/null
+++ b/archivebox/core/migrations/0006_auto_20200915_2006.py
@@ -0,0 +1,89 @@
+# Generated by Django 3.0.8 on 2020-09-15 20:06
+
+from django.db import migrations, models
+from django.contrib.contenttypes.models import ContentType
+from django.utils.text import slugify
+import django.db.models.deletion
+import taggit.managers
+
+def forwards_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TaggedItemModel = apps.get_model("core", "TaggedItem")
+ TagModel = apps.get_model("taggit", "Tag")
+ contents = ContentType.objects.all()
+ try:
+ ct = ContentType.objects.filter(app_label="core", model="snapshot")
+ except model.DoesNotExist: # Be explicit about exceptions
+ ct = None
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ tags = snapshot.tags
+ tag_set = (
+ set(tag.strip() for tag in (snapshot.tags_old or '').split(','))
+ )
+ tag_list = list(tag_set) or []
+
+ for tag in tag_list:
+ new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag))
+ TaggedItemModel.objects.get_or_create(
+ content_type_id=ct[0].id,
+ object_id=snapshot.id,
+ tag=new_tag
+ )
+
+
+def reverse_func(apps, schema_editor):
+ SnapshotModel = apps.get_model("core", "Snapshot")
+ TaggedItemModel = apps.get_model("core", "TaggedItem")
+ TagModel = apps.get_model("taggit", "Tag")
+ ct = ContentType.objects.get(app_label="core", model="snapshot")
+
+ db_alias = schema_editor.connection.alias
+ snapshots = SnapshotModel.objects.all()
+ for snapshot in snapshots:
+ for tag in tags:
+ tagged_items = TaggedItemModel.objects.filter(
+ object_id=snapshot.id,
+ ).delete()
+
+
+class Migration(migrations.Migration):
+
+ dependencies = [
+ ('contenttypes', '0002_remove_content_type_name'),
+ ('taggit', '0003_taggeditem_add_unique_index'),
+ ('core', '0005_auto_20200728_0326'),
+ ]
+
+ operations = [
+ migrations.RenameField(
+ model_name='snapshot',
+ old_name='tags',
+ new_name='tags_old',
+ ),
+ migrations.CreateModel(
+ name='TaggedItem',
+ fields=[
+ ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+ ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')),
+ ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')),
+ ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')),
+ ],
+ options={
+ 'verbose_name': 'Tag',
+ 'verbose_name_plural': 'Tags',
+ },
+ ),
+ migrations.AddField(
+ model_name='snapshot',
+ name='tags',
+ field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'),
+ ),
+ migrations.RunPython(forwards_func, reverse_func),
+ migrations.RemoveField(
+ model_name='snapshot',
+ name='tags_old',
+ ),
+ ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 313dd67d..b7719b2e 100644
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -5,10 +5,19 @@ import uuid
from django.db import models
from django.utils.functional import cached_property
+from taggit.managers import TaggableManager
+from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase
+
from ..util import parse_date
from ..index.schema import Link
+
+class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase):
+ class Meta:
+ verbose_name = "Tag"
+ verbose_name_plural = "Tags"
+
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
@@ -16,7 +25,7 @@ class Snapshot(models.Model):
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
- tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
+ tags = TaggableManager(through=TaggedItem)
added = models.DateTimeField(auto_now_add=True, db_index=True)
updated = models.DateTimeField(null=True, blank=True, db_index=True)
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 14b3b369..6ae2b6af 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -31,6 +31,7 @@ INSTALLED_APPS = [
'core',
'django_extensions',
+ 'taggit',
]
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 06832dbc..f93a4ab8 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -86,9 +86,16 @@ def merge_links(a: Link, b: Link) -> Link:
)
# all unique, truthy tags
+ tags_a = []
+ if a.tags:
+ tags_a = a.tags.all()
+ tags_b = []
+ if b.tags:
+ tags_b = b.tags.all()
+
tags_set = (
- set(tag.strip() for tag in (a.tags or '').split(','))
- | set(tag.strip() for tag in (b.tags or '').split(','))
+ set(tag.name.strip() for tag in tags_a)
+ | set(tag.name.strip() for tag in tags_b)
)
tags = ','.join(tags_set) or None
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 7508890d..7ed44e74 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -157,7 +157,8 @@ class Link:
assert isinstance(self.url, str) and '://' in self.url
assert self.updated is None or isinstance(self.updated, datetime)
assert self.title is None or (isinstance(self.title, str) and self.title)
- assert self.tags is None or isinstance(self.tags, str)
+ #for tag in self.tags.all():
+ # assert tag is None or isinstance(tag, TaggedItem)
assert isinstance(self.sources, list)
assert all(isinstance(source, str) and source for source in self.sources)
assert isinstance(self.history, dict)
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index b3ca7231..bd3664da 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -65,7 +65,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
- snap.tags = link.tags
+
+ tag_set = (
+ set(tag.strip() for tag in (link.tags or '').split(','))
+ )
+ tag_list = list(tag_set) or []
+
+ for tag in tag_list:
+ snap.tags.add(tag)
snap.save()
diff --git a/setup.py b/setup.py
index db83e9bf..0272f565 100755
--- a/setup.py
+++ b/setup.py
@@ -80,6 +80,7 @@ setuptools.setup(
"base32-crockford==0.3.0",
"django==3.0.8",
"django-extensions==3.0.3",
+ "django-taggit==1.3.0",
"dateparser",
"ipython",
diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3
new file mode 100755
index 00000000..04d35a71
Binary files /dev/null and b/tests/tags_migration/index.sqlite3 differ
diff --git a/tests/test_init.py b/tests/test_init.py
index d162fa80..72caa6d0 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -4,7 +4,7 @@
import os
import subprocess
from pathlib import Path
-import json
+import json, shutil
import sqlite3
from archivebox.config import OUTPUT_PERMISSIONS
@@ -131,4 +131,44 @@ def test_unrecognized_folders(tmp_path, process, disable_extractors_dict):
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
assert "Skipped adding 1 invalid link data directories" in init_process.stdout.decode("utf-8")
- assert init_process.returncode == 0
\ No newline at end of file
+ assert init_process.returncode == 0
+
+def test_tags_migration(tmp_path, disable_extractors_dict):
+
+ base_sqlite_path = Path(__file__).parent / 'tags_migration'
+
+ if os.path.exists(tmp_path):
+ shutil.rmtree(tmp_path)
+ shutil.copytree(str(base_sqlite_path), tmp_path)
+ os.chdir(tmp_path)
+
+ conn = sqlite3.connect("index.sqlite3")
+ conn.row_factory = sqlite3.Row
+ c = conn.cursor()
+ c.execute("SELECT id, tags from core_snapshot")
+ snapshots = c.fetchall()
+ snapshots_dict = { sn['id']: sn['tags'] for sn in snapshots}
+ conn.commit()
+ conn.close()
+
+ init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
+
+ conn = sqlite3.connect("index.sqlite3")
+ conn.row_factory = sqlite3.Row
+ c = conn.cursor()
+ c.execute("""
+ SELECT snapshot.id snapshot, tags.name tag
+ FROM core_snapshot snapshot, core_taggeditem snapshot_tagged, taggit_tag tags
+ WHERE
+ snapshot.id = snapshot_tagged.object_id
+ AND tags.id = snapshot_tagged.tag_id
+ """)
+ tags = c.fetchall()
+ conn.commit()
+ conn.close()
+
+ for tag in tags:
+ snapshot_id = tag['snapshot']
+ tag_name = tag['tag']
+ # Check each tag migrated is in the previous field
+ assert tag_name in snapshots_dict[snapshot_id]
\ No newline at end of file