mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
fix abid generation migrations to be historically consistent
This commit is contained in:
parent
506b3d28d4
commit
9273db528e
6 changed files with 99 additions and 31 deletions
|
@ -2,7 +2,7 @@
|
|||
|
||||
from django.db import migrations
|
||||
from datetime import datetime
|
||||
from abid_utils.abid import abid_from_values
|
||||
from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT
|
||||
|
||||
|
||||
def calculate_abid(self):
|
||||
|
@ -41,6 +41,7 @@ def calculate_abid(self):
|
|||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
salt=DEFAULT_ABID_URI_SALT,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor):
|
|||
|
||||
snapshot.abid = calculate_abid(snapshot)
|
||||
snapshot.uuid = snapshot.abid.uuid
|
||||
snapshot.id = snapshot.abid.uuid
|
||||
snapshot.save(update_fields=["abid", "uuid", "id"])
|
||||
snapshot.save(update_fields=["abid", "uuid"])
|
||||
|
||||
def generate_archiveresult_abids(apps, schema_editor):
|
||||
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
|
||||
|
|
|
@ -4,29 +4,89 @@ from django.db import migrations
|
|||
|
||||
from django.db import migrations
|
||||
from datetime import datetime
|
||||
from abid_utils.abid import ABID
|
||||
from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
|
||||
|
||||
|
||||
def calculate_abid(self):
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
prefix = self.abid_prefix
|
||||
ts = eval(self.abid_ts_src)
|
||||
uri = eval(self.abid_uri_src)
|
||||
subtype = eval(self.abid_subtype_src)
|
||||
rand = eval(self.abid_rand_src)
|
||||
|
||||
if (not prefix) or prefix == 'obj_':
|
||||
suggested_abid = self.__class__.__name__[:3].lower()
|
||||
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||
|
||||
if not ts:
|
||||
ts = datetime.utcfromtimestamp(0)
|
||||
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||
|
||||
if not uri:
|
||||
uri = str(self)
|
||||
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||
|
||||
if not subtype:
|
||||
subtype = self.__class__.__name__
|
||||
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||
|
||||
if not rand:
|
||||
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||
|
||||
abid = abid_from_values(
|
||||
prefix=prefix,
|
||||
ts=ts,
|
||||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
salt=DEFAULT_ABID_URI_SALT,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
||||
def update_snapshot_ids(apps, schema_editor):
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
num_total = Snapshot.objects.all().count()
|
||||
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
|
||||
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
|
||||
assert snapshot.abid
|
||||
snapshot.uuid = ABID.parse(snapshot.abid).uuid
|
||||
snapshot.save(update_fields=["uuid"])
|
||||
snapshot.abid_prefix = 'snp_'
|
||||
snapshot.abid_ts_src = 'self.added'
|
||||
snapshot.abid_uri_src = 'self.url'
|
||||
snapshot.abid_subtype_src = '"01"'
|
||||
snapshot.abid_rand_src = 'self.uuid'
|
||||
|
||||
snapshot.abid = calculate_abid(snapshot)
|
||||
snapshot.uuid = snapshot.abid.uuid
|
||||
snapshot.save(update_fields=["abid", "uuid"])
|
||||
assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
|
||||
if idx % 1000 == 0:
|
||||
print(f'Migrated {idx}/{num_total} Snapshot objects...')
|
||||
|
||||
def update_archiveresult_ids(apps, schema_editor):
|
||||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
ArchiveResult = apps.get_model("core", "ArchiveResult")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
|
||||
assert result.abid
|
||||
result.abid_prefix = 'res_'
|
||||
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
|
||||
result.snapshot_added = result.snapshot.added
|
||||
result.snapshot_url = result.snapshot.url
|
||||
result.abid_ts_src = 'self.snapshot_added'
|
||||
result.abid_uri_src = 'self.snapshot_url'
|
||||
result.abid_subtype_src = 'self.extractor'
|
||||
result.abid_rand_src = 'self.id'
|
||||
|
||||
result.abid = calculate_abid(result)
|
||||
result.uuid = result.abid.uuid
|
||||
result.uuid = ABID.parse(result.abid).uuid
|
||||
result.save(update_fields=["uuid"])
|
||||
result.save(update_fields=["abid", "uuid"])
|
||||
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
|
||||
if idx % 5000 == 0:
|
||||
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
|
||||
|
|
|
@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor):
|
|||
Snapshot = apps.get_model("core", "Snapshot")
|
||||
num_total = ArchiveResult.objects.all().count()
|
||||
print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()):
|
||||
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
|
||||
assert result.snapshot_old_id
|
||||
snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id)
|
||||
snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
|
||||
result.snapshot_id = snapshot.id
|
||||
result.save(update_fields=["snapshot_id"])
|
||||
assert str(result.snapshot_id) == str(snapshot.id)
|
||||
|
|
|
@ -17,7 +17,6 @@ from django.utils.text import slugify
|
|||
from django.core.cache import cache
|
||||
from django.urls import reverse, reverse_lazy
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
from django.contrib.auth.models import User # noqa
|
||||
|
||||
from abid_utils.models import ABIDModel, ABIDField
|
||||
|
||||
|
@ -36,6 +35,8 @@ STATUS_CHOICES = [
|
|||
("skipped", "skipped")
|
||||
]
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
|
||||
# class BaseModel(models.Model):
|
||||
|
@ -49,24 +50,26 @@ STATUS_CHOICES = [
|
|||
# abstract = True
|
||||
|
||||
|
||||
|
||||
|
||||
class Tag(ABIDModel):
|
||||
"""
|
||||
Based on django-taggit model + ABID base.
|
||||
"""
|
||||
abid_prefix = 'tag_'
|
||||
abid_ts_src = 'self.created' # TODO: add created/modified time
|
||||
abid_uri_src = 'self.name'
|
||||
abid_uri_src = 'self.slug'
|
||||
abid_subtype_src = '"03"'
|
||||
abid_rand_src = 'self.id'
|
||||
abid_rand_src = 'self.old_id'
|
||||
|
||||
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
uuid = models.UUIDField(default=uuid.uuid4, null=True, unique=True)
|
||||
old_id = models.BigIntegerField(unique=True, default=rand_int_id, serialize=False, verbose_name='Old ID') # legacy PK
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False, unique=True)
|
||||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
||||
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
||||
slug = models.SlugField(unique=True, blank=False, max_length=100, editable=False)
|
||||
# slug is autoset on save from name, never set it manually
|
||||
|
||||
|
||||
|
@ -77,9 +80,9 @@ class Tag(ABIDModel):
|
|||
def __str__(self):
|
||||
return self.name
|
||||
|
||||
@property
|
||||
def old_id(self):
|
||||
return self.id
|
||||
# @property
|
||||
# def old_id(self):
|
||||
# return self.id
|
||||
|
||||
def slugify(self, tag, i=None):
|
||||
slug = slugify(tag)
|
||||
|
@ -156,16 +159,19 @@ class Snapshot(ABIDModel):
|
|||
return self.id
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
title = (self.title_stripped or '-')[:64]
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title})'
|
||||
|
||||
def __str__(self) -> str:
|
||||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
title = (self.title_stripped or '-')[:64]
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title})'
|
||||
|
||||
def save(self, *args, **kwargs):
|
||||
super().save(*args, **kwargs)
|
||||
assert str(self.id) == str(self.abid.uuid) == str(self.uuid)
|
||||
try:
|
||||
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'Snapshot.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
|
||||
except AssertionError as e:
|
||||
print(e)
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, info: dict):
|
||||
|
@ -357,9 +363,6 @@ class ArchiveResultManager(models.Manager):
|
|||
qs = qs.annotate(indexing_precedence=Case(*precedence, default=Value(1000),output_field=IntegerField())).order_by('indexing_precedence')
|
||||
return qs
|
||||
|
||||
def rand_int_id():
|
||||
return random.getrandbits(32)
|
||||
|
||||
class ArchiveResult(ABIDModel):
|
||||
abid_prefix = 'res_'
|
||||
abid_ts_src = 'self.snapshot.added'
|
||||
|
@ -387,7 +390,8 @@ class ArchiveResult(ABIDModel):
|
|||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = 'Result'
|
||||
verbose_name = 'Archive Result'
|
||||
verbose_name_plural = 'Archive Results Log'
|
||||
|
||||
|
||||
def __str__(self):
|
||||
|
@ -395,7 +399,10 @@ class ArchiveResult(ABIDModel):
|
|||
|
||||
def save(self, *args, **kwargs):
|
||||
super().save(*args, **kwargs)
|
||||
assert str(self.id) == str(self.abid.uuid) == str(self.uuid)
|
||||
try:
|
||||
assert str(self.id) == str(self.ABID.uuid) == str(self.uuid), f'ArchiveResult.id ({self.id}) does not match .ABID.uuid ({self.ABID.uuid})'
|
||||
except AssertionError as e:
|
||||
print(e)
|
||||
|
||||
@property
|
||||
def uuid(self):
|
||||
|
|
|
@ -83,7 +83,7 @@ INSTALLED_APPS = [
|
|||
'django.contrib.staticfiles',
|
||||
'django.contrib.admin',
|
||||
'django_jsonform',
|
||||
|
||||
|
||||
'signal_webhooks',
|
||||
'abid_utils',
|
||||
'plugantic',
|
||||
|
|
|
@ -181,6 +181,7 @@ class SnapshotView(View):
|
|||
except (IndexError, ValueError):
|
||||
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
||||
|
||||
|
||||
# slug is a timestamp
|
||||
if slug.replace('.','').isdigit():
|
||||
|
||||
|
|
Loading…
Reference in a new issue