From 4ae186dfca02cdd4ad29b1edb46d2c13eb900ecf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 20 Aug 2024 05:56:19 -0700 Subject: [PATCH] fix ABID generation consistency when self._state.adding is True --- archivebox/abid_utils/models.py | 48 ++++++++++++++++++------ archivebox/api/models.py | 8 ++-- archivebox/core/admin.py | 66 +++++++++++++++++++++++++-------- archivebox/core/models.py | 2 +- 4 files changed, 91 insertions(+), 33 deletions(-) diff --git a/archivebox/abid_utils/models.py b/archivebox/abid_utils/models.py index 9d0ab1d5..c27f85ec 100644 --- a/archivebox/abid_utils/models.py +++ b/archivebox/abid_utils/models.py @@ -1,7 +1,7 @@ """ This file provides the Django ABIDField and ABIDModel base model to inherit from. -It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id. +It implements the ArchiveBox ID (ABID) interfaces including abid_values, generate_abid, .abid, .uuid, .id. """ from typing import Any, Dict, Union, List, Set, NamedTuple, cast @@ -82,14 +82,17 @@ class ABIDModel(models.Model): abstract = True def save(self, *args: Any, **kwargs: Any) -> None: - if hasattr(self, 'abid'): - # self.abid = ABID.parse(self.abid) if self.abid else self.get_abid() - self.abid = self.get_abid() - else: - print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!') - self.abid = self.get_abid() - + # when first creating a row, self.ABID is the source of truth + # overwrite default prefilled self.id & self.abid with generated self.ABID value + if self._state.adding or not self.id: + self.id = self.ABID.uuid + if self._state.adding or not self.abid: + self.abid = str(self.ABID) + super().save(*args, **kwargs) + assert str(self.id) == str(self.ABID.uuid), f'self.id {self.id} does not match self.ABID {self.ABID.uuid}' + assert str(self.abid) == str(self.ABID), f'self.abid {self.id} does not match self.ABID {self.ABID.uuid}' + @property def abid_values(self) -> Dict[str, Any]: @@ -101,7 +104,7 @@ class ABIDModel(models.Model): 'rand': eval(self.abid_rand_src), } - def get_abid(self) -> ABID: + def generate_abid(self) -> ABID: """ Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src). """ @@ -143,7 +146,30 @@ class ABIDModel(models.Model): """ ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE') """ - return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid() + abid = None + try: + abid = abid or ABID.parse(self.pk) + except Exception: + pass + + try: + abid = abid or ABID.parse(self.id) + except Exception: + pass + + try: + abid = abid or ABID.parse(self.uuid) + except Exception: + pass + + try: + abid = abid or ABID.parse(self.abid) + except Exception: + pass + + abid = abid or self.generate_abid() + + return abid @property def ULID(self) -> ULID: @@ -276,7 +302,7 @@ def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDMode ) for obj in qs: - if obj.get_abid() == abid: + if obj.generate_abid() == abid: # found exact match, no need to keep iterating return [obj] partial_matches.append(obj) diff --git a/archivebox/api/models.py b/archivebox/api/models.py index 0be6c322..b3861000 100644 --- a/archivebox/api/models.py +++ b/archivebox/api/models.py @@ -55,11 +55,9 @@ class APIToken(ABIDModel): def __json__(self) -> dict: return { "TYPE": "APIToken", - "uuid": str(self.id), - "ulid": str(self.ulid), - "abid": str(self.get_abid()), - "user_id": str(self.user.id), - "user_username": self.user.username, + "id": str(self.pk), + "abid": str(self.ABID), + "created_by_id": str(self.created_by_id), "token": self.token, "created": self.created.isoformat(), "expires": self.expires_as_iso8601, diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 78b6bdf8..36ed74df 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,6 +1,8 @@ __package__ = 'archivebox.core' +import os import json + from io import StringIO from pathlib import Path from contextlib import redirect_stdout @@ -197,28 +199,29 @@ def get_abid_info(self, obj): {}     📖 API DOCS

-     TS:                  {}        ({})
-     URI:                 {}           ({})
-     SUBTYPE:       {} ({})     -   RAND:   {} ({})     -   SALT:   {} -

    .abid:                   {}
    .abid.uuid:           {}
    .id:                       {}
+
+     TS:                  {}   {}        {}: {}
+     URI:                 {}     {}           {}: {} +   SALT:   {}
+     SUBTYPE:       {}           {}                           {}: {}
+     RAND:             {}       {}                 {}: {} +

    .old_id:                {}
''', obj.api_url, obj.api_url, obj.api_docs_url, - obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], - obj.ABID.uri, str(obj.abid_values['uri']), - obj.ABID.subtype, str(obj.abid_values['subtype']), - obj.ABID.rand, str(obj.abid_values['rand'])[-7:], - obj.ABID.uri_salt, str(obj.abid), str(obj.ABID.uuid), - obj.id, - getattr(obj, 'old_id', ''), + str(obj.id), + obj.ABID.ts, str(obj.ABID.uuid)[0:14], obj.abid_ts_src, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'], + obj.ABID.uri, str(obj.ABID.uuid)[14:26], obj.abid_uri_src, str(obj.abid_values['uri']), + obj.ABID.uri_salt, + obj.ABID.subtype, str(obj.ABID.uuid)[26:28], obj.abid_subtype_src, str(obj.abid_values['subtype']), + obj.ABID.rand, str(obj.ABID.uuid)[28:36], obj.abid_rand_src, str(obj.abid_values['rand'])[-7:], + str(getattr(obj, 'old_id', '')), ) @@ -568,9 +571,9 @@ class TagAdmin(admin.ModelAdmin): class ArchiveResultAdmin(admin.ModelAdmin): list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('snapshot_info', 'tags_str', 'created', 'modified', 'API') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created', 'modified', 'API', 'output_summary') search_fields = ('id', 'old_id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', *readonly_fields) + fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields) autocomplete_fields = ['snapshot'] list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') @@ -593,6 +596,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): try: return get_abid_info(self, obj) except Exception as e: + raise e return str(e) @admin.display( @@ -606,7 +610,7 @@ class ArchiveResultAdmin(admin.ModelAdmin): '
{}
', ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), ) - + def output_str(self, result): return format_html( '↗️
{}
', @@ -614,3 +618,33 @@ class ArchiveResultAdmin(admin.ModelAdmin): result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', result.output, ) + + def output_summary(self, result): + snapshot_dir = Path(OUTPUT_DIR) / str(result.pwd).split('data/', 1)[-1] + output_str = format_html( + '
{}

', + result.output, + ) + output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
+        path_from_output_str = (snapshot_dir / result.output)
+        output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) + if path_from_output_str.exists(): + root_dir = str(path_from_output_str) + else: + root_dir = str(snapshot_dir) + + + # print(root_dir, str(list(os.walk(root_dir)))) + + for root, dirs, files in os.walk(root_dir): + depth = root.replace(root_dir, '').count(os.sep) + 1 + if depth > 2: + continue + indent = ' ' * 4 * (depth) + output_str += format_html('{}{}/
', indent, os.path.basename(root)) + indentation_str = ' ' * 4 * (depth + 1) + for filename in sorted(files): + is_hidden = filename.startswith('.') + output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) + + return output_str + format_html('
') diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c2b6d4e6..372e68a0 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -372,7 +372,7 @@ class ArchiveResult(ABIDModel): abid_ts_src = 'self.snapshot.added' abid_uri_src = 'self.snapshot.url' abid_subtype_src = 'self.extractor' - abid_rand_src = 'self.id' + abid_rand_src = 'self.old_id' EXTRACTOR_CHOICES = EXTRACTOR_CHOICES old_id = models.BigIntegerField(default=rand_int_id, serialize=False, verbose_name='Old ID')