mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 04:03:06 +00:00
create abid_utils with new ABID type for ArchiveBox IDs
This commit is contained in:
parent
f896e5dbeb
commit
4f9f22e024
11 changed files with 572 additions and 146 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -29,6 +29,7 @@ dist/
|
|||
data/
|
||||
data*/
|
||||
output/
|
||||
index.sqlite3
|
||||
|
||||
# vim
|
||||
*.sw?
|
||||
|
|
1
archivebox/abid_utils/__init__.py
Normal file
1
archivebox/abid_utils/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'abid_utils'
|
174
archivebox/abid_utils/abid.py
Normal file
174
archivebox/abid_utils/abid.py
Normal file
|
@ -0,0 +1,174 @@
|
|||
from typing import NamedTuple, Any, Union, Optional
|
||||
|
||||
import ulid
|
||||
import uuid6
|
||||
import hashlib
|
||||
|
||||
from uuid import UUID
|
||||
from typeid import TypeID # type: ignore[import-untyped]
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
|
||||
ABID_PREFIX_LEN = 4
|
||||
ABID_SUFFIX_LEN = 26
|
||||
ABID_LEN = 30
|
||||
ABID_TS_LEN = 10
|
||||
ABID_URI_LEN = 8
|
||||
ABID_SUBTYPE_LEN = 2
|
||||
ABID_RAND_LEN = 6
|
||||
|
||||
DEFAULT_ABID_PREFIX = 'obj_'
|
||||
|
||||
|
||||
class ABID(NamedTuple):
|
||||
"""
|
||||
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
|
||||
"""
|
||||
prefix: str # e.g. obj_
|
||||
ts: str # e.g. 01HX9FPYTR
|
||||
uri: str # e.g. E4A5CCD9
|
||||
subtype: str # e.g. 01
|
||||
rand: str # e.g. ZYEBQE
|
||||
|
||||
def __getattr__(self, attr: str) -> Any:
|
||||
return getattr(self.ulid, attr)
|
||||
|
||||
def __eq__(self, other: Any) -> bool:
|
||||
try:
|
||||
return self.ulid == other.ulid
|
||||
except AttributeError:
|
||||
return NotImplemented
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.prefix + self.suffix
|
||||
|
||||
def __len__(self) -> int:
|
||||
return len(self.prefix + self.suffix)
|
||||
|
||||
@classmethod
|
||||
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
|
||||
buffer = str(buffer)
|
||||
if '_' in buffer:
|
||||
prefix, suffix = buffer.split('_')
|
||||
else:
|
||||
prefix, suffix = prefix.strip('_'), buffer
|
||||
|
||||
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
|
||||
assert len(suffix) == ABID_SUFFIX_LEN
|
||||
|
||||
return cls(
|
||||
prefix=abid_part_from_prefix(prefix),
|
||||
ts=suffix[0:10].upper(),
|
||||
uri=suffix[10:18].upper(),
|
||||
subtype=suffix[18:20].upper(),
|
||||
rand=suffix[20:26].upper(),
|
||||
)
|
||||
|
||||
@property
|
||||
def suffix(self):
|
||||
return ''.join((self.ts, self.uri, self.subtype, self.rand))
|
||||
|
||||
@property
|
||||
def ulid(self) -> ulid.ULID:
|
||||
return ulid.parse(self.suffix)
|
||||
|
||||
@property
|
||||
def uuid(self) -> UUID:
|
||||
return self.ulid.uuid
|
||||
|
||||
@property
|
||||
def uuid6(self) -> uuid6.UUID:
|
||||
return uuid6.UUID(hex=self.uuid.hex)
|
||||
|
||||
@property
|
||||
def typeid(self) -> TypeID:
|
||||
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
|
||||
|
||||
@property
|
||||
def datetime(self) -> datetime:
|
||||
return self.ulid.timestamp().datetime
|
||||
|
||||
|
||||
|
||||
####################################################
|
||||
|
||||
|
||||
def uri_hash(uri: Union[str, bytes]) -> str:
|
||||
"""
|
||||
'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
|
||||
"""
|
||||
if isinstance(uri, str):
|
||||
uri = uri.encode('utf-8')
|
||||
|
||||
return hashlib.sha256(uri).hexdigest().upper()
|
||||
|
||||
def abid_part_from_prefix(prefix: Optional[str]) -> str:
|
||||
"""
|
||||
'snp_'
|
||||
"""
|
||||
if prefix is None:
|
||||
return 'obj_'
|
||||
|
||||
prefix = prefix.strip('_').lower()
|
||||
assert len(prefix) == 3
|
||||
return prefix + '_'
|
||||
|
||||
def abid_part_from_uri(uri: str) -> str:
|
||||
"""
|
||||
'E4A5CCD9' # takes first 8 characters of sha256(url)
|
||||
"""
|
||||
return uri_hash(uri)[:ABID_URI_LEN]
|
||||
|
||||
def abid_part_from_ts(ts: Optional[datetime]) -> str:
|
||||
"""
|
||||
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
|
||||
"""
|
||||
return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
|
||||
|
||||
def abid_part_from_subtype(subtype: str) -> str:
|
||||
"""
|
||||
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
|
||||
Also allows us to change the ulid spec later by putting special sigil values here.
|
||||
"""
|
||||
if len(subtype) == ABID_SUBTYPE_LEN:
|
||||
return subtype
|
||||
|
||||
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN]
|
||||
|
||||
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
|
||||
"""
|
||||
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
|
||||
"""
|
||||
if rand is None:
|
||||
# if it's None we generate a new random 6 character hex string
|
||||
return str(ulid.new())[-ABID_RAND_LEN:]
|
||||
elif isinstance(rand, UUID):
|
||||
# if it's a uuid we take the last 6 characters of the ULID represation of it
|
||||
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
|
||||
elif isinstance(rand, str):
|
||||
# if it's a string we take the last 6 characters of it verbatim
|
||||
return rand[-ABID_RAND_LEN:]
|
||||
elif isinstance(rand, int):
|
||||
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
|
||||
rand_str = str(rand)[-ABID_RAND_LEN:]
|
||||
padding_needed = ABID_RAND_LEN - len(rand_str)
|
||||
rand_str = ('0'*padding_needed) + rand_str
|
||||
return rand_str
|
||||
raise NotImplementedError('Random component of an ABID can only be computed from a str or UUID')
|
||||
|
||||
|
||||
def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
|
||||
abid = ABID(
|
||||
prefix=abid_part_from_prefix(prefix),
|
||||
ts=abid_part_from_ts(ts),
|
||||
uri=abid_part_from_uri(uri),
|
||||
subtype=abid_part_from_subtype(subtype),
|
||||
rand=abid_part_from_rand(rand),
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
|
||||
return abid
|
7
archivebox/abid_utils/apps.py
Normal file
7
archivebox/abid_utils/apps.py
Normal file
|
@ -0,0 +1,7 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class AbidUtilsConfig(AppConfig):
|
||||
default_auto_field = 'django.db.models.BigAutoField'
|
||||
|
||||
name = 'abid_utils'
|
0
archivebox/abid_utils/migrations/__init__.py
Normal file
0
archivebox/abid_utils/migrations/__init__.py
Normal file
279
archivebox/abid_utils/models.py
Normal file
279
archivebox/abid_utils/models.py
Normal file
|
@ -0,0 +1,279 @@
|
|||
from typing import Any, Dict, Union, List, Set, cast
|
||||
|
||||
import ulid
|
||||
from uuid import UUID
|
||||
from typeid import TypeID # type: ignore[import-untyped]
|
||||
from datetime import datetime
|
||||
from functools import partial
|
||||
from charidfield import CharIDField # type: ignore[import-untyped]
|
||||
|
||||
from django.db import models
|
||||
from django.db.utils import OperationalError
|
||||
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from .abid import (
|
||||
ABID,
|
||||
ABID_LEN,
|
||||
ABID_RAND_LEN,
|
||||
ABID_SUFFIX_LEN,
|
||||
DEFAULT_ABID_PREFIX,
|
||||
abid_part_from_prefix,
|
||||
abid_from_values
|
||||
)
|
||||
|
||||
####################################################
|
||||
|
||||
|
||||
# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
|
||||
ABIDField = partial(
|
||||
CharIDField,
|
||||
default=ulid.new,
|
||||
max_length=ABID_LEN,
|
||||
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)"
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
class ABIDModel(models.Model):
|
||||
abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
|
||||
abid_ts_src = 'None' # e.g. 'self.created'
|
||||
abid_uri_src = 'None' # e.g. 'self.uri'
|
||||
abid_subtype_src = 'None' # e.g. 'self.extractor'
|
||||
abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id'
|
||||
|
||||
# abid = ABIDField(prefix=abid_prefix, db_index=True, unique=True, null=True, blank=True, editable=True)
|
||||
|
||||
# created = models.DateTimeField(auto_now_add=True, blank=True, null=True, db_index=True)
|
||||
# modified = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
|
||||
# created_by = models.ForeignKeyField(get_user_model(), blank=True, null=True, db_index=True)
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
abstract = True
|
||||
|
||||
def save(self, *args: Any, **kwargs: Any) -> None:
|
||||
if hasattr(self, 'abid'):
|
||||
self.abid: ABID = self.abid or self.calculate_abid()
|
||||
else:
|
||||
print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
|
||||
self.abid = self.calculate_abid()
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
|
||||
def calculate_abid(self) -> ABID:
|
||||
"""
|
||||
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
||||
"""
|
||||
prefix = self.abid_prefix
|
||||
ts = eval(self.abid_ts_src)
|
||||
uri = eval(self.abid_uri_src)
|
||||
subtype = eval(self.abid_subtype_src)
|
||||
rand = eval(self.abid_rand_src)
|
||||
|
||||
if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
|
||||
suggested_abid = self.__class__.__name__[:3].lower()
|
||||
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
|
||||
|
||||
if not ts:
|
||||
ts = datetime.utcfromtimestamp(0)
|
||||
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
|
||||
|
||||
if not uri:
|
||||
uri = str(self)
|
||||
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
|
||||
|
||||
if not subtype:
|
||||
subtype = self.__class__.__name__
|
||||
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
|
||||
|
||||
if not rand:
|
||||
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
|
||||
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
|
||||
|
||||
abid = abid_from_values(
|
||||
prefix=prefix,
|
||||
ts=ts,
|
||||
uri=uri,
|
||||
subtype=subtype,
|
||||
rand=rand,
|
||||
)
|
||||
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
|
||||
return abid
|
||||
|
||||
@property
|
||||
def ABID(self) -> ABID:
|
||||
"""
|
||||
ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
|
||||
"""
|
||||
return ABID.parse(self.abid) if self.abid else self.calculate_abid()
|
||||
|
||||
@property
|
||||
def ULID(self) -> ulid.ULID:
|
||||
"""
|
||||
Get a ulid.ULID representation of the object's ABID.
|
||||
"""
|
||||
return self.ABID.ulid
|
||||
|
||||
@property
|
||||
def UUID(self) -> UUID:
|
||||
"""
|
||||
Get a uuid.UUID (v4) representation of the object's ABID.
|
||||
"""
|
||||
return self.ABID.uuid
|
||||
|
||||
@property
|
||||
def TypeID(self) -> TypeID:
|
||||
"""
|
||||
Get a typeid.TypeID (stripe-style) representation of the object's ABID.
|
||||
"""
|
||||
return self.ABID.typeid
|
||||
|
||||
|
||||
|
||||
####################################################
|
||||
|
||||
# Django helpers
|
||||
def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
|
||||
"""
|
||||
Return the mapping of all ABID prefixes to their models.
|
||||
e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
|
||||
"""
|
||||
import django.apps
|
||||
prefix_map = {}
|
||||
|
||||
for model in django.apps.apps.get_models():
|
||||
abid_prefix = getattr(model, 'abid_prefix', None)
|
||||
if abid_prefix:
|
||||
prefix_map[abid_prefix] = model
|
||||
return prefix_map
|
||||
|
||||
def find_prefix_for_abid(abid: ABID) -> str:
|
||||
"""
|
||||
Find the correct prefix for a given ABID that may have be missing a prefix (slow).
|
||||
e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
|
||||
"""
|
||||
# if existing abid prefix is correct, lookup is easy
|
||||
model = find_model_from_abid(abid)
|
||||
if model:
|
||||
assert issubclass(model, ABIDModel)
|
||||
return model.abid_prefix
|
||||
|
||||
# prefix might be obj_ or missing, fuzzy-search to find any object that matches
|
||||
return find_obj_from_abid_rand(abid)[0].abid_prefix
|
||||
|
||||
def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
|
||||
"""
|
||||
Return the Django Model that corresponds to a given ABID prefix.
|
||||
e.g. 'tag_' -> core.models.Tag
|
||||
"""
|
||||
prefix = abid_part_from_prefix(prefix)
|
||||
|
||||
import django.apps
|
||||
|
||||
for model in django.apps.apps.get_models():
|
||||
if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
|
||||
if not hasattr(model, 'objects'): continue # skip abstract models
|
||||
|
||||
if (model.abid_prefix == prefix):
|
||||
return model
|
||||
|
||||
return None
|
||||
|
||||
def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
|
||||
"""
|
||||
Shortcut for find_model_from_abid_prefix(abid.prefix)
|
||||
"""
|
||||
return find_model_from_abid_prefix(abid.prefix)
|
||||
|
||||
def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
|
||||
"""
|
||||
Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
|
||||
e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
|
||||
"""
|
||||
|
||||
# convert str to ABID if necessary
|
||||
if isinstance(rand, ABID):
|
||||
abid: ABID = rand
|
||||
else:
|
||||
rand = str(rand)
|
||||
if len(rand) < ABID_SUFFIX_LEN:
|
||||
padding_needed = ABID_SUFFIX_LEN - len(rand)
|
||||
rand = ('0'*padding_needed) + rand
|
||||
abid = ABID.parse(rand)
|
||||
|
||||
import django.apps
|
||||
|
||||
partial_matches: List[ABIDModel] = []
|
||||
|
||||
models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
|
||||
model,
|
||||
find_model_from_abid(abid),
|
||||
*django.apps.apps.get_models(),
|
||||
))))
|
||||
# print(abid, abid.rand, abid.uuid, models_to_try)
|
||||
|
||||
for model in models_to_try:
|
||||
if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
|
||||
if not hasattr(model, 'objects'): continue # skip abstract Models
|
||||
assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
|
||||
|
||||
# continue on to try fuzzy searching by randomness portion derived from uuid field
|
||||
try:
|
||||
qs = []
|
||||
if hasattr(model, 'abid'):
|
||||
qs = model.objects.filter(abid__endswith=abid.rand)
|
||||
elif hasattr(model, 'uuid'):
|
||||
qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
|
||||
elif hasattr(model, 'id'):
|
||||
# NOTE: this only works on SQLite where every column is a string
|
||||
# other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
|
||||
|
||||
# try to search for uuid=...-2354352
|
||||
# try to search for id=...2354352
|
||||
# try to search for id=2354352
|
||||
qs = model.objects.filter(
|
||||
models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
|
||||
| models.Q(id__endswith=abid.rand)
|
||||
| models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
|
||||
)
|
||||
|
||||
for obj in qs:
|
||||
if obj.calculate_abid() == abid:
|
||||
# found exact match, no need to keep iterating
|
||||
return [obj]
|
||||
partial_matches.append(obj)
|
||||
except OperationalError as err:
|
||||
print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
|
||||
|
||||
return partial_matches
|
||||
|
||||
def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
|
||||
"""
|
||||
Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
|
||||
e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
|
||||
"""
|
||||
|
||||
model = model or find_model_from_abid(abid)
|
||||
assert model, f'Could not find model that could match this ABID type: {abid}'
|
||||
|
||||
try:
|
||||
if hasattr(model, 'abid'):
|
||||
return model.objects.get(abid__endswith=abid.suffix)
|
||||
if hasattr(model, 'uuid'):
|
||||
return model.objects.get(uuid=abid.uuid)
|
||||
return model.objects.get(id=abid.uuid)
|
||||
except model.DoesNotExist:
|
||||
# if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
|
||||
if hasattr(model, 'abid') or (not fuzzy):
|
||||
raise
|
||||
|
||||
# continue on to try fuzzy searching by randomness portion derived from uuid field
|
||||
match_by_rand = find_obj_from_abid_rand(abid, model=model)
|
||||
if match_by_rand:
|
||||
if match_by_rand[0].abid_prefix != abid.prefix:
|
||||
print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
|
||||
return match_by_rand
|
||||
|
||||
raise model.DoesNotExist
|
||||
|
3
archivebox/abid_utils/tests.py
Normal file
3
archivebox/abid_utils/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
|
@ -12,14 +12,16 @@ from signal_webhooks.models import WebhookBase
|
|||
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from abid_utils.models import ABIDModel
|
||||
|
||||
|
||||
def generate_secret_token() -> str:
|
||||
# returns cryptographically secure string with len() == 32
|
||||
return secrets.token_hex(16)
|
||||
|
||||
|
||||
class APIToken(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
class APIToken(ABIDModel):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
|
||||
|
||||
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
|
||||
token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
|
||||
|
|
|
@ -1,15 +1,13 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
|
||||
import uuid
|
||||
import ulid
|
||||
import json
|
||||
import hashlib
|
||||
from typeid import TypeID
|
||||
from typing import Optional, List, Dict
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
import json
|
||||
|
||||
from uuid import uuid4
|
||||
from pathlib import Path
|
||||
from typing import Optional, List, NamedTuple
|
||||
from importlib import import_module
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
@ -19,12 +17,15 @@ from django.urls import reverse
|
|||
from django.db.models import Case, When, Value, IntegerField
|
||||
from django.contrib.auth.models import User # noqa
|
||||
|
||||
from abid_utils.models import ABIDModel
|
||||
|
||||
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||
from ..system import get_dir_size
|
||||
from ..util import parse_date, base_url, hashurl, domain
|
||||
from ..util import parse_date, base_url
|
||||
from ..index.schema import Link
|
||||
from ..index.html import snapshot_icons
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
|
||||
|
||||
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
|
||||
STATUS_CHOICES = [
|
||||
|
@ -33,24 +34,29 @@ STATUS_CHOICES = [
|
|||
("skipped", "skipped")
|
||||
]
|
||||
|
||||
try:
|
||||
JSONField = models.JSONField
|
||||
except AttributeError:
|
||||
import jsonfield
|
||||
JSONField = jsonfield.JSONField
|
||||
|
||||
|
||||
class ULIDParts(NamedTuple):
|
||||
timestamp: str
|
||||
url: str
|
||||
subtype: str
|
||||
randomness: str
|
||||
# class BaseModel(models.Model):
|
||||
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
|
||||
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
|
||||
# #
|
||||
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
# class Meta(TypedModelMeta):
|
||||
# abstract = True
|
||||
|
||||
|
||||
class Tag(models.Model):
|
||||
class Tag(ABIDModel):
|
||||
"""
|
||||
Based on django-taggit model
|
||||
"""
|
||||
abid_prefix = 'tag_'
|
||||
abid_ts_src = 'None' # TODO: add created/modified time
|
||||
abid_uri_src = 'self.name'
|
||||
abid_subtype_src = '"03"'
|
||||
abid_rand_src = 'self.id'
|
||||
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
|
@ -59,7 +65,7 @@ class Tag(models.Model):
|
|||
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
||||
|
||||
|
||||
class Meta:
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = "Tag"
|
||||
verbose_name_plural = "Tags"
|
||||
|
||||
|
@ -95,8 +101,16 @@ class Tag(models.Model):
|
|||
return super().save(*args, **kwargs)
|
||||
|
||||
|
||||
class Snapshot(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
class Snapshot(ABIDModel):
|
||||
abid_prefix = 'snp_'
|
||||
abid_ts_src = 'self.added'
|
||||
abid_uri_src = 'self.url'
|
||||
abid_subtype_src = '"01"'
|
||||
abid_rand_src = 'self.id'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
|
||||
|
||||
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
url = models.URLField(unique=True, db_index=True)
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
||||
|
@ -109,37 +123,6 @@ class Snapshot(models.Model):
|
|||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
@property
|
||||
def ulid_from_timestamp(self):
|
||||
return str(ulid.from_timestamp(self.added))[:10]
|
||||
|
||||
@property
|
||||
def ulid_from_urlhash(self):
|
||||
return str(ulid.from_randomness(self.url_hash))[10:18]
|
||||
|
||||
@property
|
||||
def ulid_from_type(self):
|
||||
return '00'
|
||||
|
||||
@property
|
||||
def ulid_from_randomness(self):
|
||||
return str(ulid.from_uuid(self.id))[20:]
|
||||
|
||||
@property
|
||||
def ulid_tuple(self) -> ULIDParts:
|
||||
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
|
||||
|
||||
@property
|
||||
def ulid(self):
|
||||
return ulid.parse(''.join(self.ulid_tuple))
|
||||
|
||||
@property
|
||||
def uuid(self):
|
||||
return self.ulid.uuid
|
||||
|
||||
@property
|
||||
def typeid(self):
|
||||
return TypeID.from_uuid(prefix='snapshot', suffix=self.ulid.uuid)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = self.title or '-'
|
||||
|
@ -169,7 +152,7 @@ class Snapshot(models.Model):
|
|||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link())
|
||||
|
||||
def tags_str(self, nocache=True) -> str:
|
||||
def tags_str(self, nocache=True) -> str | None:
|
||||
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
|
||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
if nocache:
|
||||
|
@ -200,14 +183,9 @@ class Snapshot(models.Model):
|
|||
return self.as_link().is_archived
|
||||
|
||||
@cached_property
|
||||
def num_outputs(self):
|
||||
def num_outputs(self) -> int:
|
||||
return self.archiveresult_set.filter(status='succeeded').count()
|
||||
|
||||
@cached_property
|
||||
def url_hash(self):
|
||||
# return hashurl(self.url)
|
||||
return hashlib.sha256(self.url.encode('utf-8')).hexdigest()[:16].upper()
|
||||
|
||||
@cached_property
|
||||
def base_url(self):
|
||||
return base_url(self.url)
|
||||
|
@ -243,7 +221,7 @@ class Snapshot(models.Model):
|
|||
return None
|
||||
|
||||
@cached_property
|
||||
def headers(self) -> Optional[dict]:
|
||||
def headers(self) -> Optional[Dict[str, str]]:
|
||||
try:
|
||||
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
|
||||
except Exception:
|
||||
|
@ -299,30 +277,31 @@ class Snapshot(models.Model):
|
|||
self.tags.add(*tags_id)
|
||||
|
||||
|
||||
def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||
date_str = self.added.strftime('%Y%m%d')
|
||||
domain_str = domain(self.url)
|
||||
abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
|
||||
# def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||
# date_str = self.added.strftime('%Y%m%d')
|
||||
# domain_str = domain(self.url)
|
||||
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
|
||||
|
||||
if create and not abs_storage_dir.is_dir():
|
||||
abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
# if create and not abs_storage_dir.is_dir():
|
||||
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if symlink:
|
||||
LINK_PATHS = [
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
|
||||
]
|
||||
for link_path in LINK_PATHS:
|
||||
link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
except FileExistsError:
|
||||
link_path.unlink()
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
# if symlink:
|
||||
# LINK_PATHS = [
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
|
||||
# ]
|
||||
# for link_path in LINK_PATHS:
|
||||
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# try:
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
# except FileExistsError:
|
||||
# link_path.unlink()
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
|
||||
# return abs_storage_dir
|
||||
|
||||
return abs_storage_dir
|
||||
|
||||
class ArchiveResultManager(models.Manager):
|
||||
def indexable(self, sorted: bool = True):
|
||||
|
@ -335,15 +314,21 @@ class ArchiveResultManager(models.Manager):
|
|||
return qs
|
||||
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
class ArchiveResult(ABIDModel):
|
||||
abid_prefix = 'res_'
|
||||
abid_ts_src = 'self.snapshot.added'
|
||||
abid_uri_src = 'self.snapshot.url'
|
||||
abid_subtype_src = 'self.extractor'
|
||||
abid_rand_src = 'self.uuid'
|
||||
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
|
||||
|
||||
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
|
||||
uuid = models.UUIDField(default=uuid.uuid4, editable=True)
|
||||
uuid = models.UUIDField(default=uuid4, editable=True)
|
||||
# ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
|
||||
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
|
||||
cmd = JSONField()
|
||||
cmd = models.JSONField()
|
||||
pwd = models.CharField(max_length=256)
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
output = models.CharField(max_length=1024)
|
||||
|
@ -353,6 +338,9 @@ class ArchiveResult(models.Model):
|
|||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
verbose_name = 'Result'
|
||||
|
||||
def __str__(self):
|
||||
return self.extractor
|
||||
|
||||
|
@ -360,40 +348,6 @@ class ArchiveResult(models.Model):
|
|||
def snapshot_dir(self):
|
||||
return Path(self.snapshot.link_dir)
|
||||
|
||||
@property
|
||||
def ulid_from_timestamp(self):
|
||||
return self.snapshot.ulid_from_timestamp
|
||||
|
||||
@property
|
||||
def ulid_from_urlhash(self):
|
||||
return self.snapshot.ulid_from_urlhash
|
||||
|
||||
@property
|
||||
def ulid_from_snapshot(self):
|
||||
return str(self.snapshot.ulid)[:18]
|
||||
|
||||
@property
|
||||
def ulid_from_type(self):
|
||||
return hashlib.sha256(self.extractor.encode('utf-8')).hexdigest()[:2]
|
||||
|
||||
@property
|
||||
def ulid_from_randomness(self):
|
||||
return str(ulid.from_uuid(self.uuid))[20:]
|
||||
|
||||
@property
|
||||
def ulid_tuple(self) -> ULIDParts:
|
||||
return ULIDParts(self.ulid_from_timestamp, self.ulid_from_urlhash, self.ulid_from_type, self.ulid_from_randomness)
|
||||
|
||||
@property
|
||||
def ulid(self):
|
||||
final_ulid = ulid.parse(''.join(self.ulid_tuple))
|
||||
# TODO: migrate self.uuid to match this new uuid
|
||||
# self.uuid = final_ulid.uuid
|
||||
return final_ulid
|
||||
|
||||
@property
|
||||
def typeid(self):
|
||||
return TypeID.from_uuid(prefix='result', suffix=self.ulid.uuid)
|
||||
|
||||
@property
|
||||
def extractor_module(self):
|
||||
|
@ -422,31 +376,31 @@ class ArchiveResult(models.Model):
|
|||
return Path(self.output_path()).exists()
|
||||
|
||||
|
||||
def get_storage_dir(self, create=True, symlink=True):
|
||||
date_str = self.snapshot.added.strftime('%Y%m%d')
|
||||
domain_str = domain(self.snapshot.url)
|
||||
abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / str(self.ulid)
|
||||
# def get_storage_dir(self, create=True, symlink=True):
|
||||
# date_str = self.snapshot.added.strftime('%Y%m%d')
|
||||
# domain_str = domain(self.snapshot.url)
|
||||
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
|
||||
|
||||
if create and not abs_storage_dir.is_dir():
|
||||
abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
# if create and not abs_storage_dir.is_dir():
|
||||
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
if symlink:
|
||||
LINK_PATHS = [
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
|
||||
Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
|
||||
]
|
||||
for link_path in LINK_PATHS:
|
||||
link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
except FileExistsError:
|
||||
link_path.unlink()
|
||||
link_path.symlink_to(abs_storage_dir)
|
||||
# if symlink:
|
||||
# LINK_PATHS = [
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
|
||||
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
|
||||
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
|
||||
# ]
|
||||
# for link_path in LINK_PATHS:
|
||||
# link_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
# try:
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
# except FileExistsError:
|
||||
# link_path.unlink()
|
||||
# link_path.symlink_to(abs_storage_dir)
|
||||
|
||||
return abs_storage_dir
|
||||
# return abs_storage_dir
|
||||
|
||||
def symlink_index(self, create=True):
|
||||
abs_result_dir = self.get_storage_dir(create=create)
|
||||
# def symlink_index(self, create=True):
|
||||
# abs_result_dir = self.get_storage_dir(create=create)
|
||||
|
|
|
@ -62,6 +62,7 @@ INSTALLED_APPS = [
|
|||
'django.contrib.staticfiles',
|
||||
'django.contrib.admin',
|
||||
|
||||
'abid_utils',
|
||||
'core',
|
||||
'api',
|
||||
|
||||
|
@ -258,6 +259,9 @@ DATABASES = {
|
|||
},
|
||||
}
|
||||
|
||||
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
|
||||
|
||||
|
||||
CACHES = {
|
||||
'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
|
||||
|
|
|
@ -39,6 +39,7 @@ dependencies = [
|
|||
"django-admin-data-views>=0.3.1",
|
||||
"ulid-py>=1.1.0",
|
||||
"typeid-python>=0.3.0",
|
||||
"django-charid-field>=0.4",
|
||||
]
|
||||
|
||||
homepage = "https://github.com/ArchiveBox/ArchiveBox"
|
||||
|
|
Loading…
Reference in a new issue