Refactor Snapshot and ArchiveResult to use ulid and typeid instead of uuidv4 (#1430)

Fixes: https://github.com/ArchiveBox/ArchiveBox/issues/74
This commit is contained in:
Nick Sweeting 2024-06-02 17:53:53 -07:00 committed by GitHub
commit 3114980eeb
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
34 changed files with 1349 additions and 180 deletions

1
.gitignore vendored
View file

@ -29,6 +29,7 @@ dist/
data/ data/
data*/ data*/
output/ output/
index.sqlite3
# vim # vim
*.sw? *.sw?

View file

@ -1,7 +1,4 @@
__package__ = 'archivebox' __package__ = 'archivebox'
# monkey patch django timezone to add back utc (it was removed in Django 5.0) from .monkey_patches import *
import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc

View file

@ -0,0 +1 @@
__package__ = 'abid_utils'

View file

@ -0,0 +1,191 @@
from typing import NamedTuple, Any, Union, Optional
import ulid
import uuid6
import hashlib
from urllib.parse import urlparse
from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
ABID_PREFIX_LEN = 4
ABID_SUFFIX_LEN = 26
ABID_LEN = 30
ABID_TS_LEN = 10
ABID_URI_LEN = 8
ABID_SUBTYPE_LEN = 2
ABID_RAND_LEN = 6
DEFAULT_ABID_PREFIX = 'obj_'
class ABID(NamedTuple):
"""
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
"""
prefix: str # e.g. obj_
ts: str # e.g. 01HX9FPYTR
uri: str # e.g. E4A5CCD9
subtype: str # e.g. 01
rand: str # e.g. ZYEBQE
def __getattr__(self, attr: str) -> Any:
return getattr(self.ulid, attr)
def __eq__(self, other: Any) -> bool:
try:
return self.ulid == other.ulid
except AttributeError:
return NotImplemented
def __str__(self) -> str:
return self.prefix + self.suffix
def __len__(self) -> int:
return len(self.prefix + self.suffix)
@classmethod
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
assert buffer, f'Attempted to create ABID from null value {buffer}'
buffer = str(buffer)
if '_' in buffer:
prefix, suffix = buffer.split('_')
else:
prefix, suffix = prefix.strip('_'), buffer
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long'
return cls(
prefix=abid_part_from_prefix(prefix),
ts=suffix[0:10].upper(),
uri=suffix[10:18].upper(),
subtype=suffix[18:20].upper(),
rand=suffix[20:26].upper(),
)
@property
def suffix(self):
return ''.join((self.ts, self.uri, self.subtype, self.rand))
@property
def ulid(self) -> ulid.ULID:
return ulid.parse(self.suffix)
@property
def uuid(self) -> UUID:
return self.ulid.uuid
@property
def uuid6(self) -> uuid6.UUID:
return uuid6.UUID(hex=self.uuid.hex)
@property
def typeid(self) -> TypeID:
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
@property
def datetime(self) -> datetime:
return self.ulid.timestamp().datetime
####################################################
def uri_hash(uri: Union[str, bytes]) -> str:
"""
'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25'
"""
if isinstance(uri, bytes):
uri_str: str = uri.decode()
else:
uri_str = uri
# only hash the domain part of URLs
if '://' in uri_str:
try:
domain = urlparse(uri_str).netloc
if domain:
uri_str = domain
except AttributeError:
pass
uri_bytes = uri_str.encode('utf-8')
return hashlib.sha256(uri_bytes).hexdigest().upper()
def abid_part_from_prefix(prefix: Optional[str]) -> str:
"""
'snp_'
"""
if prefix is None:
return 'obj_'
prefix = prefix.strip('_').lower()
assert len(prefix) == 3
return prefix + '_'
def abid_part_from_uri(uri: str) -> str:
"""
'E4A5CCD9' # takes first 8 characters of sha256(url)
"""
uri = str(uri)
return uri_hash(uri)[:ABID_URI_LEN]
def abid_part_from_ts(ts: Optional[datetime]) -> str:
"""
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
"""
return str(ulid.from_timestamp(ts) if ts else ulid.new())[:ABID_TS_LEN]
def abid_part_from_subtype(subtype: str) -> str:
"""
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
Also allows us to change the ulid spec later by putting special sigil values here.
"""
subtype = str(subtype)
if len(subtype) == ABID_SUBTYPE_LEN:
return subtype
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper()
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
"""
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
"""
if rand is None:
# if it's None we generate a new random 6 character hex string
return str(ulid.new())[-ABID_RAND_LEN:]
elif isinstance(rand, UUID):
# if it's a uuid we take the last 6 characters of the ULID represation of it
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
elif isinstance(rand, int):
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
rand_str = str(rand)[-ABID_RAND_LEN:]
padding_needed = ABID_RAND_LEN - len(rand_str)
rand_str = ('0'*padding_needed) + rand_str
return rand_str
# otherwise treat it as a string, take the last 6 characters of it verbatim
return str(rand)[-ABID_RAND_LEN:].upper()
def abid_from_values(prefix, ts, uri, subtype, rand) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
abid = ABID(
prefix=abid_part_from_prefix(prefix),
ts=abid_part_from_ts(ts),
uri=abid_part_from_uri(uri),
subtype=abid_part_from_subtype(subtype),
rand=abid_part_from_rand(rand),
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
return abid

View file

@ -0,0 +1,7 @@
from django.apps import AppConfig
class AbidUtilsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'abid_utils'

View file

@ -0,0 +1,314 @@
"""
This file provides the Django ABIDField and ABIDModel base model to inherit from.
It implements the ArchiveBox ID (ABID) interfaces including abid_values, get_abid, .abid, .uuid, .id.
"""
from typing import Any, Dict, Union, List, Set, NamedTuple, cast
from ulid import ULID
from uuid import uuid4, UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
from functools import partial
from charidfield import CharIDField # type: ignore[import-untyped]
from django.conf import settings
from django.db import models
from django.db.utils import OperationalError
from django.contrib.auth import get_user_model
from django_stubs_ext.db.models import TypedModelMeta
from .abid import (
ABID,
ABID_LEN,
ABID_RAND_LEN,
ABID_SUFFIX_LEN,
DEFAULT_ABID_PREFIX,
abid_part_from_prefix,
abid_from_values
)
####################################################
# Database Field for typeid/ulid style IDs with a prefix, e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ
ABIDField = partial(
CharIDField,
max_length=ABID_LEN,
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
default=None,
null=True,
blank=True,
db_index=True,
unique=True,
)
def get_or_create_system_user_pk(username='system'):
"""Get or create a system user with is_superuser=True to be the default owner for new DB rows"""
User = get_user_model()
# if only one user exists total, return that user
if User.objects.filter(is_superuser=True).count() == 1:
return User.objects.filter(is_superuser=True).values_list('pk', flat=True)[0]
# otherwise, create a dedicated "system" user
user, created = User.objects.get_or_create(username=username, is_staff=True, is_superuser=True, defaults={'email': '', 'password': ''})
return user.pk
class ABIDModel(models.Model):
"""
Abstract Base Model for other models to depend on. Provides ArchiveBox ID (ABID) interface.
"""
abid_prefix: str = DEFAULT_ABID_PREFIX # e.g. 'tag_'
abid_ts_src = 'None' # e.g. 'self.created'
abid_uri_src = 'None' # e.g. 'self.uri'
abid_subtype_src = 'None' # e.g. 'self.extractor'
abid_rand_src = 'None' # e.g. 'self.uuid' or 'self.id'
id = models.UUIDField(primary_key=True, default=uuid4, editable=True)
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
created = models.DateTimeField(auto_now_add=True)
modified = models.DateTimeField(auto_now=True)
class Meta(TypedModelMeta):
abstract = True
def save(self, *args: Any, **kwargs: Any) -> None:
if hasattr(self, 'abid'):
# self.abid = ABID.parse(self.abid) if self.abid else self.get_abid()
self.abid = self.get_abid()
else:
print(f'[!] WARNING: {self.__class__.__name__}.abid is not a DB field so ABID will not be persisted!')
self.abid = self.get_abid()
super().save(*args, **kwargs)
@property
def abid_values(self) -> Dict[str, Any]:
return {
'prefix': self.abid_prefix,
'ts': eval(self.abid_ts_src),
'uri': eval(self.abid_uri_src),
'subtype': eval(self.abid_subtype_src),
'rand': eval(self.abid_rand_src),
}
def get_abid(self) -> ABID:
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix, ts, uri, subtype, rand = self.abid_values.values()
if (not prefix) or prefix == DEFAULT_ABID_PREFIX:
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
@property
def ABID(self) -> ABID:
"""
ULIDParts(timestamp='01HX9FPYTR', url='E4A5CCD9', subtype='00', randomness='ZYEBQE')
"""
return ABID.parse(self.abid) if getattr(self, 'abid', None) else self.get_abid()
@property
def ULID(self) -> ULID:
"""
Get a ulid.ULID representation of the object's ABID.
"""
return self.ABID.ulid
@property
def UUID(self) -> UUID:
"""
Get a uuid.UUID (v4) representation of the object's ABID.
"""
return self.ABID.uuid
@property
def TypeID(self) -> TypeID:
"""
Get a typeid.TypeID (stripe-style) representation of the object's ABID.
"""
return self.ABID.typeid
####################################################
# Django helpers
def find_all_abid_prefixes() -> Dict[str, type[models.Model]]:
"""
Return the mapping of all ABID prefixes to their models.
e.g. {'tag_': core.models.Tag, 'snp_': core.models.Snapshot, ...}
"""
import django.apps
prefix_map = {}
for model in django.apps.apps.get_models():
abid_prefix = getattr(model, 'abid_prefix', None)
if abid_prefix:
prefix_map[abid_prefix] = model
return prefix_map
def find_prefix_for_abid(abid: ABID) -> str:
"""
Find the correct prefix for a given ABID that may have be missing a prefix (slow).
e.g. ABID('obj_01BJQMF54D093DXEAWZ6JYRPAQ') -> 'snp_'
"""
# if existing abid prefix is correct, lookup is easy
model = find_model_from_abid(abid)
if model:
assert issubclass(model, ABIDModel)
return model.abid_prefix
# prefix might be obj_ or missing, fuzzy-search to find any object that matches
return find_obj_from_abid_rand(abid)[0].abid_prefix
def find_model_from_abid_prefix(prefix: str) -> type[ABIDModel] | None:
"""
Return the Django Model that corresponds to a given ABID prefix.
e.g. 'tag_' -> core.models.Tag
"""
prefix = abid_part_from_prefix(prefix)
import django.apps
for model in django.apps.apps.get_models():
if not issubclass(model, ABIDModel): continue # skip non-ABID-enabled models
if not hasattr(model, 'objects'): continue # skip abstract models
if (model.abid_prefix == prefix):
return model
return None
def find_model_from_abid(abid: ABID) -> type[models.Model] | None:
"""
Shortcut for find_model_from_abid_prefix(abid.prefix)
"""
return find_model_from_abid_prefix(abid.prefix)
def find_obj_from_abid_rand(rand: Union[ABID, str], model=None) -> List[ABIDModel]:
"""
Find an object corresponding to an ABID by exhaustively searching using its random suffix (slow).
e.g. 'obj_....................JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
# convert str to ABID if necessary
if isinstance(rand, ABID):
abid: ABID = rand
else:
rand = str(rand)
if len(rand) < ABID_SUFFIX_LEN:
padding_needed = ABID_SUFFIX_LEN - len(rand)
rand = ('0'*padding_needed) + rand
abid = ABID.parse(rand)
import django.apps
partial_matches: List[ABIDModel] = []
models_to_try = cast(Set[type[models.Model]], set(filter(bool, (
model,
find_model_from_abid(abid),
*django.apps.apps.get_models(),
))))
# print(abid, abid.rand, abid.uuid, models_to_try)
for model in models_to_try:
if not issubclass(model, ABIDModel): continue # skip Models that arent ABID-enabled
if not hasattr(model, 'objects'): continue # skip abstract Models
assert hasattr(model, 'objects') # force-fix for type hint nit about missing manager https://github.com/typeddjango/django-stubs/issues/1684
# continue on to try fuzzy searching by randomness portion derived from uuid field
try:
qs = []
if hasattr(model, 'abid'):
qs = model.objects.filter(abid__endswith=abid.rand)
elif hasattr(model, 'uuid'):
qs = model.objects.filter(uuid__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
elif hasattr(model, 'id'):
# NOTE: this only works on SQLite where every column is a string
# other DB backends like postgres dont let you do __endswith if this is a BigAutoInteger field
# try to search for uuid=...-2354352
# try to search for id=...2354352
# try to search for id=2354352
qs = model.objects.filter(
models.Q(id__endswith=str(abid.uuid)[-ABID_RAND_LEN:])
| models.Q(id__endswith=abid.rand)
| models.Q(id__startswith=str(int(abid.rand)) if abid.rand.isdigit() else abid.rand)
)
for obj in qs:
if obj.get_abid() == abid:
# found exact match, no need to keep iterating
return [obj]
partial_matches.append(obj)
except OperationalError as err:
print(f'[!] WARNING: Got error while trying to iterate through QuerySet for {model}:', err, '\n')
return partial_matches
def find_obj_from_abid(abid: ABID, model=None, fuzzy=False) -> Any:
"""
Find an object with a given ABID by filtering possible models for a matching abid/uuid/id (fast).
e.g. 'snp_01BJQMF54D093DXEAWZ6JYRPAQ' -> Snapshot('snp_01BJQMF54D093DXEAWZ6JYRPAQ')
"""
model = model or find_model_from_abid(abid)
assert model, f'Could not find model that could match this ABID type: {abid}'
try:
if hasattr(model, 'abid'):
return model.objects.get(abid__endswith=abid.suffix)
if hasattr(model, 'uuid'):
return model.objects.get(uuid=abid.uuid)
return model.objects.get(id=abid.uuid)
except model.DoesNotExist:
# if the model has an abid field then it shouldve matched, pointless to fuzzy search in that case
if hasattr(model, 'abid') or (not fuzzy):
raise
# continue on to try fuzzy searching by randomness portion derived from uuid field
match_by_rand = find_obj_from_abid_rand(abid, model=model)
if match_by_rand:
if match_by_rand[0].abid_prefix != abid.prefix:
print(f'[!] WARNING: fetched object {match_by_rand} even though prefix {abid.prefix} doesnt match!', abid, '\n')
return match_by_rand
raise model.DoesNotExist

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -3,5 +3,9 @@ __package__ = 'archivebox.api'
from django.apps import AppConfig from django.apps import AppConfig
class APIConfig(AppConfig): class APIConfig(AppConfig):
name = 'api' name = 'api'
def ready(self):
pass

View file

@ -0,0 +1,60 @@
# Generated by Django 5.0.6 on 2024-05-13 10:58
import charidfield.fields
import signal_webhooks.fields
import signal_webhooks.utils
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('api', '0002_alter_apitoken_options'),
]
operations = [
migrations.CreateModel(
name='OutboundWebhook',
fields=[
('name', models.CharField(db_index=True, help_text='Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).', max_length=255, unique=True, verbose_name='name')),
('signal', models.CharField(choices=[('CREATE', 'Create'), ('UPDATE', 'Update'), ('DELETE', 'Delete'), ('M2M', 'M2M changed'), ('CREATE_OR_UPDATE', 'Create or Update'), ('CREATE_OR_DELETE', 'Create or Delete'), ('CREATE_OR_M2M', 'Create or M2M changed'), ('UPDATE_OR_DELETE', 'Update or Delete'), ('UPDATE_OR_M2M', 'Update or M2M changed'), ('DELETE_OR_M2M', 'Delete or M2M changed'), ('CREATE_UPDATE_OR_DELETE', 'Create, Update or Delete'), ('CREATE_UPDATE_OR_M2M', 'Create, Update or M2M changed'), ('CREATE_DELETE_OR_M2M', 'Create, Delete or M2M changed'), ('UPDATE_DELETE_OR_M2M', 'Update, Delete or M2M changed'), ('CREATE_UPDATE_DELETE_OR_M2M', 'Create, Update or Delete, or M2M changed')], help_text='The type of event the webhook should fire for (e.g. Create, Update, Delete).', max_length=255, verbose_name='signal')),
('ref', models.CharField(db_index=True, help_text='Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).', max_length=1023, validators=[signal_webhooks.utils.model_from_reference], verbose_name='referenced model')),
('endpoint', models.URLField(help_text='External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).', max_length=2047, verbose_name='endpoint')),
('headers', models.JSONField(blank=True, default=dict, help_text='Headers to send with the webhook request.', validators=[signal_webhooks.utils.is_dict], verbose_name='headers')),
('auth_token', signal_webhooks.fields.TokenField(blank=True, default='', help_text='Authentication token to use in an Authorization header.', max_length=8000, validators=[signal_webhooks.utils.decode_cipher_key], verbose_name='authentication token')),
('enabled', models.BooleanField(default=True, help_text='Is this webhook enabled?', verbose_name='enabled')),
('keep_last_response', models.BooleanField(default=False, help_text='Should the webhook keep a log of the latest response it got?', verbose_name='keep last response')),
('created', models.DateTimeField(auto_now_add=True, help_text='When the webhook was created.', verbose_name='created')),
('updated', models.DateTimeField(auto_now=True, help_text='When the webhook was last updated.', verbose_name='updated')),
('last_response', models.CharField(blank=True, default='', help_text='Latest response to this webhook.', max_length=8000, verbose_name='last response')),
('last_success', models.DateTimeField(default=None, help_text='When the webhook last succeeded.', null=True, verbose_name='last success')),
('last_failure', models.DateTimeField(default=None, help_text='When the webhook last failed.', null=True, verbose_name='last failure')),
('uuid', models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False)),
('abid', charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk', unique=True)),
],
options={
'verbose_name': 'API Outbound Webhook',
'abstract': False,
},
),
migrations.AddField(
model_name='apitoken',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt', unique=True),
),
migrations.AddField(
model_name='apitoken',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AlterField(
model_name='apitoken',
name='id',
field=models.UUIDField(default=uuid.uuid4, primary_key=True, serialize=False),
),
migrations.AddConstraint(
model_name='outboundwebhook',
constraint=models.UniqueConstraint(fields=('ref', 'endpoint'), name='prevent_duplicate_hooks_api_outboundwebhook'),
),
]

View file

@ -0,0 +1,58 @@
# Generated by Django 5.0.6 on 2024-05-13 14:36
import abid_utils.models
import charidfield.fields
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('api', '0003_outboundwebhook_apitoken_abid_apitoken_uuid_and_more'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.RenameField(
model_name='apitoken',
old_name='user',
new_name='created_by',
),
migrations.AddField(
model_name='apitoken',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='outboundwebhook',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='outboundwebhook',
name='id',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AddField(
model_name='outboundwebhook',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AlterField(
model_name='apitoken',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='apt_', unique=True),
),
migrations.AlterField(
model_name='outboundwebhook',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='whk_', unique=True),
),
migrations.AlterField(
model_name='outboundwebhook',
name='created',
field=models.DateTimeField(auto_now_add=True),
),
]

View file

@ -8,23 +8,40 @@ from django.conf import settings
from django.db import models from django.db import models
from django.utils import timezone from django.utils import timezone
from signal_webhooks.models import WebhookBase
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from abid_utils.models import ABIDModel, ABIDField
def generate_secret_token() -> str: def generate_secret_token() -> str:
# returns cryptographically secure string with len() == 32 # returns cryptographically secure string with len() == 32
return secrets.token_hex(16) return secrets.token_hex(16)
class APIToken(models.Model): class APIToken(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) """
A secret key generated by a User that's used to authenticate REST API requests to ArchiveBox.
"""
# ABID: apt_<created_ts>_<token_hash>_<user_id_hash>_<uuid_rand>
abid_prefix = 'apt_'
abid_ts_src = 'self.created'
abid_uri_src = 'self.token'
abid_subtype_src = 'self.user_id'
abid_rand_src = 'self.id'
user = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE) id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE)
token = models.CharField(max_length=32, default=generate_secret_token, unique=True) token = models.CharField(max_length=32, default=generate_secret_token, unique=True)
created = models.DateTimeField(auto_now_add=True) created = models.DateTimeField(auto_now_add=True)
expires = models.DateTimeField(null=True, blank=True) expires = models.DateTimeField(null=True, blank=True)
class Meta(TypedModelMeta): class Meta(TypedModelMeta):
verbose_name = "API Key" verbose_name = "API Key"
verbose_name_plural = "API Keys" verbose_name_plural = "API Keys"
@ -38,7 +55,8 @@ class APIToken(models.Model):
def __json__(self) -> dict: def __json__(self) -> dict:
return { return {
"TYPE": "APIToken", "TYPE": "APIToken",
"id": str(self.id), "uuid": str(self.id),
"abid": str(self.get_abid()),
"user_id": str(self.user.id), "user_id": str(self.user.id),
"user_username": self.user.username, "user_username": self.user.username,
"token": self.token, "token": self.token,
@ -61,3 +79,37 @@ class APIToken(models.Model):
return True return True
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
class OutboundWebhook(ABIDModel, WebhookBase):
"""
Model used in place of (extending) signals_webhooks.models.WebhookModel. Swapped using:
settings.SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
"""
abid_prefix = 'whk_'
abid_ts_src = 'self.created'
abid_uri_src = 'self.endpoint'
abid_subtype_src = 'self.ref'
abid_rand_src = 'self.id'
id = models.UUIDField(blank=True, null=True, unique=True, editable=True)
uuid = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
abid = ABIDField(prefix=abid_prefix)
WebhookBase._meta.get_field('name').help_text = (
'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).')
WebhookBase._meta.get_field('signal').help_text = (
'The type of event the webhook should fire for (e.g. Create, Update, Delete).')
WebhookBase._meta.get_field('ref').help_text = (
'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).')
WebhookBase._meta.get_field('endpoint').help_text = (
'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).')
class Meta(WebhookBase.Meta):
verbose_name = 'API Outbound Webhook'

View file

@ -47,6 +47,6 @@ def check_api_token(request, token_data: TokenAuthSchema):
request=request, request=request,
) )
if user: if user:
return {"success": True, "user_id": str(user.id)} return {"success": True, "user_id": str(user.pk)}
return {"success": False, "user_id": None} return {"success": False, "user_id": None}

View file

@ -4,13 +4,14 @@ from uuid import UUID
from typing import List, Optional from typing import List, Optional
from datetime import datetime from datetime import datetime
from django.db.models import Q
from django.shortcuts import get_object_or_404 from django.shortcuts import get_object_or_404
from ninja import Router, Schema, FilterSchema, Field, Query from ninja import Router, Schema, FilterSchema, Field, Query
from ninja.pagination import paginate from ninja.pagination import paginate
from core.models import Snapshot, ArchiveResult, Tag from core.models import Snapshot, ArchiveResult, Tag
from abid_utils.abid import ABID
router = Router(tags=['Core Models']) router = Router(tags=['Core Models'])
@ -20,24 +21,39 @@ router = Router(tags=['Core Models'])
### ArchiveResult ######################################################################### ### ArchiveResult #########################################################################
class ArchiveResultSchema(Schema): class ArchiveResultSchema(Schema):
id: UUID abid: str
uuid: UUID
pk: str
modified: datetime
created: datetime
created_by_id: str
snapshot_id: UUID snapshot_abid: str
snapshot_url: str snapshot_url: str
snapshot_tags: str snapshot_tags: str
extractor: str extractor: str
cmd_version: str
cmd: List[str] cmd: List[str]
pwd: str pwd: str
cmd_version: str
output: str
status: str status: str
output: str
created: datetime
@staticmethod @staticmethod
def resolve_id(obj): def resolve_created_by_id(obj):
return obj.uuid return str(obj.created_by_id)
@staticmethod
def resolve_pk(obj):
return str(obj.pk)
@staticmethod
def resolve_uuid(obj):
return str(obj.uuid)
@staticmethod
def resolve_abid(obj):
return str(obj.ABID)
@staticmethod @staticmethod
def resolve_created(obj): def resolve_created(obj):
@ -47,18 +63,23 @@ class ArchiveResultSchema(Schema):
def resolve_snapshot_url(obj): def resolve_snapshot_url(obj):
return obj.snapshot.url return obj.snapshot.url
@staticmethod
def resolve_snapshot_abid(obj):
return str(obj.snapshot.ABID)
@staticmethod @staticmethod
def resolve_snapshot_tags(obj): def resolve_snapshot_tags(obj):
return obj.snapshot.tags_str() return obj.snapshot.tags_str()
class ArchiveResultFilterSchema(FilterSchema): class ArchiveResultFilterSchema(FilterSchema):
id: Optional[UUID] = Field(None, q='uuid') uuid: Optional[UUID] = Field(None, q='uuid')
# abid: Optional[str] = Field(None, q='abid')
search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains']) search: Optional[str] = Field(None, q=['snapshot__url__icontains', 'snapshot__title__icontains', 'snapshot__tags__name__icontains', 'extractor', 'output__icontains'])
snapshot_id: Optional[UUID] = Field(None, q='snapshot_id') snapshot_uuid: Optional[UUID] = Field(None, q='snapshot_uuid__icontains')
snapshot_url: Optional[str] = Field(None, q='snapshot__url') snapshot_url: Optional[str] = Field(None, q='snapshot__url__icontains')
snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name') snapshot_tag: Optional[str] = Field(None, q='snapshot__tags__name__icontains')
status: Optional[str] = Field(None, q='status') status: Optional[str] = Field(None, q='status')
output: Optional[str] = Field(None, q='output__icontains') output: Optional[str] = Field(None, q='output__icontains')
@ -75,6 +96,7 @@ class ArchiveResultFilterSchema(FilterSchema):
@router.get("/archiveresults", response=List[ArchiveResultSchema]) @router.get("/archiveresults", response=List[ArchiveResultSchema])
@paginate @paginate
def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)): def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)):
"""List all ArchiveResult entries matching these filters."""
qs = ArchiveResult.objects.all() qs = ArchiveResult.objects.all()
results = filters.filter(qs) results = filters.filter(qs)
return results return results
@ -82,8 +104,8 @@ def list_archiveresults(request, filters: ArchiveResultFilterSchema = Query(...)
@router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema) @router.get("/archiveresult/{archiveresult_id}", response=ArchiveResultSchema)
def get_archiveresult(request, archiveresult_id: str): def get_archiveresult(request, archiveresult_id: str):
archiveresult = get_object_or_404(ArchiveResult, id=archiveresult_id) """Get a specific ArchiveResult by abid, uuid, or pk."""
return archiveresult return ArchiveResult.objects.get(Q(pk__icontains=archiveresult_id) | Q(abid__icontains=archiveresult_id) | Q(uuid__icontains=archiveresult_id))
# @router.post("/archiveresult", response=ArchiveResultSchema) # @router.post("/archiveresult", response=ArchiveResultSchema)
@ -115,27 +137,50 @@ def get_archiveresult(request, archiveresult_id: str):
class SnapshotSchema(Schema): class SnapshotSchema(Schema):
id: UUID abid: str
uuid: UUID
pk: str
modified: datetime
created: datetime
created_by_id: str
url: str url: str
tags: str tags: str
title: Optional[str] title: Optional[str]
timestamp: str timestamp: str
bookmarked: datetime
added: datetime
updated: datetime
archive_path: str archive_path: str
bookmarked: datetime
added: datetime
updated: Optional[datetime]
num_archiveresults: int
archiveresults: List[ArchiveResultSchema] archiveresults: List[ArchiveResultSchema]
# @staticmethod @staticmethod
# def resolve_id(obj): def resolve_created_by_id(obj):
# return str(obj.id) return str(obj.created_by_id)
@staticmethod
def resolve_pk(obj):
return str(obj.pk)
@staticmethod
def resolve_uuid(obj):
return str(obj.uuid)
@staticmethod
def resolve_abid(obj):
return str(obj.ABID)
@staticmethod @staticmethod
def resolve_tags(obj): def resolve_tags(obj):
return obj.tags_str() return obj.tags_str()
@staticmethod
def resolve_num_archiveresults(obj, context):
return obj.archiveresult_set.all().distinct().count()
@staticmethod @staticmethod
def resolve_archiveresults(obj, context): def resolve_archiveresults(obj, context):
if context['request'].with_archiveresults: if context['request'].with_archiveresults:
@ -144,23 +189,32 @@ class SnapshotSchema(Schema):
class SnapshotFilterSchema(FilterSchema): class SnapshotFilterSchema(FilterSchema):
id: Optional[UUID] = Field(None, q='id') abid: Optional[str] = Field(None, q='abid__icontains')
uuid: Optional[str] = Field(None, q='uuid__icontains')
pk: Optional[str] = Field(None, q='pk__icontains')
created_by_id: str = Field(None, q='created_by_id__icontains')
created__gte: datetime = Field(None, q='created__gte')
created__lt: datetime = Field(None, q='created__lt')
created: datetime = Field(None, q='created')
modified: datetime = Field(None, q='modified')
modified__gte: datetime = Field(None, q='modified__gte')
modified__lt: datetime = Field(None, q='modified__lt')
search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains']) search: Optional[str] = Field(None, q=['url__icontains', 'title__icontains', 'tags__name__icontains', 'abid__icontains', 'uuid__icontains'])
url: Optional[str] = Field(None, q='url') url: Optional[str] = Field(None, q='url')
tag: Optional[str] = Field(None, q='tags__name') tag: Optional[str] = Field(None, q='tags__name')
title: Optional[str] = Field(None, q='title__icontains') title: Optional[str] = Field(None, q='title__icontains')
timestamp: Optional[str] = Field(None, q='timestamp__startswith') timestamp: Optional[str] = Field(None, q='timestamp__startswith')
added: Optional[datetime] = Field(None, q='added')
added__gte: Optional[datetime] = Field(None, q='added__gte') added__gte: Optional[datetime] = Field(None, q='added__gte')
added__lt: Optional[datetime] = Field(None, q='added__lt') added__lt: Optional[datetime] = Field(None, q='added__lt')
@router.get("/snapshots", response=List[SnapshotSchema]) @router.get("/snapshots", response=List[SnapshotSchema])
@paginate @paginate
def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True): def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_archiveresults: bool=True):
"""List all Snapshot entries matching these filters."""
request.with_archiveresults = with_archiveresults request.with_archiveresults = with_archiveresults
qs = Snapshot.objects.all() qs = Snapshot.objects.all()
@ -169,8 +223,24 @@ def list_snapshots(request, filters: SnapshotFilterSchema = Query(...), with_arc
@router.get("/snapshot/{snapshot_id}", response=SnapshotSchema) @router.get("/snapshot/{snapshot_id}", response=SnapshotSchema)
def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True): def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
"""Get a specific Snapshot by abid, uuid, or pk."""
request.with_archiveresults = with_archiveresults request.with_archiveresults = with_archiveresults
snapshot = get_object_or_404(Snapshot, id=snapshot_id) snapshot = None
try:
snapshot = Snapshot.objects.get(Q(uuid__startswith=snapshot_id) | Q(abid__startswith=snapshot_id)| Q(pk__startswith=snapshot_id))
except Snapshot.DoesNotExist:
pass
try:
snapshot = snapshot or Snapshot.objects.get()
except Snapshot.DoesNotExist:
pass
try:
snapshot = snapshot or Snapshot.objects.get(Q(uuid__icontains=snapshot_id) | Q(abid__icontains=snapshot_id))
except Snapshot.DoesNotExist:
pass
return snapshot return snapshot
@ -179,9 +249,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
# snapshot = Snapshot.objects.create(**payload.dict()) # snapshot = Snapshot.objects.create(**payload.dict())
# return snapshot # return snapshot
# #
# @router.put("/snapshot/{snapshot_id}", response=SnapshotSchema) # @router.put("/snapshot/{snapshot_uuid}", response=SnapshotSchema)
# def update_snapshot(request, snapshot_id: str, payload: SnapshotSchema): # def update_snapshot(request, snapshot_uuid: str, payload: SnapshotSchema):
# snapshot = get_object_or_404(Snapshot, id=snapshot_id) # snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
# #
# for attr, value in payload.dict().items(): # for attr, value in payload.dict().items():
# setattr(snapshot, attr, value) # setattr(snapshot, attr, value)
@ -189,9 +259,9 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
# #
# return snapshot # return snapshot
# #
# @router.delete("/snapshot/{snapshot_id}") # @router.delete("/snapshot/{snapshot_uuid}")
# def delete_snapshot(request, snapshot_id: str): # def delete_snapshot(request, snapshot_uuid: str):
# snapshot = get_object_or_404(Snapshot, id=snapshot_id) # snapshot = get_object_or_404(Snapshot, uuid=snapshot_uuid)
# snapshot.delete() # snapshot.delete()
# return {"success": True} # return {"success": True}
@ -201,10 +271,21 @@ def get_snapshot(request, snapshot_id: str, with_archiveresults: bool=True):
class TagSchema(Schema): class TagSchema(Schema):
abid: Optional[UUID] = Field(None, q='abid')
uuid: Optional[UUID] = Field(None, q='uuid')
pk: Optional[UUID] = Field(None, q='pk')
modified: datetime
created: datetime
created_by_id: str
name: str name: str
slug: str slug: str
@staticmethod
def resolve_created_by_id(obj):
return str(obj.created_by_id)
@router.get("/tags", response=List[TagSchema]) @router.get("/tags", response=List[TagSchema])
def list_tags(request): def list_tags(request):
return Tag.objects.all() return Tag.objects.all()

View file

@ -37,7 +37,10 @@ is_valid_cli_module = lambda module, subcommand: (
) )
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=('MainThread', 'ThreadPoolExecutor'), timeout: int=60) -> int: IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread') # threads we dont have to wait for before exiting
def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: Iterable[str]=IGNORED_BG_THREADS, timeout: int=60) -> int:
""" """
Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks. Block until the specified threads exit. e.g. pass thread_names=('default_hook_handler',) to wait for webhooks.
Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes. Useful for waiting for signal handlers, webhooks, etc. to finish running after a mgmt command completes.

View file

@ -37,7 +37,7 @@ from sqlite3 import dbapi2 as sqlite3
from hashlib import md5 from hashlib import md5
from pathlib import Path from pathlib import Path
from datetime import datetime, timezone from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict, Union, List from typing import Optional, Type, Tuple, Dict, Union, List, Any
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser from configparser import ConfigParser
from collections import defaultdict from collections import defaultdict

View file

@ -15,8 +15,7 @@ from django.contrib.auth import get_user_model
from django import forms from django import forms
from signal_webhooks.apps import DjangoSignalWebhooksConfig from signal_webhooks.admin import WebhookAdmin, get_webhook_model
from signal_webhooks.admin import WebhookAdmin, WebhookModel
from ..util import htmldecode, urldecode, ansi_to_html from ..util import htmldecode, urldecode, ansi_to_html
@ -104,23 +103,14 @@ class ArchiveBoxAdmin(admin.AdminSite):
return render(template_name='add.html', request=request, context=context) return render(template_name='add.html', request=request, context=context)
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
DjangoSignalWebhooksConfig.verbose_name = 'API'
WebhookModel._meta.get_field('name').help_text = 'Give your webhook a descriptive name (e.g. Notify ACME Slack channel of any new ArchiveResults).'
WebhookModel._meta.get_field('signal').help_text = 'The type of event the webhook should fire for (e.g. Create, Update, Delete).'
WebhookModel._meta.get_field('ref').help_text = 'Dot import notation of the model the webhook should fire for (e.g. core.models.Snapshot or core.models.ArchiveResult).'
WebhookModel._meta.get_field('endpoint').help_text = 'External URL to POST the webhook notification to (e.g. https://someapp.example.com/webhook/some-webhook-receiver).'
WebhookModel._meta.app_label = 'api'
archivebox_admin = ArchiveBoxAdmin() archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model()) archivebox_admin.register(get_user_model())
archivebox_admin.register(APIToken) archivebox_admin.register(APIToken)
archivebox_admin.register(WebhookModel, WebhookAdmin) archivebox_admin.register(get_webhook_model(), WebhookAdmin)
archivebox_admin.disable_action('delete_selected') archivebox_admin.disable_action('delete_selected')
# patch admin with methods to add data views # patch admin with methods to add data views (implemented by admin_data_views package)
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
@ -170,14 +160,41 @@ class SnapshotActionForm(ActionForm):
# ) # )
def get_abid_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
&nbsp; &nbsp; ABID:&nbsp; <code style="font-size: 16px; user-select: all"><b>{}</b></code><br/>
&nbsp; &nbsp; TS: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;<code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; URI: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; SUBTYPE: &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/>
&nbsp; &nbsp; RAND: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all"><b>{}</b></code> ({})<br/><br/>
&nbsp; &nbsp; ABID AS UUID:&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
&nbsp; &nbsp; .uuid: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
&nbsp; &nbsp; .id: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp;&nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/>
&nbsp; &nbsp; .pk: &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;<br/><br/>
''',
obj.abid,
obj.ABID.ts, obj.abid_values['ts'].isoformat() if isinstance(obj.abid_values['ts'], datetime) else obj.abid_values['ts'],
obj.ABID.uri, str(obj.abid_values['uri']),
obj.ABID.subtype, str(obj.abid_values['subtype']),
obj.ABID.rand, str(obj.abid_values['rand'])[-7:],
obj.ABID.uuid,
obj.uuid,
obj.id,
obj.pk,
)
@admin.register(Snapshot, site=archivebox_admin) @admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin): class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'files', 'size', 'url_str') list_display = ('added', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'added', 'files') sort_fields = ('title_str', 'url_str', 'added', 'files')
readonly_fields = ('info', 'bookmarked', 'added', 'updated') readonly_fields = ('admin_actions', 'status_info', 'bookmarked', 'added', 'updated', 'created', 'modified', 'identifiers')
search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name') search_fields = ('id', 'url', 'abid', 'uuid', 'timestamp', 'title', 'tags__name')
fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields) fields = ('url', 'timestamp', 'created_by', 'tags', 'title', *readonly_fields)
list_filter = ('added', 'updated', 'tags', 'archiveresult__status') list_filter = ('added', 'updated', 'tags', 'archiveresult__status', 'created_by')
ordering = ['-added'] ordering = ['-added']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
autocomplete_fields = ['tags'] autocomplete_fields = ['tags']
@ -223,40 +240,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
# </form> # </form>
# ''', # ''',
# csrf.get_token(self.request), # csrf.get_token(self.request),
# obj.id, # obj.pk,
# ) # )
def info(self, obj): def admin_actions(self, obj):
return format_html( return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page </a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions </a>
''',
obj.timestamp,
obj.timestamp,
obj.pk,
)
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
''' '''
UUID: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
Timestamp: <code style="font-size: 10px; user-select: all">{}</code> &nbsp; &nbsp;
URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
Archived: {} ({} files {}) &nbsp; &nbsp; Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp; Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp; Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp; Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp; Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp; Extension: {} &nbsp; &nbsp;
<br/><br/>
<a href="/archive/{}">View Snapshot index </a> &nbsp; &nbsp;
<a href="/admin/core/snapshot/?id__exact={}">View actions </a>
''', ''',
obj.id,
obj.timestamp,
obj.url_hash,
'' if obj.is_archived else '', '' if obj.is_archived else '',
obj.num_outputs, obj.num_outputs,
self.size(obj), self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico', f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '?', obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '?', obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '?', obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '?', obj.extension or '-',
obj.timestamp,
obj.id,
) )
def identifiers(self, obj):
return get_abid_info(self, obj)
@admin.display( @admin.display(
description='Title', description='Title',
ordering='title', ordering='title',
@ -316,7 +339,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
return format_html( return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>', '<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url, obj.url,
obj.url, obj.url[:128],
) )
def grid_view(self, request, extra_context=None): def grid_view(self, request, extra_context=None):
@ -419,42 +442,45 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
@admin.register(Tag, site=archivebox_admin) @admin.register(Tag, site=archivebox_admin)
class TagAdmin(admin.ModelAdmin): class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id') list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'abid')
sort_fields = ('id', 'name', 'slug') sort_fields = ('id', 'name', 'slug', 'abid')
readonly_fields = ('id', 'num_snapshots', 'snapshots') readonly_fields = ('created', 'modified', 'identifiers', 'num_snapshots', 'snapshots')
search_fields = ('id', 'name', 'slug') search_fields = ('id', 'abid', 'uuid', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug') fields = ('name', 'slug', 'created_by', *readonly_fields, )
actions = ['delete_selected'] actions = ['delete_selected']
ordering = ['-id'] ordering = ['-id']
def num_snapshots(self, obj): def identifiers(self, obj):
return get_abid_info(self, obj)
def num_snapshots(self, tag):
return format_html( return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>', '<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
obj.id, tag.id,
obj.snapshot_set.count(), tag.snapshot_set.count(),
) )
def snapshots(self, obj): def snapshots(self, tag):
total_count = obj.snapshot_set.count() total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join( return mark_safe('<br/>'.join(
format_html( format_html(
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>', '{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...', snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
snap.id, snap.pk,
snap.timestamp, snap.abid,
snap.url, snap.url,
) )
for snap in obj.snapshot_set.order_by('-updated')[:10] for snap in tag.snapshot_set.order_by('-updated')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else '')) ) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">and {total_count-10} more...<a>' if tag.snapshot_set.count() > 10 else ''))
@admin.register(ArchiveResult, site=archivebox_admin) @admin.register(ArchiveResult, site=archivebox_admin)
class ArchiveResultAdmin(admin.ModelAdmin): class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'tags_str', 'cmd_str', 'status', 'output_str') list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status') sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('id', 'uuid', 'snapshot_str', 'tags_str') readonly_fields = ('snapshot_info', 'tags_str', 'created_by', 'created', 'modified', 'identifiers')
search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') search_fields = ('id', 'uuid', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = (*readonly_fields, 'snapshot', 'extractor', 'status', 'start_ts', 'end_ts', 'output', 'pwd', 'cmd', 'cmd_version') fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'cmd', 'start_ts', 'end_ts', 'cmd_version', *readonly_fields)
autocomplete_fields = ['snapshot'] autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
@ -462,33 +488,36 @@ class ArchiveResultAdmin(admin.ModelAdmin):
list_per_page = SNAPSHOTS_PER_PAGE list_per_page = SNAPSHOTS_PER_PAGE
@admin.display( @admin.display(
description='snapshot' description='Snapshot Info'
) )
def snapshot_str(self, obj): def snapshot_info(self, result):
return format_html( return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>' '<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
'<small>{}</small>', result.snapshot.timestamp,
obj.snapshot.timestamp, result.snapshot.abid,
obj.snapshot.timestamp, result.snapshot.added.strftime('%Y-%m-%d %H:%M'),
obj.snapshot.url[:128], result.snapshot.url[:128],
) )
def identifiers(self, obj):
return get_abid_info(self, obj)
@admin.display( @admin.display(
description='tags' description='Snapshot Tags'
) )
def tags_str(self, obj): def tags_str(self, result):
return obj.snapshot.tags_str() return result.snapshot.tags_str()
def cmd_str(self, obj): def cmd_str(self, result):
return format_html( return format_html(
'<pre>{}</pre>', '<pre>{}</pre>',
' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd), ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
) )
def output_str(self, obj): def output_str(self, result):
return format_html( return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>', '<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
obj.snapshot.timestamp, result.snapshot.timestamp,
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html', result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
obj.output, result.output,
) )

View file

@ -0,0 +1,43 @@
# Generated by Django 5.0.6 on 2024-05-13 10:56
import charidfield.fields
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0022_auto_20231023_2008'),
]
operations = [
migrations.AlterModelOptions(
name='archiveresult',
options={'verbose_name': 'Result'},
),
migrations.AddField(
model_name='archiveresult',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='res_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='snp_', unique=True),
),
migrations.AddField(
model_name='snapshot',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AddField(
model_name='tag',
name='abid',
field=charidfield.fields.CharIDField(blank=True, db_index=True, default=None, help_text='ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)', max_length=30, null=True, prefix='tag_', unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('htmltotext', 'htmltotext'), ('git', 'git'), ('singlefile', 'singlefile'), ('media', 'media'), ('archive_org', 'archive_org'), ('readability', 'readability'), ('mercury', 'mercury'), ('favicon', 'favicon'), ('pdf', 'pdf'), ('headers', 'headers'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('title', 'title'), ('wget', 'wget')], max_length=32),
),
]

View file

@ -0,0 +1,95 @@
# Generated by Django 5.0.6 on 2024-05-13 11:43
from django.db import migrations
from datetime import datetime
from abid_utils.abid import abid_from_values
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def copy_snapshot_uuids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.uuid = snapshot.id
snapshot.save(update_fields=["uuid"])
def generate_snapshot_abids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
for snapshot in Snapshot.objects.all():
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
snapshot.abid_uri_src = 'self.url'
snapshot.abid_subtype_src = '"01"'
snapshot.abid_rand_src = 'self.uuid'
snapshot.abid = calculate_abid(snapshot)
snapshot.save(update_fields=["abid"])
def generate_archiveresult_abids(apps, schema_editor):
ArchiveResult = apps.get_model("core", "ArchiveResult")
Snapshot = apps.get_model("core", "Snapshot")
for result in ArchiveResult.objects.all():
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
result.snapshot_added = result.snapshot.added
result.snapshot_url = result.snapshot.url
result.abid_ts_src = 'self.snapshot_added'
result.abid_uri_src = 'self.snapshot_url'
result.abid_subtype_src = 'self.extractor'
result.abid_rand_src = 'self.id'
result.abid = calculate_abid(result)
result.uuid = result.abid.uuid
result.save(update_fields=["abid", "uuid"])
class Migration(migrations.Migration):
dependencies = [
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
]
operations = [
migrations.RunPython(copy_snapshot_uuids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_snapshot_abids, reverse_code=migrations.RunPython.noop),
migrations.RunPython(generate_archiveresult_abids, reverse_code=migrations.RunPython.noop),
]

View file

@ -0,0 +1,19 @@
# Generated by Django 5.0.6 on 2024-05-13 12:08
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0024_auto_20240513_1143'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False, unique=True),
),
]

View file

@ -0,0 +1,76 @@
# Generated by Django 5.0.6 on 2024-05-13 13:01
import abid_utils.models
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_uuid'),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='archiveresult',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='archiveresult',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='snapshot',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='snapshot',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='snapshot',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='created',
field=models.DateTimeField(auto_now_add=True, default=django.utils.timezone.now),
preserve_default=False,
),
migrations.AddField(
model_name='tag',
name='created_by',
field=models.ForeignKey(default=abid_utils.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='tag',
name='modified',
field=models.DateTimeField(auto_now=True),
),
migrations.AddField(
model_name='tag',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, null=True, unique=True),
),
]

View file

@ -1,12 +1,14 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
import uuid from typing import Optional, List, Dict
from django_stubs_ext.db.models import TypedModelMeta
import json import json
import uuid
from uuid import uuid4
from pathlib import Path from pathlib import Path
from typing import Optional, List
from importlib import import_module
from django.db import models from django.db import models
from django.utils.functional import cached_property from django.utils.functional import cached_property
@ -16,12 +18,15 @@ from django.urls import reverse
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.contrib.auth.models import User # noqa from django.contrib.auth.models import User # noqa
from abid_utils.models import ABIDModel, ABIDField
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl from ..util import parse_date, base_url
from ..index.schema import Link from ..index.schema import Link
from ..index.html import snapshot_icons from ..index.html import snapshot_icons
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()] EXTRACTOR_CHOICES = [(extractor_name, extractor_name) for extractor_name in EXTRACTORS.keys()]
STATUS_CHOICES = [ STATUS_CHOICES = [
@ -30,26 +35,41 @@ STATUS_CHOICES = [
("skipped", "skipped") ("skipped", "skipped")
] ]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
class Tag(models.Model): # class BaseModel(models.Model):
# # TODO: migrate all models to a shared base class with all our standard fields and helpers:
# # ulid/created/modified/owner/is_deleted/as_json/from_json/etc.
# #
# # id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
# # ulid = models.CharField(max_length=26, null=True, blank=True, db_index=True, unique=True)
# class Meta(TypedModelMeta):
# abstract = True
class Tag(ABIDModel):
""" """
Based on django-taggit model Based on django-taggit model + ABID base.
""" """
abid_prefix = 'tag_'
abid_ts_src = 'self.created' # TODO: add created/modified time
abid_uri_src = 'self.name'
abid_subtype_src = '"03"'
abid_rand_src = 'self.id'
# id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=True)
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
name = models.CharField(unique=True, blank=False, max_length=100) name = models.CharField(unique=True, blank=False, max_length=100)
# slug is autoset on save from name, never set it manually
slug = models.SlugField(unique=True, blank=True, max_length=100) slug = models.SlugField(unique=True, blank=True, max_length=100)
# slug is autoset on save from name, never set it manually
class Meta: class Meta(TypedModelMeta):
verbose_name = "Tag" verbose_name = "Tag"
verbose_name_plural = "Tags" verbose_name_plural = "Tags"
@ -85,8 +105,16 @@ class Tag(models.Model):
return super().save(*args, **kwargs) return super().save(*args, **kwargs)
class Snapshot(models.Model): class Snapshot(ABIDModel):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) abid_prefix = 'snp_'
abid_ts_src = 'self.added'
abid_uri_src = 'self.url'
abid_subtype_src = '"01"'
abid_rand_src = 'self.id'
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) # legacy pk
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
url = models.URLField(unique=True, db_index=True) url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True)
@ -99,6 +127,7 @@ class Snapshot(models.Model):
keys = ('url', 'timestamp', 'title', 'tags', 'updated') keys = ('url', 'timestamp', 'title', 'tags', 'updated')
def __repr__(self) -> str: def __repr__(self) -> str:
title = self.title or '-' title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})' return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@ -127,8 +156,8 @@ class Snapshot(models.Model):
from ..index import load_link_details from ..index import load_link_details
return load_link_details(self.as_link()) return load_link_details(self.as_link())
def tags_str(self, nocache=True) -> str: def tags_str(self, nocache=True) -> str | None:
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags' cache_key = f'{self.pk}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True)) calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if nocache: if nocache:
tags_str = calc_tags_str() tags_str = calc_tags_str()
@ -158,13 +187,9 @@ class Snapshot(models.Model):
return self.as_link().is_archived return self.as_link().is_archived
@cached_property @cached_property
def num_outputs(self): def num_outputs(self) -> int:
return self.archiveresult_set.filter(status='succeeded').count() return self.archiveresult_set.filter(status='succeeded').count()
@cached_property
def url_hash(self):
return hashurl(self.url)
@cached_property @cached_property
def base_url(self): def base_url(self):
return base_url(self.url) return base_url(self.url)
@ -179,7 +204,7 @@ class Snapshot(models.Model):
@cached_property @cached_property
def archive_size(self): def archive_size(self):
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size' cache_key = f'{str(self.pk)[:12]}-{(self.updated or self.added).timestamp()}-size'
def calc_dir_size(): def calc_dir_size():
try: try:
@ -200,7 +225,7 @@ class Snapshot(models.Model):
return None return None
@cached_property @cached_property
def headers(self) -> Optional[dict]: def headers(self) -> Optional[Dict[str, str]]:
try: try:
return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip()) return json.loads((Path(self.link_dir) / 'headers.json').read_text(encoding='utf-8').strip())
except Exception: except Exception:
@ -251,11 +276,37 @@ class Snapshot(models.Model):
tags_id = [] tags_id = []
for tag in tags: for tag in tags:
if tag.strip(): if tag.strip():
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id) tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
self.tags.clear() self.tags.clear()
self.tags.add(*tags_id) self.tags.add(*tags_id)
# def get_storage_dir(self, create=True, symlink=True) -> Path:
# date_str = self.added.strftime('%Y%m%d')
# domain_str = domain(self.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'snapshots' / date_str / domain_str / str(self.ulid)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_id' / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_date' / date_str / domain_str / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'snapshots_by_domain' / domain_str / date_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
class ArchiveResultManager(models.Manager): class ArchiveResultManager(models.Manager):
def indexable(self, sorted: bool = True): def indexable(self, sorted: bool = True):
INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ] INDEXABLE_METHODS = [ r[0] for r in ARCHIVE_METHODS_INDEXING_PRECEDENCE ]
@ -267,15 +318,22 @@ class ArchiveResultManager(models.Manager):
return qs return qs
class ArchiveResult(models.Model): class ArchiveResult(ABIDModel):
abid_prefix = 'res_'
abid_ts_src = 'self.snapshot.added'
abid_uri_src = 'self.snapshot.url'
abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.uuid'
EXTRACTOR_CHOICES = EXTRACTOR_CHOICES EXTRACTOR_CHOICES = EXTRACTOR_CHOICES
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
uuid = models.UUIDField(default=uuid.uuid4, editable=False) id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') # legacy pk
uuid = models.UUIDField(blank=True, null=True, editable=True, unique=True)
abid = ABIDField(prefix=abid_prefix)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32) extractor = models.CharField(choices=EXTRACTOR_CHOICES, max_length=32)
cmd = JSONField() cmd = models.JSONField()
pwd = models.CharField(max_length=256) pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True) cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024) output = models.CharField(max_length=1024)
@ -285,6 +343,9 @@ class ArchiveResult(models.Model):
objects = ArchiveResultManager() objects = ArchiveResultManager()
class Meta(TypedModelMeta):
verbose_name = 'Result'
def __str__(self): def __str__(self):
return self.extractor return self.extractor
@ -318,3 +379,33 @@ class ArchiveResult(models.Model):
def output_exists(self) -> bool: def output_exists(self) -> bool:
return Path(self.output_path()).exists() return Path(self.output_path()).exists()
# def get_storage_dir(self, create=True, symlink=True):
# date_str = self.snapshot.added.strftime('%Y%m%d')
# domain_str = domain(self.snapshot.url)
# abs_storage_dir = Path(ARCHIVE_DIR) / 'results' / date_str / domain_str / self.extractor / str(self.ulid)
# if create and not abs_storage_dir.is_dir():
# abs_storage_dir.mkdir(parents=True, exist_ok=True)
# if symlink:
# LINK_PATHS = [
# Path(ARCHIVE_DIR).parent / 'index' / 'all_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_id' / str(self.ulid),
# # Path(ARCHIVE_DIR).parent / 'index' / 'results_by_date' / date_str / domain_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_domain' / domain_str / date_str / self.extractor / str(self.ulid),
# Path(ARCHIVE_DIR).parent / 'index' / 'results_by_type' / self.extractor / date_str / domain_str / str(self.ulid),
# ]
# for link_path in LINK_PATHS:
# link_path.parent.mkdir(parents=True, exist_ok=True)
# try:
# link_path.symlink_to(abs_storage_dir)
# except FileExistsError:
# link_path.unlink()
# link_path.symlink_to(abs_storage_dir)
# return abs_storage_dir
# def symlink_index(self, create=True):
# abs_result_dir = self.get_storage_dir(create=create)

View file

@ -10,6 +10,7 @@ from pathlib import Path
from django.utils.crypto import get_random_string from django.utils.crypto import get_random_string
from ..config import ( from ..config import (
CONFIG,
DEBUG, DEBUG,
SECRET_KEY, SECRET_KEY,
ALLOWED_HOSTS, ALLOWED_HOSTS,
@ -62,12 +63,13 @@ INSTALLED_APPS = [
'django.contrib.staticfiles', 'django.contrib.staticfiles',
'django.contrib.admin', 'django.contrib.admin',
'signal_webhooks',
'abid_utils',
'core', 'core',
'api', 'api',
'admin_data_views', 'admin_data_views',
'signal_webhooks',
'django_extensions', 'django_extensions',
] ]
@ -247,22 +249,26 @@ DATABASES = {
'TIME_ZONE': TIMEZONE, 'TIME_ZONE': TIMEZONE,
# DB setup is sometimes modified at runtime by setup_django() in config.py # DB setup is sometimes modified at runtime by setup_django() in config.py
}, },
'cache': { # 'cache': {
'ENGINE': 'django.db.backends.sqlite3', # 'ENGINE': 'django.db.backends.sqlite3',
'NAME': CACHE_DB_PATH, # 'NAME': CACHE_DB_PATH,
'OPTIONS': { # 'OPTIONS': {
'timeout': 60, # 'timeout': 60,
'check_same_thread': False, # 'check_same_thread': False,
}, # },
'TIME_ZONE': TIMEZONE, # 'TIME_ZONE': TIMEZONE,
}, # },
} }
MIGRATION_MODULES = {'signal_webhooks': None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
CACHES = { CACHES = {
'default': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'}, 'default': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'},
'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'}, # 'sqlite': {'BACKEND': 'django.core.cache.backends.db.DatabaseCache', 'LOCATION': 'cache'},
'locmem': {'BACKEND': 'django.core.cache.backends.locmem.LocMemCache'}, # 'dummy': {'BACKEND': 'django.core.cache.backends.dummy.DummyCache'},
# 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'}, # 'filebased': {"BACKEND": "django.core.cache.backends.filebased.FileBasedCache", "LOCATION": CACHE_DIR / 'cache_filebased'},
} }
@ -421,9 +427,11 @@ LOGGING = {
# Add default webhook configuration to the User model # Add default webhook configuration to the User model
SIGNAL_WEBHOOKS_CUSTOM_MODEL = 'api.models.OutboundWebhook'
SIGNAL_WEBHOOKS = { SIGNAL_WEBHOOKS = {
"HOOKS": { "HOOKS": {
"django.contrib.auth.models.User": ..., # ... is a special value that means "use the default autogenerated hooks" # ... is a special sigil value that means "use the default autogenerated hooks"
"django.contrib.auth.models.User": ...,
"core.models.Snapshot": ..., "core.models.Snapshot": ...,
"core.models.ArchiveResult": ..., "core.models.ArchiveResult": ...,
"core.models.Tag": ..., "core.models.Tag": ...,

View file

@ -226,8 +226,8 @@ class SnapshotView(View):
'<i><b>Next steps:</i></b><br/>' '<i><b>Next steps:</i></b><br/>'
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>' f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>' f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>' f'- go to the <a href="/admin/core/snapshot/{snapshot.pk}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>' f'- go to the <a href="/admin/core/snapshot/?uuid__startswith={snapshot.uuid}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>' '- or return to <a href="/" target="_top">the main index...</a></div>'
'</center>' '</center>'
), ),
@ -455,7 +455,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
for section in CONFIG_SCHEMA.keys(): for section in CONFIG_SCHEMA.keys():
for key in CONFIG_SCHEMA[section].keys(): for key in CONFIG_SCHEMA[section].keys():
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', '')) rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key)) rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>')) rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)') rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')
@ -465,7 +465,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
section = 'DYNAMIC' section = 'DYNAMIC'
for key in DYNAMIC_CONFIG_SCHEMA.keys(): for key in DYNAMIC_CONFIG_SCHEMA.keys():
rows['Section'].append(section.replace('_', ' ').title().replace(' Config', '')) rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key)) rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>')) rows['Type'].append(mark_safe(f'<code>{find_config_type(key)}</code>'))
rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)') rows['Value'].append(mark_safe(f'<code>{CONFIG[key]}</code>') if key_is_safe(key) else '******** (redacted)')

View file

@ -160,7 +160,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# bump the updated time on the main Snapshot here, this is critical # bump the updated time on the main Snapshot here, this is critical
# to be able to cache summaries of the ArchiveResults for a given # to be able to cache summaries of the ArchiveResults for a given
# snapshot without having to load all the results from the DB each time. # snapshot without having to load all the results from the DB each time.
# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume # (we use {Snapshot.pk}-{Snapshot.updated} as the cache key and assume
# ArchiveResults are unchanged as long as the updated timestamp is unchanged) # ArchiveResults are unchanged as long as the updated timestamp is unchanged)
snapshot.save() snapshot.save()
else: else:

View file

@ -94,6 +94,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
status = 'failed' status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes). # TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"") cmd[2] = browser_args.replace('"', "\\\"")
if result:
err.hints = (result.stdout + result.stderr).decode().split('\n') err.hints = (result.stdout + result.stderr).decode().split('\n')
output = err output = err
finally: finally:

View file

@ -118,7 +118,7 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str: def snapshot_icons(snapshot) -> str:
cache_key = f'{snapshot.id}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons' cache_key = f'{snapshot.pk}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons(): def calc_snapshot_icons():
from core.models import EXTRACTOR_CHOICES from core.models import EXTRACTOR_CHOICES

View file

@ -192,6 +192,9 @@ class Link:
if extended: if extended:
info.update({ info.update({
'snapshot_id': self.snapshot_id, 'snapshot_id': self.snapshot_id,
'snapshot_uuid': self.snapshot_uuid,
'snapshot_abid': self.snapshot_abid,
'link_dir': self.link_dir, 'link_dir': self.link_dir,
'archive_path': self.archive_path, 'archive_path': self.archive_path,
@ -261,9 +264,21 @@ class Link:
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust) return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@cached_property @cached_property
def snapshot_id(self): def snapshot(self):
from core.models import Snapshot from core.models import Snapshot
return str(Snapshot.objects.only('id').get(url=self.url).id) return Snapshot.objects.only('uuid').get(url=self.url)
@cached_property
def snapshot_id(self):
return str(self.snapshot.pk)
@cached_property
def snapshot_uuid(self):
return str(self.snapshot.uuid)
@cached_property
def snapshot_abid(self):
return str(self.snapshot.ABID)
@classmethod @classmethod
def field_names(cls): def field_names(cls):

View file

@ -45,7 +45,8 @@ def write_link_to_sql_index(link: Link):
info.pop('tags') info.pop('tags')
try: try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp snapshot = Snapshot.objects.get(url=link.url)
info["timestamp"] = snapshot.timestamp
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists(): while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0) info["timestamp"] = str(float(info["timestamp"]) + 1.0)
@ -57,7 +58,7 @@ def write_link_to_sql_index(link: Link):
for entry in entries: for entry in entries:
if isinstance(entry, dict): if isinstance(entry, dict):
result, _ = ArchiveResult.objects.get_or_create( result, _ = ArchiveResult.objects.get_or_create(
snapshot_id=snapshot.id, snapshot_id=snapshot.pk,
extractor=extractor, extractor=extractor,
start_ts=parse_date(entry['start_ts']), start_ts=parse_date(entry['start_ts']),
defaults={ defaults={
@ -71,7 +72,7 @@ def write_link_to_sql_index(link: Link):
) )
else: else:
result, _ = ArchiveResult.objects.update_or_create( result, _ = ArchiveResult.objects.update_or_create(
snapshot_id=snapshot.id, snapshot_id=snapshot.pk,
extractor=extractor, extractor=extractor,
start_ts=parse_date(entry.start_ts), start_ts=parse_date(entry.start_ts),
defaults={ defaults={

View file

@ -0,0 +1,16 @@
__package__ = 'archivebox'
import django_stubs_ext
django_stubs_ext.monkeypatch()
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
import datetime
from django.utils import timezone
timezone.utc = datetime.timezone.utc
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
# DjangoSignalWebhooksConfig.verbose_name = 'API'

View file

@ -39,7 +39,7 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
backend = import_backend() backend = import_backend()
if snap: if snap:
try: try:
backend.index(snapshot_id=str(snap.id), texts=texts) backend.index(snapshot_id=str(snap.pk), texts=texts)
except Exception as err: except Exception as err:
stderr() stderr()
stderr( stderr(
@ -54,7 +54,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
if search_backend_enabled(): if search_backend_enabled():
backend = import_backend() backend = import_backend()
try: try:
snapshot_ids = backend.search(query) snapshot_pks = backend.search(query)
except Exception as err: except Exception as err:
stderr() stderr()
stderr( stderr(
@ -64,7 +64,7 @@ def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
raise raise
else: else:
# TODO preserve ordering from backend # TODO preserve ordering from backend
qsearch = Snapshot.objects.filter(pk__in=snapshot_ids) qsearch = Snapshot.objects.filter(pk__in=snapshot_pks)
return qsearch return qsearch
return Snapshot.objects.none() return Snapshot.objects.none()
@ -74,9 +74,9 @@ def flush_search_index(snapshots: QuerySet):
if not indexing_enabled() or not snapshots: if not indexing_enabled() or not snapshots:
return return
backend = import_backend() backend = import_backend()
snapshot_ids=(str(pk) for pk in snapshots.values_list('pk',flat=True)) snapshot_pks = (str(pk) for pk in snapshots.values_list('pk', flat=True))
try: try:
backend.flush(snapshot_ids) backend.flush(snapshot_pks)
except Exception as err: except Exception as err:
stderr() stderr()
stderr( stderr(

View file

@ -147,7 +147,7 @@
{% for obj in results %} {% for obj in results %}
<div class="card"> <div class="card">
<div class="card-info"> <div class="card-info">
<a href="{% url 'admin:core_snapshot_change' obj.id %}"> <a href="{% url 'admin:core_snapshot_change' obj.pk %}">
<span class="timestamp">{{obj.added}}</span> <span class="timestamp">{{obj.added}}</span>
</a> </a>
<label> <label>

View file

@ -405,7 +405,7 @@
</a> </a>
<!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>--> <!--<a href="{{result.path}}" target="preview"><h4 class="card-title">{{result.name}}</h4></a>-->
</div> </div>
<iframe class="card-img-top" src="{{result.path}}" sandbox="allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe> <iframe class="card-img-top" src="{{result.path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
</div> </div>
</div> </div>
{% endfor %} {% endfor %}
@ -463,7 +463,7 @@
if (target.endsWith('.pdf')) { if (target.endsWith('.pdf')) {
jQuery('#main-frame')[0].removeAttribute('sandbox') jQuery('#main-frame')[0].removeAttribute('sandbox')
} else { } else {
jQuery('#main-frame')[0].sandbox = "allow-scripts allow-forms allow-top-navigation-by-user-activation" jQuery('#main-frame')[0].sandbox = "allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms"
} }
window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a')) window.location.hash = getPreviewTypeFromPath(event.currentTarget.querySelector('a'))

View file

@ -37,6 +37,9 @@ dependencies = [
# - See Github issues for more... # - See Github issues for more...
"django-signal-webhooks>=0.3.0", "django-signal-webhooks>=0.3.0",
"django-admin-data-views>=0.3.1", "django-admin-data-views>=0.3.1",
"ulid-py>=1.1.0",
"typeid-python>=0.3.0",
"django-charid-field>=0.4",
] ]
homepage = "https://github.com/ArchiveBox/ArchiveBox" homepage = "https://github.com/ArchiveBox/ArchiveBox"