2024-09-04 07:08:14 +00:00
|
|
|
__package__ = 'archivebox.abid_utils'
|
|
|
|
|
2024-09-06 10:48:52 +00:00
|
|
|
from typing import NamedTuple, Any, Union, Dict
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
import ulid
|
|
|
|
import uuid6
|
|
|
|
import hashlib
|
2024-05-13 14:49:36 +00:00
|
|
|
from urllib.parse import urlparse
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
from uuid import UUID
|
|
|
|
from typeid import TypeID # type: ignore[import-untyped]
|
|
|
|
from datetime import datetime
|
|
|
|
|
2024-10-01 00:25:15 +00:00
|
|
|
from archivebox.misc.util import enforce_types
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
|
|
|
|
ABID_PREFIX_LEN = 4
|
|
|
|
ABID_SUFFIX_LEN = 26
|
|
|
|
ABID_LEN = 30
|
|
|
|
ABID_TS_LEN = 10
|
|
|
|
ABID_URI_LEN = 8
|
|
|
|
ABID_SUBTYPE_LEN = 2
|
|
|
|
ABID_RAND_LEN = 6
|
|
|
|
|
|
|
|
DEFAULT_ABID_PREFIX = 'obj_'
|
|
|
|
|
2024-08-18 02:30:58 +00:00
|
|
|
# allows people to keep their uris secret on a per-instance basis by changing the salt.
|
|
|
|
# the default means everyone can share the same namespace for URI hashes,
|
|
|
|
# meaning anyone who has a URI and wants to check if you have it can guess the ABID
|
|
|
|
DEFAULT_ABID_URI_SALT = '687c2fff14e3a7780faa5a40c237b19b5b51b089'
|
|
|
|
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
class ABID(NamedTuple):
|
|
|
|
"""
|
|
|
|
e.g. ABID('obj_01HX9FPYTRE4A5CCD901ZYEBQE')
|
|
|
|
"""
|
|
|
|
prefix: str # e.g. obj_
|
|
|
|
ts: str # e.g. 01HX9FPYTR
|
|
|
|
uri: str # e.g. E4A5CCD9
|
|
|
|
subtype: str # e.g. 01
|
|
|
|
rand: str # e.g. ZYEBQE
|
2024-08-20 08:56:07 +00:00
|
|
|
|
|
|
|
# salt: str = DEFAULT_ABID_URI_SALT
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
def __getattr__(self, attr: str) -> Any:
|
|
|
|
return getattr(self.ulid, attr)
|
|
|
|
|
|
|
|
def __eq__(self, other: Any) -> bool:
|
|
|
|
try:
|
|
|
|
return self.ulid == other.ulid
|
|
|
|
except AttributeError:
|
|
|
|
return NotImplemented
|
|
|
|
|
|
|
|
def __str__(self) -> str:
|
|
|
|
return self.prefix + self.suffix
|
|
|
|
|
|
|
|
def __len__(self) -> int:
|
|
|
|
return len(self.prefix + self.suffix)
|
|
|
|
|
|
|
|
@classmethod
|
|
|
|
def parse(cls, buffer: Union[str, UUID, ulid.ULID, TypeID, 'ABID'], prefix=DEFAULT_ABID_PREFIX) -> 'ABID':
|
2024-05-13 12:12:12 +00:00
|
|
|
assert buffer, f'Attempted to create ABID from null value {buffer}'
|
|
|
|
|
2024-05-13 09:37:48 +00:00
|
|
|
buffer = str(buffer)
|
|
|
|
if '_' in buffer:
|
|
|
|
prefix, suffix = buffer.split('_')
|
|
|
|
else:
|
|
|
|
prefix, suffix = prefix.strip('_'), buffer
|
|
|
|
|
|
|
|
assert len(prefix) == ABID_PREFIX_LEN - 1 # length without trailing _
|
2024-05-13 12:12:12 +00:00
|
|
|
assert len(suffix) == ABID_SUFFIX_LEN, f'Suffix {suffix} from {buffer} was not {ABID_SUFFIX_LEN} chars long'
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
return cls(
|
|
|
|
prefix=abid_part_from_prefix(prefix),
|
|
|
|
ts=suffix[0:10].upper(),
|
|
|
|
uri=suffix[10:18].upper(),
|
|
|
|
subtype=suffix[18:20].upper(),
|
|
|
|
rand=suffix[20:26].upper(),
|
|
|
|
)
|
2024-08-18 04:56:23 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def uri_salt(self) -> str:
|
|
|
|
return DEFAULT_ABID_URI_SALT
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def suffix(self):
|
|
|
|
return ''.join((self.ts, self.uri, self.subtype, self.rand))
|
|
|
|
|
|
|
|
@property
|
|
|
|
def ulid(self) -> ulid.ULID:
|
|
|
|
return ulid.parse(self.suffix)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def uuid(self) -> UUID:
|
|
|
|
return self.ulid.uuid
|
|
|
|
|
|
|
|
@property
|
|
|
|
def uuid6(self) -> uuid6.UUID:
|
|
|
|
return uuid6.UUID(hex=self.uuid.hex)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def typeid(self) -> TypeID:
|
|
|
|
return TypeID.from_uuid(prefix=self.prefix.strip('_'), suffix=self.uuid6)
|
|
|
|
|
|
|
|
@property
|
|
|
|
def datetime(self) -> datetime:
|
|
|
|
return self.ulid.timestamp().datetime
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
####################################################
|
|
|
|
|
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
2024-08-18 02:30:58 +00:00
|
|
|
def uri_hash(uri: Union[str, bytes], salt: str=DEFAULT_ABID_URI_SALT) -> str:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
2024-09-04 09:02:29 +00:00
|
|
|
https://example.com -> 'E4A5CCD9AF4ED2A6E0954DF19FD274E9CDDB4853051F033FD518BFC90AA1AC25' (example.com)
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
2024-05-13 14:49:36 +00:00
|
|
|
if isinstance(uri, bytes):
|
|
|
|
uri_str: str = uri.decode()
|
|
|
|
else:
|
2024-08-21 01:31:21 +00:00
|
|
|
uri_str = str(uri)
|
2024-05-13 14:49:36 +00:00
|
|
|
|
|
|
|
# only hash the domain part of URLs
|
|
|
|
if '://' in uri_str:
|
2024-05-18 03:11:00 +00:00
|
|
|
try:
|
|
|
|
domain = urlparse(uri_str).netloc
|
|
|
|
if domain:
|
|
|
|
uri_str = domain
|
|
|
|
except AttributeError:
|
|
|
|
pass
|
2024-05-13 14:49:36 +00:00
|
|
|
|
2024-09-04 09:02:29 +00:00
|
|
|
# the uri hash is the sha256 of the domain + salt
|
2024-08-18 02:30:58 +00:00
|
|
|
uri_bytes = uri_str.encode('utf-8') + salt.encode('utf-8')
|
2024-05-13 09:37:48 +00:00
|
|
|
|
2024-05-13 14:49:36 +00:00
|
|
|
return hashlib.sha256(uri_bytes).hexdigest().upper()
|
2024-05-13 09:37:48 +00:00
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
|
|
|
def abid_part_from_prefix(prefix: str) -> str:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
|
|
|
'snp_'
|
|
|
|
"""
|
2024-09-04 07:08:14 +00:00
|
|
|
# if prefix is None:
|
|
|
|
# return 'obj_'
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
prefix = prefix.strip('_').lower()
|
|
|
|
assert len(prefix) == 3
|
|
|
|
return prefix + '_'
|
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
2024-09-05 10:36:18 +00:00
|
|
|
def abid_part_from_uri(uri: Any, salt: str=DEFAULT_ABID_URI_SALT) -> str:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
|
|
|
'E4A5CCD9' # takes first 8 characters of sha256(url)
|
|
|
|
"""
|
2024-09-05 10:36:18 +00:00
|
|
|
uri = str(uri).strip()
|
|
|
|
assert uri not in ('None', '')
|
2024-08-18 02:30:58 +00:00
|
|
|
return uri_hash(uri, salt=salt)[:ABID_URI_LEN]
|
2024-05-13 09:37:48 +00:00
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
|
|
|
def abid_part_from_ts(ts: datetime) -> str:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
|
|
|
'01HX9FPYTR' # produces 10 character Timestamp section of ulid based on added date
|
|
|
|
"""
|
2024-09-04 07:08:14 +00:00
|
|
|
return str(ulid.from_timestamp(ts))[:ABID_TS_LEN]
|
2024-05-13 09:37:48 +00:00
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
2024-09-04 09:02:29 +00:00
|
|
|
def ts_from_abid(abid: str) -> datetime:
|
|
|
|
return ulid.parse(abid.split('_', 1)[-1]).timestamp().datetime
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def abid_part_from_subtype(subtype: str | int) -> str:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
|
|
|
Snapshots have 01 type, other objects have other subtypes like wget/media/etc.
|
|
|
|
Also allows us to change the ulid spec later by putting special sigil values here.
|
|
|
|
"""
|
2024-05-13 12:12:12 +00:00
|
|
|
subtype = str(subtype)
|
2024-05-13 09:37:48 +00:00
|
|
|
if len(subtype) == ABID_SUBTYPE_LEN:
|
|
|
|
return subtype
|
|
|
|
|
2024-05-13 12:12:12 +00:00
|
|
|
return hashlib.sha256(subtype.encode('utf-8')).hexdigest()[:ABID_SUBTYPE_LEN].upper()
|
2024-05-13 09:37:48 +00:00
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
2024-05-13 09:37:48 +00:00
|
|
|
def abid_part_from_rand(rand: Union[str, UUID, None, int]) -> str:
|
|
|
|
"""
|
|
|
|
'ZYEBQE' # takes last 6 characters of randomness from existing legacy uuid db field
|
|
|
|
"""
|
|
|
|
if rand is None:
|
|
|
|
# if it's None we generate a new random 6 character hex string
|
|
|
|
return str(ulid.new())[-ABID_RAND_LEN:]
|
|
|
|
elif isinstance(rand, UUID):
|
|
|
|
# if it's a uuid we take the last 6 characters of the ULID represation of it
|
|
|
|
return str(ulid.from_uuid(rand))[-ABID_RAND_LEN:]
|
|
|
|
elif isinstance(rand, int):
|
|
|
|
# if it's a BigAutoInteger field we convert it from an int to a 0-padded string
|
|
|
|
rand_str = str(rand)[-ABID_RAND_LEN:]
|
|
|
|
padding_needed = ABID_RAND_LEN - len(rand_str)
|
|
|
|
rand_str = ('0'*padding_needed) + rand_str
|
|
|
|
return rand_str
|
2024-05-13 12:12:12 +00:00
|
|
|
|
|
|
|
# otherwise treat it as a string, take the last 6 characters of it verbatim
|
|
|
|
return str(rand)[-ABID_RAND_LEN:].upper()
|
2024-05-13 09:37:48 +00:00
|
|
|
|
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
@enforce_types
|
2024-09-05 10:36:18 +00:00
|
|
|
def abid_hashes_from_values(prefix: str, ts: datetime, uri: Any, subtype: str | int, rand: Union[str, UUID, None, int], salt: str=DEFAULT_ABID_URI_SALT) -> Dict[str, str]:
|
2024-09-04 07:08:14 +00:00
|
|
|
return {
|
|
|
|
'prefix': abid_part_from_prefix(prefix),
|
|
|
|
'ts': abid_part_from_ts(ts),
|
|
|
|
'uri': abid_part_from_uri(uri, salt=salt),
|
|
|
|
'subtype': abid_part_from_subtype(subtype),
|
|
|
|
'rand': abid_part_from_rand(rand),
|
2024-09-05 06:42:36 +00:00
|
|
|
# 'salt': don't add this, salt combined with uri above to form a single hash
|
2024-09-04 07:08:14 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
@enforce_types
|
|
|
|
def abid_from_values(prefix: str, ts: datetime, uri: str, subtype: str, rand: Union[str, UUID, None, int], salt: str=DEFAULT_ABID_URI_SALT) -> ABID:
|
2024-05-13 09:37:48 +00:00
|
|
|
"""
|
|
|
|
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
|
|
|
|
"""
|
|
|
|
|
2024-09-04 07:08:14 +00:00
|
|
|
abid = ABID(**abid_hashes_from_values(prefix, ts, uri, subtype, rand, salt=salt))
|
2024-05-13 09:37:48 +00:00
|
|
|
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for ts={ts} uri={uri} subtyp={subtype} rand={rand}'
|
|
|
|
return abid
|