2024-10-02 04:46:35 +00:00
__package__ = ' archivebox.machine '
import socket
2024-10-03 10:11:04 +00:00
from datetime import timedelta
from pathlib import Path
2024-10-02 04:46:35 +00:00
from django . db import models
2024-10-03 10:11:04 +00:00
from django . utils import timezone
from django . utils . functional import cached_property
2024-10-28 11:07:35 +00:00
import abx
import archivebox
2024-10-15 08:03:01 +00:00
2024-10-28 11:07:35 +00:00
from pydantic_pkgr import Binary , BinProvider
2024-10-14 22:38:32 +00:00
from archivebox . abid_utils . models import ABIDModel , ABIDField , AutoDateTimeField , ModelWithHealthStats
2024-10-02 04:46:35 +00:00
from . detect import get_host_guid , get_os_info , get_vm_info , get_host_network , get_host_stats
2024-10-14 22:38:32 +00:00
_CURRENT_MACHINE = None # global cache for the current machine
_CURRENT_INTERFACE = None # global cache for the current network interface
_CURRENT_BINARIES = { } # global cache for the currently installed binaries
2024-10-03 10:11:04 +00:00
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 # 1 week (how often should we check for OS/hardware changes?)
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 # 1 hour (how often should we check for public IP/private IP/DNS changes?)
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60 # 30min (how often should we check for changes to locally installed binaries?)
2024-10-02 04:46:35 +00:00
class MachineManager ( models . Manager ) :
def current ( self ) - > ' Machine ' :
2024-10-03 10:11:04 +00:00
""" Get the current machine that ArchiveBox is running on. """
2024-10-14 22:38:32 +00:00
global _CURRENT_MACHINE
if _CURRENT_MACHINE :
expires_at = _CURRENT_MACHINE . modified_at + timedelta ( seconds = MACHINE_RECHECK_INTERVAL )
2024-10-03 10:11:04 +00:00
if timezone . now ( ) < expires_at :
# assume current machine cant change *while archivebox is actively running on it*
# it's not strictly impossible to swap hardware while code is running,
# but its rare and unusual so we check only once per week
# (e.g. VMWare can live-migrate a VM to a new host while it's running)
2024-10-14 22:38:32 +00:00
return _CURRENT_MACHINE
2024-10-03 10:11:04 +00:00
else :
2024-10-14 22:38:32 +00:00
_CURRENT_MACHINE = None
2024-10-02 04:46:35 +00:00
2024-10-14 22:38:32 +00:00
_CURRENT_MACHINE , _created = self . update_or_create (
2024-10-03 10:11:04 +00:00
guid = get_host_guid ( ) ,
defaults = {
' hostname ' : socket . gethostname ( ) ,
* * get_os_info ( ) ,
* * get_vm_info ( ) ,
' stats ' : get_host_stats ( ) ,
} ,
)
2024-10-14 22:38:32 +00:00
_CURRENT_MACHINE . save ( ) # populate ABID
2024-10-02 04:46:35 +00:00
2024-10-14 22:38:32 +00:00
return _CURRENT_MACHINE
2024-10-02 04:46:35 +00:00
2024-10-03 10:11:04 +00:00
class Machine ( ABIDModel , ModelWithHealthStats ) :
2024-10-14 22:38:32 +00:00
""" Audit log entry for a physical machine that was used to do archiving. """
2024-10-02 04:46:35 +00:00
abid_prefix = ' mxn_ '
abid_ts_src = ' self.created_at '
abid_uri_src = ' self.guid '
abid_subtype_src = ' " 01 " '
abid_rand_src = ' self.id '
abid_drift_allowed = False
id = models . UUIDField ( primary_key = True , default = None , null = False , editable = False , unique = True , verbose_name = ' ID ' )
abid = ABIDField ( prefix = abid_prefix )
created_at = AutoDateTimeField ( default = None , null = False , db_index = True )
modified_at = models . DateTimeField ( auto_now = True )
# IMMUTABLE PROPERTIES
2024-10-03 10:11:04 +00:00
guid = models . CharField ( max_length = 64 , default = None , null = False , unique = True , editable = False ) # 64char sha256 hash of machine's unique hardware ID
2024-10-02 04:46:35 +00:00
# MUTABLE PROPERTIES
2024-10-03 10:11:04 +00:00
hostname = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. somehost.subdomain.example.com
hw_in_docker = models . BooleanField ( default = False , null = False ) # e.g. False
hw_in_vm = models . BooleanField ( default = False , null = False ) # e.g. False
2024-10-02 04:46:35 +00:00
hw_manufacturer = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. Apple
hw_product = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. Mac Studio Mac13,1
hw_uuid = models . CharField ( max_length = 255 , default = None , null = False ) # e.g. 39A12B50-...-...-...-...
os_arch = models . CharField ( max_length = 15 , default = None , null = False ) # e.g. arm64
os_family = models . CharField ( max_length = 15 , default = None , null = False ) # e.g. darwin
os_platform = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. macOS-14.6.1-arm64-arm-64bit
os_release = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. macOS 14.6.1
os_kernel = models . CharField ( max_length = 255 , default = None , null = False ) # e.g. Darwin Kernel Version 23.6.0: Mon Jul 29 21:14:30 PDT 2024; root:xnu-10063.141.2~1/RELEASE_ARM64_T6000
2024-10-03 10:11:04 +00:00
# STATS COUNTERS
stats = models . JSONField ( default = dict , null = False ) # e.g. {"cpu_load": [1.25, 2.4, 1.4], "mem_swap_used_pct": 56, ...}
2024-10-14 22:38:32 +00:00
2024-10-03 10:11:04 +00:00
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
# num_uses_succeeded = models.PositiveIntegerField(default=0)
2024-10-02 04:46:35 +00:00
2024-10-03 10:11:04 +00:00
objects : MachineManager = MachineManager ( )
2024-10-02 04:46:35 +00:00
networkinterface_set : models . Manager [ ' NetworkInterface ' ]
2024-10-03 10:11:04 +00:00
2024-10-02 04:46:35 +00:00
class NetworkInterfaceManager ( models . Manager ) :
def current ( self ) - > ' NetworkInterface ' :
2024-10-03 10:11:04 +00:00
""" Get the current network interface for the current machine. """
2024-10-14 22:38:32 +00:00
global _CURRENT_INTERFACE
if _CURRENT_INTERFACE :
2024-10-03 10:11:04 +00:00
# assume the current network interface (public IP, DNS servers, etc.) wont change more than once per hour
2024-10-14 22:38:32 +00:00
expires_at = _CURRENT_INTERFACE . modified_at + timedelta ( seconds = NETWORK_INTERFACE_RECHECK_INTERVAL )
2024-10-03 10:11:04 +00:00
if timezone . now ( ) < expires_at :
2024-10-14 22:38:32 +00:00
return _CURRENT_INTERFACE
2024-10-03 10:11:04 +00:00
else :
2024-10-14 22:38:32 +00:00
_CURRENT_INTERFACE = None
2024-10-02 04:46:35 +00:00
machine = Machine . objects . current ( )
net_info = get_host_network ( )
2024-10-14 22:38:32 +00:00
_CURRENT_INTERFACE , _created = self . update_or_create (
2024-10-02 04:46:35 +00:00
machine = machine ,
2024-10-03 10:11:04 +00:00
ip_public = net_info . pop ( ' ip_public ' ) ,
ip_local = net_info . pop ( ' ip_local ' ) ,
mac_address = net_info . pop ( ' mac_address ' ) ,
dns_server = net_info . pop ( ' dns_server ' ) ,
defaults = net_info ,
2024-10-02 04:46:35 +00:00
)
2024-10-14 22:38:32 +00:00
_CURRENT_INTERFACE . save ( ) # populate ABID
2024-10-03 10:11:04 +00:00
2024-10-14 22:38:32 +00:00
return _CURRENT_INTERFACE
2024-10-03 10:11:04 +00:00
2024-10-02 04:46:35 +00:00
2024-10-03 10:11:04 +00:00
class NetworkInterface ( ABIDModel , ModelWithHealthStats ) :
2024-10-14 22:38:32 +00:00
""" Audit log entry for a physical network interface / internet connection that was used to do archiving. """
2024-10-02 04:46:35 +00:00
abid_prefix = ' ixf_ '
abid_ts_src = ' self.machine.created_at '
abid_uri_src = ' self.machine.guid '
abid_subtype_src = ' self.iface '
abid_rand_src = ' self.id '
abid_drift_allowed = False
id = models . UUIDField ( primary_key = True , default = None , null = False , editable = False , unique = True , verbose_name = ' ID ' )
abid = ABIDField ( prefix = abid_prefix )
created_at = AutoDateTimeField ( default = None , null = False , db_index = True )
modified_at = models . DateTimeField ( auto_now = True )
2024-10-03 10:11:04 +00:00
machine = models . ForeignKey ( Machine , on_delete = models . CASCADE , default = None , null = False ) # e.g. Machine(id=...)
2024-10-02 04:46:35 +00:00
# IMMUTABLE PROPERTIES
mac_address = models . CharField ( max_length = 17 , default = None , null = False , editable = False ) # e.g. ab:cd:ef:12:34:56
ip_public = models . GenericIPAddressField ( default = None , null = False , editable = False ) # e.g. 123.123.123.123 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
ip_local = models . GenericIPAddressField ( default = None , null = False , editable = False ) # e.g. 192.168.2.18 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
dns_server = models . GenericIPAddressField ( default = None , null = False , editable = False ) # e.g. 8.8.8.8 or 2001:0db8:85a3:0000:0000:8a2e:0370:7334
# MUTABLE PROPERTIES
hostname = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. somehost.sub.example.com
2024-10-03 10:10:22 +00:00
iface = models . CharField ( max_length = 15 , default = None , null = False ) # e.g. en0
2024-10-02 04:46:35 +00:00
isp = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. AS-SONICTELECOM
city = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. Berkeley
region = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. California
country = models . CharField ( max_length = 63 , default = None , null = False ) # e.g. United States
2024-10-14 22:38:32 +00:00
# STATS COUNTERS (inherited from ModelWithHealthStats)
2024-10-03 10:10:22 +00:00
# num_uses_failed = models.PositiveIntegerField(default=0)
# num_uses_succeeded = models.PositiveIntegerField(default=0)
objects : NetworkInterfaceManager = NetworkInterfaceManager ( )
2024-10-02 04:46:35 +00:00
class Meta :
unique_together = (
2024-10-03 10:10:22 +00:00
# if *any* of these change, it's considered a different interface
# because we might get different downloaded content as a result,
# this forces us to store an audit trail whenever these things change
2024-10-02 04:46:35 +00:00
( ' machine ' , ' ip_public ' , ' ip_local ' , ' mac_address ' , ' dns_server ' ) ,
)
2024-10-03 10:10:22 +00:00
class InstalledBinaryManager ( models . Manager ) :
2024-10-28 11:07:35 +00:00
def get_from_db_or_cache ( self , binary : Binary ) - > ' InstalledBinary ' :
2024-10-03 10:10:22 +00:00
""" Get or create an InstalledBinary record for a Binary on the local machine """
2024-10-02 04:46:35 +00:00
2024-10-14 22:38:32 +00:00
global _CURRENT_BINARIES
2024-10-15 04:50:47 +00:00
cached_binary = _CURRENT_BINARIES . get ( binary . name )
2024-10-03 10:10:22 +00:00
if cached_binary :
expires_at = cached_binary . modified_at + timedelta ( seconds = INSTALLED_BINARY_RECHECK_INTERVAL )
if timezone . now ( ) < expires_at :
is_loaded = binary . abspath and binary . version and binary . sha256
if is_loaded :
# if the caller took did the (expensive) job of loading the binary from the filesystem already
# then their in-memory version is certainly more up-to-date than any potential cached version
# use this opportunity to invalidate the cache in case if anything has changed
is_different_from_cache = (
binary . abspath != cached_binary . abspath
or binary . version != cached_binary . version
or binary . sha256 != cached_binary . sha256
)
if is_different_from_cache :
2024-10-15 04:50:47 +00:00
_CURRENT_BINARIES . pop ( binary . name )
2024-10-03 10:10:22 +00:00
else :
return cached_binary
else :
# if they have not yet loaded the binary
# but our cache is recent enough and not expired, assume cached version is good enough
# it will automatically reload when the cache expires
# cached_binary will be stale/bad for up to 30min if binary was updated/removed on host system
return cached_binary
else :
# cached binary is too old, reload it from scratch
2024-10-15 04:50:47 +00:00
_CURRENT_BINARIES . pop ( binary . name )
2024-10-03 10:10:22 +00:00
if not binary . abspath or not binary . version or not binary . sha256 :
# if binary was not yet loaded from filesystem, do it now
# this is expensive, we have to find it's abspath, version, and sha256, but it's necessary
# to make sure we have a good, up-to-date record of it in the DB & in-memroy cache
2024-10-28 11:07:35 +00:00
binary = archivebox . pm . hook . binary_load ( binary = binary , fresh = True )
2024-10-02 04:46:35 +00:00
2024-10-03 10:10:22 +00:00
assert binary . loaded_binprovider and binary . loaded_abspath and binary . loaded_version and binary . loaded_sha256 , f ' Failed to load binary { binary . name } abspath, version, and sha256 '
2024-10-15 04:50:47 +00:00
_CURRENT_BINARIES [ binary . name ] , _created = self . update_or_create (
2024-10-03 10:10:22 +00:00
machine = Machine . objects . current ( ) ,
name = binary . name ,
binprovider = binary . loaded_binprovider . name ,
version = str ( binary . loaded_version ) ,
abspath = str ( binary . loaded_abspath ) ,
sha256 = str ( binary . loaded_sha256 ) ,
)
2024-10-15 04:50:47 +00:00
cached_binary = _CURRENT_BINARIES [ binary . name ]
2024-10-03 10:10:22 +00:00
cached_binary . save ( ) # populate ABID
# if we get this far make sure DB record matches in-memroy cache
assert str ( cached_binary . binprovider ) == str ( binary . loaded_binprovider . name )
assert str ( cached_binary . abspath ) == str ( binary . loaded_abspath )
assert str ( cached_binary . version ) == str ( binary . loaded_version )
assert str ( cached_binary . sha256 ) == str ( binary . loaded_sha256 )
return cached_binary
2024-10-02 04:46:35 +00:00
2024-10-03 10:10:22 +00:00
class InstalledBinary ( ABIDModel , ModelWithHealthStats ) :
abid_prefix = ' bin_ '
abid_ts_src = ' self.machine.created_at '
abid_uri_src = ' self.machine.guid '
abid_subtype_src = ' self.binprovider '
abid_rand_src = ' self.id '
abid_drift_allowed = False
2024-10-02 04:46:35 +00:00
2024-10-03 10:10:22 +00:00
id = models . UUIDField ( primary_key = True , default = None , null = False , editable = False , unique = True , verbose_name = ' ID ' )
abid = ABIDField ( prefix = abid_prefix )
created_at = AutoDateTimeField ( default = None , null = False , db_index = True )
modified_at = models . DateTimeField ( auto_now = True )
# IMMUTABLE PROPERTIES
machine = models . ForeignKey ( Machine , on_delete = models . CASCADE , default = None , null = False , blank = True )
name = models . CharField ( max_length = 63 , default = None , null = False , blank = True )
binprovider = models . CharField ( max_length = 31 , default = None , null = False , blank = True )
abspath = models . CharField ( max_length = 255 , default = None , null = False , blank = True )
version = models . CharField ( max_length = 32 , default = None , null = False , blank = True )
sha256 = models . CharField ( max_length = 64 , default = None , null = False , blank = True )
2024-10-14 22:38:32 +00:00
# MUTABLE PROPERTIES (TODO)
2024-10-03 10:10:22 +00:00
# is_pinned = models.BooleanField(default=False) # i.e. should this binary superceede other binaries with the same name on the host?
# is_valid = models.BooleanField(default=True) # i.e. is this binary still available on the host?
2024-10-02 04:46:35 +00:00
2024-10-14 22:38:32 +00:00
# STATS COUNTERS (inherited from ModelWithHealthStats)
2024-10-03 10:10:22 +00:00
# num_uses_failed = models.PositiveIntegerField(default=0)
# num_uses_succeeded = models.PositiveIntegerField(default=0)
objects : InstalledBinaryManager = InstalledBinaryManager ( )
class Meta :
verbose_name = ' Installed Binary '
verbose_name_plural = ' Installed Binaries '
unique_together = (
( ' machine ' , ' name ' , ' binprovider ' , ' abspath ' , ' version ' , ' sha256 ' ) ,
)
def __str__ ( self ) - > str :
return f ' { self . name } @ { self . binprovider } + { self . abspath } @ { self . version } '
2024-10-02 04:46:35 +00:00
2024-10-03 10:10:22 +00:00
def clean ( self , * args , * * kwargs ) - > None :
assert self . name or self . abspath
self . name = str ( self . name or self . abspath )
assert self . name
if not hasattr ( self , ' machine ' ) :
self . machine = Machine . objects . current ( )
if not self . binprovider :
2024-10-28 11:07:35 +00:00
all_known_binproviders = list ( abx . as_dict ( archivebox . pm . hook . get_BINPROVIDERS ( ) ) . values ( ) )
binary = archivebox . pm . hook . binary_load ( binary = Binary ( name = self . name , binproviders = all_known_binproviders ) , fresh = True )
2024-10-03 10:10:22 +00:00
self . binprovider = binary . loaded_binprovider . name if binary . loaded_binprovider else None
if not self . abspath :
self . abspath = self . BINPROVIDER . get_abspath ( self . name )
if not self . version :
self . version = self . BINPROVIDER . get_version ( self . name , abspath = self . abspath )
if not self . sha256 :
self . sha256 = self . BINPROVIDER . get_sha256 ( self . name , abspath = self . abspath )
super ( ) . clean ( * args , * * kwargs )
@cached_property
2024-10-28 11:07:35 +00:00
def BINARY ( self ) - > Binary :
for binary in abx . as_dict ( archivebox . pm . hook . get_BINARIES ( ) ) . values ( ) :
2024-10-03 10:10:22 +00:00
if binary . name == self . name :
return binary
raise Exception ( f ' Orphaned InstalledBinary { self . name } { self . binprovider } was found in DB, could not find any plugin that defines it ' )
# TODO: we could technically reconstruct it from scratch, but why would we ever want to do that?
@cached_property
2024-10-28 11:07:35 +00:00
def BINPROVIDER ( self ) - > BinProvider :
for binprovider in abx . as_dict ( archivebox . pm . hook . get_BINPROVIDERS ( ) ) . values ( ) :
2024-10-03 10:10:22 +00:00
if binprovider . name == self . binprovider :
return binprovider
raise Exception ( f ' Orphaned InstalledBinary(name= { self . name } ) was found in DB, could not find any plugin that defines BinProvider(name= { self . binprovider } ) ' )
# maybe not a good idea to provide this? Binary in DB is a record of the binary's config
# whereas a loaded binary is a not-yet saved instance that may not have the same config
# why would we want to load a binary record from the db when it could be freshly loaded?
2024-10-28 11:07:35 +00:00
def load_from_db ( self ) - > Binary :
2024-10-03 10:10:22 +00:00
# TODO: implement defaults arg in pydantic_pkgr
# return self.BINARY.load(defaults={
# 'binprovider': self.BINPROVIDER,
# 'abspath': Path(self.abspath),
# 'version': self.version,
# 'sha256': self.sha256,
# })
2024-10-28 11:07:35 +00:00
return Binary . model_validate ( {
2024-10-03 10:10:22 +00:00
* * self . BINARY . model_dump ( ) ,
' abspath ' : self . abspath and Path ( self . abspath ) ,
' version ' : self . version ,
' sha256 ' : self . sha256 ,
' loaded_binprovider ' : self . BINPROVIDER ,
' binproviders_supported ' : self . BINARY . binproviders_supported ,
2024-10-11 07:45:59 +00:00
' overrides ' : self . BINARY . overrides ,
2024-10-03 10:10:22 +00:00
} )
2024-10-28 11:07:35 +00:00
def load_fresh ( self ) - > Binary :
return archivebox . pm . hook . binary_load ( binary = self . BINARY , fresh = True )