add new core and crawsl statemachine manager

This commit is contained in:
Nick Sweeting 2024-11-03 00:41:11 -07:00
parent 41efd010f0
commit 48f8416762
No known key found for this signature in database
18 changed files with 798 additions and 374 deletions

View file

@ -44,16 +44,17 @@ class ActorType(ABC, Generic[ModelType]):
launch_kwargs: LaunchKwargs = {} launch_kwargs: LaunchKwargs = {}
mode: Literal['thread', 'process'] = 'process' mode: Literal['thread', 'process'] = 'process'
MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8) # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
QUERYSET: ClassVar[QuerySet] # the QuerySet to claim objects from QUERYSET: ClassVar[QuerySet] # the QuerySet to claim objects from
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue CLAIM_FROM_TOP: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10 # the number of objects to consider when atomically getting the next object from the queue
ATOMIC: ClassVar[bool] = True # whether to atomically fetch+claim the nextobject in one step, or fetch and lock it in two steps ATOMIC: ClassVar[bool] = True # whether to atomically fetch+claim the nextobject in one step, or fetch and lock it in two steps
# model_type: Type[ModelType] # model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8) # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
_SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = [] # record all the pids of Actors spawned by this class _SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = [] # record all the pids of Actors spawned by this class
@ -89,18 +90,19 @@ class ActorType(ABC, Generic[ModelType]):
@classmethod @classmethod
def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]: def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
"""Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors""" """Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
queue_length = queue.count()
if not queue_length: # queue is empty, spawn 0 actors
return []
actors_to_spawn: list[LaunchKwargs] = [] actors_to_spawn: list[LaunchKwargs] = []
max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors) max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
queue_length = queue.count()
# spawning new actors is expensive, avoid spawning all the actors at once. To stagger them, # spawning new actors is expensive, avoid spawning all the actors at once. To stagger them,
# let the next orchestrator tick handle starting another 2 on the next tick() # let the next orchestrator tick handle starting another 2 on the next tick()
# if queue_length > 10: # queue is long, spawn as many as possible # if queue_length > 10: # queue is long, spawn as many as possible
# actors_to_spawn += max_spawnable * [{}] # actors_to_spawn += max_spawnable * [{}]
if not queue_length: # queue is empty, spawn 0 actors if queue_length > 4: # queue is medium, spawn 1 or 2 actors
return actors_to_spawn
elif queue_length > 4: # queue is medium, spawn 1 or 2 actors
actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}] actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}]
else: # queue is short, spawn 1 actor else: # queue is short, spawn 1 actor
actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}] actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}]
@ -144,7 +146,6 @@ class ActorType(ABC, Generic[ModelType]):
# return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot')) # return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot'))
return cls.QUERYSET return cls.QUERYSET
### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process) ### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process)
def runloop(self): def runloop(self):

View file

@ -1,286 +0,0 @@
__package__ = 'archivebox.actors'
import os
import time
from typing import ClassVar, Generic, cast, Literal, Type
from django.utils.functional import classproperty
from rich import print
import psutil
from django import db
from django.db.models import QuerySet
from multiprocessing import Process, cpu_count
from threading import Thread, get_native_id
from crawls.models import Crawl
from .actor import ActorType, LaunchKwargs
class CrawlActor(ActorType[Crawl]):
QUERYSET: ClassVar[QuerySet] = Crawl.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8) # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
_SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = [] # record all the pids of Actors spawned by this class
def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
self.mode = mode or self.mode
self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
@classproperty
def name(cls) -> str:
return cls.__name__ # type: ignore
def __str__(self) -> str:
return self.__repr__()
def __repr__(self) -> str:
"""FaviconActor[pid=1234]"""
label = 'pid' if self.mode == 'process' else 'tid'
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
@classmethod
def get_running_actors(cls) -> list[int]:
"""returns a list of pids of all running actors of this type"""
# WARNING: only works for process actors, not thread actors
if cls.mode == 'thread':
raise NotImplementedError('get_running_actors() is not implemented for thread actors')
return [
proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
if proc.is_running() and proc.status() != 'zombie'
]
@classmethod
def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
"""Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
actors_to_spawn: list[LaunchKwargs] = []
max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
queue_length = queue.count()
# spawning new actors is expensive, avoid spawning all the actors at once. To stagger them,
# let the next orchestrator tick handle starting another 2 on the next tick()
# if queue_length > 10: # queue is long, spawn as many as possible
# actors_to_spawn += max_spawnable * [{}]
if not queue_length: # queue is empty, spawn 0 actors
return actors_to_spawn
elif queue_length > 4: # queue is medium, spawn 1 or 2 actors
actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}]
else: # queue is short, spawn 1 actor
actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}]
return actors_to_spawn
@classmethod
def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
if mode == 'thread':
return cls.fork_actor_as_thread(**launch_kwargs)
elif mode == 'process':
return cls.fork_actor_as_process(**launch_kwargs)
raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
@classmethod
def fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
"""Spawn a new background thread running the actor's runloop"""
actor = cls(mode='thread', **launch_kwargs)
bg_actor_thread = Thread(target=actor.runloop)
bg_actor_thread.start()
assert bg_actor_thread.native_id is not None
return bg_actor_thread.native_id
@classmethod
def fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
"""Spawn a new background process running the actor's runloop"""
actor = cls(mode='process', **launch_kwargs)
bg_actor_process = Process(target=actor.runloop)
bg_actor_process.start()
assert bg_actor_process.pid is not None
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
return bg_actor_process.pid
@classmethod
def get_model(cls) -> Type[ModelType]:
# wish this was a @classproperty but Generic[ModelType] return type cant be statically inferred for @classproperty
return cls.QUERYSET.model
@classmethod
def get_queue(cls) -> QuerySet:
"""override this to provide your queryset as the queue"""
# return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot'))
return cls.QUERYSET
### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process)
def runloop(self):
"""The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
self.on_startup()
try:
while True:
obj_to_process: ModelType | None = None
try:
obj_to_process = cast(ModelType, self.get_next(atomic=self.atomic))
except Exception:
pass
if obj_to_process:
self.idle_count = 0 # reset idle count if we got an object
else:
if self.idle_count >= 30:
break # stop looping and exit if queue is empty and we have idled for 30sec
else:
# print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
self.idle_count += 1
time.sleep(1)
continue
self.on_tick_start(obj_to_process)
# Process the object
try:
self.tick(obj_to_process)
except Exception as err:
print(f'[red]🏃‍♂️ ERROR: {self}.tick()[/red]', err)
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
self.on_tick_exception(obj_to_process, err)
finally:
self.on_tick_end(obj_to_process)
self.on_shutdown(err=None)
except BaseException as err:
if isinstance(err, KeyboardInterrupt):
print()
else:
print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
self.on_shutdown(err=err)
def get_next(self, atomic: bool | None=None) -> ModelType | None:
"""get the next object from the queue, atomically locking it if self.atomic=True"""
if atomic is None:
atomic = self.ATOMIC
if atomic:
# fetch and claim the next object from in the queue in one go atomically
obj = self.get_next_atomic()
else:
# two-step claim: fetch the next object and lock it in a separate query
obj = self.get_queue().last()
assert obj and self.lock_next(obj), f'Unable to fetch+lock the next {self.get_model().__name__} ojbect from {self}.QUEUE'
return obj
def lock_next(self, obj: ModelType) -> bool:
"""override this to implement a custom two-step (non-atomic)lock mechanism"""
# For example:
# assert obj._model.objects.filter(pk=obj.pk, status='queued').update(status='started', locked_by=self.pid)
# Not needed if using get_next_and_lock() to claim the object atomically
# print(f'[blue]🏃‍♂️ {self}.lock()[/blue]', obj.abid or obj.id)
return True
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
return self.CLAIM_SET
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def get_next_atomic(self, shallow: bool=True) -> ModelType | None:
"""
claim a random object from the top n=50 objects in the queue (atomically updates status=queued->started for claimed object)
optimized for minimizing contention on the queue with other actors selecting from the same list
slightly faster than claim_any_obj() which selects randomly from the entire queue but needs to know the total count
"""
Model = self.get_model() # e.g. ArchiveResult
table = f'{Model._meta.app_label}_{Model._meta.model_name}' # e.g. core_archiveresult
where_sql = self.claim_sql_where()
set_sql = self.claim_sql_set()
order_by_sql = self.claim_sql_order()
choose_from_top = self.claim_from_top()
with db.connection.cursor() as cursor:
# subquery gets the pool of the top 50 candidates sorted by sort and order
# main query selects a random one from that pool
cursor.execute(f"""
UPDATE {table}
SET {set_sql}
WHERE {where_sql} and id = (
SELECT id FROM (
SELECT id FROM {table}
WHERE {where_sql}
ORDER BY {order_by_sql}
LIMIT {choose_from_top}
) candidates
ORDER BY RANDOM()
LIMIT 1
)
RETURNING id;
""")
result = cursor.fetchone()
if result is None:
return None # If no rows were claimed, return None
if shallow:
# shallow: faster, returns potentially incomplete object instance missing some django auto-populated fields:
columns = [col[0] for col in cursor.description or ['id']]
return Model(**dict(zip(columns, result)))
# if not shallow do one extra query to get a more complete object instance (load it fully from scratch)
return Model.objects.get(id=result[0])
@abstractmethod
def tick(self, obj: ModelType) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_startup(self) -> None:
if self.mode == 'thread':
self.pid = get_native_id() # thread id
print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
else:
self.pid = os.getpid() # process id
print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
# abx.pm.hook.on_actor_startup(self)
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: ModelType) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: ModelType) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: ModelType, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -3,8 +3,7 @@ __package__ = 'archivebox.actors'
import os import os
import time import time
import itertools import itertools
import uuid from typing import Dict, Type, Literal, ClassVar
from typing import Dict, Type, Literal
from django.utils.functional import classproperty from django.utils.functional import classproperty
from multiprocessing import Process, cpu_count from multiprocessing import Process, cpu_count
@ -173,54 +172,36 @@ from django import db
from django.db import connection from django.db import connection
from crawls.actors import CrawlActor
from .actor_snapshot import SnapshotActor
from abx_plugin_singlefile.actors import SinglefileActor
class FaviconActor(ActorType[ArchiveResult]): class FaviconActor(ActorType[ArchiveResult]):
@classmethod CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
def get_queue(cls) -> QuerySet[ArchiveResult]: CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
CLAIM_SET: ClassVar[str] = 'status = "started"'
@classproperty
def QUERYSET(cls) -> QuerySet:
return ArchiveResult.objects.filter(status='failed', extractor='favicon') return ArchiveResult.objects.filter(status='failed', extractor='favicon')
@classmethod
def get_next(cls) -> ArchiveResult | None:
# return cls.get_next_atomic(
# model=ArchiveResult,
# where='status = "failed"',
# set='status = "started"',
# order_by='created_at DESC',
# choose_from_top=cpu_count() * 10,
# )
return cls.get_random(
model=ArchiveResult,
where='status = "failed" AND extractor = "favicon"',
set='status = "queued"',
choose_from_top=50,
)
def tick(self, obj: ArchiveResult): def tick(self, obj: ArchiveResult):
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count()) print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
updated = ArchiveResult.objects.filter(id=obj.id, status='started').update(status='success') == 1 updated = ArchiveResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
if not updated: if not updated:
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object') raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
# obj.refresh_from_db() obj.refresh_from_db()
obj.status = 'success' obj.save()
def lock(self, obj: ArchiveResult) -> bool:
"""As an alternative to self.get_next_atomic(), we can use select_for_update() or manually update a semaphore field here"""
locked = ArchiveResult.objects.filter(id=obj.id, status='queued').update(status='started') == 1
if locked:
# obj.refresh_from_db()
obj.status = 'started'
# print(f'FaviconActor[{self.pid}] lock({obj.id}) 🔒')
pass
else:
print(f'FaviconActor[{self.pid}] lock({obj.id}) X')
return locked
class ExtractorsOrchestrator(Orchestrator): class ExtractorsOrchestrator(Orchestrator):
actor_types = { actor_types = {
'CrawlActor': CrawlActor,
'SnapshotActor': SnapshotActor,
'FaviconActor': FaviconActor, 'FaviconActor': FaviconActor,
'SinglefileActor': SinglefileActor,
} }

View file

@ -0,0 +1,286 @@
from statemachine import State, StateMachine
from django.db import models
from multiprocessing import Process
import psutil
import time
# State Machine Definitions
#################################################
class SnapshotMachine(StateMachine):
"""State machine for managing Snapshot lifecycle."""
# States
queued = State(initial=True)
started = State()
sealed = State(final=True)
# Transitions
start = queued.to(started, cond='can_start')
seal = started.to(sealed, cond='is_finished')
# Events
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot):
self.snapshot = snapshot
super().__init__()
def can_start(self):
return True
def is_finished(self):
return not self.snapshot.has_pending_archiveresults()
def before_start(self):
"""Pre-start validation and setup."""
self.snapshot.cleanup_dir()
def after_start(self):
"""Post-start side effects."""
self.snapshot.create_pending_archiveresults()
self.snapshot.update_indices()
self.snapshot.bump_retry_at(seconds=10)
def before_seal(self):
"""Pre-seal validation and cleanup."""
self.snapshot.cleanup_dir()
def after_seal(self):
"""Post-seal actions."""
self.snapshot.update_indices()
self.snapshot.seal_dir()
self.snapshot.upload_dir()
self.snapshot.retry_at = None
self.snapshot.save()
class ArchiveResultMachine(StateMachine):
"""State machine for managing ArchiveResult lifecycle."""
# States
queued = State(initial=True)
started = State()
succeeded = State(final=True)
backoff = State()
failed = State(final=True)
# Transitions
start = queued.to(started, cond='can_start')
succeed = started.to(succeeded, cond='extractor_succeeded')
backoff = started.to(backoff, unless='extractor_succeeded')
retry = backoff.to(queued, cond='can_retry')
fail = backoff.to(failed, unless='can_retry')
# Events
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(cond='extractor_still_running') |
started.to(succeeded, cond='extractor_succeeded') |
started.to(backoff, unless='extractor_succeeded') |
backoff.to.itself(cond='still_waiting_to_retry') |
backoff.to(queued, cond='can_retry') |
backoff.to(failed, unless='can_retry')
)
def __init__(self, archiveresult):
self.archiveresult = archiveresult
super().__init__()
def can_start(self):
return True
def extractor_still_running(self):
return self.archiveresult.start_ts > time.now() - timedelta(seconds=5)
def extractor_succeeded(self):
# return check_if_extractor_succeeded(self.archiveresult)
return self.archiveresult.start_ts < time.now() - timedelta(seconds=5)
def can_retry(self):
return self.archiveresult.retries < self.archiveresult.max_retries
def before_start(self):
"""Pre-start initialization."""
self.archiveresult.retries += 1
self.archiveresult.start_ts = time.now()
self.archiveresult.output = None
self.archiveresult.error = None
def after_start(self):
"""Post-start execution."""
self.archiveresult.bump_retry_at(seconds=self.archiveresult.timeout + 5)
execute_extractor(self.archiveresult)
self.archiveresult.snapshot.bump_retry_at(seconds=5)
def before_succeed(self):
"""Pre-success validation."""
self.archiveresult.output = get_archiveresult_output(self.archiveresult)
def after_succeed(self):
"""Post-success cleanup."""
self.archiveresult.end_ts = time.now()
self.archiveresult.retry_at = None
self.archiveresult.update_indices()
def before_backoff(self):
"""Pre-backoff error capture."""
self.archiveresult.error = get_archiveresult_error(self.archiveresult)
def after_backoff(self):
"""Post-backoff retry scheduling."""
self.archiveresult.end_ts = time.now()
self.archiveresult.bump_retry_at(
seconds=self.archiveresult.timeout * self.archiveresult.retries
)
self.archiveresult.update_indices()
def before_fail(self):
"""Pre-failure finalization."""
self.archiveresult.retry_at = None
def after_fail(self):
"""Post-failure cleanup."""
self.archiveresult.update_indices()
# Models
#################################################
class Snapshot(models.Model):
status = models.CharField(max_length=32, default='queued')
retry_at = models.DateTimeField(null=True)
@property
def sm(self):
"""Get the state machine for this snapshot."""
return SnapshotMachine(self)
def has_pending_archiveresults(self):
return self.archiveresult_set.exclude(
status__in=['succeeded', 'failed']
).exists()
def bump_retry_at(self, seconds):
self.retry_at = time.now() + timedelta(seconds=seconds)
self.save()
def cleanup_dir(self):
cleanup_snapshot_dir(self)
def create_pending_archiveresults(self):
create_snapshot_pending_archiveresults(self)
def update_indices(self):
update_snapshot_index_json(self)
update_snapshot_index_html(self)
def seal_dir(self):
seal_snapshot_dir(self)
def upload_dir(self):
upload_snapshot_dir(self)
class ArchiveResult(models.Model):
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
status = models.CharField(max_length=32, default='queued')
retry_at = models.DateTimeField(null=True)
retries = models.IntegerField(default=0)
max_retries = models.IntegerField(default=3)
timeout = models.IntegerField(default=60)
start_ts = models.DateTimeField(null=True)
end_ts = models.DateTimeField(null=True)
output = models.TextField(null=True)
error = models.TextField(null=True)
def get_machine(self):
return ArchiveResultMachine(self)
def bump_retry_at(self, seconds):
self.retry_at = time.now() + timedelta(seconds=seconds)
self.save()
def update_indices(self):
update_archiveresult_index_json(self)
update_archiveresult_index_html(self)
# Actor System
#################################################
class BaseActor:
MAX_TICK_TIME = 60
def tick(self, obj):
"""Process a single object through its state machine."""
machine = obj.get_machine()
if machine.is_queued:
if machine.can_start():
machine.start()
elif machine.is_started:
if machine.can_seal():
machine.seal()
elif machine.is_backoff:
if machine.can_retry():
machine.retry()
else:
machine.fail()
class Orchestrator:
"""Main orchestrator that manages all actors."""
def __init__(self):
self.pid = None
@classmethod
def spawn(cls):
orchestrator = cls()
proc = Process(target=orchestrator.runloop)
proc.start()
return proc.pid
def runloop(self):
self.pid = os.getpid()
abx.pm.hook.on_orchestrator_startup(self)
try:
while True:
self.process_queue(Snapshot)
self.process_queue(ArchiveResult)
time.sleep(0.1)
except (KeyboardInterrupt, SystemExit):
abx.pm.hook.on_orchestrator_shutdown(self)
def process_queue(self, model):
retry_at_reached = Q(retry_at__isnull=True) | Q(retry_at__lte=time.now())
queue = model.objects.filter(retry_at_reached)
if queue.exists():
actor = BaseActor()
for obj in queue:
try:
with transaction.atomic():
actor.tick(obj)
except Exception as e:
abx.pm.hook.on_actor_tick_exception(actor, obj, e)
# Periodic Tasks
#################################################
@djhuey.periodic_task(schedule=djhuey.crontab(minute='*'))
def ensure_orchestrator_running():
"""Ensure orchestrator is running, start if not."""
if not any(p.name().startswith('Orchestrator') for p in psutil.process_iter()):
Orchestrator.spawn()

73
archivebox/core/actors.py Normal file
View file

@ -0,0 +1,73 @@
__package__ = 'archivebox.core'
from typing import ClassVar
from rich import print
from django.db.models import QuerySet
from django.utils import timezone
from datetime import timedelta
from core.models import Snapshot
from actors.actor import ActorType
class SnapshotActor(ActorType[Snapshot]):
QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
# format as 2024-10-31 10:14:33.240903
retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def tick(self, obj: Snapshot) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -8,21 +8,25 @@ import os
import json import json
from pathlib import Path from pathlib import Path
from datetime import timedelta
from django.db import models from django.db import models
from django.utils.functional import cached_property from django.utils.functional import cached_property
from django.utils.text import slugify from django.utils.text import slugify
from django.utils import timezone
from django.core.cache import cache from django.core.cache import cache
from django.urls import reverse, reverse_lazy from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField from django.db.models import Case, When, Value, IntegerField
from django.contrib import admin from django.contrib import admin
from django.conf import settings from django.conf import settings
from statemachine.mixins import MachineMixin
from archivebox.config import CONSTANTS from archivebox.config import CONSTANTS
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from queues.tasks import bg_archive_snapshot from queues.tasks import bg_archive_snapshot
# from crawls.models import Crawl from crawls.models import Crawl
# from machine.models import Machine, NetworkInterface # from machine.models import Machine, NetworkInterface
from archivebox.misc.system import get_dir_size from archivebox.misc.system import get_dir_size
@ -152,7 +156,7 @@ class SnapshotManager(models.Manager):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct() return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ABIDModel): class Snapshot(ABIDModel, MachineMixin):
abid_prefix = 'snp_' abid_prefix = 'snp_'
abid_ts_src = 'self.created_at' abid_ts_src = 'self.created_at'
abid_uri_src = 'self.url' abid_uri_src = 'self.url'
@ -160,6 +164,17 @@ class Snapshot(ABIDModel):
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.SnapshotMachine'
state_machine_attr = 'sm'
class SnapshotStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SEALED = 'sealed', 'Sealed'
status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
@ -171,7 +186,7 @@ class Snapshot(ABIDModel):
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
# crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set') crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
url = models.URLField(unique=True, db_index=True) url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
@ -396,6 +411,25 @@ class Snapshot(ABIDModel):
tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk) tags_id.append(Tag.objects.get_or_create(name=tag)[0].pk)
self.tags.clear() self.tags.clear()
self.tags.add(*tags_id) self.tags.add(*tags_id)
def has_pending_archiveresults(self) -> bool:
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
return pending_archiveresults.exists()
def create_pending_archiveresults(self) -> list['ArchiveResult']:
archiveresults = []
for extractor in EXTRACTORS:
archiveresult, _created = ArchiveResult.objects.get_or_create(
snapshot=self,
extractor=extractor,
status=ArchiveResult.ArchiveResultStatus.QUEUED,
)
archiveresults.append(archiveresult)
return archiveresults
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
# def get_storage_dir(self, create=True, symlink=True) -> Path: # def get_storage_dir(self, create=True, symlink=True) -> Path:
@ -452,6 +486,20 @@ class ArchiveResult(ABIDModel):
abid_subtype_src = 'self.extractor' abid_subtype_src = 'self.extractor'
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.ArchiveResultMachine'
state_machine_attr = 'sm'
class ArchiveResultStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
BACKOFF = 'backoff', 'Waiting to retry'
status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
EXTRACTOR_CHOICES = ( EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'), ('htmltotext', 'htmltotext'),
@ -469,11 +517,7 @@ class ArchiveResult(ABIDModel):
('title', 'title'), ('title', 'title'),
('wget', 'wget'), ('wget', 'wget'),
) )
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
("skipped", "skipped")
]
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
@ -491,7 +535,6 @@ class ArchiveResult(ABIDModel):
output = models.CharField(max_length=1024) output = models.CharField(max_length=1024)
start_ts = models.DateTimeField(db_index=True) start_ts = models.DateTimeField(db_index=True)
end_ts = models.DateTimeField() end_ts = models.DateTimeField()
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
# the network interface that was used to download this result # the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used') # uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
@ -552,7 +595,15 @@ class ArchiveResult(ABIDModel):
return link.canonical_outputs().get(f'{self.extractor}_path') return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool: def output_exists(self) -> bool:
return os.access(self.output_path(), os.R_OK) return os.path.exists(self.output_path())
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
def create_output_dir(self):
snap_dir = self.snapshot_dir
snap_dir.mkdir(parents=True, exist_ok=True)
return snap_dir / self.output_path()
# def get_storage_dir(self, create=True, symlink=True): # def get_storage_dir(self, create=True, symlink=True):

View file

@ -64,7 +64,8 @@ INSTALLED_APPS = [
# 'abid_utils', # handles ABID ID creation, handling, and models # 'abid_utils', # handles ABID ID creation, handling, and models
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes 'actors', # handles starting and managing background workers and processes (orchestrators and actors)
'queues', # handles starting and managing background workers and processes (supervisord)
'seeds', # handles Seed model and URL source management 'seeds', # handles Seed model and URL source management
'crawls', # handles Crawl and CrawlSchedule models and management 'crawls', # handles Crawl and CrawlSchedule models and management
'personas', # handles Persona and session management 'personas', # handles Persona and session management

View file

@ -0,0 +1,115 @@
__package__ = 'archivebox.snapshots'
from django.utils import timezone
from statemachine import State, StateMachine
from core.models import Snapshot, ArchiveResult
# State Machine Definitions
#################################################
class SnapshotMachine(StateMachine, strict_states=True):
"""State machine for managing Snapshot lifecycle."""
model: Snapshot
# States
queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
started = State(value=Snapshot.SnapshotStatus.STARTED)
sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot, *args, **kwargs):
self.snapshot = snapshot
super().__init__(snapshot, *args, **kwargs)
def can_start(self) -> bool:
return self.snapshot.seed and self.snapshot.seed.uri
def is_finished(self) -> bool:
return not self.snapshot.has_pending_archiveresults()
def on_started(self):
self.snapshot.create_pending_archiveresults()
self.snapshot.bump_retry_at(seconds=60)
self.snapshot.save()
def on_sealed(self):
self.snapshot.retry_at = None
self.snapshot.save()
class ArchiveResultMachine(StateMachine, strict_states=True):
"""State machine for managing ArchiveResult lifecycle."""
model: ArchiveResult
# States
queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start', internal=True) |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed')
)
def __init__(self, archiveresult, *args, **kwargs):
self.archiveresult = archiveresult
super().__init__(archiveresult, *args, **kwargs)
def can_start(self) -> bool:
return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
def is_succeeded(self) -> bool:
return self.archiveresult.output_exists()
def is_failed(self) -> bool:
return not self.archiveresult.output_exists()
def is_backoff(self) -> bool:
return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
def on_started(self):
self.archiveresult.start_ts = timezone.now()
self.archiveresult.create_output_dir()
self.archiveresult.bump_retry_at(seconds=60)
self.archiveresult.save()
def on_backoff(self):
self.archiveresult.bump_retry_at(seconds=60)
self.archiveresult.save()
def on_succeeded(self):
self.archiveresult.end_ts = timezone.now()
self.archiveresult.save()
def on_failed(self):
self.archiveresult.end_ts = timezone.now()
self.archiveresult.save()
def after_transition(self, event: str, source: State, target: State):
print(f"after '{event}' from '{source.id}' to '{target.id}'")
# self.archiveresult.save_merkle_index()
# self.archiveresult.save_html_index()
# self.archiveresult.save_json_index()
return "after_transition"

View file

@ -0,0 +1,69 @@
__package__ = 'archivebox.crawls'
from typing import ClassVar
from rich import print
from django.db.models import QuerySet
from crawls.models import Crawl
from actors.actor import ActorType
class CrawlActor(ActorType[Crawl]):
QUERYSET: ClassVar[QuerySet] = Crawl.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
return self.CLAIM_SET
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def tick(self, obj: Crawl) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: Crawl) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: Crawl) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: Crawl, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -1,13 +1,20 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
from django_stubs_ext.db.models import TypedModelMeta from django_stubs_ext.db.models import TypedModelMeta
from datetime import timedelta
from django.db import models from django.db import models
from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings from django.conf import settings
from django.utils import timezone
from django.urls import reverse_lazy from django.urls import reverse_lazy
from django.utils import timezone
from statemachine.mixins import MachineMixin
if TYPE_CHECKING:
from core.models import Snapshot
from seeds.models import Seed from seeds.models import Seed
@ -41,8 +48,9 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
"""The base crawl that each new scheduled job should copy as a template""" """The base crawl that each new scheduled job should copy as a template"""
return self.crawl_set.first() return self.crawl_set.first()
class Crawl(ABIDModel, ModelWithHealthStats): class Crawl(ABIDModel, ModelWithHealthStats, MachineMixin):
""" """
A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak. A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
@ -55,16 +63,29 @@ class Crawl(ABIDModel, ModelWithHealthStats):
abid_prefix = 'crl_' abid_prefix = 'crl_'
abid_ts_src = 'self.created_at' abid_ts_src = 'self.created_at'
abid_uri_src = 'self.seed.uri' abid_uri_src = 'self.seed.uri'
abid_subtype_src = 'self.persona_id' abid_subtype_src = 'self.persona'
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'crawls.statemachines.CrawlMachine'
state_machine_attr = 'sm'
bind_events_as_methods = True
class CrawlStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SEALED = 'sealed', 'Sealed'
status = models.CharField(choices=CrawlStatus.choices, max_length=15, default=CrawlStatus.QUEUED, null=False, blank=False)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix) abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set') created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
created_at = AutoDateTimeField(default=None, null=False, db_index=True) created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True) modified_at = models.DateTimeField(auto_now=True)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
@ -79,7 +100,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
# schedule = models.JSONField() # schedule = models.JSONField()
# config = models.JSONField() # config = models.JSONField()
# snapshot_set: models.Manager['Snapshot'] snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta): class Meta(TypedModelMeta):
@ -102,6 +123,28 @@ class Crawl(ABIDModel, ModelWithHealthStats):
@property @property
def api_docs_url(self) -> str: def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl' return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
def has_pending_archiveresults(self) -> bool:
from core.models import ArchiveResult
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
snapshot_ids = self.snapshot_set.values_list('id', flat=True)
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, status__in=pending_statuses)
return pending_archiveresults.exists()
def create_root_snapshot(self) -> 'Snapshot':
from core.models import Snapshot
root_snapshot, _ = Snapshot.objects.get_or_create(
crawl=self,
url=self.seed.uri,
)
return root_snapshot
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
self.save()
class Outlink(models.Model): class Outlink(models.Model):

View file

@ -0,0 +1,48 @@
__package__ = 'archivebox.crawls'
from statemachine import State, StateMachine
from crawls.models import Crawl
# State Machine Definitions
#################################################
class CrawlMachine(StateMachine, strict_states=True):
"""State machine for managing Crawl lifecycle."""
model: Crawl
# States
queued = State(value=Crawl.CrawlStatus.QUEUED, initial=True)
started = State(value=Crawl.CrawlStatus.STARTED)
sealed = State(value=Crawl.CrawlStatus.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished')
)
def __init__(self, crawl, *args, **kwargs):
self.crawl = crawl
super().__init__(crawl, *args, **kwargs)
def can_start(self) -> bool:
return self.crawl.seed and self.crawl.seed.uri
def is_finished(self) -> bool:
return not self.crawl.has_pending_archiveresults()
def on_started(self):
self.crawl.create_root_snapshot()
self.crawl.bump_retry_at(seconds=10)
self.crawl.save()
def on_sealed(self):
self.crawl.retry_at = None
self.crawl.save()

View file

@ -29,7 +29,7 @@ def get_EXTRACTORS():
'singlefile': SINGLEFILE_EXTRACTOR, 'singlefile': SINGLEFILE_EXTRACTOR,
} }
# @abx.hookimpl @abx.hookimpl
# def get_INSTALLED_APPS(): def get_INSTALLED_APPS():
# # needed to load ./models.py # needed to load ./models.py
# return [__package__] return [__package__]

View file

@ -0,0 +1,27 @@
__package__ = 'abx_plugin_singlefile'
from typing import ClassVar
from django.db.models import QuerySet
from django.utils.functional import classproperty
from actors.actor import ActorType
from .models import SinglefileResult
class SinglefileActor(ActorType[SinglefileResult]):
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
CLAIM_SET: ClassVar[str] = 'status = "started"'
@classproperty
def QUERYSET(cls) -> QuerySet:
return SinglefileResult.objects.filter(status='queued')
def tick(self, obj: SinglefileResult):
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
updated = SinglefileResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
if not updated:
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
obj.refresh_from_db()
obj.save()

View file

@ -20,6 +20,17 @@ from django.urls import reverse_lazy
from pathlib import Path from pathlib import Path
# Glossary:
# - startup: when a new process is spawned
# - shutdown: when a process is exiting
# - start: at the beginning of some python code block
# - end: at the end of some python code block
# - queue: a django queryset of objects of a single type that are waiting to be processed
# - actor: a long-running daemon process that wakes up and processes a single object from a queue at a time
# - plugin: a python package that defines some hookimpls based on hookspecs exposed by ABX
# - object: an instance of a django model that represents a single row in the database
# ORCHESTRATOR: # ORCHESTRATOR:
# An orchestrator is a single long-running daemon process that manages spawning and killing actors for different queues of objects. # An orchestrator is a single long-running daemon process that manages spawning and killing actors for different queues of objects.
# The orchestrator first starts when the archivebox starts, and it stops when archivebox is killed. # The orchestrator first starts when the archivebox starts, and it stops when archivebox is killed.
@ -74,8 +85,8 @@ from pathlib import Path
# On startup an actor should fire abx.pm.hook.on_actor_startup(object) and on exit it should fire abx.pm.hook.on_actor_exit(object) (both syncronous hooks that can be used by plugins to register any startup/cleanup code). # On startup an actor should fire abx.pm.hook.on_actor_startup(object) and on exit it should fire abx.pm.hook.on_actor_exit(object) (both syncronous hooks that can be used by plugins to register any startup/cleanup code).
# An ActorType defines the following hookspecs for plugins to hook into its behavior: # An ActorType defines the following hookspecs for plugins to hook into its behavior:
# - abx.pm.hook.on_actor_startup(actor, queue) # - abx.pm.hook.on_actor_startup(actor, queue)
# - abx.pm.hook.on_actor_tick_started(actor, object) # - abx.pm.hook.on_actor_tick_start(actor, object)
# - abx.pm.hook.on_actor_tick_finished(actor, object) # - abx.pm.hook.on_actor_tick_end(actor, object)
# - abx.pm.hook.on_actor_tick_exception(actor, object, exception) # - abx.pm.hook.on_actor_tick_exception(actor, object, exception)
# - abx.pm.hook.on_actor_shutdown(actor) # - abx.pm.hook.on_actor_shutdown(actor)
@ -107,8 +118,8 @@ from pathlib import Path
# - external API calls (e.g. uploading to s3, firing a webhook, writing to a logfile, etc.) # - external API calls (e.g. uploading to s3, firing a webhook, writing to a logfile, etc.)
# - DO NOT use side effects to directly mutate other objects state or trigger other state transitions # - DO NOT use side effects to directly mutate other objects state or trigger other state transitions
# ABX defines the following hookspecs for plugins to hook into transition behavior: # ABX defines the following hookspecs for plugins to hook into transition behavior:
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_started(object) # - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_start(object)
# - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_succeeded(object) # - abx.pm.hook.on_transition_<objecttype>_from_abx_to_xyz_end(object)
# READ: # READ:
# A read() method is a function defined for a given ActorType that performs a single read from the DB and/or other read models like django cache, filesystem, in-memory caches, etc. # A read() method is a function defined for a given ActorType that performs a single read from the DB and/or other read models like django cache, filesystem, in-memory caches, etc.

View file

@ -1,19 +1,8 @@
__package__ = 'archivebox.seeds' __package__ = 'archivebox.seeds'
from datetime import datetime
from django_stubs_ext.db.models import TypedModelMeta
from django.db import models from django.db import models
from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings from django.conf import settings
from django.utils import timezone
from django.utils.functional import cached_property
from django.urls import reverse_lazy
from pathlib import Path
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
@ -47,7 +36,10 @@ class Seed(ABIDModel, ModelWithHealthStats):
abid_rand_src = 'self.id' abid_rand_src = 'self.id'
abid_drift_allowed = True abid_drift_allowed = True
uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
uri = models.URLField(max_length=2000, blank=False, null=False) # unique source location where URLs will be loaded from
extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source
tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source
@ -64,4 +56,10 @@ class Seed(ABIDModel, ModelWithHealthStats):
# pocketapi:// # pocketapi://
# s3:// # s3://
# etc.. # etc..
return self.uri.split('://')[0].lower() return self.uri.split('://', 1)[0].lower()
class Meta:
verbose_name = 'Seed'
verbose_name_plural = 'Seeds'
unique_together = (('created_by', 'uri', 'extractor'),)

View file

@ -61,7 +61,7 @@ dependencies = [
"pluggy>=1.5.0", "pluggy>=1.5.0",
"requests>=2.32.3", "requests>=2.32.3",
"dateparser>=1.2.0", "dateparser>=1.2.0",
"tzdata>=2024.2", # needed for dateparser {TZ: UTC} on some systems: https://github.com/ArchiveBox/ArchiveBox/issues/1553 "tzdata>=2024.2", # needed for dateparser {TZ: UTC} on some systems: https://github.com/ArchiveBox/ArchiveBox/issues/1553
"feedparser>=6.0.11", "feedparser>=6.0.11",
"w3lib>=2.2.1", "w3lib>=2.2.1",
"rich>=13.8.0", "rich>=13.8.0",
@ -86,40 +86,35 @@ dependencies = [
"yt-dlp>=2024.8.6", # for: media" "yt-dlp>=2024.8.6", # for: media"
############# Plugin Dependencies ################ ############# Plugin Dependencies ################
"abx>=0.1.0", "abx>=0.1.0",
"abx-spec-pydantic-pkgr>=0.1.0", "abx-spec-pydantic-pkgr>=0.1.0",
"abx-spec-config>=0.1.0", "abx-spec-config>=0.1.0",
"abx-spec-archivebox>=0.1.0", "abx-spec-archivebox>=0.1.0",
"abx-spec-django>=0.1.0", "abx-spec-django>=0.1.0",
"abx-spec-extractor>=0.1.0", "abx-spec-extractor>=0.1.0",
"abx-spec-searchbackend>=0.1.0", "abx-spec-searchbackend>=0.1.0",
"abx-plugin-default-binproviders>=2024.10.24", "abx-plugin-default-binproviders>=2024.10.24",
"abx-plugin-pip>=2024.10.24", "abx-plugin-pip>=2024.10.24",
"abx-plugin-npm>=2024.10.24", "abx-plugin-npm>=2024.10.24",
"abx-plugin-playwright>=2024.10.24", "abx-plugin-playwright>=2024.10.24",
"abx-plugin-puppeteer>=2024.10.28", "abx-plugin-puppeteer>=2024.10.28",
"abx-plugin-ripgrep-search>=2024.10.28", "abx-plugin-ripgrep-search>=2024.10.28",
"abx-plugin-sqlitefts-search>=2024.10.28", "abx-plugin-sqlitefts-search>=2024.10.28",
"abx-plugin-sonic-search>=2024.10.28", "abx-plugin-sonic-search>=2024.10.28",
"abx-plugin-ldap-auth>=2024.10.28", "abx-plugin-ldap-auth>=2024.10.28",
"abx-plugin-curl>=2024.10.27", "abx-plugin-curl>=2024.10.27",
"abx-plugin-wget>=2024.10.28", "abx-plugin-wget>=2024.10.28",
"abx-plugin-git>=2024.10.28", "abx-plugin-git>=2024.10.28",
"abx-plugin-chrome>=2024.10.28", "abx-plugin-chrome>=2024.10.28",
"abx-plugin-ytdlp>=2024.10.28", "abx-plugin-ytdlp>=2024.10.28",
"abx-plugin-title>=2024.10.27", "abx-plugin-title>=2024.10.27",
"abx-plugin-favicon>=2024.10.27", "abx-plugin-favicon>=2024.10.27",
# "abx-plugin-headers>=2024.10.27", # "abx-plugin-headers>=2024.10.27",
"abx-plugin-archivedotorg>=2024.10.28", "abx-plugin-archivedotorg>=2024.10.28",
"abx-plugin-singlefile>=2024.10.28", "abx-plugin-singlefile>=2024.10.28",
"abx-plugin-readability>=2024.10.28", "abx-plugin-readability>=2024.10.28",
"abx-plugin-mercury>=2024.10.28", "abx-plugin-mercury>=2024.10.28",
"abx-plugin-htmltotext>=2024.10.28", "abx-plugin-htmltotext>=2024.10.28",
"python-statemachine>=2.3.6",
] ]
[project.optional-dependencies] [project.optional-dependencies]

11
uv.lock
View file

@ -661,6 +661,7 @@ dependencies = [
{ name = "pydantic-settings" }, { name = "pydantic-settings" },
{ name = "python-benedict", extra = ["io", "parse"] }, { name = "python-benedict", extra = ["io", "parse"] },
{ name = "python-crontab" }, { name = "python-crontab" },
{ name = "python-statemachine" },
{ name = "requests" }, { name = "requests" },
{ name = "rich" }, { name = "rich" },
{ name = "rich-argparse" }, { name = "rich-argparse" },
@ -789,6 +790,7 @@ requires-dist = [
{ name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" }, { name = "python-benedict", extras = ["io", "parse"], specifier = ">=0.33.2" },
{ name = "python-crontab", specifier = ">=3.2.0" }, { name = "python-crontab", specifier = ">=3.2.0" },
{ name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" }, { name = "python-ldap", marker = "extra == 'ldap'", specifier = ">=3.4.3" },
{ name = "python-statemachine", specifier = ">=2.3.6" },
{ name = "requests", specifier = ">=2.32.3" }, { name = "requests", specifier = ">=2.32.3" },
{ name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" }, { name = "requests-tracker", marker = "extra == 'debug'", specifier = ">=0.3.3" },
{ name = "rich", specifier = ">=13.8.0" }, { name = "rich", specifier = ">=13.8.0" },
@ -2729,6 +2731,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051 }, { url = "https://files.pythonhosted.org/packages/a4/62/02da182e544a51a5c3ccf4b03ab79df279f9c60c5e82d5e8bec7ca26ac11/python_slugify-8.0.4-py2.py3-none-any.whl", hash = "sha256:276540b79961052b66b7d116620b36518847f52d5fd9e3a70164fc8c50faa6b8", size = 10051 },
] ]
[[package]]
name = "python-statemachine"
version = "2.3.6"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/20/c9/7034a362ce151f9fa0ead5630727a16122f7a5ed235d42447910dff95b6a/python_statemachine-2.3.6.tar.gz", hash = "sha256:9cb4040ca7f2158d3cd46f36a77b420b6ef95a90223928a7f3cab232a70bd560", size = 36735 }
wheels = [
{ url = "https://files.pythonhosted.org/packages/39/04/19a44b549cbaae1ac6c2acc58afb96b71209da866713877f40aab2f45de6/python_statemachine-2.3.6-py3-none-any.whl", hash = "sha256:0001b02cbe2f5b2420c423b5b3e3a33915447ac6d9735219c929e2378d454f5f", size = 41529 },
]
[[package]] [[package]]
name = "python-stdnum" name = "python-stdnum"
version = "1.20" version = "1.20"