From 210fd935d7a0dfcffbd862bdf27c61884e40f484 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sat, 16 Nov 2024 06:42:04 -0800 Subject: [PATCH] make orchestrator run as long as any tasks are pending --- README.md | 2 +- archivebox/__init__.py | 6 +++--- archivebox/actors/__init__.py | 2 ++ archivebox/actors/actor.py | 13 +++++++++---- archivebox/actors/orchestrator.py | 25 ++++++++++++++++++------- 5 files changed, 33 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index c331ea10..d22ef95c 100644 --- a/README.md +++ b/README.md @@ -798,7 +798,7 @@ ArchiveBox bundles industry-standard tools like [Google Chrome](https://github.c
  • Database: Django ORM saving to SQLite3 ./data/index.sqlite
  • Job Queue: Huey using ./data/queue.sqlite3 under supervisord
  • Build/test/lint: pdm / mypy+pyright+pytest / ruff
  • -
  • Subdependencies: pydantic-pkgr installs apt/brew/pip/npm pkgs at runtime (e.g. yt-dlp, singlefile, readability, git)
  • +
  • Subdependencies: abx-pkg installs apt/brew/pip/npm pkgs at runtime (e.g. yt-dlp, singlefile, readability, git)
  • diff --git a/archivebox/__init__.py b/archivebox/__init__.py index 511a2fc7..6b301e1f 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -55,15 +55,15 @@ load_vendored_pkgs() import abx # noqa import abx_spec_archivebox # noqa import abx_spec_config # noqa -import abx_spec_pydantic_pkgr # noqa +import abx_spec_abx_pkg # noqa import abx_spec_django # noqa import abx_spec_searchbackend # noqa abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC) abx.pm.register(abx_spec_config.PLUGIN_SPEC()) -abx.pm.add_hookspecs(abx_spec_pydantic_pkgr.PLUGIN_SPEC) -abx.pm.register(abx_spec_pydantic_pkgr.PLUGIN_SPEC()) +abx.pm.add_hookspecs(abx_spec_abx_pkg.PLUGIN_SPEC) +abx.pm.register(abx_spec_abx_pkg.PLUGIN_SPEC()) abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC) abx.pm.register(abx_spec_django.PLUGIN_SPEC()) diff --git a/archivebox/actors/__init__.py b/archivebox/actors/__init__.py index e69de29b..211642ad 100644 --- a/archivebox/actors/__init__.py +++ b/archivebox/actors/__init__.py @@ -0,0 +1,2 @@ +__package__ = 'archivebox.actors' +__order__ = 100 diff --git a/archivebox/actors/actor.py b/archivebox/actors/actor.py index 8dac8e44..d99f4f85 100644 --- a/archivebox/actors/actor.py +++ b/archivebox/actors/actor.py @@ -465,7 +465,7 @@ class ActorType(Generic[ModelType]): def tick(self, obj_to_process: ModelType) -> None: """Call the object.sm.tick() method to process the object""" - print(f'[blue]🏃‍♂️ {self}.tick()[/blue] {obj_to_process}') + print(f'[blue]🏃‍♂️ {self}.tick()[/blue] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}') # get the StateMachine instance from the object obj_statemachine = self._get_state_machine_instance(obj_to_process) @@ -500,17 +500,22 @@ class ActorType(Generic[ModelType]): # abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error) def on_tick_start(self, obj_to_process: ModelType) -> None: - print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process}') + print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}') # abx.pm.hook.on_actor_tick_start(actor=self, obj_to_process=obj) # self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ') + pass def on_tick_end(self, obj_to_process: ModelType) -> None: - print(f'🏃‍♂️ {self}.on_tick_end() {obj_to_process}') + print(f'🏃‍♂️ {self}.on_tick_end() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}') # abx.pm.hook.on_actor_tick_end(actor=self, obj_to_process=obj_to_process) # self.timer.end() + pass + + # import ipdb; ipdb.set_trace() + def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None: - print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red] {obj_to_process}: [red]{type(error).__name__}: {error}[/red]') + print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}: [red]{type(error).__name__}: {error}[/red]') # abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error) diff --git a/archivebox/actors/orchestrator.py b/archivebox/actors/orchestrator.py index 0e39c947..c3297c70 100644 --- a/archivebox/actors/orchestrator.py +++ b/archivebox/actors/orchestrator.py @@ -97,6 +97,15 @@ class Orchestrator: orphaned_objects.extend(model.objects.filter(retry_at__lt=timezone.now()).exclude(id__in=all_queued_ids)) return orphaned_objects + @classmethod + def has_future_objects(cls, all_queues) -> bool: + # returns a list of objects that are in the queues of all actor types but not in the queues of any other actor types + + return any( + queue.filter(retry_at__gt=timezone.now()).exists() + for queue in all_queues.values() + ) + def on_startup(self): if self.mode == 'thread': self.pid = get_native_id() @@ -111,8 +120,8 @@ class Orchestrator: # abx.pm.hook.on_orchestrator_shutdown(self) def on_tick_started(self, all_queues): - # total_pending = sum(queue.count() for queue in all_queues.values()) - # print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}') + total_pending = sum(queue.count() for queue in all_queues.values()) + print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}') # abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues) pass @@ -123,15 +132,15 @@ class Orchestrator: # abx.pm.hook.on_orchestrator_tick_finished(self, actor_types, all_queues) def on_idle(self, all_queues): - # print(f'👨‍✈️ {self}.on_idle()') + print(f'👨‍✈️ {self}.on_idle()', f'idle_count={self.idle_count}') # abx.pm.hook.on_orchestrator_idle(self) # check for orphaned objects left behind if self.idle_count == 60: orphaned_objects = self.get_orphaned_objects(all_queues) if orphaned_objects: - print('[red]👨‍✈️ WARNING: some objects may not be processed, no actor has claimed them after 60s:[/red]', orphaned_objects) - if self.idle_count > 5 and self.exit_on_idle: - raise KeyboardInterrupt('Orchestrator has no more tasks to process, exiting') + print('[red]👨‍✈️ WARNING: some objects may not be processed, no actor has claimed them after 30s:[/red]', orphaned_objects) + if self.idle_count > 3 and self.exit_on_idle and not self.has_future_objects(all_queues): + raise KeyboardInterrupt('✅ All tasks completed, exiting') def runloop(self): from archivebox.config.django import setup_django @@ -153,6 +162,8 @@ class Orchestrator: all_spawned_actors = [] for actor_type, queue in all_queues.items(): + next_obj = queue.first() + print(f'🏃‍♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj else "None"}') try: existing_actors = actor_type.get_running_actors() all_existing_actors.extend(existing_actors) @@ -168,7 +179,7 @@ class Orchestrator: if not any(queue.exists() for queue in all_queues.values()): self.on_idle(all_queues) self.idle_count += 1 - time.sleep(1) + time.sleep(0.5) else: self.idle_count = 0