improve statemachine logging and archivebox update CLI cmd

This commit is contained in:
Nick Sweeting 2024-11-19 03:31:05 -08:00
parent c9a05c9d94
commit 2595139180
No known key found for this signature in database
6 changed files with 134 additions and 167 deletions

View file

@ -1,13 +1,13 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
__package__ = 'archivebox.cli' __package__ = 'archivebox.cli'
__command__ = 'archivebox update'
import sys
import argparse
from typing import List, Optional, IO
from archivebox.misc.util import docstring import rich_click as click
from typing import Iterable
from archivebox.misc.util import enforce_types, docstring
from archivebox.index import ( from archivebox.index import (
LINK_FILTERS, LINK_FILTERS,
get_indexed_folders, get_indexed_folders,
@ -21,8 +21,66 @@ from archivebox.index import (
get_corrupted_folders, get_corrupted_folders,
get_unrecognized_folders, get_unrecognized_folders,
) )
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# from ..main import update
@enforce_types
def update(filter_patterns: Iterable[str]=(),
only_new: bool=False,
index_only: bool=False,
resume: float | None=None,
overwrite: bool=False,
before: float | None=None,
after: float | None=None,
status: str='indexed',
filter_type: str='exact',
extract: str="") -> None:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from archivebox.config.django import setup_django
setup_django()
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
@click.command()
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
@click.option('--status', type=click.Choice([
'indexed', 'archived', 'unarchived',
'present', 'valid', 'invalid',
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
]), default='indexed', help=f'''
Update only links or data directories that have the given status:
indexed {get_indexed_folders.__doc__} (the default)
archived {get_archived_folders.__doc__}
unarchived {get_unarchived_folders.__doc__}
present {get_present_folders.__doc__}
valid {get_valid_folders.__doc__}
invalid {get_invalid_folders.__doc__}
duplicate {get_duplicate_folders.__doc__}
orphaned {get_orphaned_folders.__doc__}
corrupted {get_corrupted_folders.__doc__}
unrecognized {get_unrecognized_folders.__doc__}
''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
update(**kwargs)
if __name__ == '__main__':
main()
@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
# # Step 4: Re-write links index with updated titles, icons, and resources # # Step 4: Re-write links index with updated titles, icons, and resources
# all_links = load_main_index(out_dir=out_dir) # all_links = load_main_index(out_dir=out_dir)
# return all_links # return all_links
def update():
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
from archivebox.config.django import setup_django
setup_django()
from workers.orchestrator import Orchestrator
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.start()
@docstring(update.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=update.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Update the main index without archiving any content",
)
parser.add_argument(
'--resume', #'-r',
type=float,
help='Resume the update process from a given timestamp',
default=None,
)
parser.add_argument(
'--overwrite', #'-x',
action='store_true',
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
)
parser.add_argument(
'--before', #'-b',
type=float,
help="Update only links bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="Update only links bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'Update only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type', '-t',
type=str,
choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='Update only URLs matching these filter patterns.'
)
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
command = parser.parse_args(args or ())
filter_patterns_str = None
if not command.filter_patterns:
filter_patterns_str = accept_stdin(stdin)
update()
# update(
# resume=command.resume,
# only_new=command.only_new,
# index_only=command.index_only,
# overwrite=command.overwrite,
# filter_patterns_str=filter_patterns_str,
# filter_patterns=command.filter_patterns,
# filter_type=command.filter_type,
# status=command.status,
# after=command.after,
# before=command.before,
# out_dir=Path(pwd) if pwd else DATA_DIR,
# extractors=command.extract,
# )
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -1,11 +1,14 @@
__package__ = 'archivebox.core' __package__ = 'archivebox.core'
import time import time
import os
from datetime import timedelta from datetime import timedelta
from typing import ClassVar from typing import ClassVar
from django.utils import timezone from django.utils import timezone
from rich import print
from statemachine import State, StateMachine from statemachine import State, StateMachine
from workers.actor import ActorType from workers.actor import ActorType
@ -39,10 +42,16 @@ class SnapshotMachine(StateMachine, strict_states=True):
self.snapshot = snapshot self.snapshot = snapshot
super().__init__(snapshot, *args, **kwargs) super().__init__(snapshot, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]Snapshot\\[{self.snapshot.ABID}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool: def can_start(self) -> bool:
can_start = bool(self.snapshot.url) can_start = bool(self.snapshot.url)
if not can_start: if not can_start:
print(f'SnapshotMachine[{self.snapshot.ABID}].can_start() False: {self.snapshot.url} {self.snapshot.retry_at} {timezone.now()}') print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
return can_start return can_start
def is_finished(self) -> bool: def is_finished(self) -> bool:
@ -57,12 +66,12 @@ class SnapshotMachine(StateMachine, strict_states=True):
# otherwise archiveresults exist and are all finished, so it's finished # otherwise archiveresults exist and are all finished, so it's finished
return True return True
def on_transition(self, event, state): # def on_transition(self, event, state):
print(f'SnapshotMachine[{self.snapshot.ABID}].on_transition() {event} -> {state}') # print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
@queued.enter @queued.enter
def enter_queued(self): def enter_queued(self):
print(f'SnapshotMachine[{self.snapshot.ABID}].on_queued(): snapshot.retry_at = now()') print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
self.snapshot.update_for_workers( self.snapshot.update_for_workers(
retry_at=timezone.now(), retry_at=timezone.now(),
status=Snapshot.StatusChoices.QUEUED, status=Snapshot.StatusChoices.QUEUED,
@ -70,7 +79,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
@started.enter @started.enter
def enter_started(self): def enter_started(self):
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)') print(f'{self}.on_started() ↳ snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
# lock the snapshot while we create the pending archiveresults # lock the snapshot while we create the pending archiveresults
self.snapshot.update_for_workers( self.snapshot.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
@ -86,7 +95,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
@sealed.enter @sealed.enter
def enter_sealed(self): def enter_sealed(self):
print(f'SnapshotMachine[{self.snapshot.ABID}].on_sealed(): snapshot.retry_at=None') print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
self.snapshot.update_for_workers( self.snapshot.update_for_workers(
retry_at=None, retry_at=None,
status=Snapshot.StatusChoices.SEALED, status=Snapshot.StatusChoices.SEALED,
@ -144,11 +153,17 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
def __init__(self, archiveresult, *args, **kwargs): def __init__(self, archiveresult, *args, **kwargs):
self.archiveresult = archiveresult self.archiveresult = archiveresult
super().__init__(archiveresult, *args, **kwargs) super().__init__(archiveresult, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]ArchiveResult\\[{self.archiveresult.ABID}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool: def can_start(self) -> bool:
can_start = bool(self.archiveresult.snapshot.url) can_start = bool(self.archiveresult.snapshot.url)
if not can_start: if not can_start:
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].can_start() False: {self.archiveresult.snapshot.url} {self.archiveresult.retry_at} {timezone.now()}') print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
return can_start return can_start
def is_succeeded(self) -> bool: def is_succeeded(self) -> bool:
@ -172,7 +187,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@queued.enter @queued.enter
def enter_queued(self): def enter_queued(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()') print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
self.archiveresult.update_for_workers( self.archiveresult.update_for_workers(
retry_at=timezone.now(), retry_at=timezone.now(),
status=ArchiveResult.StatusChoices.QUEUED, status=ArchiveResult.StatusChoices.QUEUED,
@ -181,7 +196,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@started.enter @started.enter
def enter_started(self): def enter_started(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)') print(f'{self}.on_started() ↳ archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
# lock the object for the next 30sec # lock the object for the next 30sec
self.archiveresult.update_for_workers( self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=30), retry_at=timezone.now() + timedelta(seconds=30),
@ -205,7 +220,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@backoff.enter @backoff.enter
def enter_backoff(self): def enter_backoff(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None') print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
self.archiveresult.update_for_workers( self.archiveresult.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=60), retry_at=timezone.now() + timedelta(seconds=60),
status=ArchiveResult.StatusChoices.BACKOFF, status=ArchiveResult.StatusChoices.BACKOFF,
@ -216,7 +231,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@succeeded.enter @succeeded.enter
def enter_succeeded(self): def enter_succeeded(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()') print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
self.archiveresult.update_for_workers( self.archiveresult.update_for_workers(
retry_at=None, retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED, status=ArchiveResult.StatusChoices.SUCCEEDED,
@ -227,7 +242,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
@failed.enter @failed.enter
def enter_failed(self): def enter_failed(self):
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()') print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
self.archiveresult.update_for_workers( self.archiveresult.update_for_workers(
retry_at=None, retry_at=None,
status=ArchiveResult.StatusChoices.FAILED, status=ArchiveResult.StatusChoices.FAILED,

View file

@ -1,9 +1,12 @@
__package__ = 'archivebox.crawls' __package__ = 'archivebox.crawls'
import os
from typing import ClassVar from typing import ClassVar
from datetime import timedelta from datetime import timedelta
from django.utils import timezone from django.utils import timezone
from rich import print
from statemachine import State, StateMachine from statemachine import State, StateMachine
from workers.actor import ActorType from workers.actor import ActorType
@ -31,6 +34,12 @@ class CrawlMachine(StateMachine, strict_states=True):
def __init__(self, crawl, *args, **kwargs): def __init__(self, crawl, *args, **kwargs):
self.crawl = crawl self.crawl = crawl
super().__init__(crawl, *args, **kwargs) super().__init__(crawl, *args, **kwargs)
def __repr__(self) -> str:
return f'[grey53]Crawl\\[{self.crawl.ABID}] 🏃‍♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool: def can_start(self) -> bool:
return bool(self.crawl.seed and self.crawl.seed.uri) return bool(self.crawl.seed and self.crawl.seed.uri)
@ -64,7 +73,7 @@ class CrawlMachine(StateMachine, strict_states=True):
@started.enter @started.enter
def enter_started(self): def enter_started(self):
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)') print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
# lock the crawl object for 2s while we create the root snapshot # lock the crawl object for 2s while we create the root snapshot
self.crawl.update_for_workers( self.crawl.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=5), retry_at=timezone.now() + timedelta(seconds=5),
@ -80,7 +89,7 @@ class CrawlMachine(StateMachine, strict_states=True):
@sealed.enter @sealed.enter
def enter_sealed(self): def enter_sealed(self):
print(f'CrawlMachine[{self.crawl.ABID}].on_sealed(): crawl.retry_at=None') print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
self.crawl.update_for_workers( self.crawl.update_for_workers(
retry_at=None, retry_at=None,
status=Crawl.StatusChoices.SEALED, status=Crawl.StatusChoices.SEALED,

View file

@ -468,7 +468,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(' {}'.format(' '.join(filter_patterns or ()))) print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links): def log_list_finished(links):
from ..index.csv import links_to_csv from archivebox.index.csv import links_to_csv
print() print()
print('---------------------------------------------------------------------------------------------------') print('---------------------------------------------------------------------------------------------------')
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | ')) print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
@ -492,7 +492,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
if not yes: if not yes:
print() print()
print('[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]') print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
try: try:
assert input(' y/[n]: ').lower() == 'y' assert input(' y/[n]: ').lower() == 'y'
except (KeyboardInterrupt, EOFError, AssertionError): except (KeyboardInterrupt, EOFError, AssertionError):

View file

@ -110,7 +110,8 @@ class ActorType(Generic[ModelType]):
def __repr__(self) -> str: def __repr__(self) -> str:
"""-> FaviconActor[pid=1234]""" """-> FaviconActor[pid=1234]"""
label = 'pid' if self.mode == 'process' else 'tid' label = 'pid' if self.mode == 'process' else 'tid'
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]' # return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
return f'[underline]Worker[/underline]\\[{label}={self.pid}]'
@staticmethod @staticmethod
def _state_to_str(state: ObjectState) -> str: def _state_to_str(state: ObjectState) -> str:
@ -210,6 +211,10 @@ class ActorType(Generic[ModelType]):
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid)) cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
return bg_actor_process.pid return bg_actor_process.pid
@classmethod
def _obj_repr(cls, obj: ModelType | Any) -> str:
"""Get a string representation of the given django Model instance"""
return f'[grey53]{type(obj).__name__}\\[{obj.ABID}][/grey53]'
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned ### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
@ -328,7 +333,7 @@ class ActorType(Generic[ModelType]):
if self.idle_count >= 3: if self.idle_count >= 3:
break # stop looping and exit if queue is empty and we have idled for 30sec break # stop looping and exit if queue is empty and we have idled for 30sec
else: else:
print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...') # print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
self.idle_count += 1 self.idle_count += 1
time.sleep(1) time.sleep(1)
continue continue
@ -339,7 +344,7 @@ class ActorType(Generic[ModelType]):
self.tick(obj_to_process) self.tick(obj_to_process)
except Exception as err: except Exception as err:
last_error = err last_error = err
print(f'[red]🏃‍♂️ {self}.tick()[/red] {obj_to_process} ERROR: [red]{type(err).__name__}: {err}[/red]') print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/red] ERROR: [red]{type(err).__name__}: {err}[/red]')
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
self.on_tick_exception(obj_to_process, err) self.on_tick_exception(obj_to_process, err)
traceback.print_exc() traceback.print_exc()
@ -351,7 +356,7 @@ class ActorType(Generic[ModelType]):
if isinstance(err, KeyboardInterrupt): if isinstance(err, KeyboardInterrupt):
print() print()
else: else:
print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}') print(f'\n[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
print(f' Last processed object: {obj_to_process}') print(f' Last processed object: {obj_to_process}')
raise raise
finally: finally:
@ -449,7 +454,7 @@ class ActorType(Generic[ModelType]):
def tick(self, obj_to_process: ModelType) -> None: def tick(self, obj_to_process: ModelType) -> None:
"""Call the object.sm.tick() method to process the object""" """Call the object.sm.tick() method to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}') print(f'\n[grey53]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.tick()[/grey53] [blue]{obj_to_process.status.upper()}[/blue] ➡️ ... +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
# get the StateMachine instance from the object # get the StateMachine instance from the object
obj_statemachine = self._get_state_machine_instance(obj_to_process) obj_statemachine = self._get_state_machine_instance(obj_to_process)
@ -477,17 +482,18 @@ class ActorType(Generic[ModelType]):
# abx.pm.hook.on_actor_startup(actor=self) # abx.pm.hook.on_actor_startup(actor=self)
def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None: def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
if isinstance(last_error, KeyboardInterrupt) or last_error is None: # if isinstance(last_error, KeyboardInterrupt) or last_error is None:
last_error_str = '[green](CTRL-C)[/green]' # last_error_str = '[green](CTRL-C)[/green]'
elif isinstance(last_error, ActorQueueIsEmpty): # elif isinstance(last_error, ActorQueueIsEmpty):
last_error_str = '[green](queue empty)[/green]' # last_error_str = '[green](queue empty)[/green]'
elif isinstance(last_error, ActorObjectAlreadyClaimed): # elif isinstance(last_error, ActorObjectAlreadyClaimed):
last_error_str = '[green](queue race)[/green]' # last_error_str = '[green](queue race)[/green]'
else: # else:
last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]' # last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}') # print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
# abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error) # abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
pass
def on_tick_start(self, obj_to_process: ModelType) -> None: def on_tick_start(self, obj_to_process: ModelType) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}') # print(f'🏃‍♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
@ -505,11 +511,11 @@ class ActorType(Generic[ModelType]):
def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None: def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}: [red]{type(error).__name__}: {error}[/red]') print(f'[red]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_tick_exception()[/red] [blue]{obj_to_process.status}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s: [red]{type(error).__name__}: {error}[/red]')
# abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error) # abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None: def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
print(f'🏃‍♂️ {self}.on_state_change() {obj_to_process.ABID} {starting_state} ➡️ {ending_state}') print(f'[blue]{self._obj_repr(obj_to_process)} 🏃‍♂️ {self}.on_state_change() {starting_state} ➡️ {ending_state}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
# abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state) # abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)

View file

@ -2,6 +2,7 @@ __package__ = 'archivebox.workers'
import os import os
import time import time
import sys
import itertools import itertools
from typing import Dict, Type, Literal, TYPE_CHECKING from typing import Dict, Type, Literal, TYPE_CHECKING
from django.utils.functional import classproperty from django.utils.functional import classproperty
@ -122,9 +123,9 @@ class Orchestrator:
# abx.pm.hook.on_orchestrator_shutdown(self) # abx.pm.hook.on_orchestrator_shutdown(self)
def on_tick_started(self, all_queues): def on_tick_started(self, all_queues):
total_pending = sum(queue.count() for queue in all_queues.values()) # total_pending = sum(queue.count() for queue in all_queues.values())
if total_pending: # if total_pending:
print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}') # print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues) # abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
pass pass
@ -136,7 +137,8 @@ class Orchestrator:
pass pass
def on_idle(self, all_queues): def on_idle(self, all_queues):
print(f'👨‍✈️ {self}.on_idle()', f'idle_count={self.idle_count}') # print(f'👨‍✈️ {self}.on_idle()', f'idle_count={self.idle_count}')
print('.', end='', flush=True, file=sys.stderr)
# abx.pm.hook.on_orchestrator_idle(self) # abx.pm.hook.on_orchestrator_idle(self)
# check for orphaned objects left behind # check for orphaned objects left behind
if self.idle_count == 60: if self.idle_count == 60:
@ -170,6 +172,7 @@ class Orchestrator:
continue continue
next_obj = queue.first() next_obj = queue.first()
print()
print(f'🏃‍♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj and next_obj.retry_at else "None"}') print(f'🏃‍♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj and next_obj.retry_at else "None"}')
self.idle_count = 0 self.idle_count = 0
try: try: