mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-24 21:23:22 +00:00
improve statemachine logging and archivebox update CLI cmd
This commit is contained in:
parent
c9a05c9d94
commit
2595139180
6 changed files with 134 additions and 167 deletions
|
@ -1,13 +1,13 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox update'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from typing import List, Optional, IO
|
||||
|
||||
from archivebox.misc.util import docstring
|
||||
import rich_click as click
|
||||
|
||||
from typing import Iterable
|
||||
|
||||
from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.index import (
|
||||
LINK_FILTERS,
|
||||
get_indexed_folders,
|
||||
|
@ -21,8 +21,66 @@ from archivebox.index import (
|
|||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||
# from ..main import update
|
||||
|
||||
|
||||
@enforce_types
|
||||
def update(filter_patterns: Iterable[str]=(),
|
||||
only_new: bool=False,
|
||||
index_only: bool=False,
|
||||
resume: float | None=None,
|
||||
overwrite: bool=False,
|
||||
before: float | None=None,
|
||||
after: float | None=None,
|
||||
status: str='indexed',
|
||||
filter_type: str='exact',
|
||||
extract: str="") -> None:
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
|
||||
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
|
||||
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
|
||||
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
|
||||
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
|
||||
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
|
||||
@click.option('--status', type=click.Choice([
|
||||
'indexed', 'archived', 'unarchived',
|
||||
'present', 'valid', 'invalid',
|
||||
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||
]), default='indexed', help=f'''
|
||||
Update only links or data directories that have the given status:
|
||||
indexed {get_indexed_folders.__doc__} (the default)
|
||||
archived {get_archived_folders.__doc__}
|
||||
unarchived {get_unarchived_folders.__doc__}
|
||||
|
||||
present {get_present_folders.__doc__}
|
||||
valid {get_valid_folders.__doc__}
|
||||
invalid {get_invalid_folders.__doc__}
|
||||
|
||||
duplicate {get_duplicate_folders.__doc__}
|
||||
orphaned {get_orphaned_folders.__doc__}
|
||||
corrupted {get_corrupted_folders.__doc__}
|
||||
unrecognized {get_unrecognized_folders.__doc__}
|
||||
''')
|
||||
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||
@click.argument('filter_patterns', nargs=-1)
|
||||
@docstring(update.__doc__)
|
||||
def main(**kwargs):
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
update(**kwargs)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
||||
|
||||
|
||||
|
@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
|||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||
# all_links = load_main_index(out_dir=out_dir)
|
||||
# return all_links
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
def update():
|
||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from workers.orchestrator import Orchestrator
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.start()
|
||||
|
||||
|
||||
@docstring(update.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=update.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
action='store_true',
|
||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
help="Update the main index without archiving any content",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--resume', #'-r',
|
||||
type=float,
|
||||
help='Resume the update process from a given timestamp',
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--overwrite', #'-x',
|
||||
action='store_true',
|
||||
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="Update only links bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="Update only links bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--status',
|
||||
type=str,
|
||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
||||
default='indexed',
|
||||
help=(
|
||||
'Update only links or data directories that have the given status\n'
|
||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
||||
f' archived {get_archived_folders.__doc__}\n'
|
||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' present {get_present_folders.__doc__}\n'
|
||||
f' valid {get_valid_folders.__doc__}\n'
|
||||
f' invalid {get_invalid_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type', '-t',
|
||||
type=str,
|
||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Update only URLs matching these filter patterns.'
|
||||
)
|
||||
parser.add_argument(
|
||||
"--extract",
|
||||
type=str,
|
||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
||||
This does not take precedence over the configuration",
|
||||
default=""
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
|
||||
filter_patterns_str = None
|
||||
if not command.filter_patterns:
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
update()
|
||||
|
||||
# update(
|
||||
# resume=command.resume,
|
||||
# only_new=command.only_new,
|
||||
# index_only=command.index_only,
|
||||
# overwrite=command.overwrite,
|
||||
# filter_patterns_str=filter_patterns_str,
|
||||
# filter_patterns=command.filter_patterns,
|
||||
# filter_type=command.filter_type,
|
||||
# status=command.status,
|
||||
# after=command.after,
|
||||
# before=command.before,
|
||||
# out_dir=Path(pwd) if pwd else DATA_DIR,
|
||||
# extractors=command.extract,
|
||||
# )
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
|
|
@ -1,11 +1,14 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
import time
|
||||
import os
|
||||
from datetime import timedelta
|
||||
from typing import ClassVar
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from workers.actor import ActorType
|
||||
|
@ -39,10 +42,16 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
|||
self.snapshot = snapshot
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Snapshot\\[{self.snapshot.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'SnapshotMachine[{self.snapshot.ABID}].can_start() False: {self.snapshot.url} {self.snapshot.retry_at} {timezone.now()}')
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
|
@ -57,12 +66,12 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
|||
# otherwise archiveresults exist and are all finished, so it's finished
|
||||
return True
|
||||
|
||||
def on_transition(self, event, state):
|
||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_transition() {event} -> {state}')
|
||||
# def on_transition(self, event, state):
|
||||
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_queued(): snapshot.retry_at = now()')
|
||||
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Snapshot.StatusChoices.QUEUED,
|
||||
|
@ -70,7 +79,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
|||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||
print(f'{self}.on_started() ↳ snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
|
@ -86,7 +95,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
|||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_sealed(): snapshot.retry_at=None')
|
||||
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
|
||||
self.snapshot.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
|
@ -145,10 +154,16 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
self.archiveresult = archiveresult
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]ArchiveResult\\[{self.archiveresult.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.archiveresult.snapshot.url)
|
||||
if not can_start:
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].can_start() False: {self.archiveresult.snapshot.url} {self.archiveresult.retry_at} {timezone.now()}')
|
||||
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
|
||||
return can_start
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
|
@ -172,7 +187,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()')
|
||||
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=ArchiveResult.StatusChoices.QUEUED,
|
||||
|
@ -181,7 +196,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||
print(f'{self}.on_started() ↳ archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||
# lock the object for the next 30sec
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=30),
|
||||
|
@ -205,7 +220,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
|
||||
@backoff.enter
|
||||
def enter_backoff(self):
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=60),
|
||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||
|
@ -216,7 +231,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
|
@ -227,7 +242,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
|||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()')
|
||||
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||
self.archiveresult.update_for_workers(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
|
|
|
@ -1,9 +1,12 @@
|
|||
__package__ = 'archivebox.crawls'
|
||||
|
||||
import os
|
||||
from typing import ClassVar
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
|
||||
from rich import print
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from workers.actor import ActorType
|
||||
|
@ -32,6 +35,12 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
self.crawl = crawl
|
||||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[grey53]Crawl\\[{self.crawl.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return bool(self.crawl.seed and self.crawl.seed.uri)
|
||||
|
||||
|
@ -64,7 +73,7 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
||||
print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
||||
# lock the crawl object for 2s while we create the root snapshot
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
|
@ -80,7 +89,7 @@ class CrawlMachine(StateMachine, strict_states=True):
|
|||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
print(f'CrawlMachine[{self.crawl.ABID}].on_sealed(): crawl.retry_at=None')
|
||||
print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
|
||||
self.crawl.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
|
|
|
@ -468,7 +468,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
|||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
|
||||
def log_list_finished(links):
|
||||
from ..index.csv import links_to_csv
|
||||
from archivebox.index.csv import links_to_csv
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
|
@ -492,7 +492,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
|||
|
||||
if not yes:
|
||||
print()
|
||||
print('[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
|
||||
print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
|
||||
try:
|
||||
assert input(' y/[n]: ').lower() == 'y'
|
||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||
|
|
|
@ -110,7 +110,8 @@ class ActorType(Generic[ModelType]):
|
|||
def __repr__(self) -> str:
|
||||
"""-> FaviconActor[pid=1234]"""
|
||||
label = 'pid' if self.mode == 'process' else 'tid'
|
||||
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
||||
# return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
||||
return f'[underline]Worker[/underline]\\[{label}={self.pid}]'
|
||||
|
||||
@staticmethod
|
||||
def _state_to_str(state: ObjectState) -> str:
|
||||
|
@ -210,6 +211,10 @@ class ActorType(Generic[ModelType]):
|
|||
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
|
||||
return bg_actor_process.pid
|
||||
|
||||
@classmethod
|
||||
def _obj_repr(cls, obj: ModelType | Any) -> str:
|
||||
"""Get a string representation of the given django Model instance"""
|
||||
return f'[grey53]{type(obj).__name__}\\[{obj.ABID}][/grey53]'
|
||||
|
||||
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
|
||||
|
||||
|
@ -328,7 +333,7 @@ class ActorType(Generic[ModelType]):
|
|||
if self.idle_count >= 3:
|
||||
break # stop looping and exit if queue is empty and we have idled for 30sec
|
||||
else:
|
||||
print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
|
||||
# print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
|
||||
self.idle_count += 1
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
@ -339,7 +344,7 @@ class ActorType(Generic[ModelType]):
|
|||
self.tick(obj_to_process)
|
||||
except Exception as err:
|
||||
last_error = err
|
||||
print(f'[red]🏃♂️ {self}.tick()[/red] {obj_to_process} ERROR: [red]{type(err).__name__}: {err}[/red]')
|
||||
print(f'[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.tick()[/red] ERROR: [red]{type(err).__name__}: {err}[/red]')
|
||||
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
|
||||
self.on_tick_exception(obj_to_process, err)
|
||||
traceback.print_exc()
|
||||
|
@ -351,7 +356,7 @@ class ActorType(Generic[ModelType]):
|
|||
if isinstance(err, KeyboardInterrupt):
|
||||
print()
|
||||
else:
|
||||
print(f'\n[red]🏃♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
|
||||
print(f'\n[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
|
||||
print(f' Last processed object: {obj_to_process}')
|
||||
raise
|
||||
finally:
|
||||
|
@ -449,7 +454,7 @@ class ActorType(Generic[ModelType]):
|
|||
|
||||
def tick(self, obj_to_process: ModelType) -> None:
|
||||
"""Call the object.sm.tick() method to process the object"""
|
||||
print(f'[blue]🏃♂️ {self}.tick()[/blue] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
|
||||
print(f'\n[grey53]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.tick()[/grey53] [blue]{obj_to_process.status.upper()}[/blue] ➡️ ... +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
|
||||
|
||||
# get the StateMachine instance from the object
|
||||
obj_statemachine = self._get_state_machine_instance(obj_to_process)
|
||||
|
@ -477,17 +482,18 @@ class ActorType(Generic[ModelType]):
|
|||
# abx.pm.hook.on_actor_startup(actor=self)
|
||||
|
||||
def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
|
||||
if isinstance(last_error, KeyboardInterrupt) or last_error is None:
|
||||
last_error_str = '[green](CTRL-C)[/green]'
|
||||
elif isinstance(last_error, ActorQueueIsEmpty):
|
||||
last_error_str = '[green](queue empty)[/green]'
|
||||
elif isinstance(last_error, ActorObjectAlreadyClaimed):
|
||||
last_error_str = '[green](queue race)[/green]'
|
||||
else:
|
||||
last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
|
||||
# if isinstance(last_error, KeyboardInterrupt) or last_error is None:
|
||||
# last_error_str = '[green](CTRL-C)[/green]'
|
||||
# elif isinstance(last_error, ActorQueueIsEmpty):
|
||||
# last_error_str = '[green](queue empty)[/green]'
|
||||
# elif isinstance(last_error, ActorObjectAlreadyClaimed):
|
||||
# last_error_str = '[green](queue race)[/green]'
|
||||
# else:
|
||||
# last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
|
||||
|
||||
print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
|
||||
# print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
|
||||
# abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
|
||||
pass
|
||||
|
||||
def on_tick_start(self, obj_to_process: ModelType) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
|
||||
|
@ -505,11 +511,11 @@ class ActorType(Generic[ModelType]):
|
|||
|
||||
|
||||
def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
|
||||
print(f'[red]🏃♂️ {self}.on_tick_exception()[/red] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}: [red]{type(error).__name__}: {error}[/red]')
|
||||
print(f'[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.on_tick_exception()[/red] [blue]{obj_to_process.status}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s: [red]{type(error).__name__}: {error}[/red]')
|
||||
# abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
|
||||
|
||||
def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
|
||||
print(f'🏃♂️ {self}.on_state_change() {obj_to_process.ABID} {starting_state} ➡️ {ending_state}')
|
||||
print(f'[blue]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.on_state_change() {starting_state} ➡️ {ending_state}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
|
||||
# abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)
|
||||
|
||||
|
||||
|
|
|
@ -2,6 +2,7 @@ __package__ = 'archivebox.workers'
|
|||
|
||||
import os
|
||||
import time
|
||||
import sys
|
||||
import itertools
|
||||
from typing import Dict, Type, Literal, TYPE_CHECKING
|
||||
from django.utils.functional import classproperty
|
||||
|
@ -122,9 +123,9 @@ class Orchestrator:
|
|||
# abx.pm.hook.on_orchestrator_shutdown(self)
|
||||
|
||||
def on_tick_started(self, all_queues):
|
||||
total_pending = sum(queue.count() for queue in all_queues.values())
|
||||
if total_pending:
|
||||
print(f'👨✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
|
||||
# total_pending = sum(queue.count() for queue in all_queues.values())
|
||||
# if total_pending:
|
||||
# print(f'👨✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
|
||||
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
|
||||
pass
|
||||
|
||||
|
@ -136,7 +137,8 @@ class Orchestrator:
|
|||
pass
|
||||
|
||||
def on_idle(self, all_queues):
|
||||
print(f'👨✈️ {self}.on_idle()', f'idle_count={self.idle_count}')
|
||||
# print(f'👨✈️ {self}.on_idle()', f'idle_count={self.idle_count}')
|
||||
print('.', end='', flush=True, file=sys.stderr)
|
||||
# abx.pm.hook.on_orchestrator_idle(self)
|
||||
# check for orphaned objects left behind
|
||||
if self.idle_count == 60:
|
||||
|
@ -170,6 +172,7 @@ class Orchestrator:
|
|||
continue
|
||||
|
||||
next_obj = queue.first()
|
||||
print()
|
||||
print(f'🏃♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj and next_obj.retry_at else "None"}')
|
||||
self.idle_count = 0
|
||||
try:
|
||||
|
|
Loading…
Reference in a new issue