mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
improve statemachine logging and archivebox update CLI cmd
This commit is contained in:
parent
c9a05c9d94
commit
2595139180
6 changed files with 134 additions and 167 deletions
|
@ -1,13 +1,13 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
__package__ = 'archivebox.cli'
|
__package__ = 'archivebox.cli'
|
||||||
__command__ = 'archivebox update'
|
|
||||||
|
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
from typing import List, Optional, IO
|
|
||||||
|
|
||||||
from archivebox.misc.util import docstring
|
import rich_click as click
|
||||||
|
|
||||||
|
from typing import Iterable
|
||||||
|
|
||||||
|
from archivebox.misc.util import enforce_types, docstring
|
||||||
from archivebox.index import (
|
from archivebox.index import (
|
||||||
LINK_FILTERS,
|
LINK_FILTERS,
|
||||||
get_indexed_folders,
|
get_indexed_folders,
|
||||||
|
@ -21,8 +21,66 @@ from archivebox.index import (
|
||||||
get_corrupted_folders,
|
get_corrupted_folders,
|
||||||
get_unrecognized_folders,
|
get_unrecognized_folders,
|
||||||
)
|
)
|
||||||
from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
|
||||||
# from ..main import update
|
|
||||||
|
@enforce_types
|
||||||
|
def update(filter_patterns: Iterable[str]=(),
|
||||||
|
only_new: bool=False,
|
||||||
|
index_only: bool=False,
|
||||||
|
resume: float | None=None,
|
||||||
|
overwrite: bool=False,
|
||||||
|
before: float | None=None,
|
||||||
|
after: float | None=None,
|
||||||
|
status: str='indexed',
|
||||||
|
filter_type: str='exact',
|
||||||
|
extract: str="") -> None:
|
||||||
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
|
||||||
|
from archivebox.config.django import setup_django
|
||||||
|
setup_django()
|
||||||
|
|
||||||
|
from workers.orchestrator import Orchestrator
|
||||||
|
orchestrator = Orchestrator(exit_on_idle=False)
|
||||||
|
orchestrator.start()
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option('--only-new', is_flag=True, help="Don't attempt to retry previously skipped/failed links when updating")
|
||||||
|
@click.option('--index-only', is_flag=True, help="Update the main index without archiving any content")
|
||||||
|
@click.option('--resume', type=float, help='Resume the update process from a given timestamp')
|
||||||
|
@click.option('--overwrite', '-F', is_flag=True, help='Ignore existing archived content and overwrite with new versions (DANGEROUS)')
|
||||||
|
@click.option('--before', type=float, help="Update only links bookmarked before the given timestamp")
|
||||||
|
@click.option('--after', type=float, help="Update only links bookmarked after the given timestamp")
|
||||||
|
@click.option('--status', type=click.Choice([
|
||||||
|
'indexed', 'archived', 'unarchived',
|
||||||
|
'present', 'valid', 'invalid',
|
||||||
|
'duplicate', 'orphaned', 'corrupted', 'unrecognized'
|
||||||
|
]), default='indexed', help=f'''
|
||||||
|
Update only links or data directories that have the given status:
|
||||||
|
indexed {get_indexed_folders.__doc__} (the default)
|
||||||
|
archived {get_archived_folders.__doc__}
|
||||||
|
unarchived {get_unarchived_folders.__doc__}
|
||||||
|
|
||||||
|
present {get_present_folders.__doc__}
|
||||||
|
valid {get_valid_folders.__doc__}
|
||||||
|
invalid {get_invalid_folders.__doc__}
|
||||||
|
|
||||||
|
duplicate {get_duplicate_folders.__doc__}
|
||||||
|
orphaned {get_orphaned_folders.__doc__}
|
||||||
|
corrupted {get_corrupted_folders.__doc__}
|
||||||
|
unrecognized {get_unrecognized_folders.__doc__}
|
||||||
|
''')
|
||||||
|
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
|
||||||
|
@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
|
||||||
|
@click.argument('filter_patterns', nargs=-1)
|
||||||
|
@docstring(update.__doc__)
|
||||||
|
def main(**kwargs):
|
||||||
|
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
||||||
|
update(**kwargs)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
@ -103,127 +161,3 @@ from archivebox.misc.logging_util import SmartFormatter, accept_stdin
|
||||||
# # Step 4: Re-write links index with updated titles, icons, and resources
|
# # Step 4: Re-write links index with updated titles, icons, and resources
|
||||||
# all_links = load_main_index(out_dir=out_dir)
|
# all_links = load_main_index(out_dir=out_dir)
|
||||||
# return all_links
|
# return all_links
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def update():
|
|
||||||
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
|
|
||||||
from archivebox.config.django import setup_django
|
|
||||||
setup_django()
|
|
||||||
|
|
||||||
from workers.orchestrator import Orchestrator
|
|
||||||
orchestrator = Orchestrator(exit_on_idle=False)
|
|
||||||
orchestrator.start()
|
|
||||||
|
|
||||||
|
|
||||||
@docstring(update.__doc__)
|
|
||||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
prog=__command__,
|
|
||||||
description=update.__doc__,
|
|
||||||
add_help=True,
|
|
||||||
formatter_class=SmartFormatter,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--only-new', #'-n',
|
|
||||||
action='store_true',
|
|
||||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--index-only', #'-o',
|
|
||||||
action='store_true',
|
|
||||||
help="Update the main index without archiving any content",
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--resume', #'-r',
|
|
||||||
type=float,
|
|
||||||
help='Resume the update process from a given timestamp',
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--overwrite', #'-x',
|
|
||||||
action='store_true',
|
|
||||||
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--before', #'-b',
|
|
||||||
type=float,
|
|
||||||
help="Update only links bookmarked before the given timestamp.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--after', #'-a',
|
|
||||||
type=float,
|
|
||||||
help="Update only links bookmarked after the given timestamp.",
|
|
||||||
default=None,
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--status',
|
|
||||||
type=str,
|
|
||||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
|
||||||
default='indexed',
|
|
||||||
help=(
|
|
||||||
'Update only links or data directories that have the given status\n'
|
|
||||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
|
||||||
f' archived {get_archived_folders.__doc__}\n'
|
|
||||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' present {get_present_folders.__doc__}\n'
|
|
||||||
f' valid {get_valid_folders.__doc__}\n'
|
|
||||||
f' invalid {get_invalid_folders.__doc__}\n'
|
|
||||||
'\n'
|
|
||||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
|
||||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
|
||||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
|
||||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
|
||||||
)
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'--filter-type', '-t',
|
|
||||||
type=str,
|
|
||||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
|
||||||
default='exact',
|
|
||||||
help='Type of pattern matching to use when filtering URLs',
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
'filter_patterns',
|
|
||||||
nargs='*',
|
|
||||||
type=str,
|
|
||||||
default=None,
|
|
||||||
help='Update only URLs matching these filter patterns.'
|
|
||||||
)
|
|
||||||
parser.add_argument(
|
|
||||||
"--extract",
|
|
||||||
type=str,
|
|
||||||
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
|
|
||||||
This does not take precedence over the configuration",
|
|
||||||
default=""
|
|
||||||
)
|
|
||||||
command = parser.parse_args(args or ())
|
|
||||||
|
|
||||||
filter_patterns_str = None
|
|
||||||
if not command.filter_patterns:
|
|
||||||
filter_patterns_str = accept_stdin(stdin)
|
|
||||||
|
|
||||||
update()
|
|
||||||
|
|
||||||
# update(
|
|
||||||
# resume=command.resume,
|
|
||||||
# only_new=command.only_new,
|
|
||||||
# index_only=command.index_only,
|
|
||||||
# overwrite=command.overwrite,
|
|
||||||
# filter_patterns_str=filter_patterns_str,
|
|
||||||
# filter_patterns=command.filter_patterns,
|
|
||||||
# filter_type=command.filter_type,
|
|
||||||
# status=command.status,
|
|
||||||
# after=command.after,
|
|
||||||
# before=command.before,
|
|
||||||
# out_dir=Path(pwd) if pwd else DATA_DIR,
|
|
||||||
# extractors=command.extract,
|
|
||||||
# )
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
||||||
|
|
|
@ -1,11 +1,14 @@
|
||||||
__package__ = 'archivebox.core'
|
__package__ = 'archivebox.core'
|
||||||
|
|
||||||
import time
|
import time
|
||||||
|
import os
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from typing import ClassVar
|
from typing import ClassVar
|
||||||
|
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
from statemachine import State, StateMachine
|
from statemachine import State, StateMachine
|
||||||
|
|
||||||
from workers.actor import ActorType
|
from workers.actor import ActorType
|
||||||
|
@ -39,10 +42,16 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
self.snapshot = snapshot
|
self.snapshot = snapshot
|
||||||
super().__init__(snapshot, *args, **kwargs)
|
super().__init__(snapshot, *args, **kwargs)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f'[grey53]Snapshot\\[{self.snapshot.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.snapshot.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.__repr__()
|
||||||
|
|
||||||
def can_start(self) -> bool:
|
def can_start(self) -> bool:
|
||||||
can_start = bool(self.snapshot.url)
|
can_start = bool(self.snapshot.url)
|
||||||
if not can_start:
|
if not can_start:
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].can_start() False: {self.snapshot.url} {self.snapshot.retry_at} {timezone.now()}')
|
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue] cant start yet +{timezone.now() - self.snapshot.retry_at}s')
|
||||||
return can_start
|
return can_start
|
||||||
|
|
||||||
def is_finished(self) -> bool:
|
def is_finished(self) -> bool:
|
||||||
|
@ -57,12 +66,12 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
# otherwise archiveresults exist and are all finished, so it's finished
|
# otherwise archiveresults exist and are all finished, so it's finished
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def on_transition(self, event, state):
|
# def on_transition(self, event, state):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_transition() {event} -> {state}')
|
# print(f'{self}.on_transition() [blue]{str(state).upper()}[/blue] ➡️ ...')
|
||||||
|
|
||||||
@queued.enter
|
@queued.enter
|
||||||
def enter_queued(self):
|
def enter_queued(self):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_queued(): snapshot.retry_at = now()')
|
print(f'{self}.on_queued() ↳ snapshot.retry_at = now()')
|
||||||
self.snapshot.update_for_workers(
|
self.snapshot.update_for_workers(
|
||||||
retry_at=timezone.now(),
|
retry_at=timezone.now(),
|
||||||
status=Snapshot.StatusChoices.QUEUED,
|
status=Snapshot.StatusChoices.QUEUED,
|
||||||
|
@ -70,7 +79,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@started.enter
|
@started.enter
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_started(): snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
print(f'{self}.on_started() ↳ snapshot.create_pending_archiveresults() + snapshot.bump_retry_at(+60s)')
|
||||||
# lock the snapshot while we create the pending archiveresults
|
# lock the snapshot while we create the pending archiveresults
|
||||||
self.snapshot.update_for_workers(
|
self.snapshot.update_for_workers(
|
||||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||||
|
@ -86,7 +95,7 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@sealed.enter
|
@sealed.enter
|
||||||
def enter_sealed(self):
|
def enter_sealed(self):
|
||||||
print(f'SnapshotMachine[{self.snapshot.ABID}].on_sealed(): snapshot.retry_at=None')
|
print(f'{self}.on_sealed() ↳ snapshot.retry_at=None')
|
||||||
self.snapshot.update_for_workers(
|
self.snapshot.update_for_workers(
|
||||||
retry_at=None,
|
retry_at=None,
|
||||||
status=Snapshot.StatusChoices.SEALED,
|
status=Snapshot.StatusChoices.SEALED,
|
||||||
|
@ -144,11 +153,17 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
def __init__(self, archiveresult, *args, **kwargs):
|
def __init__(self, archiveresult, *args, **kwargs):
|
||||||
self.archiveresult = archiveresult
|
self.archiveresult = archiveresult
|
||||||
super().__init__(archiveresult, *args, **kwargs)
|
super().__init__(archiveresult, *args, **kwargs)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f'[grey53]ArchiveResult\\[{self.archiveresult.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.archiveresult.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.__repr__()
|
||||||
|
|
||||||
def can_start(self) -> bool:
|
def can_start(self) -> bool:
|
||||||
can_start = bool(self.archiveresult.snapshot.url)
|
can_start = bool(self.archiveresult.snapshot.url)
|
||||||
if not can_start:
|
if not can_start:
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].can_start() False: {self.archiveresult.snapshot.url} {self.archiveresult.retry_at} {timezone.now()}')
|
print(f'{self}.can_start() [blue]QUEUED[/blue] ➡️❌ [blue]STARTED[/blue]: cant start yet +{timezone.now() - self.archiveresult.retry_at}s')
|
||||||
return can_start
|
return can_start
|
||||||
|
|
||||||
def is_succeeded(self) -> bool:
|
def is_succeeded(self) -> bool:
|
||||||
|
@ -172,7 +187,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@queued.enter
|
@queued.enter
|
||||||
def enter_queued(self):
|
def enter_queued(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_queued(): archiveresult.retry_at = now()')
|
print(f'{self}.on_queued() ↳ archiveresult.retry_at = now()')
|
||||||
self.archiveresult.update_for_workers(
|
self.archiveresult.update_for_workers(
|
||||||
retry_at=timezone.now(),
|
retry_at=timezone.now(),
|
||||||
status=ArchiveResult.StatusChoices.QUEUED,
|
status=ArchiveResult.StatusChoices.QUEUED,
|
||||||
|
@ -181,7 +196,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@started.enter
|
@started.enter
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_started(): archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
print(f'{self}.on_started() ↳ archiveresult.start_ts + create_output_dir() + bump_retry_at(+60s)')
|
||||||
# lock the object for the next 30sec
|
# lock the object for the next 30sec
|
||||||
self.archiveresult.update_for_workers(
|
self.archiveresult.update_for_workers(
|
||||||
retry_at=timezone.now() + timedelta(seconds=30),
|
retry_at=timezone.now() + timedelta(seconds=30),
|
||||||
|
@ -205,7 +220,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@backoff.enter
|
@backoff.enter
|
||||||
def enter_backoff(self):
|
def enter_backoff(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_backoff(): archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
print(f'{self}.on_backoff() ↳ archiveresult.retries += 1, archiveresult.bump_retry_at(+60s), archiveresult.end_ts = None')
|
||||||
self.archiveresult.update_for_workers(
|
self.archiveresult.update_for_workers(
|
||||||
retry_at=timezone.now() + timedelta(seconds=60),
|
retry_at=timezone.now() + timedelta(seconds=60),
|
||||||
status=ArchiveResult.StatusChoices.BACKOFF,
|
status=ArchiveResult.StatusChoices.BACKOFF,
|
||||||
|
@ -216,7 +231,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@succeeded.enter
|
@succeeded.enter
|
||||||
def enter_succeeded(self):
|
def enter_succeeded(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_succeeded(): archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
print(f'{self}.on_succeeded() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||||
self.archiveresult.update_for_workers(
|
self.archiveresult.update_for_workers(
|
||||||
retry_at=None,
|
retry_at=None,
|
||||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||||
|
@ -227,7 +242,7 @@ class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@failed.enter
|
@failed.enter
|
||||||
def enter_failed(self):
|
def enter_failed(self):
|
||||||
print(f'ArchiveResultMachine[{self.archiveresult.ABID}].on_failed(): archivebox.retry_at = None, archiveresult.end_ts = now()')
|
print(f'{self}.on_failed() ↳ archiveresult.retry_at = None, archiveresult.end_ts = now()')
|
||||||
self.archiveresult.update_for_workers(
|
self.archiveresult.update_for_workers(
|
||||||
retry_at=None,
|
retry_at=None,
|
||||||
status=ArchiveResult.StatusChoices.FAILED,
|
status=ArchiveResult.StatusChoices.FAILED,
|
||||||
|
|
|
@ -1,9 +1,12 @@
|
||||||
__package__ = 'archivebox.crawls'
|
__package__ = 'archivebox.crawls'
|
||||||
|
|
||||||
|
import os
|
||||||
from typing import ClassVar
|
from typing import ClassVar
|
||||||
from datetime import timedelta
|
from datetime import timedelta
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
|
from rich import print
|
||||||
|
|
||||||
from statemachine import State, StateMachine
|
from statemachine import State, StateMachine
|
||||||
|
|
||||||
from workers.actor import ActorType
|
from workers.actor import ActorType
|
||||||
|
@ -31,6 +34,12 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
def __init__(self, crawl, *args, **kwargs):
|
def __init__(self, crawl, *args, **kwargs):
|
||||||
self.crawl = crawl
|
self.crawl = crawl
|
||||||
super().__init__(crawl, *args, **kwargs)
|
super().__init__(crawl, *args, **kwargs)
|
||||||
|
|
||||||
|
def __repr__(self) -> str:
|
||||||
|
return f'[grey53]Crawl\\[{self.crawl.ABID}] 🏃♂️ Worker\\[pid={os.getpid()}].tick()[/grey53] [blue]{self.crawl.status.upper()}[/blue] ⚙️ [grey37]Machine[/grey37]'
|
||||||
|
|
||||||
|
def __str__(self) -> str:
|
||||||
|
return self.__repr__()
|
||||||
|
|
||||||
def can_start(self) -> bool:
|
def can_start(self) -> bool:
|
||||||
return bool(self.crawl.seed and self.crawl.seed.uri)
|
return bool(self.crawl.seed and self.crawl.seed.uri)
|
||||||
|
@ -64,7 +73,7 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@started.enter
|
@started.enter
|
||||||
def enter_started(self):
|
def enter_started(self):
|
||||||
print(f'CrawlMachine[{self.crawl.ABID}].on_started(): crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
print(f'{self}.on_started(): [blue]↳ STARTED[/blue] crawl.create_root_snapshot() + crawl.bump_retry_at(+10s)')
|
||||||
# lock the crawl object for 2s while we create the root snapshot
|
# lock the crawl object for 2s while we create the root snapshot
|
||||||
self.crawl.update_for_workers(
|
self.crawl.update_for_workers(
|
||||||
retry_at=timezone.now() + timedelta(seconds=5),
|
retry_at=timezone.now() + timedelta(seconds=5),
|
||||||
|
@ -80,7 +89,7 @@ class CrawlMachine(StateMachine, strict_states=True):
|
||||||
|
|
||||||
@sealed.enter
|
@sealed.enter
|
||||||
def enter_sealed(self):
|
def enter_sealed(self):
|
||||||
print(f'CrawlMachine[{self.crawl.ABID}].on_sealed(): crawl.retry_at=None')
|
print(f'{self}.on_sealed(): [blue]↳ SEALED[/blue] crawl.retry_at=None')
|
||||||
self.crawl.update_for_workers(
|
self.crawl.update_for_workers(
|
||||||
retry_at=None,
|
retry_at=None,
|
||||||
status=Crawl.StatusChoices.SEALED,
|
status=Crawl.StatusChoices.SEALED,
|
||||||
|
|
|
@ -468,7 +468,7 @@ def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||||
|
|
||||||
def log_list_finished(links):
|
def log_list_finished(links):
|
||||||
from ..index.csv import links_to_csv
|
from archivebox.index.csv import links_to_csv
|
||||||
print()
|
print()
|
||||||
print('---------------------------------------------------------------------------------------------------')
|
print('---------------------------------------------------------------------------------------------------')
|
||||||
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||||
|
@ -492,7 +492,7 @@ def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||||
|
|
||||||
if not yes:
|
if not yes:
|
||||||
print()
|
print()
|
||||||
print('[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
|
print(f'[yellow3][?] Do you want to proceed with removing these {len(links)} links?[/]')
|
||||||
try:
|
try:
|
||||||
assert input(' y/[n]: ').lower() == 'y'
|
assert input(' y/[n]: ').lower() == 'y'
|
||||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||||
|
|
|
@ -110,7 +110,8 @@ class ActorType(Generic[ModelType]):
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
"""-> FaviconActor[pid=1234]"""
|
"""-> FaviconActor[pid=1234]"""
|
||||||
label = 'pid' if self.mode == 'process' else 'tid'
|
label = 'pid' if self.mode == 'process' else 'tid'
|
||||||
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
# return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
||||||
|
return f'[underline]Worker[/underline]\\[{label}={self.pid}]'
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _state_to_str(state: ObjectState) -> str:
|
def _state_to_str(state: ObjectState) -> str:
|
||||||
|
@ -210,6 +211,10 @@ class ActorType(Generic[ModelType]):
|
||||||
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
|
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
|
||||||
return bg_actor_process.pid
|
return bg_actor_process.pid
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def _obj_repr(cls, obj: ModelType | Any) -> str:
|
||||||
|
"""Get a string representation of the given django Model instance"""
|
||||||
|
return f'[grey53]{type(obj).__name__}\\[{obj.ABID}][/grey53]'
|
||||||
|
|
||||||
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
|
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
|
||||||
|
|
||||||
|
@ -328,7 +333,7 @@ class ActorType(Generic[ModelType]):
|
||||||
if self.idle_count >= 3:
|
if self.idle_count >= 3:
|
||||||
break # stop looping and exit if queue is empty and we have idled for 30sec
|
break # stop looping and exit if queue is empty and we have idled for 30sec
|
||||||
else:
|
else:
|
||||||
print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
|
# print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
|
||||||
self.idle_count += 1
|
self.idle_count += 1
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
continue
|
continue
|
||||||
|
@ -339,7 +344,7 @@ class ActorType(Generic[ModelType]):
|
||||||
self.tick(obj_to_process)
|
self.tick(obj_to_process)
|
||||||
except Exception as err:
|
except Exception as err:
|
||||||
last_error = err
|
last_error = err
|
||||||
print(f'[red]🏃♂️ {self}.tick()[/red] {obj_to_process} ERROR: [red]{type(err).__name__}: {err}[/red]')
|
print(f'[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.tick()[/red] ERROR: [red]{type(err).__name__}: {err}[/red]')
|
||||||
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
|
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
|
||||||
self.on_tick_exception(obj_to_process, err)
|
self.on_tick_exception(obj_to_process, err)
|
||||||
traceback.print_exc()
|
traceback.print_exc()
|
||||||
|
@ -351,7 +356,7 @@ class ActorType(Generic[ModelType]):
|
||||||
if isinstance(err, KeyboardInterrupt):
|
if isinstance(err, KeyboardInterrupt):
|
||||||
print()
|
print()
|
||||||
else:
|
else:
|
||||||
print(f'\n[red]🏃♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
|
print(f'\n[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.runloop() FATAL:[/red] {type(err).__name__}: {err}')
|
||||||
print(f' Last processed object: {obj_to_process}')
|
print(f' Last processed object: {obj_to_process}')
|
||||||
raise
|
raise
|
||||||
finally:
|
finally:
|
||||||
|
@ -449,7 +454,7 @@ class ActorType(Generic[ModelType]):
|
||||||
|
|
||||||
def tick(self, obj_to_process: ModelType) -> None:
|
def tick(self, obj_to_process: ModelType) -> None:
|
||||||
"""Call the object.sm.tick() method to process the object"""
|
"""Call the object.sm.tick() method to process the object"""
|
||||||
print(f'[blue]🏃♂️ {self}.tick()[/blue] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
|
print(f'\n[grey53]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.tick()[/grey53] [blue]{obj_to_process.status.upper()}[/blue] ➡️ ... +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
|
||||||
|
|
||||||
# get the StateMachine instance from the object
|
# get the StateMachine instance from the object
|
||||||
obj_statemachine = self._get_state_machine_instance(obj_to_process)
|
obj_statemachine = self._get_state_machine_instance(obj_to_process)
|
||||||
|
@ -477,17 +482,18 @@ class ActorType(Generic[ModelType]):
|
||||||
# abx.pm.hook.on_actor_startup(actor=self)
|
# abx.pm.hook.on_actor_startup(actor=self)
|
||||||
|
|
||||||
def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
|
def on_shutdown(self, last_obj: ModelType | None=None, last_error: BaseException | None=None) -> None:
|
||||||
if isinstance(last_error, KeyboardInterrupt) or last_error is None:
|
# if isinstance(last_error, KeyboardInterrupt) or last_error is None:
|
||||||
last_error_str = '[green](CTRL-C)[/green]'
|
# last_error_str = '[green](CTRL-C)[/green]'
|
||||||
elif isinstance(last_error, ActorQueueIsEmpty):
|
# elif isinstance(last_error, ActorQueueIsEmpty):
|
||||||
last_error_str = '[green](queue empty)[/green]'
|
# last_error_str = '[green](queue empty)[/green]'
|
||||||
elif isinstance(last_error, ActorObjectAlreadyClaimed):
|
# elif isinstance(last_error, ActorObjectAlreadyClaimed):
|
||||||
last_error_str = '[green](queue race)[/green]'
|
# last_error_str = '[green](queue race)[/green]'
|
||||||
else:
|
# else:
|
||||||
last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
|
# last_error_str = f'[red]{type(last_error).__name__}: {last_error}[/red]'
|
||||||
|
|
||||||
print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
|
# print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53] {last_error_str}')
|
||||||
# abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
|
# abx.pm.hook.on_actor_shutdown(actor=self, last_obj=last_obj, last_error=last_error)
|
||||||
|
pass
|
||||||
|
|
||||||
def on_tick_start(self, obj_to_process: ModelType) -> None:
|
def on_tick_start(self, obj_to_process: ModelType) -> None:
|
||||||
# print(f'🏃♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
|
# print(f'🏃♂️ {self}.on_tick_start() {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}')
|
||||||
|
@ -505,11 +511,11 @@ class ActorType(Generic[ModelType]):
|
||||||
|
|
||||||
|
|
||||||
def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
|
def on_tick_exception(self, obj_to_process: ModelType, error: Exception) -> None:
|
||||||
print(f'[red]🏃♂️ {self}.on_tick_exception()[/red] {obj_to_process.ABID} {obj_to_process.status} {obj_to_process.retry_at}: [red]{type(error).__name__}: {error}[/red]')
|
print(f'[red]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.on_tick_exception()[/red] [blue]{obj_to_process.status}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s: [red]{type(error).__name__}: {error}[/red]')
|
||||||
# abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
|
# abx.pm.hook.on_actor_tick_exception(actor=self, obj_to_process=obj_to_process, error=error)
|
||||||
|
|
||||||
def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
|
def on_state_change(self, obj_to_process: ModelType, starting_state, ending_state) -> None:
|
||||||
print(f'🏃♂️ {self}.on_state_change() {obj_to_process.ABID} {starting_state} ➡️ {ending_state}')
|
print(f'[blue]{self._obj_repr(obj_to_process)} 🏃♂️ {self}.on_state_change() {starting_state} ➡️ {ending_state}[/blue] +{(obj_to_process.retry_at - timezone.now()).total_seconds() if obj_to_process.retry_at else "-"}s')
|
||||||
# abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)
|
# abx.pm.hook.on_actor_state_change(actor=self, obj_to_process=obj_to_process, starting_state=starting_state, ending_state=ending_state)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -2,6 +2,7 @@ __package__ = 'archivebox.workers'
|
||||||
|
|
||||||
import os
|
import os
|
||||||
import time
|
import time
|
||||||
|
import sys
|
||||||
import itertools
|
import itertools
|
||||||
from typing import Dict, Type, Literal, TYPE_CHECKING
|
from typing import Dict, Type, Literal, TYPE_CHECKING
|
||||||
from django.utils.functional import classproperty
|
from django.utils.functional import classproperty
|
||||||
|
@ -122,9 +123,9 @@ class Orchestrator:
|
||||||
# abx.pm.hook.on_orchestrator_shutdown(self)
|
# abx.pm.hook.on_orchestrator_shutdown(self)
|
||||||
|
|
||||||
def on_tick_started(self, all_queues):
|
def on_tick_started(self, all_queues):
|
||||||
total_pending = sum(queue.count() for queue in all_queues.values())
|
# total_pending = sum(queue.count() for queue in all_queues.values())
|
||||||
if total_pending:
|
# if total_pending:
|
||||||
print(f'👨✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
|
# print(f'👨✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
|
||||||
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
|
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@ -136,7 +137,8 @@ class Orchestrator:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
def on_idle(self, all_queues):
|
def on_idle(self, all_queues):
|
||||||
print(f'👨✈️ {self}.on_idle()', f'idle_count={self.idle_count}')
|
# print(f'👨✈️ {self}.on_idle()', f'idle_count={self.idle_count}')
|
||||||
|
print('.', end='', flush=True, file=sys.stderr)
|
||||||
# abx.pm.hook.on_orchestrator_idle(self)
|
# abx.pm.hook.on_orchestrator_idle(self)
|
||||||
# check for orphaned objects left behind
|
# check for orphaned objects left behind
|
||||||
if self.idle_count == 60:
|
if self.idle_count == 60:
|
||||||
|
@ -170,6 +172,7 @@ class Orchestrator:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
next_obj = queue.first()
|
next_obj = queue.first()
|
||||||
|
print()
|
||||||
print(f'🏃♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj and next_obj.retry_at else "None"}')
|
print(f'🏃♂️ {self}.runloop() {actor_type.__name__.ljust(20)} queue={str(queue.count()).ljust(3)} next={next_obj.abid if next_obj else "None"} {next_obj.status if next_obj else "None"} {(timezone.now() - next_obj.retry_at).total_seconds() if next_obj and next_obj.retry_at else "None"}')
|
||||||
self.idle_count = 0
|
self.idle_count = 0
|
||||||
try:
|
try:
|
||||||
|
|
Loading…
Reference in a new issue