mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-21 19:53:06 +00:00
v0.8.6-rc: Moving plugins to independent python packages with finite state machine interfaces (#1576)
This commit is contained in:
commit
b7b3addbab
261 changed files with 5687 additions and 2995 deletions
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
|
@ -102,7 +102,7 @@ jobs:
|
|||
# TODO: remove this exception for windows once we get tests passing on that platform
|
||||
if: ${{ !contains(matrix.os, 'windows') }}
|
||||
run: |
|
||||
python -m pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
|
||||
python -m pytest -s --basetemp=tests/out --ignore=archivebox/pkgs
|
||||
|
||||
docker_tests:
|
||||
runs-on: ubuntu-latest
|
||||
|
|
6
.gitmodules
vendored
6
.gitmodules
vendored
|
@ -1,9 +1,3 @@
|
|||
[submodule "docs"]
|
||||
path = docs
|
||||
url = https://github.com/ArchiveBox/ArchiveBox.wiki.git
|
||||
[submodule "archivebox/vendor/pocket"]
|
||||
path = archivebox/vendor/pocket
|
||||
url = https://github.com/tapanpandita/pocket
|
||||
[submodule "archivebox/vendor/pydantic-pkgr"]
|
||||
path = archivebox/vendor/pydantic-pkgr
|
||||
url = https://github.com/ArchiveBox/pydantic-pkgr
|
||||
|
|
|
@ -3,4 +3,4 @@ ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E
|
|||
select = F,E9,W
|
||||
max-line-length = 130
|
||||
max-complexity = 10
|
||||
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv
|
||||
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data*
|
||||
|
|
|
@ -13,8 +13,8 @@ __package__ = 'archivebox'
|
|||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pathlib import Path
|
||||
from typing import cast
|
||||
|
||||
ASCII_LOGO = """
|
||||
█████╗ ██████╗ ██████╗██╗ ██╗██╗██╗ ██╗███████╗ ██████╗ ██████╗ ██╗ ██╗
|
||||
|
@ -47,11 +47,54 @@ from .monkey_patches import * # noqa
|
|||
|
||||
|
||||
# print('LOADING VENDORED LIBRARIES')
|
||||
from .vendor import load_vendored_libs # noqa
|
||||
load_vendored_libs()
|
||||
from .pkgs import load_vendored_pkgs # noqa
|
||||
load_vendored_pkgs()
|
||||
# print('DONE LOADING VENDORED LIBRARIES')
|
||||
|
||||
# Load ABX Plugin Specifications + Default Implementations
|
||||
import abx # noqa
|
||||
import abx_spec_archivebox # noqa
|
||||
import abx_spec_config # noqa
|
||||
import abx_spec_pydantic_pkgr # noqa
|
||||
import abx_spec_django # noqa
|
||||
import abx_spec_searchbackend # noqa
|
||||
|
||||
abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC)
|
||||
abx.pm.register(abx_spec_config.PLUGIN_SPEC())
|
||||
|
||||
abx.pm.add_hookspecs(abx_spec_pydantic_pkgr.PLUGIN_SPEC)
|
||||
abx.pm.register(abx_spec_pydantic_pkgr.PLUGIN_SPEC())
|
||||
|
||||
abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC)
|
||||
abx.pm.register(abx_spec_django.PLUGIN_SPEC())
|
||||
|
||||
abx.pm.add_hookspecs(abx_spec_searchbackend.PLUGIN_SPEC)
|
||||
abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
|
||||
|
||||
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
|
||||
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
|
||||
pm = abx.pm
|
||||
|
||||
|
||||
# Load all pip-installed ABX-compatible plugins
|
||||
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
|
||||
|
||||
# Load all built-in ArchiveBox plugins
|
||||
ARCHIVEBOX_BUILTIN_PLUGINS = {
|
||||
'config': PACKAGE_DIR / 'config',
|
||||
'core': PACKAGE_DIR / 'core',
|
||||
# 'search': PACKAGE_DIR / 'search',
|
||||
# 'core': PACKAGE_DIR / 'core',
|
||||
}
|
||||
|
||||
# Load all user-defined ArchiveBox plugins
|
||||
USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
|
||||
|
||||
# Import all plugins and register them with ABX Plugin Manager
|
||||
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
|
||||
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
|
||||
|
||||
# Setup basic config, constants, paths, and version
|
||||
from .config.constants import CONSTANTS # noqa
|
||||
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .config.version import VERSION # noqa
|
||||
|
|
|
@ -1,131 +0,0 @@
|
|||
__package__ = 'abx'
|
||||
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
from typing import Dict, Callable, List
|
||||
|
||||
from . import hookspec as base_spec
|
||||
from abx.hookspec import hookimpl, hookspec # noqa
|
||||
from abx.manager import pm, PluginManager # noqa
|
||||
|
||||
|
||||
pm.add_hookspecs(base_spec)
|
||||
|
||||
|
||||
###### PLUGIN DISCOVERY AND LOADING ########################################################
|
||||
|
||||
def get_plugin_order(plugin_entrypoint: Path):
|
||||
order = 999
|
||||
try:
|
||||
# if .plugin_order file exists, use it to set the load priority
|
||||
order = int((plugin_entrypoint.parent / '.plugin_order').read_text())
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
return (order, plugin_entrypoint)
|
||||
|
||||
def register_hookspecs(hookspecs: List[str]):
|
||||
"""
|
||||
Register all the hookspecs from a list of module names.
|
||||
"""
|
||||
for hookspec_import_path in hookspecs:
|
||||
hookspec_module = importlib.import_module(hookspec_import_path)
|
||||
pm.add_hookspecs(hookspec_module)
|
||||
|
||||
|
||||
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
|
||||
"""
|
||||
Find all the plugins in a given directory. Just looks for an __init__.py file.
|
||||
"""
|
||||
return {
|
||||
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
|
||||
for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
|
||||
if plugin_entrypoint.parent.name != 'abx'
|
||||
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
|
||||
|
||||
|
||||
def get_pip_installed_plugins(group='abx'):
|
||||
"""replaces pm.load_setuptools_entrypoints("abx"), finds plugins that registered entrypoints via pip"""
|
||||
import importlib.metadata
|
||||
|
||||
DETECTED_PLUGINS = {} # module_name: module_dir_path
|
||||
for dist in list(importlib.metadata.distributions()):
|
||||
for entrypoint in dist.entry_points:
|
||||
if entrypoint.group != group or pm.is_blocked(entrypoint.name):
|
||||
continue
|
||||
DETECTED_PLUGINS[entrypoint.name] = Path(entrypoint.load().__file__).parent
|
||||
# pm.register(plugin, name=ep.name)
|
||||
# pm._plugin_distinfo.append((plugin, DistFacade(dist)))
|
||||
return DETECTED_PLUGINS
|
||||
|
||||
|
||||
def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]):
|
||||
"""
|
||||
Get the mapping of dir_name: {plugin_id: plugin_dir} for all plugins in the given directories.
|
||||
"""
|
||||
DETECTED_PLUGINS = {}
|
||||
for plugin_prefix, plugin_dir in plugin_dirs.items():
|
||||
DETECTED_PLUGINS.update(find_plugins_in_dir(plugin_dir, prefix=plugin_prefix))
|
||||
return DETECTED_PLUGINS
|
||||
|
||||
|
||||
# Load all plugins from pip packages, archivebox built-ins, and user plugins
|
||||
|
||||
def load_plugins(plugins_dict: Dict[str, Path]):
|
||||
"""
|
||||
Load all the plugins from a dictionary of module names and directory paths.
|
||||
"""
|
||||
LOADED_PLUGINS = {}
|
||||
for plugin_module, plugin_dir in plugins_dict.items():
|
||||
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
|
||||
plugin_module_loaded = importlib.import_module(plugin_module)
|
||||
pm.register(plugin_module_loaded)
|
||||
LOADED_PLUGINS[plugin_module] = plugin_module_loaded.PLUGIN
|
||||
# print(f' √ Loaded plugin: {plugin_module}')
|
||||
return LOADED_PLUGINS
|
||||
|
||||
def get_registered_plugins():
|
||||
"""
|
||||
Get all the plugins registered with Pluggy.
|
||||
"""
|
||||
plugins = {}
|
||||
plugin_to_distinfo = dict(pm.list_plugin_distinfo())
|
||||
for plugin in pm.get_plugins():
|
||||
plugin_info = {
|
||||
"name": plugin.__name__,
|
||||
"hooks": [h.name for h in pm.get_hookcallers(plugin) or ()],
|
||||
}
|
||||
distinfo = plugin_to_distinfo.get(plugin)
|
||||
if distinfo:
|
||||
plugin_info["version"] = distinfo.version
|
||||
plugin_info["name"] = (
|
||||
getattr(distinfo, "name", None) or distinfo.project_name
|
||||
)
|
||||
plugins[plugin_info["name"]] = plugin_info
|
||||
return plugins
|
||||
|
||||
|
||||
|
||||
|
||||
def get_plugin_hooks(plugin_pkg: str | None) -> Dict[str, Callable]:
|
||||
"""
|
||||
Get all the functions marked with @hookimpl on a module.
|
||||
"""
|
||||
if not plugin_pkg:
|
||||
return {}
|
||||
|
||||
hooks = {}
|
||||
|
||||
plugin_module = importlib.import_module(plugin_pkg)
|
||||
for attr_name in dir(plugin_module):
|
||||
if attr_name.startswith('_'):
|
||||
continue
|
||||
try:
|
||||
attr = getattr(plugin_module, attr_name)
|
||||
if isinstance(attr, Callable):
|
||||
hooks[attr_name] = None
|
||||
pm.parse_hookimpl_opts(plugin_module, attr_name)
|
||||
hooks[attr_name] = attr
|
||||
except Exception as e:
|
||||
print(f'Error getting hookimpls for {plugin_pkg}: {e}')
|
||||
|
||||
return hooks
|
|
@ -1,30 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import os
|
||||
import importlib
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
|
||||
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
|
||||
LOADED_PLUGINS = {}
|
||||
for plugin_module, plugin_dir in reversed(plugins_dict.items()):
|
||||
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
|
||||
|
||||
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
|
||||
try:
|
||||
plugin_module_loaded = importlib.import_module(plugin_module)
|
||||
pm.register(plugin_module_loaded)
|
||||
except Exception as e:
|
||||
print(f'Error registering plugin: {plugin_module} - {e}')
|
||||
|
||||
|
||||
# 2. then try to import plugin_module.apps as well
|
||||
if os.access(plugin_dir / 'apps.py', os.R_OK):
|
||||
plugin_apps = importlib.import_module(plugin_module + '.apps')
|
||||
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
|
||||
|
||||
# print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
|
||||
return LOADED_PLUGINS
|
|
@ -1,106 +0,0 @@
|
|||
__package__ = "abx.archivebox"
|
||||
|
||||
import os
|
||||
from typing import Optional, cast
|
||||
from typing_extensions import Self
|
||||
|
||||
from pydantic import validate_call
|
||||
from pydantic_pkgr import (
|
||||
Binary,
|
||||
BinProvider,
|
||||
BinProviderName,
|
||||
AptProvider,
|
||||
BrewProvider,
|
||||
EnvProvider,
|
||||
)
|
||||
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||
|
||||
|
||||
class BaseBinProvider(BinProvider):
|
||||
|
||||
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
|
||||
|
||||
@property
|
||||
def admin_url(self) -> str:
|
||||
# e.g. /admin/environment/binproviders/NpmBinProvider/ TODO
|
||||
return "/admin/environment/binaries/"
|
||||
|
||||
class BaseBinary(Binary):
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
|
||||
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
|
||||
return
|
||||
|
||||
try:
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
symlink.chmod(0o777) # make sure its executable by everyone
|
||||
except Exception as err:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@validate_call
|
||||
def load(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
if fresh:
|
||||
binary = super().load(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
else:
|
||||
# get cached binary from db
|
||||
try:
|
||||
from machine.models import InstalledBinary
|
||||
installed_binary = InstalledBinary.objects.get_from_db_or_cache(self) # type: ignore
|
||||
binary = InstalledBinary.load_from_db(installed_binary)
|
||||
except Exception:
|
||||
# maybe we are not in a DATA dir so there is no db, fallback to reading from fs
|
||||
# (e.g. when archivebox version is run outside of a DATA dir)
|
||||
binary = super().load(**kwargs)
|
||||
return cast(Self, binary)
|
||||
|
||||
@validate_call
|
||||
def install(self, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
binary = super().install(**kwargs)
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
|
||||
@validate_call
|
||||
def load_or_install(self, fresh=False, **kwargs) -> Self:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
try:
|
||||
binary = self.load(fresh=fresh)
|
||||
if binary and binary.version:
|
||||
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
return binary
|
||||
except Exception:
|
||||
pass
|
||||
return self.install(**kwargs)
|
||||
|
||||
@property
|
||||
def admin_url(self) -> str:
|
||||
# e.g. /admin/environment/config/LdapConfig/
|
||||
return f"/admin/environment/binaries/{self.name}/"
|
||||
|
||||
|
||||
class AptBinProvider(AptProvider, BaseBinProvider):
|
||||
name: BinProviderName = "apt"
|
||||
|
||||
class BrewBinProvider(BrewProvider, BaseBinProvider):
|
||||
name: BinProviderName = "brew"
|
||||
|
||||
class EnvBinProvider(EnvProvider, BaseBinProvider):
|
||||
name: BinProviderName = "env"
|
||||
|
||||
euid: Optional[int] = ARCHIVEBOX_USER
|
||||
|
||||
apt = AptBinProvider()
|
||||
brew = BrewBinProvider()
|
||||
env = EnvBinProvider()
|
|
@ -1,219 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import json
|
||||
import os
|
||||
|
||||
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
|
||||
from typing_extensions import Self
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import model_validator, AfterValidator
|
||||
from pydantic_pkgr import BinName
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils import timezone
|
||||
|
||||
import abx
|
||||
|
||||
from .base_binary import BaseBinary
|
||||
|
||||
|
||||
def no_empty_args(args: List[str]) -> List[str]:
|
||||
assert all(len(arg) for arg in args)
|
||||
return args
|
||||
|
||||
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
|
||||
|
||||
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
|
||||
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
|
||||
|
||||
|
||||
class BaseExtractor:
|
||||
|
||||
name: ExtractorName
|
||||
binary: BinName
|
||||
|
||||
output_path_func: HandlerFuncStr = 'self.get_output_path'
|
||||
should_extract_func: HandlerFuncStr = 'self.should_extract'
|
||||
extract_func: HandlerFuncStr = 'self.extract'
|
||||
exec_func: HandlerFuncStr = 'self.exec'
|
||||
|
||||
default_args: CmdArgsList = []
|
||||
extra_args: CmdArgsList = []
|
||||
args: Optional[CmdArgsList] = None
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_model(self) -> Self:
|
||||
if self.args is None:
|
||||
self.args = [*self.default_args, *self.extra_args]
|
||||
return self
|
||||
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(self.__class__.__name__.lower())
|
||||
|
||||
def should_extract(self, uri: str, config: dict | None=None) -> bool:
|
||||
try:
|
||||
assert self.detect_installed_binary().version
|
||||
except Exception:
|
||||
raise
|
||||
# could not load binary
|
||||
return False
|
||||
|
||||
# output_dir = self.get_output_path(snapshot)
|
||||
# if output_dir.glob('*.*'):
|
||||
# return False
|
||||
return True
|
||||
|
||||
@abx.hookimpl
|
||||
def extract(self, snapshot_id: str) -> Dict[str, Any]:
|
||||
from core.models import Snapshot
|
||||
from archivebox import CONSTANTS
|
||||
|
||||
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||
|
||||
if not self.should_extract(snapshot):
|
||||
return {}
|
||||
|
||||
status = 'failed'
|
||||
start_ts = timezone.now()
|
||||
uplink = self.detect_network_interface()
|
||||
installed_binary = self.detect_installed_binary()
|
||||
machine = installed_binary.machine
|
||||
assert uplink.machine == installed_binary.machine # it would be *very* weird if this wasn't true
|
||||
|
||||
output_dir = CONSTANTS.DATA_DIR / '.tmp' / 'extractors' / self.name / str(snapshot.abid)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# execute the extractor binary with the given args
|
||||
args = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
|
||||
cmd = [str(installed_binary.abspath), *args]
|
||||
proc = self.exec(installed_binary=installed_binary, args=args, cwd=output_dir)
|
||||
|
||||
# collect the output
|
||||
end_ts = timezone.now()
|
||||
output_files = list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*'))
|
||||
stdout = proc.stdout.strip()
|
||||
stderr = proc.stderr.strip()
|
||||
output_json = None
|
||||
output_text = stdout
|
||||
try:
|
||||
output_json = json.loads(stdout.strip())
|
||||
output_text = None
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
errors = []
|
||||
if proc.returncode == 0:
|
||||
status = 'success'
|
||||
else:
|
||||
errors.append(f'{installed_binary.name} returned non-zero exit code: {proc.returncode}')
|
||||
|
||||
# increment health stats counters
|
||||
if status == 'success':
|
||||
machine.record_health_success()
|
||||
uplink.record_health_success()
|
||||
installed_binary.record_health_success()
|
||||
else:
|
||||
machine.record_health_failure()
|
||||
uplink.record_health_failure()
|
||||
installed_binary.record_health_failure()
|
||||
|
||||
return {
|
||||
'extractor': self.name,
|
||||
|
||||
'snapshot': {
|
||||
'id': snapshot.id,
|
||||
'abid': snapshot.abid,
|
||||
'url': snapshot.url,
|
||||
'created_by_id': snapshot.created_by_id,
|
||||
},
|
||||
|
||||
'machine': {
|
||||
'id': machine.id,
|
||||
'abid': machine.abid,
|
||||
'guid': machine.guid,
|
||||
'hostname': machine.hostname,
|
||||
'hw_in_docker': machine.hw_in_docker,
|
||||
'hw_in_vm': machine.hw_in_vm,
|
||||
'hw_manufacturer': machine.hw_manufacturer,
|
||||
'hw_product': machine.hw_product,
|
||||
'hw_uuid': machine.hw_uuid,
|
||||
'os_arch': machine.os_arch,
|
||||
'os_family': machine.os_family,
|
||||
'os_platform': machine.os_platform,
|
||||
'os_release': machine.os_release,
|
||||
'os_kernel': machine.os_kernel,
|
||||
},
|
||||
|
||||
'uplink': {
|
||||
'id': uplink.id,
|
||||
'abid': uplink.abid,
|
||||
'mac_address': uplink.mac_address,
|
||||
'ip_public': uplink.ip_public,
|
||||
'ip_local': uplink.ip_local,
|
||||
'dns_server': uplink.dns_server,
|
||||
'hostname': uplink.hostname,
|
||||
'iface': uplink.iface,
|
||||
'isp': uplink.isp,
|
||||
'city': uplink.city,
|
||||
'region': uplink.region,
|
||||
'country': uplink.country,
|
||||
},
|
||||
|
||||
'binary': {
|
||||
'id': installed_binary.id,
|
||||
'abid': installed_binary.abid,
|
||||
'name': installed_binary.name,
|
||||
'binprovider': installed_binary.binprovider,
|
||||
'abspath': installed_binary.abspath,
|
||||
'version': installed_binary.version,
|
||||
'sha256': installed_binary.sha256,
|
||||
},
|
||||
|
||||
'cmd': cmd,
|
||||
'stdout': stdout,
|
||||
'stderr': stderr,
|
||||
'returncode': proc.returncode,
|
||||
'start_ts': start_ts,
|
||||
'end_ts': end_ts,
|
||||
|
||||
'status': status,
|
||||
'errors': errors,
|
||||
'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
|
||||
'output_files': output_files,
|
||||
'output_json': output_json or {},
|
||||
'output_text': output_text or '',
|
||||
}
|
||||
|
||||
# TODO: move this to a hookimpl
|
||||
def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
|
||||
cwd = cwd or Path(os.getcwd())
|
||||
binary = self.load_binary(installed_binary=installed_binary)
|
||||
|
||||
return binary.exec(cmd=args, cwd=cwd)
|
||||
|
||||
@cached_property
|
||||
def BINARY(self) -> BaseBinary:
|
||||
import abx.archivebox.reads
|
||||
for binary in abx.archivebox.reads.get_BINARIES().values():
|
||||
if binary.name == self.binary:
|
||||
return binary
|
||||
raise ValueError(f'Binary {self.binary} not found')
|
||||
|
||||
def detect_installed_binary(self):
|
||||
from machine.models import InstalledBinary
|
||||
# hydrates binary from DB/cache if record of installed version is recent enough
|
||||
# otherwise it finds it from scratch by detecting installed version/abspath/sha256 on host
|
||||
return InstalledBinary.objects.get_from_db_or_cache(self.BINARY)
|
||||
|
||||
def load_binary(self, installed_binary=None) -> BaseBinary:
|
||||
installed_binary = installed_binary or self.detect_installed_binary()
|
||||
return installed_binary.load_from_db()
|
||||
|
||||
def detect_network_interface(self):
|
||||
from machine.models import NetworkInterface
|
||||
return NetworkInterface.objects.current()
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS(self):
|
||||
return [self]
|
|
@ -1,25 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
class BaseReplayer:
|
||||
"""Describes how to render an ArchiveResult in several contexts"""
|
||||
|
||||
url_pattern: str = '*'
|
||||
|
||||
row_template: str = 'plugins/generic_replayer/templates/row.html'
|
||||
embed_template: str = 'plugins/generic_replayer/templates/embed.html'
|
||||
fullpage_template: str = 'plugins/generic_replayer/templates/fullpage.html'
|
||||
|
||||
# row_view: LazyImportStr = 'plugins.generic_replayer.views.row_view'
|
||||
# embed_view: LazyImportStr = 'plugins.generic_replayer.views.embed_view'
|
||||
# fullpage_view: LazyImportStr = 'plugins.generic_replayer.views.fullpage_view'
|
||||
# icon_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
|
||||
# thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
|
||||
|
||||
@abx.hookimpl
|
||||
def get_REPLAYERS(self):
|
||||
return [self]
|
||||
|
||||
# TODO: add hookimpl methods for get_row_template, get_embed_template, get_fullpage_template, etc...
|
|
@ -1,25 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
from typing import Iterable, List
|
||||
import abc
|
||||
|
||||
|
||||
|
||||
class BaseSearchBackend(abc.ABC):
|
||||
name: str
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def index(snapshot_id: str, texts: List[str]):
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def flush(snapshot_ids: Iterable[str]):
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
@abc.abstractmethod
|
||||
def search(text: str) -> List[str]:
|
||||
raise NotImplementedError("search method must be implemented by subclass")
|
||||
|
|
@ -1,52 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
from typing import Dict, Any
|
||||
|
||||
from .. import hookspec
|
||||
|
||||
from .base_binary import BaseBinary, BaseBinProvider
|
||||
from .base_configset import BaseConfigSet
|
||||
from .base_extractor import BaseExtractor
|
||||
from .base_searchbackend import BaseSearchBackend
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_CONFIG() -> Dict[str, BaseConfigSet]:
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
|
||||
return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_REPLAYERS() -> Dict[str, BaseReplayer]:
|
||||
# return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_ADMINDATAVIEWS():
|
||||
# return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_QUEUES():
|
||||
# return {}
|
||||
|
||||
|
||||
##############################################################
|
||||
# provided by abx.pydantic_pkgr.hookspec:
|
||||
# @hookspec
|
||||
# def get_BINARIES() -> Dict[str, BaseBinary]:
|
||||
# return {}
|
||||
|
||||
# @hookspec
|
||||
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
|
||||
# return {}
|
|
@ -1,160 +0,0 @@
|
|||
__package__ = 'abx.archivebox'
|
||||
|
||||
import importlib
|
||||
from typing import Dict, Set, Any, TYPE_CHECKING
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
import abx
|
||||
from .. import pm
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .base_configset import BaseConfigSet
|
||||
from .base_binary import BaseBinary, BaseBinProvider
|
||||
from .base_extractor import BaseExtractor
|
||||
from .base_searchbackend import BaseSearchBackend
|
||||
# from .base_replayer import BaseReplayer
|
||||
# from .base_queue import BaseQueue
|
||||
# from .base_admindataview import BaseAdminDataView
|
||||
|
||||
# API exposed to ArchiveBox code
|
||||
|
||||
def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
|
||||
return benedict({
|
||||
plugin_id: plugin
|
||||
for plugin_dict in pm.hook.get_PLUGIN()
|
||||
for plugin_id, plugin in plugin_dict.items()
|
||||
})
|
||||
|
||||
def get_PLUGIN(plugin_id: str) -> Dict[str, Any]:
|
||||
plugin_info = get_PLUGINS().get(plugin_id, {})
|
||||
package = plugin_info.get('package', plugin_info.get('PACKAGE', None))
|
||||
if not package:
|
||||
return {'id': plugin_id, 'hooks': {}}
|
||||
module = importlib.import_module(package)
|
||||
hooks = abx.get_plugin_hooks(module.__package__)
|
||||
assert plugin_info and (plugin_info.get('id') or plugin_info.get('ID') or hooks)
|
||||
|
||||
return benedict({
|
||||
'id': plugin_id,
|
||||
'label': getattr(module, '__label__', plugin_id),
|
||||
'module': module,
|
||||
'package': module.__package__,
|
||||
'hooks': hooks,
|
||||
'version': getattr(module, '__version__', '999.999.999'),
|
||||
'author': getattr(module, '__author__', 'Unknown'),
|
||||
'homepage': getattr(module, '__homepage__', 'https://github.com/ArchiveBox/ArchiveBox'),
|
||||
'dependencies': getattr(module, '__dependencies__', []),
|
||||
'source_code': module.__file__,
|
||||
**plugin_info,
|
||||
})
|
||||
|
||||
|
||||
def get_HOOKS() -> Set[str]:
|
||||
return {
|
||||
hook_name
|
||||
for plugin_id in get_PLUGINS().keys()
|
||||
for hook_name in get_PLUGIN(plugin_id).hooks
|
||||
}
|
||||
|
||||
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
|
||||
return benedict({
|
||||
config_id: configset
|
||||
for plugin_configs in pm.hook.get_CONFIG()
|
||||
for config_id, configset in plugin_configs.items()
|
||||
})
|
||||
|
||||
|
||||
def get_FLAT_CONFIG() -> Dict[str, Any]:
|
||||
return benedict({
|
||||
key: value
|
||||
for configset in get_CONFIGS().values()
|
||||
for key, value in configset.model_dump().items()
|
||||
})
|
||||
|
||||
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
|
||||
# TODO: move these to plugins
|
||||
from abx.archivebox.base_binary import apt, brew, env
|
||||
builtin_binproviders = {
|
||||
'env': env,
|
||||
'apt': apt,
|
||||
'brew': brew,
|
||||
}
|
||||
|
||||
return benedict({
|
||||
binprovider_id: binprovider
|
||||
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
|
||||
for binprovider_id, binprovider in plugin_binproviders.items()
|
||||
})
|
||||
|
||||
def get_BINARIES() -> Dict[str, 'BaseBinary']:
|
||||
return benedict({
|
||||
binary_id: binary
|
||||
for plugin_binaries in pm.hook.get_BINARIES()
|
||||
for binary_id, binary in plugin_binaries.items()
|
||||
})
|
||||
|
||||
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
|
||||
return benedict({
|
||||
extractor_id: extractor
|
||||
for plugin_extractors in pm.hook.get_EXTRACTORS()
|
||||
for extractor_id, extractor in plugin_extractors.items()
|
||||
})
|
||||
|
||||
# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
|
||||
# return benedict({
|
||||
# replayer.id: replayer
|
||||
# for plugin_replayers in pm.hook.get_REPLAYERS()
|
||||
# for replayer in plugin_replayers
|
||||
# })
|
||||
|
||||
# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
|
||||
# return benedict({
|
||||
# admin_dataview.id: admin_dataview
|
||||
# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
|
||||
# for admin_dataview in plugin_admin_dataviews
|
||||
# })
|
||||
|
||||
# def get_QUEUES() -> Dict[str, 'BaseQueue']:
|
||||
# return benedict({
|
||||
# queue.id: queue
|
||||
# for plugin_queues in pm.hook.get_QUEUES()
|
||||
# for queue in plugin_queues
|
||||
# })
|
||||
|
||||
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
|
||||
return benedict({
|
||||
searchbackend_id: searchbackend
|
||||
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
|
||||
for searchbackend_id,searchbackend in plugin_searchbackends.items()
|
||||
})
|
||||
|
||||
|
||||
|
||||
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
|
||||
"""Get all the relevant config for the given scope, in correct precedence order"""
|
||||
|
||||
from django.conf import settings
|
||||
default_config: benedict = defaults or settings.CONFIG
|
||||
|
||||
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
|
||||
crawl = crawl or (snapshot and snapshot.crawl)
|
||||
seed = seed or (crawl and crawl.seed)
|
||||
persona = persona or (crawl and crawl.persona)
|
||||
|
||||
persona_config = persona.config if persona else {}
|
||||
seed_config = seed.config if seed else {}
|
||||
crawl_config = crawl.config if crawl else {}
|
||||
snapshot_config = snapshot.config if snapshot else {}
|
||||
archiveresult_config = archiveresult.config if archiveresult else {}
|
||||
extra_config = extra_config or {}
|
||||
|
||||
return {
|
||||
**default_config, # defaults / config file / environment variables
|
||||
**persona_config, # lowest precedence
|
||||
**seed_config,
|
||||
**crawl_config,
|
||||
**snapshot_config,
|
||||
**archiveresult_config,
|
||||
**extra_config, # highest precedence
|
||||
}
|
|
@ -1 +0,0 @@
|
|||
__package__ = 'abx.django'
|
|
@ -1,13 +0,0 @@
|
|||
__package__ = 'abx.django'
|
||||
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ABXConfig(AppConfig):
|
||||
name = 'abx'
|
||||
|
||||
def ready(self):
|
||||
import abx
|
||||
from django.conf import settings
|
||||
|
||||
abx.pm.hook.ready(settings=settings)
|
|
@ -1,125 +0,0 @@
|
|||
__package__ = 'abx.django'
|
||||
|
||||
from ..hookspec import hookspec
|
||||
|
||||
|
||||
###########################################################################################
|
||||
|
||||
@hookspec
|
||||
def get_INSTALLED_APPS():
|
||||
"""Return a list of apps to add to INSTALLED_APPS"""
|
||||
# e.g. ['your_plugin_type.plugin_name']
|
||||
return []
|
||||
|
||||
# @hookspec
|
||||
# def register_INSTALLED_APPS(INSTALLED_APPS):
|
||||
# """Mutate INSTALLED_APPS in place to add your app in a specific position"""
|
||||
# # idx_of_contrib = INSTALLED_APPS.index('django.contrib.auth')
|
||||
# # INSTALLED_APPS.insert(idx_of_contrib + 1, 'your_plugin_type.plugin_name')
|
||||
# pass
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_TEMPLATE_DIRS():
|
||||
return [] # e.g. ['your_plugin_type/plugin_name/templates']
|
||||
|
||||
# @hookspec
|
||||
# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
|
||||
# """Install django settings"""
|
||||
# # e.g. TEMPLATE_DIRS.insert(0, 'your_plugin_type/plugin_name/templates')
|
||||
# pass
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_STATICFILES_DIRS():
|
||||
return [] # e.g. ['your_plugin_type/plugin_name/static']
|
||||
|
||||
# @hookspec
|
||||
# def register_STATICFILES_DIRS(STATICFILES_DIRS):
|
||||
# """Mutate STATICFILES_DIRS in place to add your static dirs in a specific position"""
|
||||
# # e.g. STATICFILES_DIRS.insert(0, 'your_plugin_type/plugin_name/static')
|
||||
# pass
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_MIDDLEWARE():
|
||||
return [] # e.g. ['your_plugin_type.plugin_name.middleware.YourMiddleware']
|
||||
|
||||
# @hookspec
|
||||
# def register_MIDDLEWARE(MIDDLEWARE):
|
||||
# """Mutate MIDDLEWARE in place to add your middleware in a specific position"""
|
||||
# # e.g. MIDDLEWARE.insert(0, 'your_plugin_type.plugin_name.middleware.YourMiddleware')
|
||||
# pass
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_AUTHENTICATION_BACKENDS():
|
||||
return [] # e.g. ['django_auth_ldap.backend.LDAPBackend']
|
||||
|
||||
# @hookspec
|
||||
# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
|
||||
# """Mutate AUTHENTICATION_BACKENDS in place to add your auth backends in a specific position"""
|
||||
# # e.g. AUTHENTICATION_BACKENDS.insert(0, 'your_plugin_type.plugin_name.backend.YourBackend')
|
||||
# pass
|
||||
|
||||
@hookspec
|
||||
def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME):
|
||||
return [] # e.g. [{'name': 'your_plugin_type.plugin_name', 'HUEY': {...}}]
|
||||
|
||||
# @hookspec
|
||||
# def register_DJANGO_HUEY(DJANGO_HUEY):
|
||||
# """Mutate DJANGO_HUEY in place to add your huey queues in a specific position"""
|
||||
# # e.g. DJANGO_HUEY['queues']['some_queue_name']['some_setting'] = 'some_value'
|
||||
# pass
|
||||
|
||||
|
||||
@hookspec
|
||||
def get_ADMIN_DATA_VIEWS_URLS():
|
||||
return []
|
||||
|
||||
# @hookspec
|
||||
# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
|
||||
# """Mutate ADMIN_DATA_VIEWS in place to add your admin data views in a specific position"""
|
||||
# # e.g. ADMIN_DATA_VIEWS['URLS'].insert(0, 'your_plugin_type/plugin_name/admin_data_views.py')
|
||||
# pass
|
||||
|
||||
|
||||
# @hookspec
|
||||
# def register_settings(settings):
|
||||
# """Mutate settings in place to add your settings / modify existing settings"""
|
||||
# # settings.SOME_KEY = 'some_value'
|
||||
# pass
|
||||
|
||||
|
||||
###########################################################################################
|
||||
|
||||
@hookspec
|
||||
def get_urlpatterns():
|
||||
return [] # e.g. [path('your_plugin_type/plugin_name/url.py', your_view)]
|
||||
|
||||
# @hookspec
|
||||
# def register_urlpatterns(urlpatterns):
|
||||
# """Mutate urlpatterns in place to add your urlpatterns in a specific position"""
|
||||
# # e.g. urlpatterns.insert(0, path('your_plugin_type/plugin_name/url.py', your_view))
|
||||
# pass
|
||||
|
||||
###########################################################################################
|
||||
|
||||
@hookspec
|
||||
def register_checks():
|
||||
"""Register django checks with django system checks system"""
|
||||
pass
|
||||
|
||||
@hookspec
|
||||
def register_admin(admin_site):
|
||||
"""Register django admin views/models with the main django admin site instance"""
|
||||
pass
|
||||
|
||||
|
||||
###########################################################################################
|
||||
|
||||
|
||||
@hookspec
|
||||
def ready():
|
||||
"""Called when Django apps app.ready() are triggered"""
|
||||
pass
|
|
@ -1,101 +0,0 @@
|
|||
__package__ = 'abx.django'
|
||||
|
||||
import itertools
|
||||
# from benedict import benedict
|
||||
|
||||
from .. import pm
|
||||
|
||||
|
||||
def get_INSTALLED_APPS():
|
||||
return itertools.chain(*reversed(pm.hook.get_INSTALLED_APPS()))
|
||||
|
||||
# def register_INSTALLLED_APPS(INSTALLED_APPS):
|
||||
# pm.hook.register_INSTALLED_APPS(INSTALLED_APPS=INSTALLED_APPS)
|
||||
|
||||
|
||||
def get_MIDDLEWARES():
|
||||
return itertools.chain(*reversed(pm.hook.get_MIDDLEWARE()))
|
||||
|
||||
# def register_MIDDLEWARES(MIDDLEWARE):
|
||||
# pm.hook.register_MIDDLEWARE(MIDDLEWARE=MIDDLEWARE)
|
||||
|
||||
|
||||
def get_AUTHENTICATION_BACKENDS():
|
||||
return itertools.chain(*reversed(pm.hook.get_AUTHENTICATION_BACKENDS()))
|
||||
|
||||
# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
|
||||
# pm.hook.register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS=AUTHENTICATION_BACKENDS)
|
||||
|
||||
|
||||
def get_STATICFILES_DIRS():
|
||||
return itertools.chain(*reversed(pm.hook.get_STATICFILES_DIRS()))
|
||||
|
||||
# def register_STATICFILES_DIRS(STATICFILES_DIRS):
|
||||
# pm.hook.register_STATICFILES_DIRS(STATICFILES_DIRS=STATICFILES_DIRS)
|
||||
|
||||
|
||||
def get_TEMPLATE_DIRS():
|
||||
return itertools.chain(*reversed(pm.hook.get_TEMPLATE_DIRS()))
|
||||
|
||||
# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
|
||||
# pm.hook.register_TEMPLATE_DIRS(TEMPLATE_DIRS=TEMPLATE_DIRS)
|
||||
|
||||
def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME='queue.sqlite3'):
|
||||
HUEY_QUEUES = {}
|
||||
for plugin_result in pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=QUEUE_DATABASE_NAME):
|
||||
HUEY_QUEUES.update(plugin_result)
|
||||
return HUEY_QUEUES
|
||||
|
||||
# def register_DJANGO_HUEY(DJANGO_HUEY):
|
||||
# pm.hook.register_DJANGO_HUEY(DJANGO_HUEY=DJANGO_HUEY)
|
||||
|
||||
def get_ADMIN_DATA_VIEWS_URLS():
|
||||
return itertools.chain(*reversed(pm.hook.get_ADMIN_DATA_VIEWS_URLS()))
|
||||
|
||||
# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
|
||||
# pm.hook.register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS=ADMIN_DATA_VIEWS)
|
||||
|
||||
|
||||
# def register_settings(settings):
|
||||
# # convert settings dict to an benedict so we can set values using settings.attr = xyz notation
|
||||
# settings_as_obj = benedict(settings, keypath_separator=None)
|
||||
|
||||
# # set default values for settings that are used by plugins
|
||||
# # settings_as_obj.INSTALLED_APPS = settings_as_obj.get('INSTALLED_APPS', [])
|
||||
# # settings_as_obj.MIDDLEWARE = settings_as_obj.get('MIDDLEWARE', [])
|
||||
# # settings_as_obj.AUTHENTICATION_BACKENDS = settings_as_obj.get('AUTHENTICATION_BACKENDS', [])
|
||||
# # settings_as_obj.STATICFILES_DIRS = settings_as_obj.get('STATICFILES_DIRS', [])
|
||||
# # settings_as_obj.TEMPLATE_DIRS = settings_as_obj.get('TEMPLATE_DIRS', [])
|
||||
# # settings_as_obj.DJANGO_HUEY = settings_as_obj.get('DJANGO_HUEY', {'queues': {}})
|
||||
# # settings_as_obj.ADMIN_DATA_VIEWS = settings_as_obj.get('ADMIN_DATA_VIEWS', {'URLS': []})
|
||||
|
||||
# # # call all the hook functions to mutate the settings values in-place
|
||||
# # register_INSTALLLED_APPS(settings_as_obj.INSTALLED_APPS)
|
||||
# # register_MIDDLEWARES(settings_as_obj.MIDDLEWARE)
|
||||
# # register_AUTHENTICATION_BACKENDS(settings_as_obj.AUTHENTICATION_BACKENDS)
|
||||
# # register_STATICFILES_DIRS(settings_as_obj.STATICFILES_DIRS)
|
||||
# # register_TEMPLATE_DIRS(settings_as_obj.TEMPLATE_DIRS)
|
||||
# # register_DJANGO_HUEY(settings_as_obj.DJANGO_HUEY)
|
||||
# # register_ADMIN_DATA_VIEWS(settings_as_obj.ADMIN_DATA_VIEWS)
|
||||
|
||||
# # calls Plugin.settings(settings) on each registered plugin
|
||||
# pm.hook.register_settings(settings=settings_as_obj)
|
||||
|
||||
# # then finally update the settings globals() object will all the new settings
|
||||
# # settings.update(settings_as_obj)
|
||||
|
||||
|
||||
def get_urlpatterns():
|
||||
return list(itertools.chain(*pm.hook.urlpatterns()))
|
||||
|
||||
def register_urlpatterns(urlpatterns):
|
||||
pm.hook.register_urlpatterns(urlpatterns=urlpatterns)
|
||||
|
||||
|
||||
def register_checks():
|
||||
"""register any django system checks"""
|
||||
pm.hook.register_checks()
|
||||
|
||||
def register_admin(admin_site):
|
||||
"""register any django admin models/views with the main django admin site instance"""
|
||||
pm.hook.register_admin(admin_site=admin_site)
|
|
@ -1,22 +0,0 @@
|
|||
from pathlib import Path
|
||||
|
||||
from pluggy import HookimplMarker
|
||||
from pluggy import HookspecMarker
|
||||
|
||||
spec = hookspec = HookspecMarker("abx")
|
||||
impl = hookimpl = HookimplMarker("abx")
|
||||
|
||||
|
||||
@hookspec
|
||||
@hookimpl
|
||||
def get_system_user() -> str:
|
||||
# Beware $HOME may not match current EUID, UID, PUID, SUID, there are edge cases
|
||||
# - sudo (EUD != UID != SUID)
|
||||
# - running with an autodetected UID based on data dir ownership
|
||||
# but mapping of UID:username is broken because it was created
|
||||
# by a different host system, e.g. 911's $HOME outside of docker
|
||||
# might be /usr/lib/lxd instead of /home/archivebox
|
||||
# - running as a user that doens't have a home directory
|
||||
# - home directory is set to a path that doesn't exist, or is inside a dir we cant read
|
||||
return Path('~').expanduser().name
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
import inspect
|
||||
|
||||
import pluggy
|
||||
|
||||
|
||||
class PluginManager(pluggy.PluginManager):
|
||||
"""
|
||||
Patch to fix pluggy's PluginManager to work with pydantic models.
|
||||
See: https://github.com/pytest-dev/pluggy/pull/536
|
||||
"""
|
||||
def parse_hookimpl_opts(self, plugin, name: str) -> pluggy.HookimplOpts | None:
|
||||
# IMPORTANT: @property methods can have side effects, and are never hookimpl
|
||||
# if attr is a property, skip it in advance
|
||||
plugin_class = plugin if inspect.isclass(plugin) else type(plugin)
|
||||
if isinstance(getattr(plugin_class, name, None), property):
|
||||
return None
|
||||
|
||||
# pydantic model fields are like attrs and also can never be hookimpls
|
||||
plugin_is_pydantic_obj = hasattr(plugin, "__pydantic_core_schema__")
|
||||
if plugin_is_pydantic_obj and name in getattr(plugin, "model_fields", {}):
|
||||
# pydantic models mess with the class and attr __signature__
|
||||
# so inspect.isroutine(...) throws exceptions and cant be used
|
||||
return None
|
||||
|
||||
try:
|
||||
return super().parse_hookimpl_opts(plugin, name)
|
||||
except AttributeError:
|
||||
return super().parse_hookimpl_opts(type(plugin), name)
|
||||
|
||||
pm = PluginManager("abx")
|
|
@ -1 +0,0 @@
|
|||
__package__ = 'abx.pydantic_pkgr'
|
|
@ -1,13 +0,0 @@
|
|||
|
||||
from ..hookspec import hookspec
|
||||
|
||||
###########################################################################################
|
||||
|
||||
@hookspec
|
||||
def get_BINPROVIDERS():
|
||||
return {}
|
||||
|
||||
@hookspec
|
||||
def get_BINARIES():
|
||||
return {}
|
||||
|
313
archivebox/actors/actor.py
Normal file
313
archivebox/actors/actor.py
Normal file
|
@ -0,0 +1,313 @@
|
|||
__package__ = 'archivebox.actors'
|
||||
|
||||
import os
|
||||
import time
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import ClassVar, Generic, TypeVar, Any, cast, Literal, Type
|
||||
from django.utils.functional import classproperty
|
||||
|
||||
from rich import print
|
||||
import psutil
|
||||
|
||||
from django import db
|
||||
from django.db import models
|
||||
from django.db.models import QuerySet
|
||||
from multiprocessing import Process, cpu_count
|
||||
from threading import Thread, get_native_id
|
||||
|
||||
# from archivebox.logging_util import TimedProgress
|
||||
|
||||
LaunchKwargs = dict[str, Any]
|
||||
|
||||
ModelType = TypeVar('ModelType', bound=models.Model)
|
||||
|
||||
class ActorType(ABC, Generic[ModelType]):
|
||||
"""
|
||||
Base class for all actors. Usage:
|
||||
class FaviconActor(ActorType[ArchiveResult]):
|
||||
QUERYSET: ClassVar[QuerySet] = ArchiveResult.objects.filter(status='queued', extractor='favicon')
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
|
||||
ATOMIC: ClassVar[bool] = True
|
||||
|
||||
def claim_sql_set(self, obj: ArchiveResult) -> str:
|
||||
# SQL fields to update atomically while claiming an object from the queue
|
||||
retry_at = datetime.now() + timedelta(seconds=self.MAX_TICK_TIME)
|
||||
return f"status = 'started', locked_by = {self.pid}, retry_at = {retry_at}"
|
||||
|
||||
def tick(self, obj: ArchiveResult) -> None:
|
||||
run_favicon_extractor(obj)
|
||||
ArchiveResult.objects.filter(pk=obj.pk, status='started').update(status='success')
|
||||
"""
|
||||
pid: int
|
||||
idle_count: int = 0
|
||||
launch_kwargs: LaunchKwargs = {}
|
||||
mode: Literal['thread', 'process'] = 'process'
|
||||
|
||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8) # min 2, max 8, up to 60% of available cpu cores
|
||||
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
|
||||
|
||||
QUERYSET: ClassVar[QuerySet] # the QuerySet to claim objects from
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
|
||||
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
|
||||
CLAIM_FROM_TOP: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10 # the number of objects to consider when atomically getting the next object from the queue
|
||||
ATOMIC: ClassVar[bool] = True # whether to atomically fetch+claim the nextobject in one step, or fetch and lock it in two steps
|
||||
|
||||
# model_type: Type[ModelType]
|
||||
|
||||
_SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = [] # record all the pids of Actors spawned by this class
|
||||
|
||||
def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
|
||||
self.mode = mode or self.mode
|
||||
self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
|
||||
|
||||
@classproperty
|
||||
def name(cls) -> str:
|
||||
return cls.__name__ # type: ignore
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def __repr__(self) -> str:
|
||||
"""FaviconActor[pid=1234]"""
|
||||
label = 'pid' if self.mode == 'process' else 'tid'
|
||||
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
||||
|
||||
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
|
||||
|
||||
@classmethod
|
||||
def get_running_actors(cls) -> list[int]:
|
||||
"""returns a list of pids of all running actors of this type"""
|
||||
# WARNING: only works for process actors, not thread actors
|
||||
if cls.mode == 'thread':
|
||||
raise NotImplementedError('get_running_actors() is not implemented for thread actors')
|
||||
return [
|
||||
proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
|
||||
if proc.is_running() and proc.status() != 'zombie'
|
||||
]
|
||||
|
||||
@classmethod
|
||||
def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
|
||||
"""Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
|
||||
queue_length = queue.count()
|
||||
if not queue_length: # queue is empty, spawn 0 actors
|
||||
return []
|
||||
|
||||
actors_to_spawn: list[LaunchKwargs] = []
|
||||
max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
|
||||
|
||||
# spawning new actors is expensive, avoid spawning all the actors at once. To stagger them,
|
||||
# let the next orchestrator tick handle starting another 2 on the next tick()
|
||||
# if queue_length > 10: # queue is long, spawn as many as possible
|
||||
# actors_to_spawn += max_spawnable * [{}]
|
||||
|
||||
if queue_length > 4: # queue is medium, spawn 1 or 2 actors
|
||||
actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}]
|
||||
else: # queue is short, spawn 1 actor
|
||||
actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}]
|
||||
return actors_to_spawn
|
||||
|
||||
@classmethod
|
||||
def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
|
||||
if mode == 'thread':
|
||||
return cls.fork_actor_as_thread(**launch_kwargs)
|
||||
elif mode == 'process':
|
||||
return cls.fork_actor_as_process(**launch_kwargs)
|
||||
raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
|
||||
|
||||
@classmethod
|
||||
def fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
|
||||
"""Spawn a new background thread running the actor's runloop"""
|
||||
actor = cls(mode='thread', **launch_kwargs)
|
||||
bg_actor_thread = Thread(target=actor.runloop)
|
||||
bg_actor_thread.start()
|
||||
assert bg_actor_thread.native_id is not None
|
||||
return bg_actor_thread.native_id
|
||||
|
||||
@classmethod
|
||||
def fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
|
||||
"""Spawn a new background process running the actor's runloop"""
|
||||
actor = cls(mode='process', **launch_kwargs)
|
||||
bg_actor_process = Process(target=actor.runloop)
|
||||
bg_actor_process.start()
|
||||
assert bg_actor_process.pid is not None
|
||||
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
|
||||
return bg_actor_process.pid
|
||||
|
||||
@classmethod
|
||||
def get_model(cls) -> Type[ModelType]:
|
||||
# wish this was a @classproperty but Generic[ModelType] return type cant be statically inferred for @classproperty
|
||||
return cls.QUERYSET.model
|
||||
|
||||
@classmethod
|
||||
def get_queue(cls) -> QuerySet:
|
||||
"""override this to provide your queryset as the queue"""
|
||||
# return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot'))
|
||||
return cls.QUERYSET
|
||||
|
||||
### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process)
|
||||
|
||||
def runloop(self):
|
||||
"""The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
|
||||
self.on_startup()
|
||||
try:
|
||||
while True:
|
||||
obj_to_process: ModelType | None = None
|
||||
try:
|
||||
obj_to_process = cast(ModelType, self.get_next(atomic=self.atomic))
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if obj_to_process:
|
||||
self.idle_count = 0 # reset idle count if we got an object
|
||||
else:
|
||||
if self.idle_count >= 30:
|
||||
break # stop looping and exit if queue is empty and we have idled for 30sec
|
||||
else:
|
||||
# print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
|
||||
self.idle_count += 1
|
||||
time.sleep(1)
|
||||
continue
|
||||
|
||||
self.on_tick_start(obj_to_process)
|
||||
|
||||
# Process the object
|
||||
try:
|
||||
self.tick(obj_to_process)
|
||||
except Exception as err:
|
||||
print(f'[red]🏃♂️ ERROR: {self}.tick()[/red]', err)
|
||||
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
|
||||
self.on_tick_exception(obj_to_process, err)
|
||||
finally:
|
||||
self.on_tick_end(obj_to_process)
|
||||
|
||||
self.on_shutdown(err=None)
|
||||
except BaseException as err:
|
||||
if isinstance(err, KeyboardInterrupt):
|
||||
print()
|
||||
else:
|
||||
print(f'\n[red]🏃♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
|
||||
self.on_shutdown(err=err)
|
||||
|
||||
def get_next(self, atomic: bool | None=None) -> ModelType | None:
|
||||
"""get the next object from the queue, atomically locking it if self.atomic=True"""
|
||||
if atomic is None:
|
||||
atomic = self.ATOMIC
|
||||
|
||||
if atomic:
|
||||
# fetch and claim the next object from in the queue in one go atomically
|
||||
obj = self.get_next_atomic()
|
||||
else:
|
||||
# two-step claim: fetch the next object and lock it in a separate query
|
||||
obj = self.get_queue().last()
|
||||
assert obj and self.lock_next(obj), f'Unable to fetch+lock the next {self.get_model().__name__} ojbect from {self}.QUEUE'
|
||||
return obj
|
||||
|
||||
def lock_next(self, obj: ModelType) -> bool:
|
||||
"""override this to implement a custom two-step (non-atomic)lock mechanism"""
|
||||
# For example:
|
||||
# assert obj._model.objects.filter(pk=obj.pk, status='queued').update(status='started', locked_by=self.pid)
|
||||
# Not needed if using get_next_and_lock() to claim the object atomically
|
||||
# print(f'[blue]🏃♂️ {self}.lock()[/blue]', obj.abid or obj.id)
|
||||
return True
|
||||
|
||||
def claim_sql_where(self) -> str:
|
||||
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
|
||||
return self.CLAIM_WHERE
|
||||
|
||||
def claim_sql_set(self) -> str:
|
||||
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
|
||||
return self.CLAIM_SET
|
||||
|
||||
def claim_sql_order(self) -> str:
|
||||
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
|
||||
return self.CLAIM_ORDER
|
||||
|
||||
def claim_from_top(self) -> int:
|
||||
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
|
||||
return self.CLAIM_FROM_TOP
|
||||
|
||||
def get_next_atomic(self, shallow: bool=True) -> ModelType | None:
|
||||
"""
|
||||
claim a random object from the top n=50 objects in the queue (atomically updates status=queued->started for claimed object)
|
||||
optimized for minimizing contention on the queue with other actors selecting from the same list
|
||||
slightly faster than claim_any_obj() which selects randomly from the entire queue but needs to know the total count
|
||||
"""
|
||||
Model = self.get_model() # e.g. ArchiveResult
|
||||
table = f'{Model._meta.app_label}_{Model._meta.model_name}' # e.g. core_archiveresult
|
||||
|
||||
where_sql = self.claim_sql_where()
|
||||
set_sql = self.claim_sql_set()
|
||||
order_by_sql = self.claim_sql_order()
|
||||
choose_from_top = self.claim_from_top()
|
||||
|
||||
with db.connection.cursor() as cursor:
|
||||
# subquery gets the pool of the top 50 candidates sorted by sort and order
|
||||
# main query selects a random one from that pool
|
||||
cursor.execute(f"""
|
||||
UPDATE {table}
|
||||
SET {set_sql}
|
||||
WHERE {where_sql} and id = (
|
||||
SELECT id FROM (
|
||||
SELECT id FROM {table}
|
||||
WHERE {where_sql}
|
||||
ORDER BY {order_by_sql}
|
||||
LIMIT {choose_from_top}
|
||||
) candidates
|
||||
ORDER BY RANDOM()
|
||||
LIMIT 1
|
||||
)
|
||||
RETURNING id;
|
||||
""")
|
||||
result = cursor.fetchone()
|
||||
|
||||
if result is None:
|
||||
return None # If no rows were claimed, return None
|
||||
|
||||
if shallow:
|
||||
# shallow: faster, returns potentially incomplete object instance missing some django auto-populated fields:
|
||||
columns = [col[0] for col in cursor.description or ['id']]
|
||||
return Model(**dict(zip(columns, result)))
|
||||
|
||||
# if not shallow do one extra query to get a more complete object instance (load it fully from scratch)
|
||||
return Model.objects.get(id=result[0])
|
||||
|
||||
@abstractmethod
|
||||
def tick(self, obj: ModelType) -> None:
|
||||
"""override this to process the object"""
|
||||
print(f'[blue]🏃♂️ {self}.tick()[/blue]', obj.abid or obj.id)
|
||||
# For example:
|
||||
# do_some_task(obj)
|
||||
# do_something_else(obj)
|
||||
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
|
||||
raise NotImplementedError('tick() must be implemented by the Actor subclass')
|
||||
|
||||
def on_startup(self) -> None:
|
||||
if self.mode == 'thread':
|
||||
self.pid = get_native_id() # thread id
|
||||
print(f'[green]🏃♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
|
||||
else:
|
||||
self.pid = os.getpid() # process id
|
||||
print(f'[green]🏃♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
|
||||
# abx.pm.hook.on_actor_startup(self)
|
||||
|
||||
def on_shutdown(self, err: BaseException | None=None) -> None:
|
||||
print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
|
||||
# abx.pm.hook.on_actor_shutdown(self)
|
||||
|
||||
def on_tick_start(self, obj: ModelType) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_start()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
|
||||
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
|
||||
pass
|
||||
|
||||
def on_tick_end(self, obj: ModelType) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_end()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
|
||||
# self.timer.end()
|
||||
pass
|
||||
|
||||
def on_tick_exception(self, obj: ModelType, err: BaseException) -> None:
|
||||
print(f'[red]🏃♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
|
||||
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)
|
3
archivebox/actors/admin.py
Normal file
3
archivebox/actors/admin.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.contrib import admin
|
||||
|
||||
# Register your models here.
|
6
archivebox/actors/apps.py
Normal file
6
archivebox/actors/apps.py
Normal file
|
@ -0,0 +1,6 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class ActorsConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "actors"
|
3
archivebox/actors/models.py
Normal file
3
archivebox/actors/models.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.db import models
|
||||
|
||||
# Create your models here.
|
244
archivebox/actors/orchestrator.py
Normal file
244
archivebox/actors/orchestrator.py
Normal file
|
@ -0,0 +1,244 @@
|
|||
__package__ = 'archivebox.actors'
|
||||
|
||||
import os
|
||||
import time
|
||||
import itertools
|
||||
from typing import Dict, Type, Literal, ClassVar
|
||||
from django.utils.functional import classproperty
|
||||
|
||||
from multiprocessing import Process, cpu_count
|
||||
from threading import Thread, get_native_id
|
||||
|
||||
|
||||
from rich import print
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from django.apps import apps
|
||||
from .actor import ActorType
|
||||
|
||||
class Orchestrator:
|
||||
pid: int
|
||||
idle_count: int = 0
|
||||
actor_types: Dict[str, Type[ActorType]]
|
||||
mode: Literal['thread', 'process'] = 'process'
|
||||
|
||||
def __init__(self, actor_types: Dict[str, Type[ActorType]] | None = None, mode: Literal['thread', 'process'] | None=None):
|
||||
self.actor_types = actor_types or self.actor_types or self.autodiscover_actor_types()
|
||||
self.mode = mode or self.mode
|
||||
|
||||
def __repr__(self) -> str:
|
||||
label = 'tid' if self.mode == 'thread' else 'pid'
|
||||
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
@classproperty
|
||||
def name(cls) -> str:
|
||||
return cls.__name__ # type: ignore
|
||||
|
||||
def fork_as_thread(self):
|
||||
self.thread = Thread(target=self.runloop)
|
||||
self.thread.start()
|
||||
assert self.thread.native_id is not None
|
||||
return self.thread.native_id
|
||||
|
||||
def fork_as_process(self):
|
||||
self.process = Process(target=self.runloop)
|
||||
self.process.start()
|
||||
assert self.process.pid is not None
|
||||
return self.process.pid
|
||||
|
||||
def start(self) -> int:
|
||||
if self.mode == 'thread':
|
||||
return self.fork_as_thread()
|
||||
elif self.mode == 'process':
|
||||
return self.fork_as_process()
|
||||
raise ValueError(f'Invalid orchestrator mode: {self.mode}')
|
||||
|
||||
@classmethod
|
||||
def autodiscover_actor_types(cls) -> Dict[str, Type[ActorType]]:
|
||||
# returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
|
||||
# override this method in a subclass to customize the actor types that are used
|
||||
# return {'Snapshot': SnapshotActorType, 'ArchiveResult_chrome': ChromeActorType, ...}
|
||||
return {
|
||||
# look through all models and find all classes that inherit from ActorType
|
||||
# actor_type.__name__: actor_type
|
||||
# for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def get_orphaned_objects(cls, all_queues) -> list:
|
||||
# returns a list of objects that are in the queues of all actor types but not in the queues of any other actor types
|
||||
all_queued_ids = itertools.chain(*[queue.values('id', flat=True) for queue in all_queues.values()])
|
||||
orphaned_objects = []
|
||||
for model in apps.get_models():
|
||||
if hasattr(model, 'retry_at'):
|
||||
orphaned_objects.extend(model.objects.filter(retry_at__lt=timezone.now()).exclude(id__in=all_queued_ids))
|
||||
return orphaned_objects
|
||||
|
||||
def on_startup(self):
|
||||
if self.mode == 'thread':
|
||||
self.pid = get_native_id()
|
||||
print(f'[green]👨✈️ {self}.on_startup() STARTUP (THREAD)[/green]')
|
||||
elif self.mode == 'process':
|
||||
self.pid = os.getpid()
|
||||
print(f'[green]👨✈️ {self}.on_startup() STARTUP (PROCESS)[/green]')
|
||||
# abx.pm.hook.on_orchestrator_startup(self)
|
||||
|
||||
def on_shutdown(self, err: BaseException | None = None):
|
||||
print(f'[grey53]👨✈️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
|
||||
# abx.pm.hook.on_orchestrator_shutdown(self)
|
||||
|
||||
def on_tick_started(self, all_queues):
|
||||
# total_pending = sum(queue.count() for queue in all_queues.values())
|
||||
# print(f'👨✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
|
||||
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
|
||||
pass
|
||||
|
||||
def on_tick_finished(self, all_queues, all_existing_actors, all_spawned_actors):
|
||||
if all_spawned_actors:
|
||||
total_queue_length = sum(queue.count() for queue in all_queues.values())
|
||||
print(f'[grey53]👨✈️ {self}.on_tick_finished() queue={total_queue_length} existing_actors={len(all_existing_actors)} spawned_actors={len(all_spawned_actors)}[/grey53]')
|
||||
# abx.pm.hook.on_orchestrator_tick_finished(self, actor_types, all_queues)
|
||||
|
||||
def on_idle(self, all_queues):
|
||||
# print(f'👨✈️ {self}.on_idle()')
|
||||
# abx.pm.hook.on_orchestrator_idle(self)
|
||||
# check for orphaned objects left behind
|
||||
if self.idle_count == 60:
|
||||
orphaned_objects = self.get_orphaned_objects(all_queues)
|
||||
if orphaned_objects:
|
||||
print('[red]👨✈️ WARNING: some objects may not be processed, no actor has claimed them after 60s:[/red]', orphaned_objects)
|
||||
|
||||
def runloop(self):
|
||||
self.on_startup()
|
||||
try:
|
||||
while True:
|
||||
all_queues = {
|
||||
actor_type: actor_type.get_queue()
|
||||
for actor_type in self.actor_types.values()
|
||||
}
|
||||
if not all_queues:
|
||||
raise Exception('Failed to find any actor_types to process')
|
||||
|
||||
self.on_tick_started(all_queues)
|
||||
|
||||
all_existing_actors = []
|
||||
all_spawned_actors = []
|
||||
|
||||
for actor_type, queue in all_queues.items():
|
||||
try:
|
||||
existing_actors = actor_type.get_running_actors()
|
||||
all_existing_actors.extend(existing_actors)
|
||||
actors_to_spawn = actor_type.get_actors_to_spawn(queue, existing_actors)
|
||||
for launch_kwargs in actors_to_spawn:
|
||||
new_actor_pid = actor_type.start(mode='process', **launch_kwargs)
|
||||
all_spawned_actors.append(new_actor_pid)
|
||||
except Exception as err:
|
||||
print(f'🏃♂️ ERROR: {self} Failed to get {actor_type} queue & running actors', err)
|
||||
except BaseException:
|
||||
raise
|
||||
|
||||
if not any(queue.exists() for queue in all_queues.values()):
|
||||
self.on_idle(all_queues)
|
||||
self.idle_count += 1
|
||||
time.sleep(1)
|
||||
else:
|
||||
self.idle_count = 0
|
||||
|
||||
self.on_tick_finished(all_queues, all_existing_actors, all_spawned_actors)
|
||||
time.sleep(1)
|
||||
|
||||
except BaseException as err:
|
||||
if isinstance(err, KeyboardInterrupt):
|
||||
print()
|
||||
else:
|
||||
print(f'\n[red]🏃♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
|
||||
self.on_shutdown(err=err)
|
||||
|
||||
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
|
||||
setup_django()
|
||||
|
||||
from core.models import ArchiveResult, Snapshot
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from django import db
|
||||
from django.db import connection
|
||||
|
||||
|
||||
from crawls.actors import CrawlActor
|
||||
from .actor_snapshot import SnapshotActor
|
||||
|
||||
from abx_plugin_singlefile.actors import SinglefileActor
|
||||
|
||||
|
||||
class FaviconActor(ActorType[ArchiveResult]):
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
|
||||
CLAIM_SET: ClassVar[str] = 'status = "started"'
|
||||
|
||||
@classproperty
|
||||
def QUERYSET(cls) -> QuerySet:
|
||||
return ArchiveResult.objects.filter(status='failed', extractor='favicon')
|
||||
|
||||
def tick(self, obj: ArchiveResult):
|
||||
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
|
||||
updated = ArchiveResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
|
||||
if not updated:
|
||||
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
|
||||
obj.refresh_from_db()
|
||||
obj.save()
|
||||
|
||||
|
||||
class ExtractorsOrchestrator(Orchestrator):
|
||||
actor_types = {
|
||||
'CrawlActor': CrawlActor,
|
||||
'SnapshotActor': SnapshotActor,
|
||||
'FaviconActor': FaviconActor,
|
||||
'SinglefileActor': SinglefileActor,
|
||||
}
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
orchestrator = ExtractorsOrchestrator()
|
||||
orchestrator.start()
|
||||
|
||||
snap = Snapshot.objects.last()
|
||||
assert snap is not None
|
||||
created = 0
|
||||
while True:
|
||||
time.sleep(0.05)
|
||||
# try:
|
||||
# ArchiveResult.objects.bulk_create([
|
||||
# ArchiveResult(
|
||||
# id=uuid.uuid4(),
|
||||
# snapshot=snap,
|
||||
# status='failed',
|
||||
# extractor='favicon',
|
||||
# cmd=['echo', '"hello"'],
|
||||
# cmd_version='1.0',
|
||||
# pwd='.',
|
||||
# start_ts=timezone.now(),
|
||||
# end_ts=timezone.now(),
|
||||
# created_at=timezone.now(),
|
||||
# modified_at=timezone.now(),
|
||||
# created_by_id=1,
|
||||
# )
|
||||
# for _ in range(100)
|
||||
# ])
|
||||
# created += 100
|
||||
# if created % 1000 == 0:
|
||||
# print(f'[blue]Created {created} ArchiveResults...[/blue]')
|
||||
# time.sleep(25)
|
||||
# except Exception as err:
|
||||
# print(err)
|
||||
# db.connections.close_all()
|
||||
# except BaseException as err:
|
||||
# print(err)
|
||||
# break
|
286
archivebox/actors/statemachine.py
Normal file
286
archivebox/actors/statemachine.py
Normal file
|
@ -0,0 +1,286 @@
|
|||
from statemachine import State, StateMachine
|
||||
from django.db import models
|
||||
from multiprocessing import Process
|
||||
import psutil
|
||||
import time
|
||||
|
||||
# State Machine Definitions
|
||||
#################################################
|
||||
|
||||
class SnapshotMachine(StateMachine):
|
||||
"""State machine for managing Snapshot lifecycle."""
|
||||
|
||||
# States
|
||||
queued = State(initial=True)
|
||||
started = State()
|
||||
sealed = State(final=True)
|
||||
|
||||
# Transitions
|
||||
start = queued.to(started, cond='can_start')
|
||||
seal = started.to(sealed, cond='is_finished')
|
||||
|
||||
# Events
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, snapshot):
|
||||
self.snapshot = snapshot
|
||||
super().__init__()
|
||||
|
||||
def can_start(self):
|
||||
return True
|
||||
|
||||
def is_finished(self):
|
||||
return not self.snapshot.has_pending_archiveresults()
|
||||
|
||||
def before_start(self):
|
||||
"""Pre-start validation and setup."""
|
||||
self.snapshot.cleanup_dir()
|
||||
|
||||
def after_start(self):
|
||||
"""Post-start side effects."""
|
||||
self.snapshot.create_pending_archiveresults()
|
||||
self.snapshot.update_indices()
|
||||
self.snapshot.bump_retry_at(seconds=10)
|
||||
|
||||
def before_seal(self):
|
||||
"""Pre-seal validation and cleanup."""
|
||||
self.snapshot.cleanup_dir()
|
||||
|
||||
def after_seal(self):
|
||||
"""Post-seal actions."""
|
||||
self.snapshot.update_indices()
|
||||
self.snapshot.seal_dir()
|
||||
self.snapshot.upload_dir()
|
||||
self.snapshot.retry_at = None
|
||||
self.snapshot.save()
|
||||
|
||||
|
||||
class ArchiveResultMachine(StateMachine):
|
||||
"""State machine for managing ArchiveResult lifecycle."""
|
||||
|
||||
# States
|
||||
queued = State(initial=True)
|
||||
started = State()
|
||||
succeeded = State(final=True)
|
||||
backoff = State()
|
||||
failed = State(final=True)
|
||||
|
||||
# Transitions
|
||||
start = queued.to(started, cond='can_start')
|
||||
succeed = started.to(succeeded, cond='extractor_succeeded')
|
||||
backoff = started.to(backoff, unless='extractor_succeeded')
|
||||
retry = backoff.to(queued, cond='can_retry')
|
||||
fail = backoff.to(failed, unless='can_retry')
|
||||
|
||||
# Events
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(cond='extractor_still_running') |
|
||||
started.to(succeeded, cond='extractor_succeeded') |
|
||||
started.to(backoff, unless='extractor_succeeded') |
|
||||
backoff.to.itself(cond='still_waiting_to_retry') |
|
||||
backoff.to(queued, cond='can_retry') |
|
||||
backoff.to(failed, unless='can_retry')
|
||||
)
|
||||
|
||||
def __init__(self, archiveresult):
|
||||
self.archiveresult = archiveresult
|
||||
super().__init__()
|
||||
|
||||
def can_start(self):
|
||||
return True
|
||||
|
||||
def extractor_still_running(self):
|
||||
return self.archiveresult.start_ts > time.now() - timedelta(seconds=5)
|
||||
|
||||
def extractor_succeeded(self):
|
||||
# return check_if_extractor_succeeded(self.archiveresult)
|
||||
return self.archiveresult.start_ts < time.now() - timedelta(seconds=5)
|
||||
|
||||
def can_retry(self):
|
||||
return self.archiveresult.retries < self.archiveresult.max_retries
|
||||
|
||||
def before_start(self):
|
||||
"""Pre-start initialization."""
|
||||
self.archiveresult.retries += 1
|
||||
self.archiveresult.start_ts = time.now()
|
||||
self.archiveresult.output = None
|
||||
self.archiveresult.error = None
|
||||
|
||||
def after_start(self):
|
||||
"""Post-start execution."""
|
||||
self.archiveresult.bump_retry_at(seconds=self.archiveresult.timeout + 5)
|
||||
execute_extractor(self.archiveresult)
|
||||
self.archiveresult.snapshot.bump_retry_at(seconds=5)
|
||||
|
||||
def before_succeed(self):
|
||||
"""Pre-success validation."""
|
||||
self.archiveresult.output = get_archiveresult_output(self.archiveresult)
|
||||
|
||||
def after_succeed(self):
|
||||
"""Post-success cleanup."""
|
||||
self.archiveresult.end_ts = time.now()
|
||||
self.archiveresult.retry_at = None
|
||||
self.archiveresult.update_indices()
|
||||
|
||||
def before_backoff(self):
|
||||
"""Pre-backoff error capture."""
|
||||
self.archiveresult.error = get_archiveresult_error(self.archiveresult)
|
||||
|
||||
def after_backoff(self):
|
||||
"""Post-backoff retry scheduling."""
|
||||
self.archiveresult.end_ts = time.now()
|
||||
self.archiveresult.bump_retry_at(
|
||||
seconds=self.archiveresult.timeout * self.archiveresult.retries
|
||||
)
|
||||
self.archiveresult.update_indices()
|
||||
|
||||
def before_fail(self):
|
||||
"""Pre-failure finalization."""
|
||||
self.archiveresult.retry_at = None
|
||||
|
||||
def after_fail(self):
|
||||
"""Post-failure cleanup."""
|
||||
self.archiveresult.update_indices()
|
||||
|
||||
# Models
|
||||
#################################################
|
||||
|
||||
class Snapshot(models.Model):
|
||||
status = models.CharField(max_length=32, default='queued')
|
||||
retry_at = models.DateTimeField(null=True)
|
||||
|
||||
@property
|
||||
def sm(self):
|
||||
"""Get the state machine for this snapshot."""
|
||||
return SnapshotMachine(self)
|
||||
|
||||
def has_pending_archiveresults(self):
|
||||
return self.archiveresult_set.exclude(
|
||||
status__in=['succeeded', 'failed']
|
||||
).exists()
|
||||
|
||||
def bump_retry_at(self, seconds):
|
||||
self.retry_at = time.now() + timedelta(seconds=seconds)
|
||||
self.save()
|
||||
|
||||
def cleanup_dir(self):
|
||||
cleanup_snapshot_dir(self)
|
||||
|
||||
def create_pending_archiveresults(self):
|
||||
create_snapshot_pending_archiveresults(self)
|
||||
|
||||
def update_indices(self):
|
||||
update_snapshot_index_json(self)
|
||||
update_snapshot_index_html(self)
|
||||
|
||||
def seal_dir(self):
|
||||
seal_snapshot_dir(self)
|
||||
|
||||
def upload_dir(self):
|
||||
upload_snapshot_dir(self)
|
||||
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
status = models.CharField(max_length=32, default='queued')
|
||||
retry_at = models.DateTimeField(null=True)
|
||||
retries = models.IntegerField(default=0)
|
||||
max_retries = models.IntegerField(default=3)
|
||||
timeout = models.IntegerField(default=60)
|
||||
start_ts = models.DateTimeField(null=True)
|
||||
end_ts = models.DateTimeField(null=True)
|
||||
output = models.TextField(null=True)
|
||||
error = models.TextField(null=True)
|
||||
|
||||
def get_machine(self):
|
||||
return ArchiveResultMachine(self)
|
||||
|
||||
def bump_retry_at(self, seconds):
|
||||
self.retry_at = time.now() + timedelta(seconds=seconds)
|
||||
self.save()
|
||||
|
||||
def update_indices(self):
|
||||
update_archiveresult_index_json(self)
|
||||
update_archiveresult_index_html(self)
|
||||
|
||||
|
||||
# Actor System
|
||||
#################################################
|
||||
|
||||
class BaseActor:
|
||||
MAX_TICK_TIME = 60
|
||||
|
||||
def tick(self, obj):
|
||||
"""Process a single object through its state machine."""
|
||||
machine = obj.get_machine()
|
||||
|
||||
if machine.is_queued:
|
||||
if machine.can_start():
|
||||
machine.start()
|
||||
|
||||
elif machine.is_started:
|
||||
if machine.can_seal():
|
||||
machine.seal()
|
||||
|
||||
elif machine.is_backoff:
|
||||
if machine.can_retry():
|
||||
machine.retry()
|
||||
else:
|
||||
machine.fail()
|
||||
|
||||
|
||||
class Orchestrator:
|
||||
"""Main orchestrator that manages all actors."""
|
||||
|
||||
def __init__(self):
|
||||
self.pid = None
|
||||
|
||||
@classmethod
|
||||
def spawn(cls):
|
||||
orchestrator = cls()
|
||||
proc = Process(target=orchestrator.runloop)
|
||||
proc.start()
|
||||
return proc.pid
|
||||
|
||||
def runloop(self):
|
||||
self.pid = os.getpid()
|
||||
abx.pm.hook.on_orchestrator_startup(self)
|
||||
|
||||
try:
|
||||
while True:
|
||||
self.process_queue(Snapshot)
|
||||
self.process_queue(ArchiveResult)
|
||||
time.sleep(0.1)
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
abx.pm.hook.on_orchestrator_shutdown(self)
|
||||
|
||||
def process_queue(self, model):
|
||||
retry_at_reached = Q(retry_at__isnull=True) | Q(retry_at__lte=time.now())
|
||||
queue = model.objects.filter(retry_at_reached)
|
||||
|
||||
if queue.exists():
|
||||
actor = BaseActor()
|
||||
for obj in queue:
|
||||
try:
|
||||
with transaction.atomic():
|
||||
actor.tick(obj)
|
||||
except Exception as e:
|
||||
abx.pm.hook.on_actor_tick_exception(actor, obj, e)
|
||||
|
||||
|
||||
# Periodic Tasks
|
||||
#################################################
|
||||
|
||||
@djhuey.periodic_task(schedule=djhuey.crontab(minute='*'))
|
||||
def ensure_orchestrator_running():
|
||||
"""Ensure orchestrator is running, start if not."""
|
||||
if not any(p.name().startswith('Orchestrator') for p in psutil.process_iter()):
|
||||
Orchestrator.spawn()
|
3
archivebox/actors/tests.py
Normal file
3
archivebox/actors/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
3
archivebox/actors/views.py
Normal file
3
archivebox/actors/views.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
from django.shortcuts import render
|
||||
|
||||
# Create your views here.
|
|
@ -1,4 +1,5 @@
|
|||
__package__ = 'archivebox.config'
|
||||
__package__ = 'config'
|
||||
__order__ = 200
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
|
@ -8,35 +9,28 @@ from .paths import (
|
|||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
|
||||
|
||||
import abx
|
||||
|
||||
# import abx
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_INSTALLED_APPS():
|
||||
# return ['config']
|
||||
# def get_CONFIG():
|
||||
# from .common import (
|
||||
# SHELL_CONFIG,
|
||||
# STORAGE_CONFIG,
|
||||
# GENERAL_CONFIG,
|
||||
# SERVER_CONFIG,
|
||||
# ARCHIVING_CONFIG,
|
||||
# SEARCH_BACKEND_CONFIG,
|
||||
# )
|
||||
# return {
|
||||
# 'SHELL_CONFIG': SHELL_CONFIG,
|
||||
# 'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
# 'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
# 'SERVER_CONFIG': SERVER_CONFIG,
|
||||
# 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
# 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
# }
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
for config in get_CONFIG().values():
|
||||
config.validate()
|
||||
# @abx.hookimpl
|
||||
# def ready():
|
||||
# for config in get_CONFIG().values():
|
||||
# config.validate()
|
||||
|
|
|
@ -9,16 +9,18 @@ from configparser import ConfigParser
|
|||
|
||||
from benedict import benedict
|
||||
|
||||
import archivebox
|
||||
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
from archivebox.misc.logging import stderr
|
||||
|
||||
|
||||
def get_real_name(key: str) -> str:
|
||||
"""get the current canonical name for a given deprecated config key"""
|
||||
from django.conf import settings
|
||||
"""get the up-to-date canonical name for a given old alias or current key"""
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
for section in settings.CONFIGS.values():
|
||||
for section in CONFIGS.values():
|
||||
try:
|
||||
return section.aliases[key]
|
||||
except KeyError:
|
||||
|
@ -115,17 +117,15 @@ def load_config_file() -> Optional[benedict]:
|
|||
|
||||
|
||||
def section_for_key(key: str) -> Any:
|
||||
from django.conf import settings
|
||||
for config_section in settings.CONFIGS.values():
|
||||
for config_section in archivebox.pm.hook.get_CONFIGS().values():
|
||||
if hasattr(config_section, key):
|
||||
return config_section
|
||||
return None
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
|
||||
|
||||
def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
|
||||
|
||||
import abx.archivebox.reads
|
||||
from archivebox.misc.system import atomic_write
|
||||
|
||||
CONFIG_HEADER = (
|
||||
|
@ -175,7 +175,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
|||
updated_config = {}
|
||||
try:
|
||||
# validate the updated_config by attempting to re-parse it
|
||||
updated_config = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
|
||||
updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
# something went horribly wrong, revert to the previous version
|
||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||
|
@ -233,11 +233,11 @@ def load_config(defaults: Dict[str, Any],
|
|||
return benedict(extended_config)
|
||||
|
||||
def load_all_config():
|
||||
import abx.archivebox.reads
|
||||
import abx
|
||||
|
||||
flat_config = benedict()
|
||||
|
||||
for config_section in abx.archivebox.reads.get_CONFIGS().values():
|
||||
for config_section in abx.pm.hook.get_CONFIGS().values():
|
||||
config_section.__init__()
|
||||
flat_config.update(config_section.model_dump())
|
||||
|
|
@ -10,7 +10,7 @@ from rich import print
|
|||
from pydantic import Field, field_validator
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
from .constants import CONSTANTS
|
||||
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
|
||||
|
@ -45,8 +45,6 @@ class ShellConfig(BaseConfigSet):
|
|||
def BUILD_TIME(self) -> str:
|
||||
return get_BUILD_TIME()
|
||||
|
||||
# def VERSIONS_AVAILABLE() -> bool # .check_for_update.get_versions_available_on_github(c)},
|
||||
# def CAN_UPGRADE() -> bool # .check_for_update.can_upgrade(c)},
|
||||
|
||||
SHELL_CONFIG = ShellConfig()
|
||||
|
||||
|
|
|
@ -1,3 +1,15 @@
|
|||
"""
|
||||
Constants are for things that never change at runtime.
|
||||
(but they can change from run-to-run or machine-to-machine)
|
||||
|
||||
DATA_DIR will never change at runtime, but you can run
|
||||
archivebox from inside a different DATA_DIR on the same machine.
|
||||
|
||||
This is loaded very early in the archivebox startup flow, so nothing in this file
|
||||
or imported from this file should import anything from archivebox.config.common,
|
||||
django, other INSTALLED_APPS, or anything else that is not in a standard library.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
|
||||
import re
|
||||
|
@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
|
|||
|
||||
@classmethod
|
||||
def __getitem__(cls, key: str):
|
||||
# so it behaves like a dict[key] == dict.key or object attr
|
||||
return getattr(cls, key)
|
||||
|
||||
@classmethod
|
||||
def __benedict__(cls):
|
||||
# when casting to benedict, only include uppercase keys that don't start with an underscore
|
||||
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
|
||||
|
||||
@classmethod
|
||||
|
@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
|
|||
CONSTANTS = ConstantsDict()
|
||||
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
|
||||
|
||||
# add all key: values to globals() for easier importing
|
||||
globals().update(CONSTANTS)
|
||||
# add all key: values to globals() for easier importing, e.g.:
|
||||
# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
|
||||
# globals().update(CONSTANTS)
|
||||
|
|
|
@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
|||
return
|
||||
|
||||
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False)
|
||||
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
|
||||
|
||||
|
@ -97,7 +97,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
|
|||
except Exception as e:
|
||||
bump_startup_progress_bar(advance=1000)
|
||||
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version', 'init'))
|
||||
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
|
||||
if not is_using_meta_cmd:
|
||||
# show error message to user only if they're not running a meta command / just trying to get help
|
||||
STDERR.print()
|
||||
|
|
|
@ -45,7 +45,7 @@ def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
|
|||
@cache
|
||||
def get_COMMIT_HASH() -> Optional[str]:
|
||||
try:
|
||||
git_dir = PACKAGE_DIR / '../.git'
|
||||
git_dir = PACKAGE_DIR.parent / '.git'
|
||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||
return commit_hash
|
||||
|
@ -53,7 +53,7 @@ def get_COMMIT_HASH() -> Optional[str]:
|
|||
pass
|
||||
|
||||
try:
|
||||
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
@ -62,8 +62,12 @@ def get_COMMIT_HASH() -> Optional[str]:
|
|||
@cache
|
||||
def get_BUILD_TIME() -> str:
|
||||
if IN_DOCKER:
|
||||
try:
|
||||
# if we're in the archivebox official docker image, /VERSION.txt will contain the build time
|
||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||
return docker_build_end_time
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||
|
|
|
@ -14,8 +14,8 @@ from django.utils.html import format_html, mark_safe
|
|||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import abx.archivebox.reads
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
|
@ -65,7 +65,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
|||
|
||||
@render_with_table_view
|
||||
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
rows = {
|
||||
|
@ -81,12 +81,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
|
||||
relevant_configs = {
|
||||
key: val
|
||||
for key, val in settings.FLAT_CONFIG.items()
|
||||
for key, val in FLAT_CONFIG.items()
|
||||
if '_BINARY' in key or '_VERSION' in key
|
||||
}
|
||||
|
||||
for plugin_id, plugin in abx.archivebox.reads.get_PLUGINS().items():
|
||||
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
if not plugin.hooks.get('get_BINARIES'):
|
||||
continue
|
||||
|
||||
|
@ -131,17 +130,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
binary = None
|
||||
plugin = None
|
||||
for plugin_id in abx.archivebox.reads.get_PLUGINS().keys():
|
||||
loaded_plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
try:
|
||||
for loaded_binary in loaded_plugin.hooks.get_BINARIES().values():
|
||||
for loaded_binary in plugin['hooks'].get_BINARIES().values():
|
||||
if loaded_binary.name == key:
|
||||
binary = loaded_binary
|
||||
plugin = loaded_plugin
|
||||
plugin = plugin
|
||||
# break # last write wins
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
@ -161,7 +159,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
|||
"name": binary.name,
|
||||
"description": binary.abspath,
|
||||
"fields": {
|
||||
'plugin': plugin.package,
|
||||
'plugin': plugin['package'],
|
||||
'binprovider': binary.loaded_binprovider,
|
||||
'abspath': binary.loaded_abspath,
|
||||
'version': binary.loaded_version,
|
||||
|
@ -215,9 +213,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
return color
|
||||
return 'black'
|
||||
|
||||
for plugin_id in settings.PLUGINS.keys():
|
||||
|
||||
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
plugin.hooks.get_BINPROVIDERS = plugin.hooks.get('get_BINPROVIDERS', lambda: {})
|
||||
plugin.hooks.get_BINARIES = plugin.hooks.get('get_BINARIES', lambda: {})
|
||||
plugin.hooks.get_CONFIG = plugin.hooks.get('get_CONFIG', lambda: {})
|
||||
|
@ -263,7 +259,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
|||
|
||||
assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
|
||||
|
||||
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
|
||||
plugin = abx.get_plugin(plugin_id)
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
|
|
|
@ -1,2 +1,31 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
import abx
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from core.admin import register_admin
|
||||
register_admin(admin_site)
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
|
|
73
archivebox/core/actors.py
Normal file
73
archivebox/core/actors.py
Normal file
|
@ -0,0 +1,73 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
from rich import print
|
||||
|
||||
from django.db.models import QuerySet
|
||||
from django.utils import timezone
|
||||
from datetime import timedelta
|
||||
from core.models import Snapshot
|
||||
|
||||
from actors.actor import ActorType
|
||||
|
||||
|
||||
class SnapshotActor(ActorType[Snapshot]):
|
||||
|
||||
QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
|
||||
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
|
||||
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
|
||||
|
||||
# model_type: Type[ModelType]
|
||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
|
||||
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
|
||||
|
||||
def claim_sql_where(self) -> str:
|
||||
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
|
||||
return self.CLAIM_WHERE
|
||||
|
||||
def claim_sql_set(self) -> str:
|
||||
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
|
||||
retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
|
||||
# format as 2024-10-31 10:14:33.240903
|
||||
retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
|
||||
return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
|
||||
|
||||
def claim_sql_order(self) -> str:
|
||||
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
|
||||
return self.CLAIM_ORDER
|
||||
|
||||
def claim_from_top(self) -> int:
|
||||
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
|
||||
return self.CLAIM_FROM_TOP
|
||||
|
||||
def tick(self, obj: Snapshot) -> None:
|
||||
"""override this to process the object"""
|
||||
print(f'[blue]🏃♂️ {self}.tick()[/blue]', obj.abid or obj.id)
|
||||
# For example:
|
||||
# do_some_task(obj)
|
||||
# do_something_else(obj)
|
||||
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
|
||||
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
|
||||
|
||||
def on_shutdown(self, err: BaseException | None=None) -> None:
|
||||
print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
|
||||
# abx.pm.hook.on_actor_shutdown(self)
|
||||
|
||||
def on_tick_start(self, obj: Snapshot) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_start()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
|
||||
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
|
||||
pass
|
||||
|
||||
def on_tick_end(self, obj: Snapshot) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_end()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
|
||||
# self.timer.end()
|
||||
pass
|
||||
|
||||
def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
|
||||
print(f'[red]🏃♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
|
||||
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)
|
|
@ -8,7 +8,7 @@ from django.utils.html import format_html, mark_safe
|
|||
from django.core.exceptions import ValidationError
|
||||
from django.urls import reverse, resolve
|
||||
from django.utils import timezone
|
||||
from django.forms import forms
|
||||
from django_jsonform.forms.fields import JSONFormField
|
||||
|
||||
from huey_monitor.admin import TaskModel
|
||||
|
||||
|
@ -83,7 +83,7 @@ class ArchiveResultInline(admin.TabularInline):
|
|||
formset.form.base_fields['cmd_version'].initial = '-'
|
||||
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
|
||||
formset.form.base_fields['created_by'].initial = request.user
|
||||
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
|
||||
formset.form.base_fields['cmd'] = JSONFormField(initial=['-'])
|
||||
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
|
||||
|
||||
if obj is not None:
|
||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
|
|||
|
||||
from django.contrib import admin
|
||||
|
||||
import abx.django.use
|
||||
import archivebox
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
|
@ -37,6 +37,6 @@ def register_admin_site():
|
|||
sites.site = archivebox_admin
|
||||
|
||||
# register all plugins admin classes
|
||||
abx.django.use.register_admin(archivebox_admin)
|
||||
archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
|
||||
|
||||
return archivebox_admin
|
||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
|
|||
|
||||
from django.apps import AppConfig
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
|
@ -10,16 +10,11 @@ class CoreConfig(AppConfig):
|
|||
|
||||
def ready(self):
|
||||
"""Register the archivebox.core.admin_site as the main django admin site"""
|
||||
from django.conf import settings
|
||||
archivebox.pm.hook.ready(settings=settings)
|
||||
|
||||
from core.admin_site import register_admin_site
|
||||
register_admin_site()
|
||||
|
||||
abx.pm.hook.ready()
|
||||
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def register_admin(admin_site):
|
||||
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
|
||||
from core.admin import register_admin
|
||||
register_admin(admin_site)
|
||||
|
|
|
@ -8,21 +8,25 @@ import os
|
|||
import json
|
||||
|
||||
from pathlib import Path
|
||||
from datetime import timedelta
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
from django.utils import timezone
|
||||
from django.core.cache import cache
|
||||
from django.urls import reverse, reverse_lazy
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
from django.contrib import admin
|
||||
from django.conf import settings
|
||||
|
||||
from statemachine.mixins import MachineMixin
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
|
||||
from queues.tasks import bg_archive_snapshot
|
||||
# from crawls.models import Crawl
|
||||
from crawls.models import Crawl
|
||||
# from machine.models import Machine, NetworkInterface
|
||||
|
||||
from archivebox.misc.system import get_dir_size
|
||||
|
@ -152,7 +156,7 @@ class SnapshotManager(models.Manager):
|
|||
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
|
||||
|
||||
|
||||
class Snapshot(ABIDModel):
|
||||
class Snapshot(ABIDModel, MachineMixin):
|
||||
abid_prefix = 'snp_'
|
||||
abid_ts_src = 'self.created_at'
|
||||
abid_uri_src = 'self.url'
|
||||
|
@ -160,6 +164,17 @@ class Snapshot(ABIDModel):
|
|||
abid_rand_src = 'self.id'
|
||||
abid_drift_allowed = True
|
||||
|
||||
state_field_name = 'status'
|
||||
state_machine_name = 'core.statemachines.SnapshotMachine'
|
||||
state_machine_attr = 'sm'
|
||||
|
||||
class SnapshotStatus(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
SEALED = 'sealed', 'Sealed'
|
||||
|
||||
status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
||||
|
@ -171,7 +186,7 @@ class Snapshot(ABIDModel):
|
|||
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
|
||||
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
||||
|
||||
# crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
|
||||
crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
|
||||
|
||||
url = models.URLField(unique=True, db_index=True)
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
|
||||
|
@ -397,6 +412,25 @@ class Snapshot(ABIDModel):
|
|||
self.tags.clear()
|
||||
self.tags.add(*tags_id)
|
||||
|
||||
def has_pending_archiveresults(self) -> bool:
|
||||
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
|
||||
pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
|
||||
return pending_archiveresults.exists()
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
archiveresults = []
|
||||
for extractor in EXTRACTORS:
|
||||
archiveresult, _created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self,
|
||||
extractor=extractor,
|
||||
status=ArchiveResult.ArchiveResultStatus.QUEUED,
|
||||
)
|
||||
archiveresults.append(archiveresult)
|
||||
return archiveresults
|
||||
|
||||
def bump_retry_at(self, seconds: int = 10):
|
||||
self.retry_at = timezone.now() + timedelta(seconds=seconds)
|
||||
|
||||
|
||||
# def get_storage_dir(self, create=True, symlink=True) -> Path:
|
||||
# date_str = self.bookmarked_at.strftime('%Y%m%d')
|
||||
|
@ -453,6 +487,20 @@ class ArchiveResult(ABIDModel):
|
|||
abid_rand_src = 'self.id'
|
||||
abid_drift_allowed = True
|
||||
|
||||
state_field_name = 'status'
|
||||
state_machine_name = 'core.statemachines.ArchiveResultMachine'
|
||||
state_machine_attr = 'sm'
|
||||
|
||||
class ArchiveResultStatus(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
SUCCEEDED = 'succeeded', 'Succeeded'
|
||||
FAILED = 'failed', 'Failed'
|
||||
SKIPPED = 'skipped', 'Skipped'
|
||||
BACKOFF = 'backoff', 'Waiting to retry'
|
||||
|
||||
status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
|
||||
|
||||
EXTRACTOR_CHOICES = (
|
||||
('htmltotext', 'htmltotext'),
|
||||
('git', 'git'),
|
||||
|
@ -469,11 +517,7 @@ class ArchiveResult(ABIDModel):
|
|||
('title', 'title'),
|
||||
('wget', 'wget'),
|
||||
)
|
||||
STATUS_CHOICES = [
|
||||
("succeeded", "succeeded"),
|
||||
("failed", "failed"),
|
||||
("skipped", "skipped")
|
||||
]
|
||||
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
@ -491,7 +535,6 @@ class ArchiveResult(ABIDModel):
|
|||
output = models.CharField(max_length=1024)
|
||||
start_ts = models.DateTimeField(db_index=True)
|
||||
end_ts = models.DateTimeField()
|
||||
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||
|
||||
# the network interface that was used to download this result
|
||||
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
|
||||
|
@ -552,7 +595,15 @@ class ArchiveResult(ABIDModel):
|
|||
return link.canonical_outputs().get(f'{self.extractor}_path')
|
||||
|
||||
def output_exists(self) -> bool:
|
||||
return os.access(self.output_path(), os.R_OK)
|
||||
return os.path.exists(self.output_path())
|
||||
|
||||
def bump_retry_at(self, seconds: int = 10):
|
||||
self.retry_at = timezone.now() + timedelta(seconds=seconds)
|
||||
|
||||
def create_output_dir(self):
|
||||
snap_dir = self.snapshot_dir
|
||||
snap_dir.mkdir(parents=True, exist_ok=True)
|
||||
return snap_dir / self.output_path()
|
||||
|
||||
|
||||
# def get_storage_dir(self, create=True, symlink=True):
|
||||
|
|
|
@ -9,13 +9,12 @@ from pathlib import Path
|
|||
from django.utils.crypto import get_random_string
|
||||
|
||||
import abx
|
||||
import abx.archivebox
|
||||
import abx.archivebox.reads
|
||||
import abx.django.use
|
||||
import archivebox
|
||||
|
||||
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
|
||||
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
|
||||
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
|
||||
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
|
||||
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
||||
|
@ -25,45 +24,8 @@ IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--v
|
|||
### ArchiveBox Plugin Settings
|
||||
################################################################################
|
||||
|
||||
PLUGIN_HOOKSPECS = [
|
||||
'abx.django.hookspec',
|
||||
'abx.pydantic_pkgr.hookspec',
|
||||
'abx.archivebox.hookspec',
|
||||
]
|
||||
abx.register_hookspecs(PLUGIN_HOOKSPECS)
|
||||
|
||||
BUILTIN_PLUGIN_DIRS = {
|
||||
'archivebox': PACKAGE_DIR,
|
||||
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
|
||||
'plugins_auth': PACKAGE_DIR / 'plugins_auth',
|
||||
'plugins_search': PACKAGE_DIR / 'plugins_search',
|
||||
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
|
||||
}
|
||||
USER_PLUGIN_DIRS = {
|
||||
# 'user_plugins': DATA_DIR / 'user_plugins',
|
||||
}
|
||||
|
||||
# Discover ArchiveBox plugins
|
||||
BUILTIN_PLUGINS = abx.get_plugins_in_dirs(BUILTIN_PLUGIN_DIRS)
|
||||
PIP_PLUGINS = abx.get_pip_installed_plugins(group='archivebox')
|
||||
USER_PLUGINS = abx.get_plugins_in_dirs(USER_PLUGIN_DIRS)
|
||||
ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
|
||||
|
||||
# Load ArchiveBox plugins
|
||||
PLUGIN_MANAGER = abx.pm
|
||||
abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
|
||||
PLUGINS = abx.archivebox.reads.get_PLUGINS()
|
||||
|
||||
# Load ArchiveBox config from plugins
|
||||
CONFIGS = abx.archivebox.reads.get_CONFIGS()
|
||||
CONFIG = FLAT_CONFIG = abx.archivebox.reads.get_FLAT_CONFIG()
|
||||
BINPROVIDERS = abx.archivebox.reads.get_BINPROVIDERS()
|
||||
BINARIES = abx.archivebox.reads.get_BINARIES()
|
||||
EXTRACTORS = abx.archivebox.reads.get_EXTRACTORS()
|
||||
SEARCHBACKENDS = abx.archivebox.reads.get_SEARCHBACKENDS()
|
||||
# REPLAYERS = abx.archivebox.reads.get_REPLAYERS()
|
||||
# ADMINDATAVIEWS = abx.archivebox.reads.get_ADMINDATAVIEWS()
|
||||
|
||||
ALL_PLUGINS = archivebox.ALL_PLUGINS
|
||||
LOADED_PLUGINS = archivebox.LOADED_PLUGINS
|
||||
|
||||
################################################################################
|
||||
### Django Core Settings
|
||||
|
@ -102,7 +64,8 @@ INSTALLED_APPS = [
|
|||
# 'abid_utils', # handles ABID ID creation, handling, and models
|
||||
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
|
||||
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
|
||||
'queues', # handles starting and managing background workers and processes
|
||||
'actors', # handles starting and managing background workers and processes (orchestrators and actors)
|
||||
'queues', # handles starting and managing background workers and processes (supervisord)
|
||||
'seeds', # handles Seed model and URL source management
|
||||
'crawls', # handles Crawl and CrawlSchedule models and management
|
||||
'personas', # handles Persona and session management
|
||||
|
@ -110,7 +73,7 @@ INSTALLED_APPS = [
|
|||
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
|
||||
|
||||
# ArchiveBox plugins
|
||||
*abx.django.use.get_INSTALLED_APPS(), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
|
||||
*abx.as_list(abx.pm.hook.get_INSTALLED_APPS()), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
|
||||
|
||||
# 3rd-party apps from PyPI that need to be loaded last
|
||||
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
|
||||
|
@ -125,6 +88,7 @@ INSTALLED_APPS = [
|
|||
|
||||
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
'core.middleware.TimezoneMiddleware',
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
|
@ -135,7 +99,7 @@ MIDDLEWARE = [
|
|||
'core.middleware.ReverseProxyAuthMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
'core.middleware.CacheControlMiddleware',
|
||||
*abx.django.use.get_MIDDLEWARES(),
|
||||
*abx.as_list(abx.pm.hook.get_MIDDLEWARES()),
|
||||
]
|
||||
|
||||
|
||||
|
@ -148,7 +112,7 @@ MIDDLEWARE = [
|
|||
AUTHENTICATION_BACKENDS = [
|
||||
'django.contrib.auth.backends.RemoteUserBackend',
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
*abx.django.use.get_AUTHENTICATION_BACKENDS(),
|
||||
*abx.as_list(abx.pm.hook.get_AUTHENTICATION_BACKENDS()),
|
||||
]
|
||||
|
||||
|
||||
|
@ -169,7 +133,7 @@ AUTHENTICATION_BACKENDS = [
|
|||
|
||||
STATIC_URL = '/static/'
|
||||
TEMPLATES_DIR_NAME = 'templates'
|
||||
CUSTOM_TEMPLATES_ENABLED = os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK) and CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir()
|
||||
CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK)
|
||||
STATICFILES_DIRS = [
|
||||
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []),
|
||||
# *[
|
||||
|
@ -177,7 +141,7 @@ STATICFILES_DIRS = [
|
|||
# for plugin_dir in PLUGIN_DIRS.values()
|
||||
# if (plugin_dir / 'static').is_dir()
|
||||
# ],
|
||||
*abx.django.use.get_STATICFILES_DIRS(),
|
||||
*abx.as_list(abx.pm.hook.get_STATICFILES_DIRS()),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
|
||||
]
|
||||
|
||||
|
@ -188,7 +152,7 @@ TEMPLATE_DIRS = [
|
|||
# for plugin_dir in PLUGIN_DIRS.values()
|
||||
# if (plugin_dir / 'templates').is_dir()
|
||||
# ],
|
||||
*abx.django.use.get_TEMPLATE_DIRS(),
|
||||
*abx.as_list(abx.pm.hook.get_TEMPLATE_DIRS()),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
|
||||
str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
|
||||
|
@ -228,7 +192,7 @@ SQLITE_CONNECTION_OPTIONS = {
|
|||
# https://gcollazo.com/optimal-sqlite-settings-for-django/
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
|
||||
"timeout": 5,
|
||||
"timeout": 10,
|
||||
"check_same_thread": False,
|
||||
"transaction_mode": "IMMEDIATE",
|
||||
"init_command": (
|
||||
|
@ -267,7 +231,7 @@ if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file
|
|||
HUEY = {
|
||||
"huey_class": "huey.SqliteHuey",
|
||||
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
|
||||
"name": "system_tasks",
|
||||
"name": "commands",
|
||||
"results": True,
|
||||
"store_none": True,
|
||||
"immediate": False,
|
||||
|
@ -288,11 +252,11 @@ if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file
|
|||
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
|
||||
# https://github.com/gaiacoop/django-huey
|
||||
DJANGO_HUEY = {
|
||||
"default": "system_tasks",
|
||||
"default": "commands",
|
||||
"queues": {
|
||||
HUEY["name"]: HUEY.copy(),
|
||||
# more registered here at plugin import-time by BaseQueue.register()
|
||||
**abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME),
|
||||
**abx.as_dict(abx.pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME)),
|
||||
},
|
||||
}
|
||||
|
||||
|
@ -517,7 +481,7 @@ ADMIN_DATA_VIEWS = {
|
|||
"name": "log",
|
||||
},
|
||||
},
|
||||
*abx.django.use.get_ADMIN_DATA_VIEWS_URLS(),
|
||||
*abx.as_list(abx.pm.hook.get_ADMIN_DATA_VIEWS_URLS()),
|
||||
],
|
||||
}
|
||||
|
||||
|
@ -611,7 +575,4 @@ if DEBUG_REQUESTS_TRACKER:
|
|||
# JET_TOKEN = 'some-api-token-here'
|
||||
|
||||
|
||||
abx.django.use.register_checks()
|
||||
# abx.archivebox.reads.register_all_hooks(globals())
|
||||
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
|
|
@ -163,11 +163,6 @@ SETTINGS_LOGGING = {
|
|||
"level": "DEBUG",
|
||||
"propagate": False,
|
||||
},
|
||||
"plugins_extractor": {
|
||||
"handlers": ["default", "logfile"],
|
||||
"level": "DEBUG",
|
||||
"propagate": False,
|
||||
},
|
||||
"httpx": {
|
||||
"handlers": ["outbound_webhooks"],
|
||||
"level": "INFO",
|
||||
|
|
115
archivebox/core/statemachines.py
Normal file
115
archivebox/core/statemachines.py
Normal file
|
@ -0,0 +1,115 @@
|
|||
__package__ = 'archivebox.snapshots'
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
|
||||
# State Machine Definitions
|
||||
#################################################
|
||||
|
||||
|
||||
class SnapshotMachine(StateMachine, strict_states=True):
|
||||
"""State machine for managing Snapshot lifecycle."""
|
||||
|
||||
model: Snapshot
|
||||
|
||||
# States
|
||||
queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
|
||||
started = State(value=Snapshot.SnapshotStatus.STARTED)
|
||||
sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start', internal=True) |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', internal=True) |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, snapshot, *args, **kwargs):
|
||||
self.snapshot = snapshot
|
||||
super().__init__(snapshot, *args, **kwargs)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return self.snapshot.seed and self.snapshot.seed.uri
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
return not self.snapshot.has_pending_archiveresults()
|
||||
|
||||
def on_started(self):
|
||||
self.snapshot.create_pending_archiveresults()
|
||||
self.snapshot.bump_retry_at(seconds=60)
|
||||
self.snapshot.save()
|
||||
|
||||
def on_sealed(self):
|
||||
self.snapshot.retry_at = None
|
||||
self.snapshot.save()
|
||||
|
||||
class ArchiveResultMachine(StateMachine, strict_states=True):
|
||||
"""State machine for managing ArchiveResult lifecycle."""
|
||||
|
||||
model: ArchiveResult
|
||||
|
||||
# States
|
||||
queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
|
||||
started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
|
||||
backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
|
||||
succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
|
||||
failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start', internal=True) |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', internal=True) |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed') |
|
||||
started.to(backoff, cond='is_backoff') |
|
||||
backoff.to.itself(unless='can_start', internal=True) |
|
||||
backoff.to(started, cond='can_start') |
|
||||
backoff.to(succeeded, cond='is_succeeded') |
|
||||
backoff.to(failed, cond='is_failed')
|
||||
)
|
||||
|
||||
def __init__(self, archiveresult, *args, **kwargs):
|
||||
self.archiveresult = archiveresult
|
||||
super().__init__(archiveresult, *args, **kwargs)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
return self.archiveresult.output_exists()
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
return not self.archiveresult.output_exists()
|
||||
|
||||
def is_backoff(self) -> bool:
|
||||
return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
|
||||
|
||||
def on_started(self):
|
||||
self.archiveresult.start_ts = timezone.now()
|
||||
self.archiveresult.create_output_dir()
|
||||
self.archiveresult.bump_retry_at(seconds=60)
|
||||
self.archiveresult.save()
|
||||
|
||||
def on_backoff(self):
|
||||
self.archiveresult.bump_retry_at(seconds=60)
|
||||
self.archiveresult.save()
|
||||
|
||||
def on_succeeded(self):
|
||||
self.archiveresult.end_ts = timezone.now()
|
||||
self.archiveresult.save()
|
||||
|
||||
def on_failed(self):
|
||||
self.archiveresult.end_ts = timezone.now()
|
||||
self.archiveresult.save()
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
print(f"after '{event}' from '{source.id}' to '{target.id}'")
|
||||
# self.archiveresult.save_merkle_index()
|
||||
# self.archiveresult.save_html_index()
|
||||
# self.archiveresult.save_json_index()
|
||||
return "after_transition"
|
|
@ -12,7 +12,6 @@ from django.views import View
|
|||
from django.views.generic.list import ListView
|
||||
from django.views.generic import FormView
|
||||
from django.db.models import Q
|
||||
from django.conf import settings
|
||||
from django.contrib import messages
|
||||
from django.contrib.auth.mixins import UserPassesTestMixin
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
|
@ -21,6 +20,7 @@ from django.utils.decorators import method_decorator
|
|||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import archivebox
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
|
@ -32,9 +32,8 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
|
|||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
from archivebox.misc.serve_static import serve_static_with_byterange_support
|
||||
|
||||
from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||
from ..logging_util import printable_filesize
|
||||
from ..search import query_search_index
|
||||
from archivebox.logging_util import printable_filesize
|
||||
from archivebox.search import query_search_index
|
||||
|
||||
|
||||
class HomepageView(View):
|
||||
|
@ -69,7 +68,7 @@ class SnapshotView(View):
|
|||
and embed_path
|
||||
and os.access(abs_path, os.R_OK)
|
||||
and abs_path.exists()):
|
||||
if abs_path.is_dir() and not any(abs_path.glob('*.*')):
|
||||
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
|
||||
continue
|
||||
|
||||
result_info = {
|
||||
|
@ -103,7 +102,7 @@ class SnapshotView(View):
|
|||
|
||||
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
|
||||
snap_dir = Path(snapshot.link_dir)
|
||||
assert os.access(snap_dir, os.R_OK) and os.access(snap_dir, os.X_OK)
|
||||
assert os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK)
|
||||
|
||||
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
|
||||
extension = result_file.suffix.lstrip('.').lower()
|
||||
|
@ -154,7 +153,7 @@ class SnapshotView(View):
|
|||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||
'warc_path': warc_path,
|
||||
'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
|
||||
'SAVE_ARCHIVE_DOT_ORG': archivebox.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
|
||||
'best_result': best_result,
|
||||
|
@ -500,21 +499,25 @@ class HealthCheckView(View):
|
|||
|
||||
|
||||
def find_config_section(key: str) -> str:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return 'CONSTANT'
|
||||
matching_sections = [
|
||||
section_id for section_id, section in settings.CONFIGS.items() if key in section.model_fields
|
||||
section_id for section_id, section in CONFIGS.items() if key in section.model_fields
|
||||
]
|
||||
section = matching_sections[0] if matching_sections else 'DYNAMIC'
|
||||
return section
|
||||
|
||||
def find_config_default(key: str) -> str:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
if key in CONSTANTS_CONFIG:
|
||||
return str(CONSTANTS_CONFIG[key])
|
||||
|
||||
default_val = None
|
||||
|
||||
for config in settings.CONFIGS.values():
|
||||
for config in CONFIGS.values():
|
||||
if key in config.model_fields:
|
||||
default_val = config.model_fields[key].default
|
||||
break
|
||||
|
@ -530,7 +533,9 @@ def find_config_default(key: str) -> str:
|
|||
return default_val
|
||||
|
||||
def find_config_type(key: str) -> str:
|
||||
for config in settings.CONFIGS.values():
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
for config in CONFIGS.values():
|
||||
if hasattr(config, key):
|
||||
type_hints = get_type_hints(config)
|
||||
try:
|
||||
|
@ -547,6 +552,7 @@ def key_is_safe(key: str) -> bool:
|
|||
|
||||
@render_with_table_view
|
||||
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
|
@ -560,7 +566,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
# "Aliases": [],
|
||||
}
|
||||
|
||||
for section_id, section in reversed(list(settings.CONFIGS.items())):
|
||||
for section_id, section in reversed(list(CONFIGS.items())):
|
||||
for key, field in section.model_fields.items():
|
||||
rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
rows['Key'].append(ItemLink(key, key=key))
|
||||
|
@ -570,7 +576,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
|
||||
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
|
||||
|
||||
|
||||
section = 'CONSTANT'
|
||||
for key in CONSTANTS_CONFIG.keys():
|
||||
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
|
||||
|
@ -589,6 +594,8 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
|||
|
||||
@render_with_item_view
|
||||
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
|
@ -597,7 +604,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
|
||||
if key in CONSTANTS_CONFIG:
|
||||
section_header = mark_safe(f'[CONSTANTS] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, hardcoded by ArchiveBox)</small>')
|
||||
elif key in settings.FLAT_CONFIG:
|
||||
elif key in FLAT_CONFIG:
|
||||
section_header = mark_safe(f'data / ArchiveBox.conf [{find_config_section(key)}] <b><code style="color: lightgray">{key}</code></b>')
|
||||
else:
|
||||
section_header = mark_safe(f'[DYNAMIC CONFIG] <b><code style="color: lightgray">{key}</code></b> <small>(read-only, calculated at runtime)</small>')
|
||||
|
@ -613,7 +620,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
"fields": {
|
||||
'Key': key,
|
||||
'Type': find_config_type(key),
|
||||
'Value': settings.FLAT_CONFIG.get(key, settings.CONFIGS.get(key, None)) if key_is_safe(key) else '********',
|
||||
'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********',
|
||||
},
|
||||
"help_texts": {
|
||||
'Key': mark_safe(f'''
|
||||
|
@ -635,13 +642,13 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
|||
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
|
||||
</a>
|
||||
<br/><br/>
|
||||
<p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
|
||||
<p style="display: {"block" if key in FLAT_CONFIG else "none"}">
|
||||
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
|
||||
<br/><br/>
|
||||
<code>archivebox config --set {key}="{
|
||||
val.strip("'")
|
||||
if (val := find_config_default(key)) else
|
||||
(repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
(repr(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
|
||||
}"</code>
|
||||
</p>
|
||||
'''),
|
||||
|
|
69
archivebox/crawls/actors.py
Normal file
69
archivebox/crawls/actors.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import ClassVar
|
||||
|
||||
from rich import print
|
||||
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from crawls.models import Crawl
|
||||
|
||||
from actors.actor import ActorType
|
||||
|
||||
|
||||
class CrawlActor(ActorType[Crawl]):
|
||||
|
||||
QUERYSET: ClassVar[QuerySet] = Crawl.objects.filter(status='queued')
|
||||
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
|
||||
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
|
||||
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
|
||||
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
|
||||
|
||||
# model_type: Type[ModelType]
|
||||
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
|
||||
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
|
||||
|
||||
def claim_sql_where(self) -> str:
|
||||
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
|
||||
return self.CLAIM_WHERE
|
||||
|
||||
def claim_sql_set(self) -> str:
|
||||
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
|
||||
return self.CLAIM_SET
|
||||
|
||||
def claim_sql_order(self) -> str:
|
||||
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
|
||||
return self.CLAIM_ORDER
|
||||
|
||||
def claim_from_top(self) -> int:
|
||||
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
|
||||
return self.CLAIM_FROM_TOP
|
||||
|
||||
def tick(self, obj: Crawl) -> None:
|
||||
"""override this to process the object"""
|
||||
print(f'[blue]🏃♂️ {self}.tick()[/blue]', obj.abid or obj.id)
|
||||
# For example:
|
||||
# do_some_task(obj)
|
||||
# do_something_else(obj)
|
||||
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
|
||||
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
|
||||
|
||||
def on_shutdown(self, err: BaseException | None=None) -> None:
|
||||
print(f'[grey53]🏃♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
|
||||
# abx.pm.hook.on_actor_shutdown(self)
|
||||
|
||||
def on_tick_start(self, obj: Crawl) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_start()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
|
||||
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
|
||||
pass
|
||||
|
||||
def on_tick_end(self, obj: Crawl) -> None:
|
||||
# print(f'🏃♂️ {self}.on_tick_end()', obj.abid or obj.id)
|
||||
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
|
||||
# self.timer.end()
|
||||
pass
|
||||
|
||||
def on_tick_exception(self, obj: Crawl, err: BaseException) -> None:
|
||||
print(f'[red]🏃♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
|
||||
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)
|
|
@ -1,13 +1,20 @@
|
|||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from typing import TYPE_CHECKING
|
||||
from django_stubs_ext.db.models import TypedModelMeta
|
||||
|
||||
from datetime import timedelta
|
||||
|
||||
from django.db import models
|
||||
from django.db.models import Q
|
||||
from django.core.validators import MaxValueValidator, MinValueValidator
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.urls import reverse_lazy
|
||||
from django.utils import timezone
|
||||
|
||||
from statemachine.mixins import MachineMixin
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from core.models import Snapshot
|
||||
|
||||
from seeds.models import Seed
|
||||
|
||||
|
@ -42,7 +49,8 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
|
|||
return self.crawl_set.first()
|
||||
|
||||
|
||||
class Crawl(ABIDModel, ModelWithHealthStats):
|
||||
|
||||
class Crawl(ABIDModel, ModelWithHealthStats, MachineMixin):
|
||||
"""
|
||||
A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
|
||||
|
||||
|
@ -55,10 +63,22 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
|||
abid_prefix = 'crl_'
|
||||
abid_ts_src = 'self.created_at'
|
||||
abid_uri_src = 'self.seed.uri'
|
||||
abid_subtype_src = 'self.persona_id'
|
||||
abid_subtype_src = 'self.persona'
|
||||
abid_rand_src = 'self.id'
|
||||
abid_drift_allowed = True
|
||||
|
||||
state_field_name = 'status'
|
||||
state_machine_name = 'crawls.statemachines.CrawlMachine'
|
||||
state_machine_attr = 'sm'
|
||||
bind_events_as_methods = True
|
||||
|
||||
class CrawlStatus(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
SEALED = 'sealed', 'Sealed'
|
||||
|
||||
status = models.CharField(choices=CrawlStatus.choices, max_length=15, default=CrawlStatus.QUEUED, null=False, blank=False)
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
|
||||
abid = ABIDField(prefix=abid_prefix)
|
||||
|
||||
|
@ -66,6 +86,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
|||
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
|
||||
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
|
||||
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
|
||||
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
|
||||
|
@ -79,7 +100,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
|||
# schedule = models.JSONField()
|
||||
# config = models.JSONField()
|
||||
|
||||
# snapshot_set: models.Manager['Snapshot']
|
||||
snapshot_set: models.Manager['Snapshot']
|
||||
|
||||
|
||||
class Meta(TypedModelMeta):
|
||||
|
@ -103,6 +124,28 @@ class Crawl(ABIDModel, ModelWithHealthStats):
|
|||
def api_docs_url(self) -> str:
|
||||
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
|
||||
|
||||
def has_pending_archiveresults(self) -> bool:
|
||||
from core.models import ArchiveResult
|
||||
|
||||
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
|
||||
|
||||
snapshot_ids = self.snapshot_set.values_list('id', flat=True)
|
||||
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, status__in=pending_statuses)
|
||||
return pending_archiveresults.exists()
|
||||
|
||||
def create_root_snapshot(self) -> 'Snapshot':
|
||||
from core.models import Snapshot
|
||||
|
||||
root_snapshot, _ = Snapshot.objects.get_or_create(
|
||||
crawl=self,
|
||||
url=self.seed.uri,
|
||||
)
|
||||
return root_snapshot
|
||||
|
||||
def bump_retry_at(self, seconds: int = 10):
|
||||
self.retry_at = timezone.now() + timedelta(seconds=seconds)
|
||||
self.save()
|
||||
|
||||
|
||||
class Outlink(models.Model):
|
||||
"""A record of a link found on a page, pointing to another page."""
|
||||
|
|
48
archivebox/crawls/statemachines.py
Normal file
48
archivebox/crawls/statemachines.py
Normal file
|
@ -0,0 +1,48 @@
|
|||
__package__ = 'archivebox.crawls'
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from crawls.models import Crawl
|
||||
|
||||
# State Machine Definitions
|
||||
#################################################
|
||||
|
||||
|
||||
class CrawlMachine(StateMachine, strict_states=True):
|
||||
"""State machine for managing Crawl lifecycle."""
|
||||
|
||||
model: Crawl
|
||||
|
||||
# States
|
||||
queued = State(value=Crawl.CrawlStatus.QUEUED, initial=True)
|
||||
started = State(value=Crawl.CrawlStatus.STARTED)
|
||||
sealed = State(value=Crawl.CrawlStatus.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start', internal=True) |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', internal=True) |
|
||||
started.to(sealed, cond='is_finished')
|
||||
)
|
||||
|
||||
def __init__(self, crawl, *args, **kwargs):
|
||||
self.crawl = crawl
|
||||
super().__init__(crawl, *args, **kwargs)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
return self.crawl.seed and self.crawl.seed.uri
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
return not self.crawl.has_pending_archiveresults()
|
||||
|
||||
|
||||
|
||||
def on_started(self):
|
||||
self.crawl.create_root_snapshot()
|
||||
self.crawl.bump_retry_at(seconds=10)
|
||||
self.crawl.save()
|
||||
|
||||
def on_sealed(self):
|
||||
self.crawl.retry_at = None
|
||||
self.crawl.save()
|
|
@ -27,43 +27,29 @@ from ..logging_util import (
|
|||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
from .title import should_save_title, save_title
|
||||
from .favicon import should_save_favicon, save_favicon
|
||||
from .wget import should_save_wget, save_wget
|
||||
from .singlefile import should_save_singlefile, save_singlefile
|
||||
from .readability import should_save_readability, save_readability
|
||||
from .mercury import should_save_mercury, save_mercury
|
||||
from .htmltotext import should_save_htmltotext, save_htmltotext
|
||||
from .pdf import should_save_pdf, save_pdf
|
||||
from .screenshot import should_save_screenshot, save_screenshot
|
||||
from .dom import should_save_dom, save_dom
|
||||
from .git import should_save_git, save_git
|
||||
from .media import should_save_media, save_media
|
||||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||
from .headers import should_save_headers, save_headers
|
||||
|
||||
|
||||
ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
|
||||
SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
|
||||
ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
|
||||
|
||||
def get_default_archive_methods() -> List[ArchiveMethodEntry]:
|
||||
# TODO: move to abx.pm.hook.get_EXTRACTORS()
|
||||
return [
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('headers', should_save_headers, save_headers),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('wget', should_save_wget, save_wget),
|
||||
# keep title, readability, and htmltotext below wget and singlefile, as they depend on them
|
||||
('title', should_save_title, save_title),
|
||||
('readability', should_save_readability, save_readability),
|
||||
('mercury', should_save_mercury, save_mercury),
|
||||
('htmltotext', should_save_htmltotext, save_htmltotext),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
# ('favicon', should_save_favicon, save_favicon),
|
||||
# ('headers', should_save_headers, save_headers),
|
||||
# ('singlefile', should_save_singlefile, save_singlefile),
|
||||
# ('pdf', should_save_pdf, save_pdf),
|
||||
# ('screenshot', should_save_screenshot, save_screenshot),
|
||||
# ('dom', should_save_dom, save_dom),
|
||||
# ('wget', should_save_wget, save_wget),
|
||||
# # keep title, readability, and htmltotext below wget and singlefile, as they depend on them
|
||||
# ('title', should_save_title, save_title),
|
||||
# ('readability', should_save_readability, save_readability),
|
||||
# ('mercury', should_save_mercury, save_mercury),
|
||||
# ('htmltotext', should_save_htmltotext, save_htmltotext),
|
||||
# ('git', should_save_git, save_git),
|
||||
# ('media', should_save_media, save_media),
|
||||
# ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
|
||||
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
|
||||
|
|
|
@ -8,6 +8,8 @@ from typing import List, Optional, Iterator, Mapping
|
|||
from django.utils.html import format_html, mark_safe # type: ignore
|
||||
from django.core.cache import cache
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.misc.system import atomic_write
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
|
@ -19,7 +21,6 @@ from archivebox.misc.util import (
|
|||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.config.version import get_COMMIT_HASH
|
||||
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
from .schema import Link
|
||||
from ..logging_util import printable_filesize
|
||||
|
@ -80,7 +81,9 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
|||
@enforce_types
|
||||
def link_details_template(link: Link) -> str:
|
||||
|
||||
from ..extractors.wget import wget_output_path
|
||||
from abx_plugin_wget_extractor.wget import wget_output_path
|
||||
|
||||
SAVE_ARCHIVE_DOT_ORG = abx.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG
|
||||
|
||||
link_info = link._asdict(extended=True)
|
||||
|
||||
|
@ -102,7 +105,7 @@ def link_details_template(link: Link) -> str:
|
|||
'status': 'archived' if link.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||
'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
|
||||
})
|
||||
|
||||
|
|
|
@ -8,6 +8,8 @@ from pathlib import Path
|
|||
from datetime import datetime, timezone
|
||||
from typing import List, Optional, Iterator, Any, Union
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.config import VERSION, DATA_DIR, CONSTANTS
|
||||
from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
|
||||
|
||||
|
@ -19,8 +21,6 @@ from archivebox.misc.util import enforce_types
|
|||
|
||||
@enforce_types
|
||||
def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
||||
from django.conf import settings
|
||||
|
||||
MAIN_INDEX_HEADER = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
|
@ -33,11 +33,10 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
|
|||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||
'dependencies': settings.BINARIES,
|
||||
'dependencies': dict(abx.pm.hook.get_BINARIES()),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
if with_headers:
|
||||
output = {
|
||||
**MAIN_INDEX_HEADER,
|
||||
|
|
|
@ -17,9 +17,9 @@ from dataclasses import dataclass, asdict, field, fields
|
|||
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from archivebox.config import ARCHIVE_DIR, CONSTANTS
|
||||
import abx
|
||||
|
||||
from plugins_extractor.favicon.config import FAVICON_CONFIG
|
||||
from archivebox.config import ARCHIVE_DIR, CONSTANTS
|
||||
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import ts_to_date_str, parse_date
|
||||
|
@ -426,7 +426,10 @@ class Link:
|
|||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""predict the expected output paths that should be present after archiving"""
|
||||
|
||||
from ..extractors.wget import wget_output_path
|
||||
from abx_plugin_wget.wget import wget_output_path
|
||||
|
||||
FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon
|
||||
|
||||
# TODO: banish this awful duplication from the codebase and import these
|
||||
# from their respective extractor files
|
||||
canonical = {
|
||||
|
|
|
@ -8,9 +8,10 @@ from django.db import models
|
|||
from django.utils import timezone
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
import abx.archivebox.reads
|
||||
import abx
|
||||
import archivebox
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider
|
||||
from pydantic_pkgr import Binary, BinProvider
|
||||
from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
|
||||
|
||||
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
|
||||
|
@ -180,7 +181,7 @@ class NetworkInterface(ABIDModel, ModelWithHealthStats):
|
|||
|
||||
|
||||
class InstalledBinaryManager(models.Manager):
|
||||
def get_from_db_or_cache(self, binary: BaseBinary) -> 'InstalledBinary':
|
||||
def get_from_db_or_cache(self, binary: Binary) -> 'InstalledBinary':
|
||||
"""Get or create an InstalledBinary record for a Binary on the local machine"""
|
||||
|
||||
global _CURRENT_BINARIES
|
||||
|
@ -216,7 +217,7 @@ class InstalledBinaryManager(models.Manager):
|
|||
# if binary was not yet loaded from filesystem, do it now
|
||||
# this is expensive, we have to find it's abspath, version, and sha256, but it's necessary
|
||||
# to make sure we have a good, up-to-date record of it in the DB & in-memroy cache
|
||||
binary = binary.load(fresh=True)
|
||||
binary = archivebox.pm.hook.binary_load(binary=binary, fresh=True)
|
||||
|
||||
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
|
||||
|
||||
|
@ -291,8 +292,8 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
|
|||
if not hasattr(self, 'machine'):
|
||||
self.machine = Machine.objects.current()
|
||||
if not self.binprovider:
|
||||
all_known_binproviders = list(abx.archivebox.reads.get_BINPROVIDERS().values())
|
||||
binary = BaseBinary(name=self.name, binproviders=all_known_binproviders).load(fresh=True)
|
||||
all_known_binproviders = list(abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values())
|
||||
binary = archivebox.pm.hook.binary_load(binary=Binary(name=self.name, binproviders=all_known_binproviders), fresh=True)
|
||||
self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None
|
||||
if not self.abspath:
|
||||
self.abspath = self.BINPROVIDER.get_abspath(self.name)
|
||||
|
@ -304,16 +305,16 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
|
|||
super().clean(*args, **kwargs)
|
||||
|
||||
@cached_property
|
||||
def BINARY(self) -> BaseBinary:
|
||||
for binary in abx.archivebox.reads.get_BINARIES().values():
|
||||
def BINARY(self) -> Binary:
|
||||
for binary in abx.as_dict(archivebox.pm.hook.get_BINARIES()).values():
|
||||
if binary.name == self.name:
|
||||
return binary
|
||||
raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it')
|
||||
# TODO: we could technically reconstruct it from scratch, but why would we ever want to do that?
|
||||
|
||||
@cached_property
|
||||
def BINPROVIDER(self) -> BaseBinProvider:
|
||||
for binprovider in abx.archivebox.reads.get_BINPROVIDERS().values():
|
||||
def BINPROVIDER(self) -> BinProvider:
|
||||
for binprovider in abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values():
|
||||
if binprovider.name == self.binprovider:
|
||||
return binprovider
|
||||
raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})')
|
||||
|
@ -321,7 +322,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
|
|||
# maybe not a good idea to provide this? Binary in DB is a record of the binary's config
|
||||
# whereas a loaded binary is a not-yet saved instance that may not have the same config
|
||||
# why would we want to load a binary record from the db when it could be freshly loaded?
|
||||
def load_from_db(self) -> BaseBinary:
|
||||
def load_from_db(self) -> Binary:
|
||||
# TODO: implement defaults arg in pydantic_pkgr
|
||||
# return self.BINARY.load(defaults={
|
||||
# 'binprovider': self.BINPROVIDER,
|
||||
|
@ -330,7 +331,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
|
|||
# 'sha256': self.sha256,
|
||||
# })
|
||||
|
||||
return BaseBinary.model_validate({
|
||||
return Binary.model_validate({
|
||||
**self.BINARY.model_dump(),
|
||||
'abspath': self.abspath and Path(self.abspath),
|
||||
'version': self.version,
|
||||
|
@ -340,5 +341,5 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
|
|||
'overrides': self.BINARY.overrides,
|
||||
})
|
||||
|
||||
def load_fresh(self) -> BaseBinary:
|
||||
return self.BINARY.load(fresh=True)
|
||||
def load_fresh(self) -> Binary:
|
||||
return archivebox.pm.hook.binary_load(binary=self.BINARY, fresh=True)
|
||||
|
|
|
@ -14,6 +14,10 @@ from crontab import CronTab, CronSlices
|
|||
from django.db.models import QuerySet
|
||||
from django.utils import timezone
|
||||
|
||||
from pydantic_pkgr import Binary
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.misc.util import enforce_types # type: ignore
|
||||
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||
|
@ -22,7 +26,7 @@ from archivebox.misc.logging import stderr, hint
|
|||
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
|
||||
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
|
||||
from archivebox.config.permissions import SudoPermission, IN_DOCKER
|
||||
from archivebox.config.configfile import (
|
||||
from archivebox.config.collection import (
|
||||
write_config_file,
|
||||
load_all_config,
|
||||
get_real_name,
|
||||
|
@ -195,15 +199,13 @@ def version(quiet: bool=False,
|
|||
console = Console()
|
||||
prnt = console.print
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, apt, brew, env
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
|
||||
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
|
||||
from archivebox.config.paths import get_data_locations, get_code_locations
|
||||
|
||||
from plugins_auth.ldap.config import LDAP_CONFIG
|
||||
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
|
||||
|
||||
|
||||
# 0.7.1
|
||||
|
@ -242,7 +244,7 @@ def version(quiet: bool=False,
|
|||
f'SUDO={CONSTANTS.IS_ROOT}',
|
||||
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
|
||||
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
|
||||
f'LDAP={LDAP_ENABLED}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
)
|
||||
prnt()
|
||||
|
@ -264,7 +266,8 @@ def version(quiet: bool=False,
|
|||
|
||||
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
|
||||
failures = []
|
||||
for name, binary in list(settings.BINARIES.items()):
|
||||
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
|
||||
for name, binary in list(BINARIES.items()):
|
||||
if binary.name == 'archivebox':
|
||||
continue
|
||||
|
||||
|
@ -295,14 +298,15 @@ def version(quiet: bool=False,
|
|||
|
||||
prnt()
|
||||
prnt('[gold3][i] Package Managers:[/gold3]')
|
||||
for name, binprovider in list(settings.BINPROVIDERS.items()):
|
||||
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
|
||||
for name, binprovider in list(BINPROVIDERS.items()):
|
||||
err = None
|
||||
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
|
||||
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
|
||||
loaded_bin = binprovider.INSTALLER_BINARY or BaseBinary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
|
||||
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
|
||||
|
||||
abspath = None
|
||||
if loaded_bin.abspath:
|
||||
|
@ -1050,9 +1054,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
# - recommend user re-run with sudo if any deps need to be installed as root
|
||||
|
||||
from rich import print
|
||||
from django.conf import settings
|
||||
|
||||
from archivebox import CONSTANTS
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.paths import get_or_create_working_lib_dir
|
||||
|
||||
|
@ -1075,11 +1077,11 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
|
||||
package_manager_names = ', '.join(
|
||||
f'[yellow]{binprovider.name}[/yellow]'
|
||||
for binprovider in list(settings.BINPROVIDERS.values())
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
|
||||
if not binproviders or (binproviders and binprovider.name in binproviders)
|
||||
)
|
||||
print(f'[+] Setting up package managers {package_manager_names}...')
|
||||
for binprovider in list(settings.BINPROVIDERS.values()):
|
||||
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
|
||||
if binproviders and binprovider.name not in binproviders:
|
||||
continue
|
||||
try:
|
||||
|
@ -1092,7 +1094,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
|
||||
print()
|
||||
|
||||
for binary in list(settings.BINARIES.values()):
|
||||
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
|
||||
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
|
||||
# obviously must already be installed if we are running
|
||||
continue
|
||||
|
@ -1122,7 +1124,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
result = binary.load_or_install(binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if result and result['loaded_version']:
|
||||
break
|
||||
except Exception as e:
|
||||
|
@ -1133,7 +1136,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
sys.stderr.write("\033[00m\n") # reset
|
||||
else:
|
||||
binary.load_or_install(fresh=True, dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
|
||||
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
|
||||
if IS_ROOT and LIB_DIR:
|
||||
with SudoPermission(uid=0):
|
||||
if ARCHIVEBOX_USER == 0:
|
||||
|
@ -1157,7 +1161,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
|
|||
|
||||
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
|
||||
|
||||
from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
|
||||
extra_args = []
|
||||
if binproviders:
|
||||
|
@ -1183,8 +1187,6 @@ def config(config_options_str: Optional[str]=None,
|
|||
out_dir: Path=DATA_DIR) -> None:
|
||||
"""Get and set your ArchiveBox project configuration values"""
|
||||
|
||||
import abx.archivebox.reads
|
||||
|
||||
from rich import print
|
||||
|
||||
check_data_folder()
|
||||
|
@ -1198,7 +1200,8 @@ def config(config_options_str: Optional[str]=None,
|
|||
elif config_options_str:
|
||||
config_options = config_options_str.split('\n')
|
||||
|
||||
from django.conf import settings
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
config_options = config_options or []
|
||||
|
||||
|
@ -1208,8 +1211,8 @@ def config(config_options_str: Optional[str]=None,
|
|||
if search:
|
||||
if config_options:
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
|
||||
for config_section in settings.CONFIGS.values():
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
for config_section in CONFIGS.values():
|
||||
aliases = config_section.aliases
|
||||
|
||||
for search_key in config_options:
|
||||
|
@ -1228,15 +1231,15 @@ def config(config_options_str: Optional[str]=None,
|
|||
elif get or no_args:
|
||||
if config_options:
|
||||
config_options = [get_real_name(key) for key in config_options]
|
||||
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in settings.FLAT_CONFIG]
|
||||
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
|
||||
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
|
||||
if failed_config:
|
||||
stderr()
|
||||
stderr('[X] These options failed to get', color='red')
|
||||
stderr(' {}'.format('\n '.join(config_options)))
|
||||
raise SystemExit(1)
|
||||
else:
|
||||
matching_config = settings.FLAT_CONFIG
|
||||
matching_config = FLAT_CONFIG
|
||||
|
||||
print(printable_config(matching_config))
|
||||
raise SystemExit(not matching_config)
|
||||
|
@ -1257,20 +1260,20 @@ def config(config_options_str: Optional[str]=None,
|
|||
if key != raw_key:
|
||||
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
|
||||
|
||||
if key in settings.FLAT_CONFIG:
|
||||
if key in FLAT_CONFIG:
|
||||
new_config[key] = val.strip()
|
||||
else:
|
||||
failed_options.append(line)
|
||||
|
||||
if new_config:
|
||||
before = settings.FLAT_CONFIG
|
||||
before = FLAT_CONFIG
|
||||
matching_config = write_config_file(new_config)
|
||||
after = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
|
||||
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||
print(printable_config(matching_config))
|
||||
|
||||
side_effect_changes = {}
|
||||
for key, val in after.items():
|
||||
if key in settings.FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
|
||||
if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
|
||||
side_effect_changes[key] = after[key]
|
||||
# import ipdb; ipdb.set_trace()
|
||||
|
||||
|
@ -1312,7 +1315,7 @@ def schedule(add: bool=False,
|
|||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
||||
check_data_folder()
|
||||
from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
|
||||
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
|
||||
from archivebox.config.permissions import USER
|
||||
|
||||
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
|
||||
|
|
|
@ -201,6 +201,7 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
|||
|
||||
|
||||
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
|
||||
import archivebox
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
|
||||
|
@ -209,6 +210,8 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
|
|||
|
||||
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
|
||||
|
||||
if not must_exist and not os.path.isdir(lib_dir):
|
||||
return True
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ from archivebox import CONSTANTS # noqa
|
|||
from ..main import * # noqa
|
||||
from ..cli import CLI_SUBCOMMANDS
|
||||
|
||||
CONFIG = settings.FLAT_CONFIG
|
||||
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
@ -55,6 +55,5 @@ if __name__ == '__main__':
|
|||
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')
|
||||
prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]')
|
||||
prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]')
|
||||
prnt(' archivebox.plugins_extractor.wget.apps.WGET_EXTRACTOR.extract(snap.id) [grey53]# call an extractor directly[/]')
|
||||
prnt(' snap.archiveresult_set.all() [grey53]# see extractor results[/]')
|
||||
prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')
|
||||
|
|
|
@ -5,7 +5,7 @@ import requests
|
|||
import json as pyjson
|
||||
import http.cookiejar
|
||||
|
||||
from typing import List, Optional, Any
|
||||
from typing import List, Optional, Any, Callable
|
||||
from pathlib import Path
|
||||
from inspect import signature
|
||||
from functools import wraps
|
||||
|
@ -19,14 +19,13 @@ from requests.exceptions import RequestException, ReadTimeout
|
|||
from base32_crockford import encode as base32_encode # type: ignore
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
try:
|
||||
import chardet
|
||||
import chardet # type:ignore
|
||||
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
|
||||
except ImportError:
|
||||
detect_encoding = lambda rawdata: "utf-8"
|
||||
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
from .logging import COLOR_DICT
|
||||
|
||||
|
@ -126,6 +125,7 @@ def is_static_file(url: str):
|
|||
def enforce_types(func):
|
||||
"""
|
||||
Enforce function arg and kwarg types at runtime using its python3 type hints
|
||||
Simpler version of pydantic @validate_call decorator
|
||||
"""
|
||||
# TODO: check return type as well
|
||||
|
||||
|
@ -186,11 +186,11 @@ def str_between(string: str, start: str, end: str=None) -> str:
|
|||
|
||||
|
||||
@enforce_types
|
||||
def parse_date(date: Any) -> Optional[datetime]:
|
||||
def parse_date(date: Any) -> datetime:
|
||||
"""Parse unix timestamps, iso format, and human-readable strings"""
|
||||
|
||||
if date is None:
|
||||
return None
|
||||
return None # type: ignore
|
||||
|
||||
if isinstance(date, datetime):
|
||||
if date.tzinfo is None:
|
||||
|
@ -212,6 +212,8 @@ def parse_date(date: Any) -> Optional[datetime]:
|
|||
def download_url(url: str, timeout: int=None) -> str:
|
||||
"""Download the contents of a remote url and return the text"""
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
|
||||
session = requests.Session()
|
||||
|
||||
|
@ -241,8 +243,12 @@ def download_url(url: str, timeout: int=None) -> str:
|
|||
return url.rsplit('/', 1)[-1]
|
||||
|
||||
@enforce_types
|
||||
def get_headers(url: str, timeout: int=None) -> str:
|
||||
def get_headers(url: str, timeout: int | None=None) -> str:
|
||||
"""Download the contents of a remote url and return the headers"""
|
||||
# TODO: get rid of this and use an abx pluggy hook instead
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
|
||||
|
||||
try:
|
||||
|
@ -283,6 +289,7 @@ def get_headers(url: str, timeout: int=None) -> str:
|
|||
def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
|
||||
Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
|
||||
"""
|
||||
|
||||
TEMPLATE = '<span style="color: rgb{}"><br>'
|
||||
|
@ -306,13 +313,13 @@ def ansi_to_html(text: str) -> str:
|
|||
@enforce_types
|
||||
def dedupe(options: List[str]) -> List[str]:
|
||||
"""
|
||||
Deduplicates the given options. Options that come later clobber earlier
|
||||
conflicting options.
|
||||
Deduplicates the given CLI args by key=value. Options that come later override earlier.
|
||||
"""
|
||||
deduped = {}
|
||||
|
||||
for option in options:
|
||||
deduped[option.split('=')[0]] = option
|
||||
key = option.split('=')[0]
|
||||
deduped[key] = option
|
||||
|
||||
return list(deduped.values())
|
||||
|
||||
|
@ -345,6 +352,9 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
|||
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
||||
return tuple(obj)
|
||||
|
||||
elif isinstance(obj, Callable):
|
||||
return str(obj)
|
||||
|
||||
return pyjson.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
|
|
|
@ -1,14 +1,11 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
|
||||
from ..index.schema import Link
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
)
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
from ..index.schema import Link
|
||||
from .generic_json import jsonObjectToLink
|
||||
|
||||
def parse_line(line: str):
|
||||
|
|
|
@ -6,8 +6,7 @@ import re
|
|||
from typing import IO, Iterable, Optional
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pocket import Pocket
|
||||
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.system import atomic_write
|
||||
|
@ -22,7 +21,7 @@ API_DB_PATH = CONSTANTS.SOURCES_DIR / 'pocket_api.db'
|
|||
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
|
||||
|
||||
|
||||
def get_pocket_articles(api: Pocket, since=None, page=0):
|
||||
def get_pocket_articles(api, since=None, page=0):
|
||||
body, headers = api.get(
|
||||
state='archive',
|
||||
sort='oldest',
|
||||
|
@ -94,7 +93,9 @@ def should_parse_as_pocket_api(text: str) -> bool:
|
|||
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse bookmarks from the Pocket API"""
|
||||
|
||||
from archivebox.plugins_extractor.pocket.config import POCKET_CONFIG
|
||||
from pocket import Pocket
|
||||
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
|
||||
input_buffer.seek(0)
|
||||
pattern = re.compile(r"^pocket:\/\/(\w+)")
|
||||
|
@ -102,7 +103,7 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
if should_parse_as_pocket_api(line):
|
||||
|
||||
username = pattern.search(line).group(1)
|
||||
api = Pocket(POCKET_CONFIG.POCKET_CONSUMER_KEY, POCKET_CONFIG.POCKET_ACCESS_TOKENS[username])
|
||||
api = Pocket(FLAT_CONFIG.POCKET_CONSUMER_KEY, FLAT_CONFIG.POCKET_ACCESS_TOKENS[username])
|
||||
api.last_since = None
|
||||
|
||||
for article in get_pocket_articles(api, since=read_since(username)):
|
||||
|
|
|
@ -8,9 +8,10 @@ from datetime import datetime
|
|||
from typing import IO, Iterable, Optional
|
||||
from configparser import ConfigParser
|
||||
|
||||
import abx
|
||||
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.system import atomic_write
|
||||
from archivebox.plugins_extractor.readwise.config import READWISE_CONFIG
|
||||
|
||||
from ..index.schema import Link
|
||||
|
||||
|
@ -62,26 +63,30 @@ def link_from_article(article: dict, sources: list):
|
|||
|
||||
|
||||
def write_cursor(username: str, since: str):
|
||||
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
|
||||
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
|
||||
READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
|
||||
|
||||
if not READWISE_DB_PATH.exists():
|
||||
atomic_write(READWISE_DB_PATH, "")
|
||||
|
||||
since_file = ConfigParser()
|
||||
since_file.optionxform = str
|
||||
since_file.read(READWISE_CONFIG.READWISE_DB_PATH)
|
||||
since_file.read(READWISE_DB_PATH)
|
||||
|
||||
since_file[username] = {"since": since}
|
||||
|
||||
with open(READWISE_CONFIG.READWISE_DB_PATH, "w+") as new:
|
||||
with open(READWISE_DB_PATH, "w+") as new:
|
||||
since_file.write(new)
|
||||
|
||||
|
||||
def read_cursor(username: str) -> Optional[str]:
|
||||
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
|
||||
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
|
||||
READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
|
||||
|
||||
if not READWISE_DB_PATH.exists():
|
||||
atomic_write(READWISE_DB_PATH, "")
|
||||
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
config_file.read(READWISE_CONFIG.READWISE_DB_PATH)
|
||||
config_file.read(READWISE_DB_PATH)
|
||||
|
||||
return config_file.get(username, "since", fallback=None)
|
||||
|
||||
|
@ -97,12 +102,14 @@ def should_parse_as_readwise_reader_api(text: str) -> bool:
|
|||
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
"""Parse bookmarks from the Readwise Reader API"""
|
||||
|
||||
READWISE_READER_TOKENS = abx.pm.hook.get_CONFIG().READWISE_READER_TOKENS
|
||||
|
||||
input_buffer.seek(0)
|
||||
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
|
||||
for line in input_buffer:
|
||||
if should_parse_as_readwise_reader_api(line):
|
||||
username = pattern.search(line).group(1)
|
||||
api = ReadwiseReaderAPI(READWISE_CONFIG.READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
||||
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
|
||||
|
||||
for article in get_readwise_reader_articles(api):
|
||||
yield link_from_article(article, sources=[line])
|
||||
|
|
39
archivebox/pkgs/__init__.py
Normal file
39
archivebox/pkgs/__init__.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
import sys
|
||||
import importlib
|
||||
from pathlib import Path
|
||||
|
||||
PKGS_DIR = Path(__file__).parent
|
||||
|
||||
VENDORED_PKGS = [
|
||||
'abx',
|
||||
# 'pydantic-pkgr',
|
||||
]
|
||||
|
||||
# scan ./pkgs and add all dirs present to list of available VENDORED_PKGS
|
||||
for subdir in reversed(sorted(PKGS_DIR.iterdir())):
|
||||
if subdir.is_dir() and subdir.name not in VENDORED_PKGS and not subdir.name.startswith('_'):
|
||||
VENDORED_PKGS.append(subdir.name)
|
||||
|
||||
|
||||
def load_vendored_pkgs():
|
||||
"""Add archivebox/vendor to sys.path and import all vendored libraries present within"""
|
||||
if str(PKGS_DIR) not in sys.path:
|
||||
sys.path.append(str(PKGS_DIR))
|
||||
|
||||
for pkg_name in VENDORED_PKGS:
|
||||
pkg_dir = PKGS_DIR / pkg_name
|
||||
assert pkg_dir.is_dir(), f'Required vendored pkg {pkg_name} could not be found in {pkg_dir}'
|
||||
|
||||
try:
|
||||
lib = importlib.import_module(pkg_name)
|
||||
# print(f"Successfully imported lib from environment {pkg_name}")
|
||||
except ImportError:
|
||||
sys.path.append(str(pkg_dir))
|
||||
try:
|
||||
lib = importlib.import_module(pkg_name)
|
||||
# print(f"Successfully imported lib from vendored fallback {pkg_name}: {inspect.getfile(lib)}")
|
||||
except ImportError as e:
|
||||
print(f"Failed to import lib from environment or vendored fallback {pkg_name}: {e}", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
__label__ = 'Archive.org'
|
||||
__homepage__ = 'https://archive.org'
|
||||
|
||||
import abx
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
return {
|
||||
'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
|
||||
#
|
||||
# return {
|
||||
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
|
||||
# }
|
|
@ -1,7 +1,4 @@
|
|||
__package__ = 'plugins_extractor.archivedotorg'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class ArchivedotorgConfig(BaseConfigSet):
|
18
archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml
Normal file
18
archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-plugin-archivedotorg"
|
||||
version = "2024.10.28"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-plugin-curl>=2024.10.24",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_archivedotorg = "abx_plugin_archivedotorg"
|
|
@ -0,0 +1,34 @@
|
|||
__label__ = 'Chrome'
|
||||
__author__ = 'ArchiveBox'
|
||||
|
||||
import abx
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
return {
|
||||
'CHROME_CONFIG': CHROME_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CHROME_BINARY
|
||||
|
||||
return {
|
||||
'chrome': CHROME_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
from .config import CHROME_CONFIG
|
||||
CHROME_CONFIG.validate()
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# return {
|
||||
# 'pdf': PDF_EXTRACTOR,
|
||||
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||
# 'dom': DOM_EXTRACTOR,
|
||||
# }
|
|
@ -1,5 +1,3 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
|
@ -7,21 +5,22 @@ from typing import List, Optional
|
|||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import (
|
||||
Binary,
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
import abx
|
||||
|
||||
# Depends on Other Plugins:
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
from abx_plugin_puppeteer.binproviders import PUPPETEER_BINPROVIDER
|
||||
from abx_plugin_playwright.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
"chromium-browser",
|
||||
|
@ -48,12 +47,13 @@ CHROME_BINARY_NAMES_MACOS = [
|
|||
]
|
||||
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||
|
||||
APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
|
||||
CHROME_APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common',
|
||||
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||
'chromium-browser',
|
||||
]
|
||||
|
||||
|
||||
|
@ -80,7 +80,7 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
|
|||
###################### Config ##########################
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
class ChromeBinary(Binary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
|
@ -95,7 +95,7 @@ class ChromeBinary(BaseBinary):
|
|||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': APT_DEPENDENCIES,
|
||||
'packages': CHROME_APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
|
||||
|
@ -104,10 +104,9 @@ class ChromeBinary(BaseBinary):
|
|||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
bin_dir = bin_dir or abx.pm.hook.get_BIN_DIR()
|
||||
|
||||
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
|
||||
if not (binary.abspath and os.path.isfile(binary.abspath)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
@ -121,7 +120,7 @@ class ChromeBinary(BaseBinary):
|
|||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception as err:
|
||||
except Exception:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
@ -132,14 +131,17 @@ class ChromeBinary(BaseBinary):
|
|||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
|
||||
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
|
||||
lock_file.unlink()
|
||||
try:
|
||||
linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
linux_lock_file.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
|
||||
lock_file.unlink()
|
||||
try:
|
||||
(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
|
|
@ -1,5 +1,3 @@
|
|||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
@ -7,8 +5,8 @@ from typing import List, Optional
|
|||
from pydantic import Field
|
||||
from pydantic_pkgr import bin_abspath
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import env
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
from abx_plugin_default_binproviders import env
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||
|
@ -81,15 +79,16 @@ class ChromeConfig(BaseConfigSet):
|
|||
# Chrome Binary
|
||||
CHROME_BINARY: str = Field(default='chrome')
|
||||
CHROME_DEFAULT_ARGS: List[str] = Field(default=[
|
||||
'--virtual-time-budget=15000',
|
||||
'--disable-features=DarkMode',
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
|
||||
"--no-first-run", # dont show any first run ui / setup prompts
|
||||
'--virtual-time-budget=15000', # accellerate any animations on the page by 15s into the future
|
||||
'--disable-features=DarkMode', # disable dark mode for archiving
|
||||
"--run-all-compositor-stages-before-draw", # dont draw partially rendered content, wait until everything is ready
|
||||
"--hide-scrollbars", # hide scrollbars to prevent layout shift / scrollbar visible in screenshots
|
||||
"--autoplay-policy=no-user-gesture-required", # allow media autoplay without user gesture (e.g. on mobile)
|
||||
"--use-fake-ui-for-media-stream", # provide fake camera if site tries to request camera access
|
||||
"--use-fake-device-for-media-stream", # provide fake camera if site tries to request camera access
|
||||
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'", # ignore chrome updates
|
||||
"--force-gpu-mem-available-mb=4096", # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
|
||||
])
|
||||
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
||||
|
||||
|
@ -196,6 +195,7 @@ class ChromeConfig(BaseConfigSet):
|
|||
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
|
||||
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
|
||||
|
||||
# if CHROME_USER_DATA_DIR is set but folder is empty, create a new profile inside it
|
||||
if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
|
||||
STDERR.print(f'[green] + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
|
||||
cmd_args.remove('--no-first-run')
|
18
archivebox/pkgs/abx-plugin-chrome/pyproject.toml
Normal file
18
archivebox/pkgs/abx-plugin-chrome/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-plugin-chrome"
|
||||
version = "2024.10.28"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_chrome = "abx_plugin_chrome"
|
0
archivebox/pkgs/abx-plugin-curl/README.md
Normal file
0
archivebox/pkgs/abx-plugin-curl/README.md
Normal file
18
archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py
Normal file
18
archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/__init__.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
return {
|
||||
'curl': CURL_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CURL_BINARY
|
||||
|
||||
return {
|
||||
'curl': CURL_BINARY,
|
||||
}
|
|
@ -1,17 +1,17 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
__package__ = 'abx_plugin_curl'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
from pydantic_pkgr import BinProvider, BinName, Binary
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
|
||||
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
|
||||
class CurlBinary(BaseBinary):
|
||||
class CurlBinary(Binary):
|
||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
|
@ -1,11 +1,11 @@
|
|||
__package__ = 'plugins_extractor.curl'
|
||||
__package__ = 'abx_plugin_curl'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
18
archivebox/pkgs/abx-plugin-curl/pyproject.toml
Normal file
18
archivebox/pkgs/abx-plugin-curl/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-plugin-curl"
|
||||
version = "2024.10.24"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_curl = "abx_plugin_curl"
|
|
@ -0,0 +1,23 @@
|
|||
|
||||
import abx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from pydantic_pkgr import (
|
||||
AptProvider,
|
||||
BrewProvider,
|
||||
EnvProvider,
|
||||
BinProvider,
|
||||
)
|
||||
apt = APT_BINPROVIDER = AptProvider()
|
||||
brew = BREW_BINPROVIDER = BrewProvider()
|
||||
env = ENV_BINPROVIDER = EnvProvider()
|
||||
|
||||
|
||||
@abx.hookimpl(tryfirst=True)
|
||||
def get_BINPROVIDERS() -> Dict[str, BinProvider]:
|
||||
return {
|
||||
'apt': APT_BINPROVIDER,
|
||||
'brew': BREW_BINPROVIDER,
|
||||
'env': ENV_BINPROVIDER,
|
||||
}
|
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-plugin-default-binproviders"
|
||||
version = "2024.10.24"
|
||||
description = "Default BinProviders for ABX (apt, brew, env)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"pydantic-pkgr>=0.5.4",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_default_binproviders = "abx_plugin_default_binproviders"
|
0
archivebox/pkgs/abx-plugin-favicon/README.md
Normal file
0
archivebox/pkgs/abx-plugin-favicon/README.md
Normal file
|
@ -0,0 +1,29 @@
|
|||
__label__ = 'Favicon'
|
||||
__version__ = '2024.10.24'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||
__dependencies__ = [
|
||||
'abx>=0.1.0',
|
||||
'abx-spec-config>=0.1.0',
|
||||
'abx-plugin-curl-extractor>=2024.10.24',
|
||||
]
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import FAVICON_CONFIG
|
||||
|
||||
return {
|
||||
'FAVICON_CONFIG': FAVICON_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import FAVICON_EXTRACTOR
|
||||
|
||||
# return {
|
||||
# 'favicon': FAVICON_EXTRACTOR,
|
||||
# }
|
|
@ -1,7 +1,4 @@
|
|||
__package__ = 'plugins_extractor.favicon'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class FaviconConfig(BaseConfigSet):
|
18
archivebox/pkgs/abx-plugin-favicon/pyproject.toml
Normal file
18
archivebox/pkgs/abx-plugin-favicon/pyproject.toml
Normal file
|
@ -0,0 +1,18 @@
|
|||
[project]
|
||||
name = "abx-plugin-favicon"
|
||||
version = "2024.10.28"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-plugin-curl>=2024.10.28",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_favicon = "abx_plugin_favicon"
|
0
archivebox/pkgs/abx-plugin-git/README.md
Normal file
0
archivebox/pkgs/abx-plugin-git/README.md
Normal file
29
archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py
Normal file
29
archivebox/pkgs/abx-plugin-git/abx_plugin_git/__init__.py
Normal file
|
@ -0,0 +1,29 @@
|
|||
__package__ = 'abx_plugin_git'
|
||||
__label__ = 'Git'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
return {
|
||||
'GIT_CONFIG': GIT_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import GIT_BINARY
|
||||
|
||||
return {
|
||||
'git': GIT_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import GIT_EXTRACTOR
|
||||
|
||||
return {
|
||||
'git': GIT_EXTRACTOR,
|
||||
}
|
|
@ -1,17 +1,17 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
__package__ = 'abx_plugin_git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
from pydantic_pkgr import BinProvider, BinName, Binary
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
from abx_plugin_default_binproviders import apt, brew, env
|
||||
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
|
||||
|
||||
class GitBinary(BaseBinary):
|
||||
class GitBinary(Binary):
|
||||
name: BinName = GIT_CONFIG.GIT_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
|
@ -1,10 +1,10 @@
|
|||
__package__ = 'plugins_extractor.git'
|
||||
__package__ = 'abx_plugin_git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
15
archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py
Normal file
15
archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py
Normal file
|
@ -0,0 +1,15 @@
|
|||
__package__ = 'abx_plugin_git'
|
||||
|
||||
# from pathlib import Path
|
||||
|
||||
# from .binaries import GIT_BINARY
|
||||
|
||||
|
||||
# class GitExtractor(BaseExtractor):
|
||||
# name: ExtractorName = 'git'
|
||||
# binary: str = GIT_BINARY.name
|
||||
|
||||
# def get_output_path(self, snapshot) -> Path | None:
|
||||
# return snapshot.as_link() / 'git'
|
||||
|
||||
# GIT_EXTRACTOR = GitExtractor()
|
|
@ -16,8 +16,8 @@ from archivebox.misc.util import (
|
|||
from ..logging_util import TimedProgress
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
|
||||
from archivebox.plugins_extractor.git.config import GIT_CONFIG
|
||||
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
|
||||
from abx_plugin_git.config import GIT_CONFIG
|
||||
from abx_plugin_git.binaries import GIT_BINARY
|
||||
|
||||
|
||||
def get_output_path():
|
19
archivebox/pkgs/abx-plugin-git/pyproject.toml
Normal file
19
archivebox/pkgs/abx-plugin-git/pyproject.toml
Normal file
|
@ -0,0 +1,19 @@
|
|||
[project]
|
||||
name = "abx-plugin-git"
|
||||
version = "2024.10.28"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
"abx-plugin-default-binproviders>=2024.10.24",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_git = "abx_plugin_git"
|
0
archivebox/pkgs/abx-plugin-htmltotext/README.md
Normal file
0
archivebox/pkgs/abx-plugin-htmltotext/README.md
Normal file
|
@ -0,0 +1,22 @@
|
|||
__package__ = 'abx_plugin_htmltotext'
|
||||
__label__ = 'HTML-to-Text'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import HTMLTOTEXT_CONFIG
|
||||
|
||||
return {
|
||||
'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import FAVICON_EXTRACTOR
|
||||
|
||||
# return {
|
||||
# 'htmltotext': FAVICON_EXTRACTOR,
|
||||
# }
|
|
@ -1,7 +1,4 @@
|
|||
__package__ = 'plugins_extractor.htmltotext'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class HtmltotextConfig(BaseConfigSet):
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue