v0.8.6-rc: Moving plugins to independent python packages with finite state machine interfaces (#1576)

This commit is contained in:
Nick Sweeting 2024-11-03 15:57:12 -05:00 committed by GitHub
commit b7b3addbab
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
261 changed files with 5687 additions and 2995 deletions

View file

@ -102,7 +102,7 @@ jobs:
# TODO: remove this exception for windows once we get tests passing on that platform
if: ${{ !contains(matrix.os, 'windows') }}
run: |
python -m pytest -s --basetemp=tests/out --ignore=archivebox/vendor --ignore=deb_dist --ignore=pip_dist --ignore=brew_dist
python -m pytest -s --basetemp=tests/out --ignore=archivebox/pkgs
docker_tests:
runs-on: ubuntu-latest

6
.gitmodules vendored
View file

@ -1,9 +1,3 @@
[submodule "docs"]
path = docs
url = https://github.com/ArchiveBox/ArchiveBox.wiki.git
[submodule "archivebox/vendor/pocket"]
path = archivebox/vendor/pocket
url = https://github.com/tapanpandita/pocket
[submodule "archivebox/vendor/pydantic-pkgr"]
path = archivebox/vendor/pydantic-pkgr
url = https://github.com/ArchiveBox/pydantic-pkgr

View file

@ -3,4 +3,4 @@ ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E
select = F,E9,W
max-line-length = 130
max-complexity = 10
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv,data,data*

View file

@ -13,8 +13,8 @@ __package__ = 'archivebox'
import os
import sys
from pathlib import Path
from typing import cast
ASCII_LOGO = """
@ -47,11 +47,54 @@ from .monkey_patches import * # noqa
# print('LOADING VENDORED LIBRARIES')
from .vendor import load_vendored_libs # noqa
load_vendored_libs()
from .pkgs import load_vendored_pkgs # noqa
load_vendored_pkgs()
# print('DONE LOADING VENDORED LIBRARIES')
# Load ABX Plugin Specifications + Default Implementations
import abx # noqa
import abx_spec_archivebox # noqa
import abx_spec_config # noqa
import abx_spec_pydantic_pkgr # noqa
import abx_spec_django # noqa
import abx_spec_searchbackend # noqa
abx.pm.add_hookspecs(abx_spec_config.PLUGIN_SPEC)
abx.pm.register(abx_spec_config.PLUGIN_SPEC())
abx.pm.add_hookspecs(abx_spec_pydantic_pkgr.PLUGIN_SPEC)
abx.pm.register(abx_spec_pydantic_pkgr.PLUGIN_SPEC())
abx.pm.add_hookspecs(abx_spec_django.PLUGIN_SPEC)
abx.pm.register(abx_spec_django.PLUGIN_SPEC())
abx.pm.add_hookspecs(abx_spec_searchbackend.PLUGIN_SPEC)
abx.pm.register(abx_spec_searchbackend.PLUGIN_SPEC())
# Cast to ArchiveBoxPluginSpec to enable static type checking of pm.hook.call() methods
abx.pm = cast(abx.ABXPluginManager[abx_spec_archivebox.ArchiveBoxPluginSpec], abx.pm)
pm = abx.pm
# Load all pip-installed ABX-compatible plugins
ABX_ECOSYSTEM_PLUGINS = abx.get_pip_installed_plugins(group='abx')
# Load all built-in ArchiveBox plugins
ARCHIVEBOX_BUILTIN_PLUGINS = {
'config': PACKAGE_DIR / 'config',
'core': PACKAGE_DIR / 'core',
# 'search': PACKAGE_DIR / 'search',
# 'core': PACKAGE_DIR / 'core',
}
# Load all user-defined ArchiveBox plugins
USER_PLUGINS = abx.find_plugins_in_dir(Path(os.getcwd()) / 'user_plugins')
# Import all plugins and register them with ABX Plugin Manager
ALL_PLUGINS = {**ABX_ECOSYSTEM_PLUGINS, **ARCHIVEBOX_BUILTIN_PLUGINS, **USER_PLUGINS}
LOADED_PLUGINS = abx.load_plugins(ALL_PLUGINS)
# Setup basic config, constants, paths, and version
from .config.constants import CONSTANTS # noqa
from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .config.version import VERSION # noqa

View file

@ -1,131 +0,0 @@
__package__ = 'abx'
import importlib
from pathlib import Path
from typing import Dict, Callable, List
from . import hookspec as base_spec
from abx.hookspec import hookimpl, hookspec # noqa
from abx.manager import pm, PluginManager # noqa
pm.add_hookspecs(base_spec)
###### PLUGIN DISCOVERY AND LOADING ########################################################
def get_plugin_order(plugin_entrypoint: Path):
order = 999
try:
# if .plugin_order file exists, use it to set the load priority
order = int((plugin_entrypoint.parent / '.plugin_order').read_text())
except FileNotFoundError:
pass
return (order, plugin_entrypoint)
def register_hookspecs(hookspecs: List[str]):
"""
Register all the hookspecs from a list of module names.
"""
for hookspec_import_path in hookspecs:
hookspec_module = importlib.import_module(hookspec_import_path)
pm.add_hookspecs(hookspec_module)
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
"""
Find all the plugins in a given directory. Just looks for an __init__.py file.
"""
return {
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
if plugin_entrypoint.parent.name != 'abx'
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
def get_pip_installed_plugins(group='abx'):
"""replaces pm.load_setuptools_entrypoints("abx"), finds plugins that registered entrypoints via pip"""
import importlib.metadata
DETECTED_PLUGINS = {} # module_name: module_dir_path
for dist in list(importlib.metadata.distributions()):
for entrypoint in dist.entry_points:
if entrypoint.group != group or pm.is_blocked(entrypoint.name):
continue
DETECTED_PLUGINS[entrypoint.name] = Path(entrypoint.load().__file__).parent
# pm.register(plugin, name=ep.name)
# pm._plugin_distinfo.append((plugin, DistFacade(dist)))
return DETECTED_PLUGINS
def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]):
"""
Get the mapping of dir_name: {plugin_id: plugin_dir} for all plugins in the given directories.
"""
DETECTED_PLUGINS = {}
for plugin_prefix, plugin_dir in plugin_dirs.items():
DETECTED_PLUGINS.update(find_plugins_in_dir(plugin_dir, prefix=plugin_prefix))
return DETECTED_PLUGINS
# Load all plugins from pip packages, archivebox built-ins, and user plugins
def load_plugins(plugins_dict: Dict[str, Path]):
"""
Load all the plugins from a dictionary of module names and directory paths.
"""
LOADED_PLUGINS = {}
for plugin_module, plugin_dir in plugins_dict.items():
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
plugin_module_loaded = importlib.import_module(plugin_module)
pm.register(plugin_module_loaded)
LOADED_PLUGINS[plugin_module] = plugin_module_loaded.PLUGIN
# print(f' √ Loaded plugin: {plugin_module}')
return LOADED_PLUGINS
def get_registered_plugins():
"""
Get all the plugins registered with Pluggy.
"""
plugins = {}
plugin_to_distinfo = dict(pm.list_plugin_distinfo())
for plugin in pm.get_plugins():
plugin_info = {
"name": plugin.__name__,
"hooks": [h.name for h in pm.get_hookcallers(plugin) or ()],
}
distinfo = plugin_to_distinfo.get(plugin)
if distinfo:
plugin_info["version"] = distinfo.version
plugin_info["name"] = (
getattr(distinfo, "name", None) or distinfo.project_name
)
plugins[plugin_info["name"]] = plugin_info
return plugins
def get_plugin_hooks(plugin_pkg: str | None) -> Dict[str, Callable]:
"""
Get all the functions marked with @hookimpl on a module.
"""
if not plugin_pkg:
return {}
hooks = {}
plugin_module = importlib.import_module(plugin_pkg)
for attr_name in dir(plugin_module):
if attr_name.startswith('_'):
continue
try:
attr = getattr(plugin_module, attr_name)
if isinstance(attr, Callable):
hooks[attr_name] = None
pm.parse_hookimpl_opts(plugin_module, attr_name)
hooks[attr_name] = attr
except Exception as e:
print(f'Error getting hookimpls for {plugin_pkg}: {e}')
return hooks

View file

@ -1,30 +0,0 @@
__package__ = 'abx.archivebox'
import os
import importlib
from typing import Dict
from pathlib import Path
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
LOADED_PLUGINS = {}
for plugin_module, plugin_dir in reversed(plugins_dict.items()):
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
try:
plugin_module_loaded = importlib.import_module(plugin_module)
pm.register(plugin_module_loaded)
except Exception as e:
print(f'Error registering plugin: {plugin_module} - {e}')
# 2. then try to import plugin_module.apps as well
if os.access(plugin_dir / 'apps.py', os.R_OK):
plugin_apps = importlib.import_module(plugin_module + '.apps')
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
# print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
return LOADED_PLUGINS

View file

@ -1,106 +0,0 @@
__package__ = "abx.archivebox"
import os
from typing import Optional, cast
from typing_extensions import Self
from pydantic import validate_call
from pydantic_pkgr import (
Binary,
BinProvider,
BinProviderName,
AptProvider,
BrewProvider,
EnvProvider,
)
from archivebox.config.permissions import ARCHIVEBOX_USER
class BaseBinProvider(BinProvider):
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
@property
def admin_url(self) -> str:
# e.g. /admin/environment/binproviders/NpmBinProvider/ TODO
return "/admin/environment/binaries/"
class BaseBinary(Binary):
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
return
try:
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
symlink.chmod(0o777) # make sure its executable by everyone
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@validate_call
def load(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
if fresh:
binary = super().load(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
else:
# get cached binary from db
try:
from machine.models import InstalledBinary
installed_binary = InstalledBinary.objects.get_from_db_or_cache(self) # type: ignore
binary = InstalledBinary.load_from_db(installed_binary)
except Exception:
# maybe we are not in a DATA dir so there is no db, fallback to reading from fs
# (e.g. when archivebox version is run outside of a DATA dir)
binary = super().load(**kwargs)
return cast(Self, binary)
@validate_call
def install(self, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
binary = super().install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
@validate_call
def load_or_install(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
try:
binary = self.load(fresh=fresh)
if binary and binary.version:
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
except Exception:
pass
return self.install(**kwargs)
@property
def admin_url(self) -> str:
# e.g. /admin/environment/config/LdapConfig/
return f"/admin/environment/binaries/{self.name}/"
class AptBinProvider(AptProvider, BaseBinProvider):
name: BinProviderName = "apt"
class BrewBinProvider(BrewProvider, BaseBinProvider):
name: BinProviderName = "brew"
class EnvBinProvider(EnvProvider, BaseBinProvider):
name: BinProviderName = "env"
euid: Optional[int] = ARCHIVEBOX_USER
apt = AptBinProvider()
brew = BrewBinProvider()
env = EnvBinProvider()

View file

@ -1,219 +0,0 @@
__package__ = 'abx.archivebox'
import json
import os
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
from typing_extensions import Self
from pathlib import Path
from pydantic import model_validator, AfterValidator
from pydantic_pkgr import BinName
from django.utils.functional import cached_property
from django.utils import timezone
import abx
from .base_binary import BaseBinary
def no_empty_args(args: List[str]) -> List[str]:
assert all(len(arg) for arg in args)
return args
ExtractorName = Literal['wget', 'warc', 'media', 'singlefile'] | str
HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
class BaseExtractor:
name: ExtractorName
binary: BinName
output_path_func: HandlerFuncStr = 'self.get_output_path'
should_extract_func: HandlerFuncStr = 'self.should_extract'
extract_func: HandlerFuncStr = 'self.extract'
exec_func: HandlerFuncStr = 'self.exec'
default_args: CmdArgsList = []
extra_args: CmdArgsList = []
args: Optional[CmdArgsList] = None
@model_validator(mode='after')
def validate_model(self) -> Self:
if self.args is None:
self.args = [*self.default_args, *self.extra_args]
return self
def get_output_path(self, snapshot) -> Path:
return Path(self.__class__.__name__.lower())
def should_extract(self, uri: str, config: dict | None=None) -> bool:
try:
assert self.detect_installed_binary().version
except Exception:
raise
# could not load binary
return False
# output_dir = self.get_output_path(snapshot)
# if output_dir.glob('*.*'):
# return False
return True
@abx.hookimpl
def extract(self, snapshot_id: str) -> Dict[str, Any]:
from core.models import Snapshot
from archivebox import CONSTANTS
snapshot = Snapshot.objects.get(id=snapshot_id)
if not self.should_extract(snapshot):
return {}
status = 'failed'
start_ts = timezone.now()
uplink = self.detect_network_interface()
installed_binary = self.detect_installed_binary()
machine = installed_binary.machine
assert uplink.machine == installed_binary.machine # it would be *very* weird if this wasn't true
output_dir = CONSTANTS.DATA_DIR / '.tmp' / 'extractors' / self.name / str(snapshot.abid)
output_dir.mkdir(parents=True, exist_ok=True)
# execute the extractor binary with the given args
args = [snapshot.url, *self.args] if self.args is not None else [snapshot.url, *self.default_args, *self.extra_args]
cmd = [str(installed_binary.abspath), *args]
proc = self.exec(installed_binary=installed_binary, args=args, cwd=output_dir)
# collect the output
end_ts = timezone.now()
output_files = list(str(path.relative_to(output_dir)) for path in output_dir.glob('**/*.*'))
stdout = proc.stdout.strip()
stderr = proc.stderr.strip()
output_json = None
output_text = stdout
try:
output_json = json.loads(stdout.strip())
output_text = None
except json.JSONDecodeError:
pass
errors = []
if proc.returncode == 0:
status = 'success'
else:
errors.append(f'{installed_binary.name} returned non-zero exit code: {proc.returncode}')
# increment health stats counters
if status == 'success':
machine.record_health_success()
uplink.record_health_success()
installed_binary.record_health_success()
else:
machine.record_health_failure()
uplink.record_health_failure()
installed_binary.record_health_failure()
return {
'extractor': self.name,
'snapshot': {
'id': snapshot.id,
'abid': snapshot.abid,
'url': snapshot.url,
'created_by_id': snapshot.created_by_id,
},
'machine': {
'id': machine.id,
'abid': machine.abid,
'guid': machine.guid,
'hostname': machine.hostname,
'hw_in_docker': machine.hw_in_docker,
'hw_in_vm': machine.hw_in_vm,
'hw_manufacturer': machine.hw_manufacturer,
'hw_product': machine.hw_product,
'hw_uuid': machine.hw_uuid,
'os_arch': machine.os_arch,
'os_family': machine.os_family,
'os_platform': machine.os_platform,
'os_release': machine.os_release,
'os_kernel': machine.os_kernel,
},
'uplink': {
'id': uplink.id,
'abid': uplink.abid,
'mac_address': uplink.mac_address,
'ip_public': uplink.ip_public,
'ip_local': uplink.ip_local,
'dns_server': uplink.dns_server,
'hostname': uplink.hostname,
'iface': uplink.iface,
'isp': uplink.isp,
'city': uplink.city,
'region': uplink.region,
'country': uplink.country,
},
'binary': {
'id': installed_binary.id,
'abid': installed_binary.abid,
'name': installed_binary.name,
'binprovider': installed_binary.binprovider,
'abspath': installed_binary.abspath,
'version': installed_binary.version,
'sha256': installed_binary.sha256,
},
'cmd': cmd,
'stdout': stdout,
'stderr': stderr,
'returncode': proc.returncode,
'start_ts': start_ts,
'end_ts': end_ts,
'status': status,
'errors': errors,
'output_dir': str(output_dir.relative_to(CONSTANTS.DATA_DIR)),
'output_files': output_files,
'output_json': output_json or {},
'output_text': output_text or '',
}
# TODO: move this to a hookimpl
def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
cwd = cwd or Path(os.getcwd())
binary = self.load_binary(installed_binary=installed_binary)
return binary.exec(cmd=args, cwd=cwd)
@cached_property
def BINARY(self) -> BaseBinary:
import abx.archivebox.reads
for binary in abx.archivebox.reads.get_BINARIES().values():
if binary.name == self.binary:
return binary
raise ValueError(f'Binary {self.binary} not found')
def detect_installed_binary(self):
from machine.models import InstalledBinary
# hydrates binary from DB/cache if record of installed version is recent enough
# otherwise it finds it from scratch by detecting installed version/abspath/sha256 on host
return InstalledBinary.objects.get_from_db_or_cache(self.BINARY)
def load_binary(self, installed_binary=None) -> BaseBinary:
installed_binary = installed_binary or self.detect_installed_binary()
return installed_binary.load_from_db()
def detect_network_interface(self):
from machine.models import NetworkInterface
return NetworkInterface.objects.current()
@abx.hookimpl
def get_EXTRACTORS(self):
return [self]

View file

@ -1,25 +0,0 @@
__package__ = 'abx.archivebox'
import abx
class BaseReplayer:
"""Describes how to render an ArchiveResult in several contexts"""
url_pattern: str = '*'
row_template: str = 'plugins/generic_replayer/templates/row.html'
embed_template: str = 'plugins/generic_replayer/templates/embed.html'
fullpage_template: str = 'plugins/generic_replayer/templates/fullpage.html'
# row_view: LazyImportStr = 'plugins.generic_replayer.views.row_view'
# embed_view: LazyImportStr = 'plugins.generic_replayer.views.embed_view'
# fullpage_view: LazyImportStr = 'plugins.generic_replayer.views.fullpage_view'
# icon_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
# thumbnail_view: LazyImportStr = 'plugins.generic_replayer.views.get_icon'
@abx.hookimpl
def get_REPLAYERS(self):
return [self]
# TODO: add hookimpl methods for get_row_template, get_embed_template, get_fullpage_template, etc...

View file

@ -1,25 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Iterable, List
import abc
class BaseSearchBackend(abc.ABC):
name: str
@staticmethod
@abc.abstractmethod
def index(snapshot_id: str, texts: List[str]):
return
@staticmethod
@abc.abstractmethod
def flush(snapshot_ids: Iterable[str]):
return
@staticmethod
@abc.abstractmethod
def search(text: str) -> List[str]:
raise NotImplementedError("search method must be implemented by subclass")

View file

@ -1,52 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Dict, Any
from .. import hookspec
from .base_binary import BaseBinary, BaseBinProvider
from .base_configset import BaseConfigSet
from .base_extractor import BaseExtractor
from .base_searchbackend import BaseSearchBackend
@hookspec
def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
return {}
@hookspec
def get_CONFIG() -> Dict[str, BaseConfigSet]:
return {}
@hookspec
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
return {}
@hookspec
def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
return {}
# @hookspec
# def get_REPLAYERS() -> Dict[str, BaseReplayer]:
# return {}
# @hookspec
# def get_ADMINDATAVIEWS():
# return {}
# @hookspec
# def get_QUEUES():
# return {}
##############################################################
# provided by abx.pydantic_pkgr.hookspec:
# @hookspec
# def get_BINARIES() -> Dict[str, BaseBinary]:
# return {}
# @hookspec
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
# return {}

View file

@ -1,160 +0,0 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, Set, Any, TYPE_CHECKING
from benedict import benedict
import abx
from .. import pm
if TYPE_CHECKING:
from .base_configset import BaseConfigSet
from .base_binary import BaseBinary, BaseBinProvider
from .base_extractor import BaseExtractor
from .base_searchbackend import BaseSearchBackend
# from .base_replayer import BaseReplayer
# from .base_queue import BaseQueue
# from .base_admindataview import BaseAdminDataView
# API exposed to ArchiveBox code
def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
return benedict({
plugin_id: plugin
for plugin_dict in pm.hook.get_PLUGIN()
for plugin_id, plugin in plugin_dict.items()
})
def get_PLUGIN(plugin_id: str) -> Dict[str, Any]:
plugin_info = get_PLUGINS().get(plugin_id, {})
package = plugin_info.get('package', plugin_info.get('PACKAGE', None))
if not package:
return {'id': plugin_id, 'hooks': {}}
module = importlib.import_module(package)
hooks = abx.get_plugin_hooks(module.__package__)
assert plugin_info and (plugin_info.get('id') or plugin_info.get('ID') or hooks)
return benedict({
'id': plugin_id,
'label': getattr(module, '__label__', plugin_id),
'module': module,
'package': module.__package__,
'hooks': hooks,
'version': getattr(module, '__version__', '999.999.999'),
'author': getattr(module, '__author__', 'Unknown'),
'homepage': getattr(module, '__homepage__', 'https://github.com/ArchiveBox/ArchiveBox'),
'dependencies': getattr(module, '__dependencies__', []),
'source_code': module.__file__,
**plugin_info,
})
def get_HOOKS() -> Set[str]:
return {
hook_name
for plugin_id in get_PLUGINS().keys()
for hook_name in get_PLUGIN(plugin_id).hooks
}
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
return benedict({
config_id: configset
for plugin_configs in pm.hook.get_CONFIG()
for config_id, configset in plugin_configs.items()
})
def get_FLAT_CONFIG() -> Dict[str, Any]:
return benedict({
key: value
for configset in get_CONFIGS().values()
for key, value in configset.model_dump().items()
})
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
# TODO: move these to plugins
from abx.archivebox.base_binary import apt, brew, env
builtin_binproviders = {
'env': env,
'apt': apt,
'brew': brew,
}
return benedict({
binprovider_id: binprovider
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
for binprovider_id, binprovider in plugin_binproviders.items()
})
def get_BINARIES() -> Dict[str, 'BaseBinary']:
return benedict({
binary_id: binary
for plugin_binaries in pm.hook.get_BINARIES()
for binary_id, binary in plugin_binaries.items()
})
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
return benedict({
extractor_id: extractor
for plugin_extractors in pm.hook.get_EXTRACTORS()
for extractor_id, extractor in plugin_extractors.items()
})
# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
# return benedict({
# replayer.id: replayer
# for plugin_replayers in pm.hook.get_REPLAYERS()
# for replayer in plugin_replayers
# })
# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
# return benedict({
# admin_dataview.id: admin_dataview
# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
# for admin_dataview in plugin_admin_dataviews
# })
# def get_QUEUES() -> Dict[str, 'BaseQueue']:
# return benedict({
# queue.id: queue
# for plugin_queues in pm.hook.get_QUEUES()
# for queue in plugin_queues
# })
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
return benedict({
searchbackend_id: searchbackend
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
for searchbackend_id,searchbackend in plugin_searchbackends.items()
})
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
"""Get all the relevant config for the given scope, in correct precedence order"""
from django.conf import settings
default_config: benedict = defaults or settings.CONFIG
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
crawl = crawl or (snapshot and snapshot.crawl)
seed = seed or (crawl and crawl.seed)
persona = persona or (crawl and crawl.persona)
persona_config = persona.config if persona else {}
seed_config = seed.config if seed else {}
crawl_config = crawl.config if crawl else {}
snapshot_config = snapshot.config if snapshot else {}
archiveresult_config = archiveresult.config if archiveresult else {}
extra_config = extra_config or {}
return {
**default_config, # defaults / config file / environment variables
**persona_config, # lowest precedence
**seed_config,
**crawl_config,
**snapshot_config,
**archiveresult_config,
**extra_config, # highest precedence
}

View file

@ -1 +0,0 @@
__package__ = 'abx.django'

View file

@ -1,13 +0,0 @@
__package__ = 'abx.django'
from django.apps import AppConfig
class ABXConfig(AppConfig):
name = 'abx'
def ready(self):
import abx
from django.conf import settings
abx.pm.hook.ready(settings=settings)

View file

@ -1,125 +0,0 @@
__package__ = 'abx.django'
from ..hookspec import hookspec
###########################################################################################
@hookspec
def get_INSTALLED_APPS():
"""Return a list of apps to add to INSTALLED_APPS"""
# e.g. ['your_plugin_type.plugin_name']
return []
# @hookspec
# def register_INSTALLED_APPS(INSTALLED_APPS):
# """Mutate INSTALLED_APPS in place to add your app in a specific position"""
# # idx_of_contrib = INSTALLED_APPS.index('django.contrib.auth')
# # INSTALLED_APPS.insert(idx_of_contrib + 1, 'your_plugin_type.plugin_name')
# pass
@hookspec
def get_TEMPLATE_DIRS():
return [] # e.g. ['your_plugin_type/plugin_name/templates']
# @hookspec
# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
# """Install django settings"""
# # e.g. TEMPLATE_DIRS.insert(0, 'your_plugin_type/plugin_name/templates')
# pass
@hookspec
def get_STATICFILES_DIRS():
return [] # e.g. ['your_plugin_type/plugin_name/static']
# @hookspec
# def register_STATICFILES_DIRS(STATICFILES_DIRS):
# """Mutate STATICFILES_DIRS in place to add your static dirs in a specific position"""
# # e.g. STATICFILES_DIRS.insert(0, 'your_plugin_type/plugin_name/static')
# pass
@hookspec
def get_MIDDLEWARE():
return [] # e.g. ['your_plugin_type.plugin_name.middleware.YourMiddleware']
# @hookspec
# def register_MIDDLEWARE(MIDDLEWARE):
# """Mutate MIDDLEWARE in place to add your middleware in a specific position"""
# # e.g. MIDDLEWARE.insert(0, 'your_plugin_type.plugin_name.middleware.YourMiddleware')
# pass
@hookspec
def get_AUTHENTICATION_BACKENDS():
return [] # e.g. ['django_auth_ldap.backend.LDAPBackend']
# @hookspec
# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
# """Mutate AUTHENTICATION_BACKENDS in place to add your auth backends in a specific position"""
# # e.g. AUTHENTICATION_BACKENDS.insert(0, 'your_plugin_type.plugin_name.backend.YourBackend')
# pass
@hookspec
def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME):
return [] # e.g. [{'name': 'your_plugin_type.plugin_name', 'HUEY': {...}}]
# @hookspec
# def register_DJANGO_HUEY(DJANGO_HUEY):
# """Mutate DJANGO_HUEY in place to add your huey queues in a specific position"""
# # e.g. DJANGO_HUEY['queues']['some_queue_name']['some_setting'] = 'some_value'
# pass
@hookspec
def get_ADMIN_DATA_VIEWS_URLS():
return []
# @hookspec
# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
# """Mutate ADMIN_DATA_VIEWS in place to add your admin data views in a specific position"""
# # e.g. ADMIN_DATA_VIEWS['URLS'].insert(0, 'your_plugin_type/plugin_name/admin_data_views.py')
# pass
# @hookspec
# def register_settings(settings):
# """Mutate settings in place to add your settings / modify existing settings"""
# # settings.SOME_KEY = 'some_value'
# pass
###########################################################################################
@hookspec
def get_urlpatterns():
return [] # e.g. [path('your_plugin_type/plugin_name/url.py', your_view)]
# @hookspec
# def register_urlpatterns(urlpatterns):
# """Mutate urlpatterns in place to add your urlpatterns in a specific position"""
# # e.g. urlpatterns.insert(0, path('your_plugin_type/plugin_name/url.py', your_view))
# pass
###########################################################################################
@hookspec
def register_checks():
"""Register django checks with django system checks system"""
pass
@hookspec
def register_admin(admin_site):
"""Register django admin views/models with the main django admin site instance"""
pass
###########################################################################################
@hookspec
def ready():
"""Called when Django apps app.ready() are triggered"""
pass

View file

@ -1,101 +0,0 @@
__package__ = 'abx.django'
import itertools
# from benedict import benedict
from .. import pm
def get_INSTALLED_APPS():
return itertools.chain(*reversed(pm.hook.get_INSTALLED_APPS()))
# def register_INSTALLLED_APPS(INSTALLED_APPS):
# pm.hook.register_INSTALLED_APPS(INSTALLED_APPS=INSTALLED_APPS)
def get_MIDDLEWARES():
return itertools.chain(*reversed(pm.hook.get_MIDDLEWARE()))
# def register_MIDDLEWARES(MIDDLEWARE):
# pm.hook.register_MIDDLEWARE(MIDDLEWARE=MIDDLEWARE)
def get_AUTHENTICATION_BACKENDS():
return itertools.chain(*reversed(pm.hook.get_AUTHENTICATION_BACKENDS()))
# def register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS):
# pm.hook.register_AUTHENTICATION_BACKENDS(AUTHENTICATION_BACKENDS=AUTHENTICATION_BACKENDS)
def get_STATICFILES_DIRS():
return itertools.chain(*reversed(pm.hook.get_STATICFILES_DIRS()))
# def register_STATICFILES_DIRS(STATICFILES_DIRS):
# pm.hook.register_STATICFILES_DIRS(STATICFILES_DIRS=STATICFILES_DIRS)
def get_TEMPLATE_DIRS():
return itertools.chain(*reversed(pm.hook.get_TEMPLATE_DIRS()))
# def register_TEMPLATE_DIRS(TEMPLATE_DIRS):
# pm.hook.register_TEMPLATE_DIRS(TEMPLATE_DIRS=TEMPLATE_DIRS)
def get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME='queue.sqlite3'):
HUEY_QUEUES = {}
for plugin_result in pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=QUEUE_DATABASE_NAME):
HUEY_QUEUES.update(plugin_result)
return HUEY_QUEUES
# def register_DJANGO_HUEY(DJANGO_HUEY):
# pm.hook.register_DJANGO_HUEY(DJANGO_HUEY=DJANGO_HUEY)
def get_ADMIN_DATA_VIEWS_URLS():
return itertools.chain(*reversed(pm.hook.get_ADMIN_DATA_VIEWS_URLS()))
# def register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS):
# pm.hook.register_ADMIN_DATA_VIEWS(ADMIN_DATA_VIEWS=ADMIN_DATA_VIEWS)
# def register_settings(settings):
# # convert settings dict to an benedict so we can set values using settings.attr = xyz notation
# settings_as_obj = benedict(settings, keypath_separator=None)
# # set default values for settings that are used by plugins
# # settings_as_obj.INSTALLED_APPS = settings_as_obj.get('INSTALLED_APPS', [])
# # settings_as_obj.MIDDLEWARE = settings_as_obj.get('MIDDLEWARE', [])
# # settings_as_obj.AUTHENTICATION_BACKENDS = settings_as_obj.get('AUTHENTICATION_BACKENDS', [])
# # settings_as_obj.STATICFILES_DIRS = settings_as_obj.get('STATICFILES_DIRS', [])
# # settings_as_obj.TEMPLATE_DIRS = settings_as_obj.get('TEMPLATE_DIRS', [])
# # settings_as_obj.DJANGO_HUEY = settings_as_obj.get('DJANGO_HUEY', {'queues': {}})
# # settings_as_obj.ADMIN_DATA_VIEWS = settings_as_obj.get('ADMIN_DATA_VIEWS', {'URLS': []})
# # # call all the hook functions to mutate the settings values in-place
# # register_INSTALLLED_APPS(settings_as_obj.INSTALLED_APPS)
# # register_MIDDLEWARES(settings_as_obj.MIDDLEWARE)
# # register_AUTHENTICATION_BACKENDS(settings_as_obj.AUTHENTICATION_BACKENDS)
# # register_STATICFILES_DIRS(settings_as_obj.STATICFILES_DIRS)
# # register_TEMPLATE_DIRS(settings_as_obj.TEMPLATE_DIRS)
# # register_DJANGO_HUEY(settings_as_obj.DJANGO_HUEY)
# # register_ADMIN_DATA_VIEWS(settings_as_obj.ADMIN_DATA_VIEWS)
# # calls Plugin.settings(settings) on each registered plugin
# pm.hook.register_settings(settings=settings_as_obj)
# # then finally update the settings globals() object will all the new settings
# # settings.update(settings_as_obj)
def get_urlpatterns():
return list(itertools.chain(*pm.hook.urlpatterns()))
def register_urlpatterns(urlpatterns):
pm.hook.register_urlpatterns(urlpatterns=urlpatterns)
def register_checks():
"""register any django system checks"""
pm.hook.register_checks()
def register_admin(admin_site):
"""register any django admin models/views with the main django admin site instance"""
pm.hook.register_admin(admin_site=admin_site)

View file

@ -1,22 +0,0 @@
from pathlib import Path
from pluggy import HookimplMarker
from pluggy import HookspecMarker
spec = hookspec = HookspecMarker("abx")
impl = hookimpl = HookimplMarker("abx")
@hookspec
@hookimpl
def get_system_user() -> str:
# Beware $HOME may not match current EUID, UID, PUID, SUID, there are edge cases
# - sudo (EUD != UID != SUID)
# - running with an autodetected UID based on data dir ownership
# but mapping of UID:username is broken because it was created
# by a different host system, e.g. 911's $HOME outside of docker
# might be /usr/lib/lxd instead of /home/archivebox
# - running as a user that doens't have a home directory
# - home directory is set to a path that doesn't exist, or is inside a dir we cant read
return Path('~').expanduser().name

View file

@ -1,30 +0,0 @@
import inspect
import pluggy
class PluginManager(pluggy.PluginManager):
"""
Patch to fix pluggy's PluginManager to work with pydantic models.
See: https://github.com/pytest-dev/pluggy/pull/536
"""
def parse_hookimpl_opts(self, plugin, name: str) -> pluggy.HookimplOpts | None:
# IMPORTANT: @property methods can have side effects, and are never hookimpl
# if attr is a property, skip it in advance
plugin_class = plugin if inspect.isclass(plugin) else type(plugin)
if isinstance(getattr(plugin_class, name, None), property):
return None
# pydantic model fields are like attrs and also can never be hookimpls
plugin_is_pydantic_obj = hasattr(plugin, "__pydantic_core_schema__")
if plugin_is_pydantic_obj and name in getattr(plugin, "model_fields", {}):
# pydantic models mess with the class and attr __signature__
# so inspect.isroutine(...) throws exceptions and cant be used
return None
try:
return super().parse_hookimpl_opts(plugin, name)
except AttributeError:
return super().parse_hookimpl_opts(type(plugin), name)
pm = PluginManager("abx")

View file

@ -1 +0,0 @@
__package__ = 'abx.pydantic_pkgr'

View file

@ -1,13 +0,0 @@
from ..hookspec import hookspec
###########################################################################################
@hookspec
def get_BINPROVIDERS():
return {}
@hookspec
def get_BINARIES():
return {}

313
archivebox/actors/actor.py Normal file
View file

@ -0,0 +1,313 @@
__package__ = 'archivebox.actors'
import os
import time
from abc import ABC, abstractmethod
from typing import ClassVar, Generic, TypeVar, Any, cast, Literal, Type
from django.utils.functional import classproperty
from rich import print
import psutil
from django import db
from django.db import models
from django.db.models import QuerySet
from multiprocessing import Process, cpu_count
from threading import Thread, get_native_id
# from archivebox.logging_util import TimedProgress
LaunchKwargs = dict[str, Any]
ModelType = TypeVar('ModelType', bound=models.Model)
class ActorType(ABC, Generic[ModelType]):
"""
Base class for all actors. Usage:
class FaviconActor(ActorType[ArchiveResult]):
QUERYSET: ClassVar[QuerySet] = ArchiveResult.objects.filter(status='queued', extractor='favicon')
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
ATOMIC: ClassVar[bool] = True
def claim_sql_set(self, obj: ArchiveResult) -> str:
# SQL fields to update atomically while claiming an object from the queue
retry_at = datetime.now() + timedelta(seconds=self.MAX_TICK_TIME)
return f"status = 'started', locked_by = {self.pid}, retry_at = {retry_at}"
def tick(self, obj: ArchiveResult) -> None:
run_favicon_extractor(obj)
ArchiveResult.objects.filter(pk=obj.pk, status='started').update(status='success')
"""
pid: int
idle_count: int = 0
launch_kwargs: LaunchKwargs = {}
mode: Literal['thread', 'process'] = 'process'
MAX_CONCURRENT_ACTORS: ClassVar[int] = min(max(2, int(cpu_count() * 0.6)), 8) # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
QUERYSET: ClassVar[QuerySet] # the QuerySet to claim objects from
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = MAX_CONCURRENT_ACTORS * 10 # the number of objects to consider when atomically getting the next object from the queue
ATOMIC: ClassVar[bool] = True # whether to atomically fetch+claim the nextobject in one step, or fetch and lock it in two steps
# model_type: Type[ModelType]
_SPAWNED_ACTOR_PIDS: ClassVar[list[psutil.Process]] = [] # record all the pids of Actors spawned by this class
def __init__(self, mode: Literal['thread', 'process']|None=None, **launch_kwargs: LaunchKwargs):
self.mode = mode or self.mode
self.launch_kwargs = launch_kwargs or dict(self.launch_kwargs)
@classproperty
def name(cls) -> str:
return cls.__name__ # type: ignore
def __str__(self) -> str:
return self.__repr__()
def __repr__(self) -> str:
"""FaviconActor[pid=1234]"""
label = 'pid' if self.mode == 'process' else 'tid'
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
### Class Methods: Called by Orchestrator on ActorType class before it has been spawned
@classmethod
def get_running_actors(cls) -> list[int]:
"""returns a list of pids of all running actors of this type"""
# WARNING: only works for process actors, not thread actors
if cls.mode == 'thread':
raise NotImplementedError('get_running_actors() is not implemented for thread actors')
return [
proc.pid for proc in cls._SPAWNED_ACTOR_PIDS
if proc.is_running() and proc.status() != 'zombie'
]
@classmethod
def get_actors_to_spawn(cls, queue: QuerySet, running_actors: list[int]) -> list[LaunchKwargs]:
"""Get a list of launch kwargs for the number of actors to spawn based on the queue and currently running actors"""
queue_length = queue.count()
if not queue_length: # queue is empty, spawn 0 actors
return []
actors_to_spawn: list[LaunchKwargs] = []
max_spawnable = cls.MAX_CONCURRENT_ACTORS - len(running_actors)
# spawning new actors is expensive, avoid spawning all the actors at once. To stagger them,
# let the next orchestrator tick handle starting another 2 on the next tick()
# if queue_length > 10: # queue is long, spawn as many as possible
# actors_to_spawn += max_spawnable * [{}]
if queue_length > 4: # queue is medium, spawn 1 or 2 actors
actors_to_spawn += min(2, max_spawnable) * [{**cls.launch_kwargs}]
else: # queue is short, spawn 1 actor
actors_to_spawn += min(1, max_spawnable) * [{**cls.launch_kwargs}]
return actors_to_spawn
@classmethod
def start(cls, mode: Literal['thread', 'process']='process', **launch_kwargs: LaunchKwargs) -> int:
if mode == 'thread':
return cls.fork_actor_as_thread(**launch_kwargs)
elif mode == 'process':
return cls.fork_actor_as_process(**launch_kwargs)
raise ValueError(f'Invalid actor mode: {mode} must be "thread" or "process"')
@classmethod
def fork_actor_as_thread(cls, **launch_kwargs: LaunchKwargs) -> int:
"""Spawn a new background thread running the actor's runloop"""
actor = cls(mode='thread', **launch_kwargs)
bg_actor_thread = Thread(target=actor.runloop)
bg_actor_thread.start()
assert bg_actor_thread.native_id is not None
return bg_actor_thread.native_id
@classmethod
def fork_actor_as_process(cls, **launch_kwargs: LaunchKwargs) -> int:
"""Spawn a new background process running the actor's runloop"""
actor = cls(mode='process', **launch_kwargs)
bg_actor_process = Process(target=actor.runloop)
bg_actor_process.start()
assert bg_actor_process.pid is not None
cls._SPAWNED_ACTOR_PIDS.append(psutil.Process(pid=bg_actor_process.pid))
return bg_actor_process.pid
@classmethod
def get_model(cls) -> Type[ModelType]:
# wish this was a @classproperty but Generic[ModelType] return type cant be statically inferred for @classproperty
return cls.QUERYSET.model
@classmethod
def get_queue(cls) -> QuerySet:
"""override this to provide your queryset as the queue"""
# return ArchiveResult.objects.filter(status='queued', extractor__in=('pdf', 'dom', 'screenshot'))
return cls.QUERYSET
### Instance Methods: Called by Actor after it has been spawned (i.e. forked as a thread or process)
def runloop(self):
"""The main runloop that starts running when the actor is spawned (as subprocess or thread) and exits when the queue is empty"""
self.on_startup()
try:
while True:
obj_to_process: ModelType | None = None
try:
obj_to_process = cast(ModelType, self.get_next(atomic=self.atomic))
except Exception:
pass
if obj_to_process:
self.idle_count = 0 # reset idle count if we got an object
else:
if self.idle_count >= 30:
break # stop looping and exit if queue is empty and we have idled for 30sec
else:
# print('Actor runloop()', f'pid={self.pid}', 'queue empty, rechecking...')
self.idle_count += 1
time.sleep(1)
continue
self.on_tick_start(obj_to_process)
# Process the object
try:
self.tick(obj_to_process)
except Exception as err:
print(f'[red]🏃‍♂️ ERROR: {self}.tick()[/red]', err)
db.connections.close_all() # always reset the db connection after an exception to clear any pending transactions
self.on_tick_exception(obj_to_process, err)
finally:
self.on_tick_end(obj_to_process)
self.on_shutdown(err=None)
except BaseException as err:
if isinstance(err, KeyboardInterrupt):
print()
else:
print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
self.on_shutdown(err=err)
def get_next(self, atomic: bool | None=None) -> ModelType | None:
"""get the next object from the queue, atomically locking it if self.atomic=True"""
if atomic is None:
atomic = self.ATOMIC
if atomic:
# fetch and claim the next object from in the queue in one go atomically
obj = self.get_next_atomic()
else:
# two-step claim: fetch the next object and lock it in a separate query
obj = self.get_queue().last()
assert obj and self.lock_next(obj), f'Unable to fetch+lock the next {self.get_model().__name__} ojbect from {self}.QUEUE'
return obj
def lock_next(self, obj: ModelType) -> bool:
"""override this to implement a custom two-step (non-atomic)lock mechanism"""
# For example:
# assert obj._model.objects.filter(pk=obj.pk, status='queued').update(status='started', locked_by=self.pid)
# Not needed if using get_next_and_lock() to claim the object atomically
# print(f'[blue]🏃‍♂️ {self}.lock()[/blue]', obj.abid or obj.id)
return True
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
return self.CLAIM_SET
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def get_next_atomic(self, shallow: bool=True) -> ModelType | None:
"""
claim a random object from the top n=50 objects in the queue (atomically updates status=queued->started for claimed object)
optimized for minimizing contention on the queue with other actors selecting from the same list
slightly faster than claim_any_obj() which selects randomly from the entire queue but needs to know the total count
"""
Model = self.get_model() # e.g. ArchiveResult
table = f'{Model._meta.app_label}_{Model._meta.model_name}' # e.g. core_archiveresult
where_sql = self.claim_sql_where()
set_sql = self.claim_sql_set()
order_by_sql = self.claim_sql_order()
choose_from_top = self.claim_from_top()
with db.connection.cursor() as cursor:
# subquery gets the pool of the top 50 candidates sorted by sort and order
# main query selects a random one from that pool
cursor.execute(f"""
UPDATE {table}
SET {set_sql}
WHERE {where_sql} and id = (
SELECT id FROM (
SELECT id FROM {table}
WHERE {where_sql}
ORDER BY {order_by_sql}
LIMIT {choose_from_top}
) candidates
ORDER BY RANDOM()
LIMIT 1
)
RETURNING id;
""")
result = cursor.fetchone()
if result is None:
return None # If no rows were claimed, return None
if shallow:
# shallow: faster, returns potentially incomplete object instance missing some django auto-populated fields:
columns = [col[0] for col in cursor.description or ['id']]
return Model(**dict(zip(columns, result)))
# if not shallow do one extra query to get a more complete object instance (load it fully from scratch)
return Model.objects.get(id=result[0])
@abstractmethod
def tick(self, obj: ModelType) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_startup(self) -> None:
if self.mode == 'thread':
self.pid = get_native_id() # thread id
print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (THREAD)[/green]')
else:
self.pid = os.getpid() # process id
print(f'[green]🏃‍♂️ {self}.on_startup() STARTUP (PROCESS)[/green]')
# abx.pm.hook.on_actor_startup(self)
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: ModelType) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: ModelType) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: ModelType, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class ActorsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "actors"

View file

@ -0,0 +1,3 @@
from django.db import models
# Create your models here.

View file

@ -0,0 +1,244 @@
__package__ = 'archivebox.actors'
import os
import time
import itertools
from typing import Dict, Type, Literal, ClassVar
from django.utils.functional import classproperty
from multiprocessing import Process, cpu_count
from threading import Thread, get_native_id
from rich import print
from django.db.models import QuerySet
from django.apps import apps
from .actor import ActorType
class Orchestrator:
pid: int
idle_count: int = 0
actor_types: Dict[str, Type[ActorType]]
mode: Literal['thread', 'process'] = 'process'
def __init__(self, actor_types: Dict[str, Type[ActorType]] | None = None, mode: Literal['thread', 'process'] | None=None):
self.actor_types = actor_types or self.actor_types or self.autodiscover_actor_types()
self.mode = mode or self.mode
def __repr__(self) -> str:
label = 'tid' if self.mode == 'thread' else 'pid'
return f'[underline]{self.name}[/underline]\\[{label}={self.pid}]'
def __str__(self) -> str:
return self.__repr__()
@classproperty
def name(cls) -> str:
return cls.__name__ # type: ignore
def fork_as_thread(self):
self.thread = Thread(target=self.runloop)
self.thread.start()
assert self.thread.native_id is not None
return self.thread.native_id
def fork_as_process(self):
self.process = Process(target=self.runloop)
self.process.start()
assert self.process.pid is not None
return self.process.pid
def start(self) -> int:
if self.mode == 'thread':
return self.fork_as_thread()
elif self.mode == 'process':
return self.fork_as_process()
raise ValueError(f'Invalid orchestrator mode: {self.mode}')
@classmethod
def autodiscover_actor_types(cls) -> Dict[str, Type[ActorType]]:
# returns a Dict of all discovered {actor_type_id: ActorType} across the codebase
# override this method in a subclass to customize the actor types that are used
# return {'Snapshot': SnapshotActorType, 'ArchiveResult_chrome': ChromeActorType, ...}
return {
# look through all models and find all classes that inherit from ActorType
# actor_type.__name__: actor_type
# for actor_type in abx.pm.hook.get_all_ACTORS_TYPES().values()
}
@classmethod
def get_orphaned_objects(cls, all_queues) -> list:
# returns a list of objects that are in the queues of all actor types but not in the queues of any other actor types
all_queued_ids = itertools.chain(*[queue.values('id', flat=True) for queue in all_queues.values()])
orphaned_objects = []
for model in apps.get_models():
if hasattr(model, 'retry_at'):
orphaned_objects.extend(model.objects.filter(retry_at__lt=timezone.now()).exclude(id__in=all_queued_ids))
return orphaned_objects
def on_startup(self):
if self.mode == 'thread':
self.pid = get_native_id()
print(f'[green]👨‍✈️ {self}.on_startup() STARTUP (THREAD)[/green]')
elif self.mode == 'process':
self.pid = os.getpid()
print(f'[green]👨‍✈️ {self}.on_startup() STARTUP (PROCESS)[/green]')
# abx.pm.hook.on_orchestrator_startup(self)
def on_shutdown(self, err: BaseException | None = None):
print(f'[grey53]👨‍✈️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_orchestrator_shutdown(self)
def on_tick_started(self, all_queues):
# total_pending = sum(queue.count() for queue in all_queues.values())
# print(f'👨‍✈️ {self}.on_tick_started()', f'total_pending={total_pending}')
# abx.pm.hook.on_orchestrator_tick_started(self, actor_types, all_queues)
pass
def on_tick_finished(self, all_queues, all_existing_actors, all_spawned_actors):
if all_spawned_actors:
total_queue_length = sum(queue.count() for queue in all_queues.values())
print(f'[grey53]👨‍✈️ {self}.on_tick_finished() queue={total_queue_length} existing_actors={len(all_existing_actors)} spawned_actors={len(all_spawned_actors)}[/grey53]')
# abx.pm.hook.on_orchestrator_tick_finished(self, actor_types, all_queues)
def on_idle(self, all_queues):
# print(f'👨‍✈️ {self}.on_idle()')
# abx.pm.hook.on_orchestrator_idle(self)
# check for orphaned objects left behind
if self.idle_count == 60:
orphaned_objects = self.get_orphaned_objects(all_queues)
if orphaned_objects:
print('[red]👨‍✈️ WARNING: some objects may not be processed, no actor has claimed them after 60s:[/red]', orphaned_objects)
def runloop(self):
self.on_startup()
try:
while True:
all_queues = {
actor_type: actor_type.get_queue()
for actor_type in self.actor_types.values()
}
if not all_queues:
raise Exception('Failed to find any actor_types to process')
self.on_tick_started(all_queues)
all_existing_actors = []
all_spawned_actors = []
for actor_type, queue in all_queues.items():
try:
existing_actors = actor_type.get_running_actors()
all_existing_actors.extend(existing_actors)
actors_to_spawn = actor_type.get_actors_to_spawn(queue, existing_actors)
for launch_kwargs in actors_to_spawn:
new_actor_pid = actor_type.start(mode='process', **launch_kwargs)
all_spawned_actors.append(new_actor_pid)
except Exception as err:
print(f'🏃‍♂️ ERROR: {self} Failed to get {actor_type} queue & running actors', err)
except BaseException:
raise
if not any(queue.exists() for queue in all_queues.values()):
self.on_idle(all_queues)
self.idle_count += 1
time.sleep(1)
else:
self.idle_count = 0
self.on_tick_finished(all_queues, all_existing_actors, all_spawned_actors)
time.sleep(1)
except BaseException as err:
if isinstance(err, KeyboardInterrupt):
print()
else:
print(f'\n[red]🏃‍♂️ {self}.runloop() FATAL:[/red]', err.__class__.__name__, err)
self.on_shutdown(err=err)
from archivebox.config.django import setup_django
setup_django()
from core.models import ArchiveResult, Snapshot
from django.utils import timezone
from django import db
from django.db import connection
from crawls.actors import CrawlActor
from .actor_snapshot import SnapshotActor
from abx_plugin_singlefile.actors import SinglefileActor
class FaviconActor(ActorType[ArchiveResult]):
CLAIM_ORDER: ClassVar[str] = 'created_at DESC'
CLAIM_WHERE: ClassVar[str] = 'status = "queued" AND extractor = "favicon"'
CLAIM_SET: ClassVar[str] = 'status = "started"'
@classproperty
def QUERYSET(cls) -> QuerySet:
return ArchiveResult.objects.filter(status='failed', extractor='favicon')
def tick(self, obj: ArchiveResult):
print(f'[grey53]{self}.tick({obj.abid or obj.id}, status={obj.status}) remaining:[/grey53]', self.get_queue().count())
updated = ArchiveResult.objects.filter(id=obj.id, status='started').update(status='success') == 1
if not updated:
raise Exception(f'Failed to update {obj.abid or obj.id}, interrupted by another actor writing to the same object')
obj.refresh_from_db()
obj.save()
class ExtractorsOrchestrator(Orchestrator):
actor_types = {
'CrawlActor': CrawlActor,
'SnapshotActor': SnapshotActor,
'FaviconActor': FaviconActor,
'SinglefileActor': SinglefileActor,
}
if __name__ == '__main__':
orchestrator = ExtractorsOrchestrator()
orchestrator.start()
snap = Snapshot.objects.last()
assert snap is not None
created = 0
while True:
time.sleep(0.05)
# try:
# ArchiveResult.objects.bulk_create([
# ArchiveResult(
# id=uuid.uuid4(),
# snapshot=snap,
# status='failed',
# extractor='favicon',
# cmd=['echo', '"hello"'],
# cmd_version='1.0',
# pwd='.',
# start_ts=timezone.now(),
# end_ts=timezone.now(),
# created_at=timezone.now(),
# modified_at=timezone.now(),
# created_by_id=1,
# )
# for _ in range(100)
# ])
# created += 100
# if created % 1000 == 0:
# print(f'[blue]Created {created} ArchiveResults...[/blue]')
# time.sleep(25)
# except Exception as err:
# print(err)
# db.connections.close_all()
# except BaseException as err:
# print(err)
# break

View file

@ -0,0 +1,286 @@
from statemachine import State, StateMachine
from django.db import models
from multiprocessing import Process
import psutil
import time
# State Machine Definitions
#################################################
class SnapshotMachine(StateMachine):
"""State machine for managing Snapshot lifecycle."""
# States
queued = State(initial=True)
started = State()
sealed = State(final=True)
# Transitions
start = queued.to(started, cond='can_start')
seal = started.to(sealed, cond='is_finished')
# Events
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot):
self.snapshot = snapshot
super().__init__()
def can_start(self):
return True
def is_finished(self):
return not self.snapshot.has_pending_archiveresults()
def before_start(self):
"""Pre-start validation and setup."""
self.snapshot.cleanup_dir()
def after_start(self):
"""Post-start side effects."""
self.snapshot.create_pending_archiveresults()
self.snapshot.update_indices()
self.snapshot.bump_retry_at(seconds=10)
def before_seal(self):
"""Pre-seal validation and cleanup."""
self.snapshot.cleanup_dir()
def after_seal(self):
"""Post-seal actions."""
self.snapshot.update_indices()
self.snapshot.seal_dir()
self.snapshot.upload_dir()
self.snapshot.retry_at = None
self.snapshot.save()
class ArchiveResultMachine(StateMachine):
"""State machine for managing ArchiveResult lifecycle."""
# States
queued = State(initial=True)
started = State()
succeeded = State(final=True)
backoff = State()
failed = State(final=True)
# Transitions
start = queued.to(started, cond='can_start')
succeed = started.to(succeeded, cond='extractor_succeeded')
backoff = started.to(backoff, unless='extractor_succeeded')
retry = backoff.to(queued, cond='can_retry')
fail = backoff.to(failed, unless='can_retry')
# Events
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(cond='extractor_still_running') |
started.to(succeeded, cond='extractor_succeeded') |
started.to(backoff, unless='extractor_succeeded') |
backoff.to.itself(cond='still_waiting_to_retry') |
backoff.to(queued, cond='can_retry') |
backoff.to(failed, unless='can_retry')
)
def __init__(self, archiveresult):
self.archiveresult = archiveresult
super().__init__()
def can_start(self):
return True
def extractor_still_running(self):
return self.archiveresult.start_ts > time.now() - timedelta(seconds=5)
def extractor_succeeded(self):
# return check_if_extractor_succeeded(self.archiveresult)
return self.archiveresult.start_ts < time.now() - timedelta(seconds=5)
def can_retry(self):
return self.archiveresult.retries < self.archiveresult.max_retries
def before_start(self):
"""Pre-start initialization."""
self.archiveresult.retries += 1
self.archiveresult.start_ts = time.now()
self.archiveresult.output = None
self.archiveresult.error = None
def after_start(self):
"""Post-start execution."""
self.archiveresult.bump_retry_at(seconds=self.archiveresult.timeout + 5)
execute_extractor(self.archiveresult)
self.archiveresult.snapshot.bump_retry_at(seconds=5)
def before_succeed(self):
"""Pre-success validation."""
self.archiveresult.output = get_archiveresult_output(self.archiveresult)
def after_succeed(self):
"""Post-success cleanup."""
self.archiveresult.end_ts = time.now()
self.archiveresult.retry_at = None
self.archiveresult.update_indices()
def before_backoff(self):
"""Pre-backoff error capture."""
self.archiveresult.error = get_archiveresult_error(self.archiveresult)
def after_backoff(self):
"""Post-backoff retry scheduling."""
self.archiveresult.end_ts = time.now()
self.archiveresult.bump_retry_at(
seconds=self.archiveresult.timeout * self.archiveresult.retries
)
self.archiveresult.update_indices()
def before_fail(self):
"""Pre-failure finalization."""
self.archiveresult.retry_at = None
def after_fail(self):
"""Post-failure cleanup."""
self.archiveresult.update_indices()
# Models
#################################################
class Snapshot(models.Model):
status = models.CharField(max_length=32, default='queued')
retry_at = models.DateTimeField(null=True)
@property
def sm(self):
"""Get the state machine for this snapshot."""
return SnapshotMachine(self)
def has_pending_archiveresults(self):
return self.archiveresult_set.exclude(
status__in=['succeeded', 'failed']
).exists()
def bump_retry_at(self, seconds):
self.retry_at = time.now() + timedelta(seconds=seconds)
self.save()
def cleanup_dir(self):
cleanup_snapshot_dir(self)
def create_pending_archiveresults(self):
create_snapshot_pending_archiveresults(self)
def update_indices(self):
update_snapshot_index_json(self)
update_snapshot_index_html(self)
def seal_dir(self):
seal_snapshot_dir(self)
def upload_dir(self):
upload_snapshot_dir(self)
class ArchiveResult(models.Model):
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
status = models.CharField(max_length=32, default='queued')
retry_at = models.DateTimeField(null=True)
retries = models.IntegerField(default=0)
max_retries = models.IntegerField(default=3)
timeout = models.IntegerField(default=60)
start_ts = models.DateTimeField(null=True)
end_ts = models.DateTimeField(null=True)
output = models.TextField(null=True)
error = models.TextField(null=True)
def get_machine(self):
return ArchiveResultMachine(self)
def bump_retry_at(self, seconds):
self.retry_at = time.now() + timedelta(seconds=seconds)
self.save()
def update_indices(self):
update_archiveresult_index_json(self)
update_archiveresult_index_html(self)
# Actor System
#################################################
class BaseActor:
MAX_TICK_TIME = 60
def tick(self, obj):
"""Process a single object through its state machine."""
machine = obj.get_machine()
if machine.is_queued:
if machine.can_start():
machine.start()
elif machine.is_started:
if machine.can_seal():
machine.seal()
elif machine.is_backoff:
if machine.can_retry():
machine.retry()
else:
machine.fail()
class Orchestrator:
"""Main orchestrator that manages all actors."""
def __init__(self):
self.pid = None
@classmethod
def spawn(cls):
orchestrator = cls()
proc = Process(target=orchestrator.runloop)
proc.start()
return proc.pid
def runloop(self):
self.pid = os.getpid()
abx.pm.hook.on_orchestrator_startup(self)
try:
while True:
self.process_queue(Snapshot)
self.process_queue(ArchiveResult)
time.sleep(0.1)
except (KeyboardInterrupt, SystemExit):
abx.pm.hook.on_orchestrator_shutdown(self)
def process_queue(self, model):
retry_at_reached = Q(retry_at__isnull=True) | Q(retry_at__lte=time.now())
queue = model.objects.filter(retry_at_reached)
if queue.exists():
actor = BaseActor()
for obj in queue:
try:
with transaction.atomic():
actor.tick(obj)
except Exception as e:
abx.pm.hook.on_actor_tick_exception(actor, obj, e)
# Periodic Tasks
#################################################
@djhuey.periodic_task(schedule=djhuey.crontab(minute='*'))
def ensure_orchestrator_running():
"""Ensure orchestrator is running, start if not."""
if not any(p.name().startswith('Orchestrator') for p in psutil.process_iter()):
Orchestrator.spawn()

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -1,4 +1,5 @@
__package__ = 'archivebox.config'
__package__ = 'config'
__order__ = 200
from .paths import (
PACKAGE_DIR, # noqa
@ -8,35 +9,28 @@ from .paths import (
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
import abx
# import abx
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return ['config']
# def get_CONFIG():
# from .common import (
# SHELL_CONFIG,
# STORAGE_CONFIG,
# GENERAL_CONFIG,
# SERVER_CONFIG,
# ARCHIVING_CONFIG,
# SEARCH_BACKEND_CONFIG,
# )
# return {
# 'SHELL_CONFIG': SHELL_CONFIG,
# 'STORAGE_CONFIG': STORAGE_CONFIG,
# 'GENERAL_CONFIG': GENERAL_CONFIG,
# 'SERVER_CONFIG': SERVER_CONFIG,
# 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
# 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
# }
@abx.hookimpl
def get_CONFIG():
from .common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}
@abx.hookimpl
def ready():
for config in get_CONFIG().values():
config.validate()
# @abx.hookimpl
# def ready():
# for config in get_CONFIG().values():
# config.validate()

View file

@ -9,16 +9,18 @@ from configparser import ConfigParser
from benedict import benedict
import archivebox
from archivebox.config.constants import CONSTANTS
from archivebox.misc.logging import stderr
def get_real_name(key: str) -> str:
"""get the current canonical name for a given deprecated config key"""
from django.conf import settings
"""get the up-to-date canonical name for a given old alias or current key"""
CONFIGS = archivebox.pm.hook.get_CONFIGS()
for section in settings.CONFIGS.values():
for section in CONFIGS.values():
try:
return section.aliases[key]
except KeyError:
@ -115,17 +117,15 @@ def load_config_file() -> Optional[benedict]:
def section_for_key(key: str) -> Any:
from django.conf import settings
for config_section in settings.CONFIGS.values():
for config_section in archivebox.pm.hook.get_CONFIGS().values():
if hasattr(config_section, key):
return config_section
return None
raise ValueError(f'No config section found for key: {key}')
def write_config_file(config: Dict[str, str]) -> benedict:
"""load the ini-formatted config file from DATA_DIR/Archivebox.conf"""
import abx.archivebox.reads
from archivebox.misc.system import atomic_write
CONFIG_HEADER = (
@ -175,7 +175,7 @@ def write_config_file(config: Dict[str, str]) -> benedict:
updated_config = {}
try:
# validate the updated_config by attempting to re-parse it
updated_config = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
except BaseException: # lgtm [py/catch-base-exception]
# something went horribly wrong, revert to the previous version
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
@ -233,11 +233,11 @@ def load_config(defaults: Dict[str, Any],
return benedict(extended_config)
def load_all_config():
import abx.archivebox.reads
import abx
flat_config = benedict()
for config_section in abx.archivebox.reads.get_CONFIGS().values():
for config_section in abx.pm.hook.get_CONFIGS().values():
config_section.__init__()
flat_config.update(config_section.model_dump())

View file

@ -10,7 +10,7 @@ from rich import print
from pydantic import Field, field_validator
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
@ -45,8 +45,6 @@ class ShellConfig(BaseConfigSet):
def BUILD_TIME(self) -> str:
return get_BUILD_TIME()
# def VERSIONS_AVAILABLE() -> bool # .check_for_update.get_versions_available_on_github(c)},
# def CAN_UPGRADE() -> bool # .check_for_update.can_upgrade(c)},
SHELL_CONFIG = ShellConfig()

View file

@ -1,3 +1,15 @@
"""
Constants are for things that never change at runtime.
(but they can change from run-to-run or machine-to-machine)
DATA_DIR will never change at runtime, but you can run
archivebox from inside a different DATA_DIR on the same machine.
This is loaded very early in the archivebox startup flow, so nothing in this file
or imported from this file should import anything from archivebox.config.common,
django, other INSTALLED_APPS, or anything else that is not in a standard library.
"""
__package__ = 'archivebox.config'
import re
@ -197,10 +209,12 @@ class ConstantsDict(Mapping):
@classmethod
def __getitem__(cls, key: str):
# so it behaves like a dict[key] == dict.key or object attr
return getattr(cls, key)
@classmethod
def __benedict__(cls):
# when casting to benedict, only include uppercase keys that don't start with an underscore
return benedict({key: value for key, value in cls.__dict__.items() if key.isupper() and not key.startswith('_')})
@classmethod
@ -214,5 +228,6 @@ class ConstantsDict(Mapping):
CONSTANTS = ConstantsDict()
CONSTANTS_CONFIG = CONSTANTS.__benedict__()
# add all key: values to globals() for easier importing
globals().update(CONSTANTS)
# add all key: values to globals() for easier importing, e.g.:
# from archivebox.config.constants import IS_ROOT, PERSONAS_DIR, ...
# globals().update(CONSTANTS)

View file

@ -60,7 +60,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
return
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25, visible=False)
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
@ -97,7 +97,7 @@ def setup_django(check_db=False, in_memory_db=False) -> None:
except Exception as e:
bump_startup_progress_bar(advance=1000)
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version', 'init'))
is_using_meta_cmd = any(ignored_subcommand in sys.argv for ignored_subcommand in ('help', 'version', '--help', '--version'))
if not is_using_meta_cmd:
# show error message to user only if they're not running a meta command / just trying to get help
STDERR.print()

View file

@ -45,7 +45,7 @@ def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
@cache
def get_COMMIT_HASH() -> Optional[str]:
try:
git_dir = PACKAGE_DIR / '../.git'
git_dir = PACKAGE_DIR.parent / '.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
@ -53,7 +53,7 @@ def get_COMMIT_HASH() -> Optional[str]:
pass
try:
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
return list((PACKAGE_DIR.parent / '.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
@ -62,8 +62,12 @@ def get_COMMIT_HASH() -> Optional[str]:
@cache
def get_BUILD_TIME() -> str:
if IN_DOCKER:
try:
# if we're in the archivebox official docker image, /VERSION.txt will contain the build time
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
except Exception:
pass
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')

View file

@ -14,8 +14,8 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import abx.archivebox.reads
import abx
import archivebox
from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date
@ -65,7 +65,7 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
@render_with_table_view
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
@ -81,12 +81,11 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
relevant_configs = {
key: val
for key, val in settings.FLAT_CONFIG.items()
for key, val in FLAT_CONFIG.items()
if '_BINARY' in key or '_VERSION' in key
}
for plugin_id, plugin in abx.archivebox.reads.get_PLUGINS().items():
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
for plugin_id, plugin in abx.get_all_plugins().items():
if not plugin.hooks.get('get_BINARIES'):
continue
@ -131,17 +130,16 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
binary = None
plugin = None
for plugin_id in abx.archivebox.reads.get_PLUGINS().keys():
loaded_plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
for plugin_id, plugin in abx.get_all_plugins().items():
try:
for loaded_binary in loaded_plugin.hooks.get_BINARIES().values():
for loaded_binary in plugin['hooks'].get_BINARIES().values():
if loaded_binary.name == key:
binary = loaded_binary
plugin = loaded_plugin
plugin = plugin
# break # last write wins
except Exception as e:
print(e)
@ -161,7 +159,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"name": binary.name,
"description": binary.abspath,
"fields": {
'plugin': plugin.package,
'plugin': plugin['package'],
'binprovider': binary.loaded_binprovider,
'abspath': binary.loaded_abspath,
'version': binary.loaded_version,
@ -215,9 +213,7 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
return color
return 'black'
for plugin_id in settings.PLUGINS.keys():
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
for plugin_id, plugin in abx.get_all_plugins().items():
plugin.hooks.get_BINPROVIDERS = plugin.hooks.get('get_BINPROVIDERS', lambda: {})
plugin.hooks.get_BINARIES = plugin.hooks.get('get_BINARIES', lambda: {})
plugin.hooks.get_CONFIG = plugin.hooks.get('get_CONFIG', lambda: {})
@ -263,7 +259,7 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
plugin = abx.get_plugin(plugin_id)
return ItemContext(
slug=key,

View file

@ -1,2 +1,31 @@
__package__ = 'archivebox.core'
import abx
@abx.hookimpl
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin
register_admin(admin_site)
@abx.hookimpl
def get_CONFIG():
from archivebox.config.common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}

73
archivebox/core/actors.py Normal file
View file

@ -0,0 +1,73 @@
__package__ = 'archivebox.core'
from typing import ClassVar
from rich import print
from django.db.models import QuerySet
from django.utils import timezone
from datetime import timedelta
from core.models import Snapshot
from actors.actor import ActorType
class SnapshotActor(ActorType[Snapshot]):
QUERYSET: ClassVar[QuerySet] = Snapshot.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
retry_at = timezone.now() + timedelta(seconds=self.MAX_TICK_TIME)
# format as 2024-10-31 10:14:33.240903
retry_at_str = retry_at.strftime('%Y-%m-%d %H:%M:%S.%f')
return f'{self.CLAIM_SET}, retry_at = {retry_at_str}'
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def tick(self, obj: Snapshot) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: Snapshot) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: Snapshot, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -8,7 +8,7 @@ from django.utils.html import format_html, mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from django.forms import forms
from django_jsonform.forms.fields import JSONFormField
from huey_monitor.admin import TaskModel
@ -83,7 +83,7 @@ class ArchiveResultInline(admin.TabularInline):
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
formset.form.base_fields['cmd'] = JSONFormField(initial=['-'])
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
if obj is not None:

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
from django.contrib import admin
import abx.django.use
import archivebox
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
@ -37,6 +37,6 @@ def register_admin_site():
sites.site = archivebox_admin
# register all plugins admin classes
abx.django.use.register_admin(archivebox_admin)
archivebox.pm.hook.register_admin(admin_site=archivebox_admin)
return archivebox_admin

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
from django.apps import AppConfig
import abx
import archivebox
class CoreConfig(AppConfig):
@ -10,16 +10,11 @@ class CoreConfig(AppConfig):
def ready(self):
"""Register the archivebox.core.admin_site as the main django admin site"""
from django.conf import settings
archivebox.pm.hook.ready(settings=settings)
from core.admin_site import register_admin_site
register_admin_site()
abx.pm.hook.ready()
@abx.hookimpl
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin
register_admin(admin_site)

View file

@ -8,21 +8,25 @@ import os
import json
from pathlib import Path
from datetime import timedelta
from django.db import models
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.utils import timezone
from django.core.cache import cache
from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField
from django.contrib import admin
from django.conf import settings
from statemachine.mixins import MachineMixin
from archivebox.config import CONSTANTS
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from queues.tasks import bg_archive_snapshot
# from crawls.models import Crawl
from crawls.models import Crawl
# from machine.models import Machine, NetworkInterface
from archivebox.misc.system import get_dir_size
@ -152,7 +156,7 @@ class SnapshotManager(models.Manager):
return super().get_queryset().prefetch_related('tags', 'archiveresult_set') # .annotate(archiveresult_count=models.Count('archiveresult')).distinct()
class Snapshot(ABIDModel):
class Snapshot(ABIDModel, MachineMixin):
abid_prefix = 'snp_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.url'
@ -160,6 +164,17 @@ class Snapshot(ABIDModel):
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.SnapshotMachine'
state_machine_attr = 'sm'
class SnapshotStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SEALED = 'sealed', 'Sealed'
status = models.CharField(max_length=15, default=SnapshotStatus.QUEUED, null=False, blank=False)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
@ -171,7 +186,7 @@ class Snapshot(ABIDModel):
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
# crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
@ -397,6 +412,25 @@ class Snapshot(ABIDModel):
self.tags.clear()
self.tags.add(*tags_id)
def has_pending_archiveresults(self) -> bool:
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
pending_archiveresults = self.archiveresult_set.filter(status__in=pending_statuses)
return pending_archiveresults.exists()
def create_pending_archiveresults(self) -> list['ArchiveResult']:
archiveresults = []
for extractor in EXTRACTORS:
archiveresult, _created = ArchiveResult.objects.get_or_create(
snapshot=self,
extractor=extractor,
status=ArchiveResult.ArchiveResultStatus.QUEUED,
)
archiveresults.append(archiveresult)
return archiveresults
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
# def get_storage_dir(self, create=True, symlink=True) -> Path:
# date_str = self.bookmarked_at.strftime('%Y%m%d')
@ -453,6 +487,20 @@ class ArchiveResult(ABIDModel):
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'core.statemachines.ArchiveResultMachine'
state_machine_attr = 'sm'
class ArchiveResultStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
SKIPPED = 'skipped', 'Skipped'
BACKOFF = 'backoff', 'Waiting to retry'
status = models.CharField(max_length=15, choices=ArchiveResultStatus.choices, default=ArchiveResultStatus.QUEUED, null=False, blank=False)
EXTRACTOR_CHOICES = (
('htmltotext', 'htmltotext'),
('git', 'git'),
@ -469,11 +517,7 @@ class ArchiveResult(ABIDModel):
('title', 'title'),
('wget', 'wget'),
)
STATUS_CHOICES = [
("succeeded", "succeeded"),
("failed", "failed"),
("skipped", "skipped")
]
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
@ -491,7 +535,6 @@ class ArchiveResult(ABIDModel):
output = models.CharField(max_length=1024)
start_ts = models.DateTimeField(db_index=True)
end_ts = models.DateTimeField()
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
# the network interface that was used to download this result
# uplink = models.ForeignKey(NetworkInterface, on_delete=models.SET_NULL, null=True, blank=True, verbose_name='Network Interface Used')
@ -552,7 +595,15 @@ class ArchiveResult(ABIDModel):
return link.canonical_outputs().get(f'{self.extractor}_path')
def output_exists(self) -> bool:
return os.access(self.output_path(), os.R_OK)
return os.path.exists(self.output_path())
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
def create_output_dir(self):
snap_dir = self.snapshot_dir
snap_dir.mkdir(parents=True, exist_ok=True)
return snap_dir / self.output_path()
# def get_storage_dir(self, create=True, symlink=True):

View file

@ -9,13 +9,12 @@ from pathlib import Path
from django.utils.crypto import get_random_string
import abx
import abx.archivebox
import abx.archivebox.reads
import abx.django.use
import archivebox
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS # noqa
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
@ -25,45 +24,8 @@ IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--v
### ArchiveBox Plugin Settings
################################################################################
PLUGIN_HOOKSPECS = [
'abx.django.hookspec',
'abx.pydantic_pkgr.hookspec',
'abx.archivebox.hookspec',
]
abx.register_hookspecs(PLUGIN_HOOKSPECS)
BUILTIN_PLUGIN_DIRS = {
'archivebox': PACKAGE_DIR,
'plugins_pkg': PACKAGE_DIR / 'plugins_pkg',
'plugins_auth': PACKAGE_DIR / 'plugins_auth',
'plugins_search': PACKAGE_DIR / 'plugins_search',
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
}
USER_PLUGIN_DIRS = {
# 'user_plugins': DATA_DIR / 'user_plugins',
}
# Discover ArchiveBox plugins
BUILTIN_PLUGINS = abx.get_plugins_in_dirs(BUILTIN_PLUGIN_DIRS)
PIP_PLUGINS = abx.get_pip_installed_plugins(group='archivebox')
USER_PLUGINS = abx.get_plugins_in_dirs(USER_PLUGIN_DIRS)
ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
# Load ArchiveBox plugins
PLUGIN_MANAGER = abx.pm
abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
PLUGINS = abx.archivebox.reads.get_PLUGINS()
# Load ArchiveBox config from plugins
CONFIGS = abx.archivebox.reads.get_CONFIGS()
CONFIG = FLAT_CONFIG = abx.archivebox.reads.get_FLAT_CONFIG()
BINPROVIDERS = abx.archivebox.reads.get_BINPROVIDERS()
BINARIES = abx.archivebox.reads.get_BINARIES()
EXTRACTORS = abx.archivebox.reads.get_EXTRACTORS()
SEARCHBACKENDS = abx.archivebox.reads.get_SEARCHBACKENDS()
# REPLAYERS = abx.archivebox.reads.get_REPLAYERS()
# ADMINDATAVIEWS = abx.archivebox.reads.get_ADMINDATAVIEWS()
ALL_PLUGINS = archivebox.ALL_PLUGINS
LOADED_PLUGINS = archivebox.LOADED_PLUGINS
################################################################################
### Django Core Settings
@ -102,7 +64,8 @@ INSTALLED_APPS = [
# 'abid_utils', # handles ABID ID creation, handling, and models
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes
'actors', # handles starting and managing background workers and processes (orchestrators and actors)
'queues', # handles starting and managing background workers and processes (supervisord)
'seeds', # handles Seed model and URL source management
'crawls', # handles Crawl and CrawlSchedule models and management
'personas', # handles Persona and session management
@ -110,7 +73,7 @@ INSTALLED_APPS = [
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
# ArchiveBox plugins
*abx.django.use.get_INSTALLED_APPS(), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
*abx.as_list(abx.pm.hook.get_INSTALLED_APPS()), # all plugin django-apps found in archivebox/plugins_* and data/user_plugins,
# 3rd-party apps from PyPI that need to be loaded last
'admin_data_views', # handles rendering some convenient automatic read-only views of data in Django admin
@ -125,6 +88,7 @@ INSTALLED_APPS = [
MIDDLEWARE = [
'core.middleware.TimezoneMiddleware',
'django.middleware.security.SecurityMiddleware',
@ -135,7 +99,7 @@ MIDDLEWARE = [
'core.middleware.ReverseProxyAuthMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
'core.middleware.CacheControlMiddleware',
*abx.django.use.get_MIDDLEWARES(),
*abx.as_list(abx.pm.hook.get_MIDDLEWARES()),
]
@ -148,7 +112,7 @@ MIDDLEWARE = [
AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.RemoteUserBackend',
'django.contrib.auth.backends.ModelBackend',
*abx.django.use.get_AUTHENTICATION_BACKENDS(),
*abx.as_list(abx.pm.hook.get_AUTHENTICATION_BACKENDS()),
]
@ -169,7 +133,7 @@ AUTHENTICATION_BACKENDS = [
STATIC_URL = '/static/'
TEMPLATES_DIR_NAME = 'templates'
CUSTOM_TEMPLATES_ENABLED = os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK) and CONSTANTS.CUSTOM_TEMPLATES_DIR.is_dir()
CUSTOM_TEMPLATES_ENABLED = os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK)
STATICFILES_DIRS = [
*([str(CONSTANTS.CUSTOM_TEMPLATES_DIR / 'static')] if CUSTOM_TEMPLATES_ENABLED else []),
# *[
@ -177,7 +141,7 @@ STATICFILES_DIRS = [
# for plugin_dir in PLUGIN_DIRS.values()
# if (plugin_dir / 'static').is_dir()
# ],
*abx.django.use.get_STATICFILES_DIRS(),
*abx.as_list(abx.pm.hook.get_STATICFILES_DIRS()),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'static'),
]
@ -188,7 +152,7 @@ TEMPLATE_DIRS = [
# for plugin_dir in PLUGIN_DIRS.values()
# if (plugin_dir / 'templates').is_dir()
# ],
*abx.django.use.get_TEMPLATE_DIRS(),
*abx.as_list(abx.pm.hook.get_TEMPLATE_DIRS()),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'core'),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME / 'admin'),
str(PACKAGE_DIR / TEMPLATES_DIR_NAME),
@ -228,7 +192,7 @@ SQLITE_CONNECTION_OPTIONS = {
# https://gcollazo.com/optimal-sqlite-settings-for-django/
# https://litestream.io/tips/#busy-timeout
# https://docs.djangoproject.com/en/5.1/ref/databases/#setting-pragma-options
"timeout": 5,
"timeout": 10,
"check_same_thread": False,
"transaction_mode": "IMMEDIATE",
"init_command": (
@ -267,7 +231,7 @@ if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file
HUEY = {
"huey_class": "huey.SqliteHuey",
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
"name": "system_tasks",
"name": "commands",
"results": True,
"store_none": True,
"immediate": False,
@ -288,11 +252,11 @@ if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
# https://github.com/gaiacoop/django-huey
DJANGO_HUEY = {
"default": "system_tasks",
"default": "commands",
"queues": {
HUEY["name"]: HUEY.copy(),
# more registered here at plugin import-time by BaseQueue.register()
**abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME),
**abx.as_dict(abx.pm.hook.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME)),
},
}
@ -517,7 +481,7 @@ ADMIN_DATA_VIEWS = {
"name": "log",
},
},
*abx.django.use.get_ADMIN_DATA_VIEWS_URLS(),
*abx.as_list(abx.pm.hook.get_ADMIN_DATA_VIEWS_URLS()),
],
}
@ -611,7 +575,4 @@ if DEBUG_REQUESTS_TRACKER:
# JET_TOKEN = 'some-api-token-here'
abx.django.use.register_checks()
# abx.archivebox.reads.register_all_hooks(globals())
# import ipdb; ipdb.set_trace()

View file

@ -163,11 +163,6 @@ SETTINGS_LOGGING = {
"level": "DEBUG",
"propagate": False,
},
"plugins_extractor": {
"handlers": ["default", "logfile"],
"level": "DEBUG",
"propagate": False,
},
"httpx": {
"handlers": ["outbound_webhooks"],
"level": "INFO",

View file

@ -0,0 +1,115 @@
__package__ = 'archivebox.snapshots'
from django.utils import timezone
from statemachine import State, StateMachine
from core.models import Snapshot, ArchiveResult
# State Machine Definitions
#################################################
class SnapshotMachine(StateMachine, strict_states=True):
"""State machine for managing Snapshot lifecycle."""
model: Snapshot
# States
queued = State(value=Snapshot.SnapshotStatus.QUEUED, initial=True)
started = State(value=Snapshot.SnapshotStatus.STARTED)
sealed = State(value=Snapshot.SnapshotStatus.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished')
)
def __init__(self, snapshot, *args, **kwargs):
self.snapshot = snapshot
super().__init__(snapshot, *args, **kwargs)
def can_start(self) -> bool:
return self.snapshot.seed and self.snapshot.seed.uri
def is_finished(self) -> bool:
return not self.snapshot.has_pending_archiveresults()
def on_started(self):
self.snapshot.create_pending_archiveresults()
self.snapshot.bump_retry_at(seconds=60)
self.snapshot.save()
def on_sealed(self):
self.snapshot.retry_at = None
self.snapshot.save()
class ArchiveResultMachine(StateMachine, strict_states=True):
"""State machine for managing ArchiveResult lifecycle."""
model: ArchiveResult
# States
queued = State(value=ArchiveResult.ArchiveResultStatus.QUEUED, initial=True)
started = State(value=ArchiveResult.ArchiveResultStatus.STARTED)
backoff = State(value=ArchiveResult.ArchiveResultStatus.BACKOFF)
succeeded = State(value=ArchiveResult.ArchiveResultStatus.SUCCEEDED, final=True)
failed = State(value=ArchiveResult.ArchiveResultStatus.FAILED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed') |
started.to(backoff, cond='is_backoff') |
backoff.to.itself(unless='can_start', internal=True) |
backoff.to(started, cond='can_start') |
backoff.to(succeeded, cond='is_succeeded') |
backoff.to(failed, cond='is_failed')
)
def __init__(self, archiveresult, *args, **kwargs):
self.archiveresult = archiveresult
super().__init__(archiveresult, *args, **kwargs)
def can_start(self) -> bool:
return self.archiveresult.snapshot and self.archiveresult.snapshot.is_started()
def is_succeeded(self) -> bool:
return self.archiveresult.output_exists()
def is_failed(self) -> bool:
return not self.archiveresult.output_exists()
def is_backoff(self) -> bool:
return self.archiveresult.status == ArchiveResult.ArchiveResultStatus.BACKOFF
def on_started(self):
self.archiveresult.start_ts = timezone.now()
self.archiveresult.create_output_dir()
self.archiveresult.bump_retry_at(seconds=60)
self.archiveresult.save()
def on_backoff(self):
self.archiveresult.bump_retry_at(seconds=60)
self.archiveresult.save()
def on_succeeded(self):
self.archiveresult.end_ts = timezone.now()
self.archiveresult.save()
def on_failed(self):
self.archiveresult.end_ts = timezone.now()
self.archiveresult.save()
def after_transition(self, event: str, source: State, target: State):
print(f"after '{event}' from '{source.id}' to '{target.id}'")
# self.archiveresult.save_merkle_index()
# self.archiveresult.save_html_index()
# self.archiveresult.save_json_index()
return "after_transition"

View file

@ -12,7 +12,6 @@ from django.views import View
from django.views.generic.list import ListView
from django.views.generic import FormView
from django.db.models import Q
from django.conf import settings
from django.contrib import messages
from django.contrib.auth.mixins import UserPassesTestMixin
from django.views.decorators.csrf import csrf_exempt
@ -21,6 +20,7 @@ from django.utils.decorators import method_decorator
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import archivebox
from core.models import Snapshot
from core.forms import AddLinkForm
@ -32,9 +32,8 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize
from ..search import query_search_index
from archivebox.logging_util import printable_filesize
from archivebox.search import query_search_index
class HomepageView(View):
@ -69,7 +68,7 @@ class SnapshotView(View):
and embed_path
and os.access(abs_path, os.R_OK)
and abs_path.exists()):
if abs_path.is_dir() and not any(abs_path.glob('*.*')):
if os.path.isdir(abs_path) and not any(abs_path.glob('*.*')):
continue
result_info = {
@ -103,7 +102,7 @@ class SnapshotView(View):
# iterate through all the files in the snapshot dir and add the biggest ones to1 the result list
snap_dir = Path(snapshot.link_dir)
assert os.access(snap_dir, os.R_OK) and os.access(snap_dir, os.X_OK)
assert os.path.isdir(snap_dir) and os.access(snap_dir, os.R_OK)
for result_file in (*snap_dir.glob('*'), *snap_dir.glob('*/*')):
extension = result_file.suffix.lstrip('.').lower()
@ -154,7 +153,7 @@ class SnapshotView(View):
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'warc_path': warc_path,
'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
'SAVE_ARCHIVE_DOT_ORG': archivebox.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']),
'best_result': best_result,
@ -500,21 +499,25 @@ class HealthCheckView(View):
def find_config_section(key: str) -> str:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
if key in CONSTANTS_CONFIG:
return 'CONSTANT'
matching_sections = [
section_id for section_id, section in settings.CONFIGS.items() if key in section.model_fields
section_id for section_id, section in CONFIGS.items() if key in section.model_fields
]
section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section
def find_config_default(key: str) -> str:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
if key in CONSTANTS_CONFIG:
return str(CONSTANTS_CONFIG[key])
default_val = None
for config in settings.CONFIGS.values():
for config in CONFIGS.values():
if key in config.model_fields:
default_val = config.model_fields[key].default
break
@ -530,7 +533,9 @@ def find_config_default(key: str) -> str:
return default_val
def find_config_type(key: str) -> str:
for config in settings.CONFIGS.values():
CONFIGS = archivebox.pm.hook.get_CONFIGS()
for config in CONFIGS.values():
if hasattr(config, key):
type_hints = get_type_hints(config)
try:
@ -547,6 +552,7 @@ def key_is_safe(key: str) -> bool:
@render_with_table_view
def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
@ -560,7 +566,7 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# "Aliases": [],
}
for section_id, section in reversed(list(settings.CONFIGS.items())):
for section_id, section in reversed(list(CONFIGS.items())):
for key, field in section.model_fields.items():
rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
@ -570,7 +576,6 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# rows['Documentation'].append(mark_safe(f'Wiki: <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#{key.lower()}">{key}</a>'))
# rows['Aliases'].append(', '.join(find_config_aliases(key)))
section = 'CONSTANT'
for key in CONSTANTS_CONFIG.keys():
rows['Section'].append(section) # section.replace('_', ' ').title().replace(' Config', '')
@ -589,6 +594,8 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
CONFIGS = archivebox.pm.hook.get_CONFIGS()
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
@ -597,7 +604,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
if key in CONSTANTS_CONFIG:
section_header = mark_safe(f'[CONSTANTS] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, hardcoded by ArchiveBox)</small>')
elif key in settings.FLAT_CONFIG:
elif key in FLAT_CONFIG:
section_header = mark_safe(f'data / ArchiveBox.conf &nbsp; [{find_config_section(key)}] &nbsp; <b><code style="color: lightgray">{key}</code></b>')
else:
section_header = mark_safe(f'[DYNAMIC CONFIG] &nbsp; <b><code style="color: lightgray">{key}</code></b> &nbsp; <small>(read-only, calculated at runtime)</small>')
@ -613,7 +620,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': settings.FLAT_CONFIG.get(key, settings.CONFIGS.get(key, None)) if key_is_safe(key) else '********',
'Value': FLAT_CONFIG.get(key, CONFIGS.get(key, None)) if key_is_safe(key) else '********',
},
"help_texts": {
'Key': mark_safe(f'''
@ -635,13 +642,13 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
<code>{find_config_default(key) or '↗️ See in ArchiveBox source code...'}</code>
</a>
<br/><br/>
<p style="display: {"block" if key in settings.FLAT_CONFIG else "none"}">
<p style="display: {"block" if key in FLAT_CONFIG else "none"}">
<i>To change this value, edit <code>data/ArchiveBox.conf</code> or run:</i>
<br/><br/>
<code>archivebox config --set {key}="{
val.strip("'")
if (val := find_config_default(key)) else
(repr(settings.FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
(repr(FLAT_CONFIG[key] if key_is_safe(key) else '********')).strip("'")
}"</code>
</p>
'''),

View file

@ -0,0 +1,69 @@
__package__ = 'archivebox.crawls'
from typing import ClassVar
from rich import print
from django.db.models import QuerySet
from crawls.models import Crawl
from actors.actor import ActorType
class CrawlActor(ActorType[Crawl]):
QUERYSET: ClassVar[QuerySet] = Crawl.objects.filter(status='queued')
CLAIM_WHERE: ClassVar[str] = 'status = "queued"' # the WHERE clause to filter the objects when atomically getting the next object from the queue
CLAIM_SET: ClassVar[str] = 'status = "started"' # the SET clause to claim the object when atomically getting the next object from the queue
CLAIM_ORDER: ClassVar[str] = 'created_at DESC' # the ORDER BY clause to sort the objects with when atomically getting the next object from the queue
CLAIM_FROM_TOP: ClassVar[int] = 50 # the number of objects to consider when atomically getting the next object from the queue
# model_type: Type[ModelType]
MAX_CONCURRENT_ACTORS: ClassVar[int] = 4 # min 2, max 8, up to 60% of available cpu cores
MAX_TICK_TIME: ClassVar[int] = 60 # maximum duration in seconds to process a single object
def claim_sql_where(self) -> str:
"""override this to implement a custom WHERE clause for the atomic claim step e.g. "status = 'queued' AND locked_by = NULL" """
return self.CLAIM_WHERE
def claim_sql_set(self) -> str:
"""override this to implement a custom SET clause for the atomic claim step e.g. "status = 'started' AND locked_by = {self.pid}" """
return self.CLAIM_SET
def claim_sql_order(self) -> str:
"""override this to implement a custom ORDER BY clause for the atomic claim step e.g. "created_at DESC" """
return self.CLAIM_ORDER
def claim_from_top(self) -> int:
"""override this to implement a custom number of objects to consider when atomically claiming the next object from the top of the queue"""
return self.CLAIM_FROM_TOP
def tick(self, obj: Crawl) -> None:
"""override this to process the object"""
print(f'[blue]🏃‍♂️ {self}.tick()[/blue]', obj.abid or obj.id)
# For example:
# do_some_task(obj)
# do_something_else(obj)
# obj._model.objects.filter(pk=obj.pk, status='started').update(status='success')
# raise NotImplementedError('tick() must be implemented by the Actor subclass')
def on_shutdown(self, err: BaseException | None=None) -> None:
print(f'[grey53]🏃‍♂️ {self}.on_shutdown() SHUTTING DOWN[/grey53]', err or '[green](gracefully)[/green]')
# abx.pm.hook.on_actor_shutdown(self)
def on_tick_start(self, obj: Crawl) -> None:
# print(f'🏃‍♂️ {self}.on_tick_start()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_start(self, obj_to_process)
# self.timer = TimedProgress(self.MAX_TICK_TIME, prefix=' ')
pass
def on_tick_end(self, obj: Crawl) -> None:
# print(f'🏃‍♂️ {self}.on_tick_end()', obj.abid or obj.id)
# abx.pm.hook.on_actor_tick_end(self, obj_to_process)
# self.timer.end()
pass
def on_tick_exception(self, obj: Crawl, err: BaseException) -> None:
print(f'[red]🏃‍♂️ {self}.on_tick_exception()[/red]', obj.abid or obj.id, err)
# abx.pm.hook.on_actor_tick_exception(self, obj_to_process, err)

View file

@ -1,13 +1,20 @@
__package__ = 'archivebox.crawls'
from typing import TYPE_CHECKING
from django_stubs_ext.db.models import TypedModelMeta
from datetime import timedelta
from django.db import models
from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.utils import timezone
from django.urls import reverse_lazy
from django.utils import timezone
from statemachine.mixins import MachineMixin
if TYPE_CHECKING:
from core.models import Snapshot
from seeds.models import Seed
@ -42,7 +49,8 @@ class CrawlSchedule(ABIDModel, ModelWithHealthStats):
return self.crawl_set.first()
class Crawl(ABIDModel, ModelWithHealthStats):
class Crawl(ABIDModel, ModelWithHealthStats, MachineMixin):
"""
A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
@ -55,10 +63,22 @@ class Crawl(ABIDModel, ModelWithHealthStats):
abid_prefix = 'crl_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.seed.uri'
abid_subtype_src = 'self.persona_id'
abid_subtype_src = 'self.persona'
abid_rand_src = 'self.id'
abid_drift_allowed = True
state_field_name = 'status'
state_machine_name = 'crawls.statemachines.CrawlMachine'
state_machine_attr = 'sm'
bind_events_as_methods = True
class CrawlStatus(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SEALED = 'sealed', 'Sealed'
status = models.CharField(choices=CrawlStatus.choices, max_length=15, default=CrawlStatus.QUEUED, null=False, blank=False)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
@ -66,6 +86,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
@ -79,7 +100,7 @@ class Crawl(ABIDModel, ModelWithHealthStats):
# schedule = models.JSONField()
# config = models.JSONField()
# snapshot_set: models.Manager['Snapshot']
snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
@ -103,6 +124,28 @@ class Crawl(ABIDModel, ModelWithHealthStats):
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
def has_pending_archiveresults(self) -> bool:
from core.models import ArchiveResult
pending_statuses = [ArchiveResult.ArchiveResultStatus.QUEUED, ArchiveResult.ArchiveResultStatus.STARTED]
snapshot_ids = self.snapshot_set.values_list('id', flat=True)
pending_archiveresults = ArchiveResult.objects.filter(snapshot_id__in=snapshot_ids, status__in=pending_statuses)
return pending_archiveresults.exists()
def create_root_snapshot(self) -> 'Snapshot':
from core.models import Snapshot
root_snapshot, _ = Snapshot.objects.get_or_create(
crawl=self,
url=self.seed.uri,
)
return root_snapshot
def bump_retry_at(self, seconds: int = 10):
self.retry_at = timezone.now() + timedelta(seconds=seconds)
self.save()
class Outlink(models.Model):
"""A record of a link found on a page, pointing to another page."""

View file

@ -0,0 +1,48 @@
__package__ = 'archivebox.crawls'
from statemachine import State, StateMachine
from crawls.models import Crawl
# State Machine Definitions
#################################################
class CrawlMachine(StateMachine, strict_states=True):
"""State machine for managing Crawl lifecycle."""
model: Crawl
# States
queued = State(value=Crawl.CrawlStatus.QUEUED, initial=True)
started = State(value=Crawl.CrawlStatus.STARTED)
sealed = State(value=Crawl.CrawlStatus.SEALED, final=True)
# Tick Event
tick = (
queued.to.itself(unless='can_start', internal=True) |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', internal=True) |
started.to(sealed, cond='is_finished')
)
def __init__(self, crawl, *args, **kwargs):
self.crawl = crawl
super().__init__(crawl, *args, **kwargs)
def can_start(self) -> bool:
return self.crawl.seed and self.crawl.seed.uri
def is_finished(self) -> bool:
return not self.crawl.has_pending_archiveresults()
def on_started(self):
self.crawl.create_root_snapshot()
self.crawl.bump_retry_at(seconds=10)
self.crawl.save()
def on_sealed(self):
self.crawl.retry_at = None
self.crawl.save()

View file

@ -27,43 +27,29 @@ from ..logging_util import (
log_archive_method_finished,
)
from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .singlefile import should_save_singlefile, save_singlefile
from .readability import should_save_readability, save_readability
from .mercury import should_save_mercury, save_mercury
from .htmltotext import should_save_htmltotext, save_htmltotext
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers
ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
def get_default_archive_methods() -> List[ArchiveMethodEntry]:
# TODO: move to abx.pm.hook.get_EXTRACTORS()
return [
('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('wget', should_save_wget, save_wget),
# keep title, readability, and htmltotext below wget and singlefile, as they depend on them
('title', should_save_title, save_title),
('readability', should_save_readability, save_readability),
('mercury', should_save_mercury, save_mercury),
('htmltotext', should_save_htmltotext, save_htmltotext),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
# ('favicon', should_save_favicon, save_favicon),
# ('headers', should_save_headers, save_headers),
# ('singlefile', should_save_singlefile, save_singlefile),
# ('pdf', should_save_pdf, save_pdf),
# ('screenshot', should_save_screenshot, save_screenshot),
# ('dom', should_save_dom, save_dom),
# ('wget', should_save_wget, save_wget),
# # keep title, readability, and htmltotext below wget and singlefile, as they depend on them
# ('title', should_save_title, save_title),
# ('readability', should_save_readability, save_readability),
# ('mercury', should_save_mercury, save_mercury),
# ('htmltotext', should_save_htmltotext, save_htmltotext),
# ('git', should_save_git, save_git),
# ('media', should_save_media, save_media),
# ('archive_org', should_save_archive_dot_org, save_archive_dot_org),
]
ARCHIVE_METHODS_INDEXING_PRECEDENCE = [

View file

@ -8,6 +8,8 @@ from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe # type: ignore
from django.core.cache import cache
import abx
from archivebox.misc.system import atomic_write
from archivebox.misc.util import (
enforce_types,
@ -19,7 +21,6 @@ from archivebox.misc.util import (
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from .schema import Link
from ..logging_util import printable_filesize
@ -80,7 +81,9 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
@enforce_types
def link_details_template(link: Link) -> str:
from ..extractors.wget import wget_output_path
from abx_plugin_wget_extractor.wget import wget_output_path
SAVE_ARCHIVE_DOT_ORG = abx.pm.hook.get_FLAT_CONFIG().SAVE_ARCHIVE_DOT_ORG
link_info = link._asdict(extended=True)
@ -102,7 +105,7 @@ def link_details_template(link: Link) -> str:
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG,
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS,
})

View file

@ -8,6 +8,8 @@ from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional, Iterator, Any, Union
import abx
from archivebox.config import VERSION, DATA_DIR, CONSTANTS
from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
@ -19,8 +21,6 @@ from archivebox.misc.util import enforce_types
@enforce_types
def generate_json_index_from_links(links: List[Link], with_headers: bool):
from django.conf import settings
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
@ -33,11 +33,10 @@ def generate_json_index_from_links(links: List[Link], with_headers: bool):
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
'dependencies': settings.BINARIES,
'dependencies': dict(abx.pm.hook.get_BINARIES()),
},
}
if with_headers:
output = {
**MAIN_INDEX_HEADER,

View file

@ -17,9 +17,9 @@ from dataclasses import dataclass, asdict, field, fields
from django.utils.functional import cached_property
from archivebox.config import ARCHIVE_DIR, CONSTANTS
import abx
from plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.config import ARCHIVE_DIR, CONSTANTS
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import ts_to_date_str, parse_date
@ -426,7 +426,10 @@ class Link:
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
from ..extractors.wget import wget_output_path
from abx_plugin_wget.wget import wget_output_path
FAVICON_CONFIG = abx.pm.hook.get_CONFIGS().favicon
# TODO: banish this awful duplication from the codebase and import these
# from their respective extractor files
canonical = {

View file

@ -8,9 +8,10 @@ from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property
import abx.archivebox.reads
import abx
import archivebox
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider
from pydantic_pkgr import Binary, BinProvider
from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
@ -180,7 +181,7 @@ class NetworkInterface(ABIDModel, ModelWithHealthStats):
class InstalledBinaryManager(models.Manager):
def get_from_db_or_cache(self, binary: BaseBinary) -> 'InstalledBinary':
def get_from_db_or_cache(self, binary: Binary) -> 'InstalledBinary':
"""Get or create an InstalledBinary record for a Binary on the local machine"""
global _CURRENT_BINARIES
@ -216,7 +217,7 @@ class InstalledBinaryManager(models.Manager):
# if binary was not yet loaded from filesystem, do it now
# this is expensive, we have to find it's abspath, version, and sha256, but it's necessary
# to make sure we have a good, up-to-date record of it in the DB & in-memroy cache
binary = binary.load(fresh=True)
binary = archivebox.pm.hook.binary_load(binary=binary, fresh=True)
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
@ -291,8 +292,8 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
if not hasattr(self, 'machine'):
self.machine = Machine.objects.current()
if not self.binprovider:
all_known_binproviders = list(abx.archivebox.reads.get_BINPROVIDERS().values())
binary = BaseBinary(name=self.name, binproviders=all_known_binproviders).load(fresh=True)
all_known_binproviders = list(abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values())
binary = archivebox.pm.hook.binary_load(binary=Binary(name=self.name, binproviders=all_known_binproviders), fresh=True)
self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None
if not self.abspath:
self.abspath = self.BINPROVIDER.get_abspath(self.name)
@ -304,16 +305,16 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
super().clean(*args, **kwargs)
@cached_property
def BINARY(self) -> BaseBinary:
for binary in abx.archivebox.reads.get_BINARIES().values():
def BINARY(self) -> Binary:
for binary in abx.as_dict(archivebox.pm.hook.get_BINARIES()).values():
if binary.name == self.name:
return binary
raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it')
# TODO: we could technically reconstruct it from scratch, but why would we ever want to do that?
@cached_property
def BINPROVIDER(self) -> BaseBinProvider:
for binprovider in abx.archivebox.reads.get_BINPROVIDERS().values():
def BINPROVIDER(self) -> BinProvider:
for binprovider in abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS()).values():
if binprovider.name == self.binprovider:
return binprovider
raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})')
@ -321,7 +322,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
# maybe not a good idea to provide this? Binary in DB is a record of the binary's config
# whereas a loaded binary is a not-yet saved instance that may not have the same config
# why would we want to load a binary record from the db when it could be freshly loaded?
def load_from_db(self) -> BaseBinary:
def load_from_db(self) -> Binary:
# TODO: implement defaults arg in pydantic_pkgr
# return self.BINARY.load(defaults={
# 'binprovider': self.BINPROVIDER,
@ -330,7 +331,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
# 'sha256': self.sha256,
# })
return BaseBinary.model_validate({
return Binary.model_validate({
**self.BINARY.model_dump(),
'abspath': self.abspath and Path(self.abspath),
'version': self.version,
@ -340,5 +341,5 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
'overrides': self.BINARY.overrides,
})
def load_fresh(self) -> BaseBinary:
return self.BINARY.load(fresh=True)
def load_fresh(self) -> Binary:
return archivebox.pm.hook.binary_load(binary=self.BINARY, fresh=True)

View file

@ -14,6 +14,10 @@ from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from django.utils import timezone
from pydantic_pkgr import Binary
import abx
import archivebox
from archivebox.misc.checks import check_data_folder
from archivebox.misc.util import enforce_types # type: ignore
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
@ -22,7 +26,7 @@ from archivebox.misc.logging import stderr, hint
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.permissions import SudoPermission, IN_DOCKER
from archivebox.config.configfile import (
from archivebox.config.collection import (
write_config_file,
load_all_config,
get_real_name,
@ -195,15 +199,13 @@ def version(quiet: bool=False,
console = Console()
prnt = console.print
from django.conf import settings
from abx.archivebox.base_binary import BaseBinary, apt, brew, env
from abx_plugin_default_binproviders import apt, brew, env
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
from archivebox.config.paths import get_data_locations, get_code_locations
from plugins_auth.ldap.config import LDAP_CONFIG
LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
# 0.7.1
@ -242,7 +244,7 @@ def version(quiet: bool=False,
f'SUDO={CONSTANTS.IS_ROOT}',
f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
f'LDAP={LDAP_ENABLED}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
prnt()
@ -264,7 +266,8 @@ def version(quiet: bool=False,
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
failures = []
for name, binary in list(settings.BINARIES.items()):
BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
for name, binary in list(BINARIES.items()):
if binary.name == 'archivebox':
continue
@ -295,14 +298,15 @@ def version(quiet: bool=False,
prnt()
prnt('[gold3][i] Package Managers:[/gold3]')
for name, binprovider in list(settings.BINPROVIDERS.items()):
BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
for name, binprovider in list(BINPROVIDERS.items()):
err = None
if binproviders and binprovider.name not in binproviders:
continue
# TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
loaded_bin = binprovider.INSTALLER_BINARY or BaseBinary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
abspath = None
if loaded_bin.abspath:
@ -1050,9 +1054,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
# - recommend user re-run with sudo if any deps need to be installed as root
from rich import print
from django.conf import settings
from archivebox import CONSTANTS
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.paths import get_or_create_working_lib_dir
@ -1075,11 +1077,11 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
package_manager_names = ', '.join(
f'[yellow]{binprovider.name}[/yellow]'
for binprovider in list(settings.BINPROVIDERS.values())
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
if not binproviders or (binproviders and binprovider.name in binproviders)
)
print(f'[+] Setting up package managers {package_manager_names}...')
for binprovider in list(settings.BINPROVIDERS.values()):
for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
if binproviders and binprovider.name not in binproviders:
continue
try:
@ -1092,7 +1094,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
print()
for binary in list(settings.BINARIES.values()):
for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
# obviously must already be installed if we are running
continue
@ -1122,7 +1124,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
result = binary.load_or_install(binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if result and result['loaded_version']:
break
except Exception as e:
@ -1133,7 +1136,8 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
sys.stderr.write("\033[00m\n") # reset
else:
binary.load_or_install(fresh=True, dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
if IS_ROOT and LIB_DIR:
with SudoPermission(uid=0):
if ARCHIVEBOX_USER == 0:
@ -1157,7 +1161,7 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
extra_args = []
if binproviders:
@ -1183,8 +1187,6 @@ def config(config_options_str: Optional[str]=None,
out_dir: Path=DATA_DIR) -> None:
"""Get and set your ArchiveBox project configuration values"""
import abx.archivebox.reads
from rich import print
check_data_folder()
@ -1198,7 +1200,8 @@ def config(config_options_str: Optional[str]=None,
elif config_options_str:
config_options = config_options_str.split('\n')
from django.conf import settings
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CONFIGS = archivebox.pm.hook.get_CONFIGS()
config_options = config_options or []
@ -1208,8 +1211,8 @@ def config(config_options_str: Optional[str]=None,
if search:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
for config_section in settings.CONFIGS.values():
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
for config_section in CONFIGS.values():
aliases = config_section.aliases
for search_key in config_options:
@ -1228,15 +1231,15 @@ def config(config_options_str: Optional[str]=None,
elif get or no_args:
if config_options:
config_options = [get_real_name(key) for key in config_options]
matching_config = {key: settings.FLAT_CONFIG[key] for key in config_options if key in settings.FLAT_CONFIG}
failed_config = [key for key in config_options if key not in settings.FLAT_CONFIG]
matching_config = {key: FLAT_CONFIG[key] for key in config_options if key in FLAT_CONFIG}
failed_config = [key for key in config_options if key not in FLAT_CONFIG]
if failed_config:
stderr()
stderr('[X] These options failed to get', color='red')
stderr(' {}'.format('\n '.join(config_options)))
raise SystemExit(1)
else:
matching_config = settings.FLAT_CONFIG
matching_config = FLAT_CONFIG
print(printable_config(matching_config))
raise SystemExit(not matching_config)
@ -1257,20 +1260,20 @@ def config(config_options_str: Optional[str]=None,
if key != raw_key:
stderr(f'[i] Note: The config option {raw_key} has been renamed to {key}, please use the new name going forwards.', color='lightyellow')
if key in settings.FLAT_CONFIG:
if key in FLAT_CONFIG:
new_config[key] = val.strip()
else:
failed_options.append(line)
if new_config:
before = settings.FLAT_CONFIG
before = FLAT_CONFIG
matching_config = write_config_file(new_config)
after = {**load_all_config(), **abx.archivebox.reads.get_FLAT_CONFIG()}
after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
print(printable_config(matching_config))
side_effect_changes = {}
for key, val in after.items():
if key in settings.FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
if key in FLAT_CONFIG and (str(before[key]) != str(after[key])) and (key not in matching_config):
side_effect_changes[key] = after[key]
# import ipdb; ipdb.set_trace()
@ -1312,7 +1315,7 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder()
from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

View file

@ -201,6 +201,7 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
import archivebox
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.misc.logging import STDERR
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
@ -209,6 +210,8 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
assert lib_dir == archivebox.pm.hook.get_LIB_DIR(), "lib_dir is not the same as the one in the flat config"
if not must_exist and not os.path.isdir(lib_dir):
return True

View file

@ -23,7 +23,7 @@ from archivebox import CONSTANTS # noqa
from ..main import * # noqa
from ..cli import CLI_SUBCOMMANDS
CONFIG = settings.FLAT_CONFIG
CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
CLI_COMMAND_NAMES = ", ".join(CLI_SUBCOMMANDS.keys())
if __name__ == '__main__':
@ -55,6 +55,5 @@ if __name__ == '__main__':
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')
prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]')
prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]')
prnt(' archivebox.plugins_extractor.wget.apps.WGET_EXTRACTOR.extract(snap.id) [grey53]# call an extractor directly[/]')
prnt(' snap.archiveresult_set.all() [grey53]# see extractor results[/]')
prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')

View file

@ -5,7 +5,7 @@ import requests
import json as pyjson
import http.cookiejar
from typing import List, Optional, Any
from typing import List, Optional, Any, Callable
from pathlib import Path
from inspect import signature
from functools import wraps
@ -19,14 +19,13 @@ from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
try:
import chardet
import chardet # type:ignore
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
except ImportError:
detect_encoding = lambda rawdata: "utf-8"
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.constants import CONSTANTS
from .logging import COLOR_DICT
@ -126,6 +125,7 @@ def is_static_file(url: str):
def enforce_types(func):
"""
Enforce function arg and kwarg types at runtime using its python3 type hints
Simpler version of pydantic @validate_call decorator
"""
# TODO: check return type as well
@ -186,11 +186,11 @@ def str_between(string: str, start: str, end: str=None) -> str:
@enforce_types
def parse_date(date: Any) -> Optional[datetime]:
def parse_date(date: Any) -> datetime:
"""Parse unix timestamps, iso format, and human-readable strings"""
if date is None:
return None
return None # type: ignore
if isinstance(date, datetime):
if date.tzinfo is None:
@ -212,6 +212,8 @@ def parse_date(date: Any) -> Optional[datetime]:
def download_url(url: str, timeout: int=None) -> str:
"""Download the contents of a remote url and return the text"""
from archivebox.config.common import ARCHIVING_CONFIG
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
session = requests.Session()
@ -241,8 +243,12 @@ def download_url(url: str, timeout: int=None) -> str:
return url.rsplit('/', 1)[-1]
@enforce_types
def get_headers(url: str, timeout: int=None) -> str:
def get_headers(url: str, timeout: int | None=None) -> str:
"""Download the contents of a remote url and return the headers"""
# TODO: get rid of this and use an abx pluggy hook instead
from archivebox.config.common import ARCHIVING_CONFIG
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
try:
@ -283,6 +289,7 @@ def get_headers(url: str, timeout: int=None) -> str:
def ansi_to_html(text: str) -> str:
"""
Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html
Simple way to render colored CLI stdout/stderr in HTML properly, Textual/rich is probably better though.
"""
TEMPLATE = '<span style="color: rgb{}"><br>'
@ -306,13 +313,13 @@ def ansi_to_html(text: str) -> str:
@enforce_types
def dedupe(options: List[str]) -> List[str]:
"""
Deduplicates the given options. Options that come later clobber earlier
conflicting options.
Deduplicates the given CLI args by key=value. Options that come later override earlier.
"""
deduped = {}
for option in options:
deduped[option.split('=')[0]] = option
key = option.split('=')[0]
deduped[key] = option
return list(deduped.values())
@ -345,6 +352,9 @@ class ExtendedEncoder(pyjson.JSONEncoder):
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
elif isinstance(obj, Callable):
return str(obj)
return pyjson.JSONEncoder.default(self, obj)

View file

@ -1,14 +1,11 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from ..index.schema import Link
from archivebox.misc.util import (
enforce_types,
)
from archivebox.misc.util import enforce_types
from ..index.schema import Link
from .generic_json import jsonObjectToLink
def parse_line(line: str):

View file

@ -6,8 +6,7 @@ import re
from typing import IO, Iterable, Optional
from configparser import ConfigParser
from pocket import Pocket
import archivebox
from archivebox.config import CONSTANTS
from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write
@ -22,7 +21,7 @@ API_DB_PATH = CONSTANTS.SOURCES_DIR / 'pocket_api.db'
_BROKEN_PROTOCOL_RE = re.compile('^(http[s]?)(:/(?!/))')
def get_pocket_articles(api: Pocket, since=None, page=0):
def get_pocket_articles(api, since=None, page=0):
body, headers = api.get(
state='archive',
sort='oldest',
@ -94,7 +93,9 @@ def should_parse_as_pocket_api(text: str) -> bool:
def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Pocket API"""
from archivebox.plugins_extractor.pocket.config import POCKET_CONFIG
from pocket import Pocket
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
input_buffer.seek(0)
pattern = re.compile(r"^pocket:\/\/(\w+)")
@ -102,7 +103,7 @@ def parse_pocket_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
if should_parse_as_pocket_api(line):
username = pattern.search(line).group(1)
api = Pocket(POCKET_CONFIG.POCKET_CONSUMER_KEY, POCKET_CONFIG.POCKET_ACCESS_TOKENS[username])
api = Pocket(FLAT_CONFIG.POCKET_CONSUMER_KEY, FLAT_CONFIG.POCKET_ACCESS_TOKENS[username])
api.last_since = None
for article in get_pocket_articles(api, since=read_since(username)):

View file

@ -8,9 +8,10 @@ from datetime import datetime
from typing import IO, Iterable, Optional
from configparser import ConfigParser
import abx
from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write
from archivebox.plugins_extractor.readwise.config import READWISE_CONFIG
from ..index.schema import Link
@ -62,26 +63,30 @@ def link_from_article(article: dict, sources: list):
def write_cursor(username: str, since: str):
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
if not READWISE_DB_PATH.exists():
atomic_write(READWISE_DB_PATH, "")
since_file = ConfigParser()
since_file.optionxform = str
since_file.read(READWISE_CONFIG.READWISE_DB_PATH)
since_file.read(READWISE_DB_PATH)
since_file[username] = {"since": since}
with open(READWISE_CONFIG.READWISE_DB_PATH, "w+") as new:
with open(READWISE_DB_PATH, "w+") as new:
since_file.write(new)
def read_cursor(username: str) -> Optional[str]:
if not READWISE_CONFIG.READWISE_DB_PATH.exists():
atomic_write(READWISE_CONFIG.READWISE_DB_PATH, "")
READWISE_DB_PATH = abx.pm.hook.get_CONFIG().READWISE_DB_PATH
if not READWISE_DB_PATH.exists():
atomic_write(READWISE_DB_PATH, "")
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(READWISE_CONFIG.READWISE_DB_PATH)
config_file.read(READWISE_DB_PATH)
return config_file.get(username, "since", fallback=None)
@ -97,12 +102,14 @@ def should_parse_as_readwise_reader_api(text: str) -> bool:
def parse_readwise_reader_api_export(input_buffer: IO[str], **_kwargs) -> Iterable[Link]:
"""Parse bookmarks from the Readwise Reader API"""
READWISE_READER_TOKENS = abx.pm.hook.get_CONFIG().READWISE_READER_TOKENS
input_buffer.seek(0)
pattern = re.compile(r"^readwise-reader:\/\/(\w+)")
for line in input_buffer:
if should_parse_as_readwise_reader_api(line):
username = pattern.search(line).group(1)
api = ReadwiseReaderAPI(READWISE_CONFIG.READWISE_READER_TOKENS[username], cursor=read_cursor(username))
api = ReadwiseReaderAPI(READWISE_READER_TOKENS[username], cursor=read_cursor(username))
for article in get_readwise_reader_articles(api):
yield link_from_article(article, sources=[line])

View file

@ -0,0 +1,39 @@
import sys
import importlib
from pathlib import Path
PKGS_DIR = Path(__file__).parent
VENDORED_PKGS = [
'abx',
# 'pydantic-pkgr',
]
# scan ./pkgs and add all dirs present to list of available VENDORED_PKGS
for subdir in reversed(sorted(PKGS_DIR.iterdir())):
if subdir.is_dir() and subdir.name not in VENDORED_PKGS and not subdir.name.startswith('_'):
VENDORED_PKGS.append(subdir.name)
def load_vendored_pkgs():
"""Add archivebox/vendor to sys.path and import all vendored libraries present within"""
if str(PKGS_DIR) not in sys.path:
sys.path.append(str(PKGS_DIR))
for pkg_name in VENDORED_PKGS:
pkg_dir = PKGS_DIR / pkg_name
assert pkg_dir.is_dir(), f'Required vendored pkg {pkg_name} could not be found in {pkg_dir}'
try:
lib = importlib.import_module(pkg_name)
# print(f"Successfully imported lib from environment {pkg_name}")
except ImportError:
sys.path.append(str(pkg_dir))
try:
lib = importlib.import_module(pkg_name)
# print(f"Successfully imported lib from vendored fallback {pkg_name}: {inspect.getfile(lib)}")
except ImportError as e:
print(f"Failed to import lib from environment or vendored fallback {pkg_name}: {e}", file=sys.stderr)
sys.exit(1)

View file

@ -0,0 +1,21 @@
__label__ = 'Archive.org'
__homepage__ = 'https://archive.org'
import abx
@abx.hookimpl
def get_CONFIG():
from .config import ARCHIVEDOTORG_CONFIG
return {
'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
#
# return {
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
# }

View file

@ -1,7 +1,4 @@
__package__ = 'plugins_extractor.archivedotorg'
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
class ArchivedotorgConfig(BaseConfigSet):

View file

@ -0,0 +1,18 @@
[project]
name = "abx-plugin-archivedotorg"
version = "2024.10.28"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-plugin-curl>=2024.10.24",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_archivedotorg = "abx_plugin_archivedotorg"

View file

@ -0,0 +1,34 @@
__label__ = 'Chrome'
__author__ = 'ArchiveBox'
import abx
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'CHROME_CONFIG': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
@abx.hookimpl
def ready():
from .config import CHROME_CONFIG
CHROME_CONFIG.validate()
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View file

@ -1,5 +1,3 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
@ -7,21 +5,22 @@ from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
Binary,
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
import abx
# Depends on Other Plugins:
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from abx_plugin_default_binproviders import apt, brew, env
from abx_plugin_puppeteer.binproviders import PUPPETEER_BINPROVIDER
from abx_plugin_playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
@ -48,12 +47,13 @@ CHROME_BINARY_NAMES_MACOS = [
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
CHROME_APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
'chromium-browser',
]
@ -80,7 +80,7 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
###################### Config ##########################
class ChromeBinary(BaseBinary):
class ChromeBinary(Binary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
@ -95,7 +95,7 @@ class ChromeBinary(BaseBinary):
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
'packages': CHROME_APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
@ -104,10 +104,9 @@ class ChromeBinary(BaseBinary):
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
bin_dir = bin_dir or abx.pm.hook.get_BIN_DIR()
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
if not (binary.abspath and os.path.isfile(binary.abspath)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
@ -121,7 +120,7 @@ class ChromeBinary(BaseBinary):
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
except Exception:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@ -132,14 +131,17 @@ class ChromeBinary(BaseBinary):
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
try:
linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
linux_lock_file.unlink(missing_ok=True)
except Exception:
pass
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
try:
(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
except Exception:
pass

View file

@ -1,5 +1,3 @@
__package__ = 'plugins_extractor.chrome'
import os
from pathlib import Path
from typing import List, Optional
@ -7,8 +5,8 @@ from typing import List, Optional
from pydantic import Field
from pydantic_pkgr import bin_abspath
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import env
from abx_spec_config.base_configset import BaseConfigSet
from abx_plugin_default_binproviders import env
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
@ -81,15 +79,16 @@ class ChromeConfig(BaseConfigSet):
# Chrome Binary
CHROME_BINARY: str = Field(default='chrome')
CHROME_DEFAULT_ARGS: List[str] = Field(default=[
'--virtual-time-budget=15000',
'--disable-features=DarkMode',
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
"--no-first-run", # dont show any first run ui / setup prompts
'--virtual-time-budget=15000', # accellerate any animations on the page by 15s into the future
'--disable-features=DarkMode', # disable dark mode for archiving
"--run-all-compositor-stages-before-draw", # dont draw partially rendered content, wait until everything is ready
"--hide-scrollbars", # hide scrollbars to prevent layout shift / scrollbar visible in screenshots
"--autoplay-policy=no-user-gesture-required", # allow media autoplay without user gesture (e.g. on mobile)
"--use-fake-ui-for-media-stream", # provide fake camera if site tries to request camera access
"--use-fake-device-for-media-stream", # provide fake camera if site tries to request camera access
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'", # ignore chrome updates
"--force-gpu-mem-available-mb=4096", # allows for longer full page screenshots https://github.com/puppeteer/puppeteer/issues/5530
])
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
@ -196,6 +195,7 @@ class ChromeConfig(BaseConfigSet):
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
# if CHROME_USER_DATA_DIR is set but folder is empty, create a new profile inside it
if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
STDERR.print(f'[green] + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
cmd_args.remove('--no-first-run')

View file

@ -0,0 +1,18 @@
[project]
name = "abx-plugin-chrome"
version = "2024.10.28"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-spec-pydantic-pkgr>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_chrome = "abx_plugin_chrome"

View file

@ -0,0 +1,18 @@
import abx
@abx.hookimpl
def get_CONFIG():
from .config import CURL_CONFIG
return {
'curl': CURL_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CURL_BINARY
return {
'curl': CURL_BINARY,
}

View file

@ -1,17 +1,17 @@
__package__ = 'plugins_extractor.curl'
__package__ = 'abx_plugin_curl'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from pydantic_pkgr import BinProvider, BinName, Binary
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx_plugin_default_binproviders import apt, brew, env
from .config import CURL_CONFIG
class CurlBinary(BaseBinary):
class CurlBinary(Binary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]

View file

@ -1,11 +1,11 @@
__package__ = 'plugins_extractor.curl'
__package__ = 'abx_plugin_curl'
from typing import List, Optional
from pathlib import Path
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG

View file

@ -0,0 +1,18 @@
[project]
name = "abx-plugin-curl"
version = "2024.10.24"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-spec-pydantic-pkgr>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_curl = "abx_plugin_curl"

View file

@ -0,0 +1,23 @@
import abx
from typing import Dict
from pydantic_pkgr import (
AptProvider,
BrewProvider,
EnvProvider,
BinProvider,
)
apt = APT_BINPROVIDER = AptProvider()
brew = BREW_BINPROVIDER = BrewProvider()
env = ENV_BINPROVIDER = EnvProvider()
@abx.hookimpl(tryfirst=True)
def get_BINPROVIDERS() -> Dict[str, BinProvider]:
return {
'apt': APT_BINPROVIDER,
'brew': BREW_BINPROVIDER,
'env': ENV_BINPROVIDER,
}

View file

@ -0,0 +1,18 @@
[project]
name = "abx-plugin-default-binproviders"
version = "2024.10.24"
description = "Default BinProviders for ABX (apt, brew, env)"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"pydantic-pkgr>=0.5.4",
"abx-spec-pydantic-pkgr>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_default_binproviders = "abx_plugin_default_binproviders"

View file

@ -0,0 +1,29 @@
__label__ = 'Favicon'
__version__ = '2024.10.24'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
__dependencies__ = [
'abx>=0.1.0',
'abx-spec-config>=0.1.0',
'abx-plugin-curl-extractor>=2024.10.24',
]
import abx
@abx.hookimpl
def get_CONFIG():
from .config import FAVICON_CONFIG
return {
'FAVICON_CONFIG': FAVICON_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'favicon': FAVICON_EXTRACTOR,
# }

View file

@ -1,7 +1,4 @@
__package__ = 'plugins_extractor.favicon'
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
class FaviconConfig(BaseConfigSet):

View file

@ -0,0 +1,18 @@
[project]
name = "abx-plugin-favicon"
version = "2024.10.28"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-plugin-curl>=2024.10.28",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_favicon = "abx_plugin_favicon"

View file

View file

@ -0,0 +1,29 @@
__package__ = 'abx_plugin_git'
__label__ = 'Git'
import abx
@abx.hookimpl
def get_CONFIG():
from .config import GIT_CONFIG
return {
'GIT_CONFIG': GIT_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import GIT_BINARY
return {
'git': GIT_BINARY,
}
@abx.hookimpl
def get_EXTRACTORS():
from .extractors import GIT_EXTRACTOR
return {
'git': GIT_EXTRACTOR,
}

View file

@ -1,17 +1,17 @@
__package__ = 'plugins_extractor.git'
__package__ = 'abx_plugin_git'
from typing import List
from pydantic import InstanceOf
from pydantic_pkgr import BinProvider, BinName
from pydantic_pkgr import BinProvider, BinName, Binary
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx_plugin_default_binproviders import apt, brew, env
from .config import GIT_CONFIG
class GitBinary(BaseBinary):
class GitBinary(Binary):
name: BinName = GIT_CONFIG.GIT_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]

View file

@ -1,10 +1,10 @@
__package__ = 'plugins_extractor.git'
__package__ = 'abx_plugin_git'
from typing import List
from pydantic import Field
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
from archivebox.config.common import ARCHIVING_CONFIG

View file

@ -0,0 +1,15 @@
__package__ = 'abx_plugin_git'
# from pathlib import Path
# from .binaries import GIT_BINARY
# class GitExtractor(BaseExtractor):
# name: ExtractorName = 'git'
# binary: str = GIT_BINARY.name
# def get_output_path(self, snapshot) -> Path | None:
# return snapshot.as_link() / 'git'
# GIT_EXTRACTOR = GitExtractor()

View file

@ -16,8 +16,8 @@ from archivebox.misc.util import (
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.plugins_extractor.git.config import GIT_CONFIG
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
from abx_plugin_git.config import GIT_CONFIG
from abx_plugin_git.binaries import GIT_BINARY
def get_output_path():

View file

@ -0,0 +1,19 @@
[project]
name = "abx-plugin-git"
version = "2024.10.28"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-spec-pydantic-pkgr>=0.1.0",
"abx-plugin-default-binproviders>=2024.10.24",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_git = "abx_plugin_git"

View file

@ -0,0 +1,22 @@
__package__ = 'abx_plugin_htmltotext'
__label__ = 'HTML-to-Text'
import abx
@abx.hookimpl
def get_CONFIG():
from .config import HTMLTOTEXT_CONFIG
return {
'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import FAVICON_EXTRACTOR
# return {
# 'htmltotext': FAVICON_EXTRACTOR,
# }

View file

@ -1,7 +1,4 @@
__package__ = 'plugins_extractor.htmltotext'
from abx.archivebox.base_configset import BaseConfigSet
from abx_spec_config.base_configset import BaseConfigSet
class HtmltotextConfig(BaseConfigSet):

Some files were not shown because too many files have changed in this diff Show more