diff --git a/Dockerfile b/Dockerfile index e5bcf397..c6358a1e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -300,10 +300,15 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH # Setup ArchiveBox runtime config WORKDIR "$DATA_DIR" RUN openssl rand -hex 16 > /etc/machine-id \ - && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp" + && mkdir -p "/tmp/archivebox" \ + && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp/archivebox" \ + && mkdir -p "/usr/share/archivebox/lib" \ + && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/usr/share/archivebox/lib" \ ENV GOOGLE_API_KEY=no \ GOOGLE_DEFAULT_CLIENT_ID=no \ GOOGLE_DEFAULT_CLIENT_SECRET=no \ + TMP_DIR=/tmp/archivebox \ + LIB_DIR=/usr/share/archivebox/lib \ ALLOWED_HOSTS=* # Print version for nice docker finish summary diff --git a/README.md b/README.md index 3019962d..bda15ae2 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ curl -fsSL 'https://get.archivebox.io' | sh - [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats) - [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats) - [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC -- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) +- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) - [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode) - Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!) - Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345) diff --git a/archivebox/Architecture.md b/archivebox/Architecture.md new file mode 100644 index 00000000..c365088a --- /dev/null +++ b/archivebox/Architecture.md @@ -0,0 +1,172 @@ +# ArchiveBox UI + +## Page: Getting Started + +### What do you want to capture? + +- Save some URLs now -> [Add page] + - Paste some URLs to archive now + - Upload a file containing URLs (bookmarks.html export, RSS.xml feed, markdown file, word doc, PDF, etc.) + - Pull in URLs to archive from a remote location (e.g. RSS feed URL, remote TXT file, JSON file, etc.) + +- Import URLs from a browser -> [Import page] + - Desktop: Get the ArchiveBox Chrome/Firefox extension + - Mobile: Get the ArchiveBox iOS App / Android App + - Upload a bookmarks.html export file + - Upload a browser_history.sqlite3 export file + +- Import URLs from a 3rd party bookmarking service -> [Sync page] + - Pocket + - Pinboard + - Instapaper + - Wallabag + - Zapier, N8N, IFTTT, etc. + - Upload a bookmarks.html export, bookmarks.json, RSS, etc. file + +- Archive URLs on a schedule -> [Schedule page] + +- Archive an entire website -> [Crawl page] + - What starting URL/domain? + - How deep? + - Follow links to external domains? + - Follow links to parent URLs? + - Maximum number of pages to save? + - Maximum number of requests/minute? + +- Crawl for URLs with a search engine and save automatically + - +- Some URLs on a schedule +- Save an entire website (e.g. `https://example.com`) +- Save results matching a search query (e.g. "site:example.com") +- Save a social media feed (e.g. `https://x.com/user/1234567890`) + +-------------------------------------------------------------------------------- + +### Crawls App + +- Archive an entire website -> [Crawl page] + - What are the seed URLs? + - How many hops to follow? + - Follow links to external domains? + - Follow links to parent URLs? + - Maximum number of pages to save? + - Maximum number of requests/minute? + + +-------------------------------------------------------------------------------- + +### Scheduler App + + +- Archive URLs on a schedule -> [Schedule page] + - What URL(s)? + - How often? + - Do you want to discard old snapshots after x amount of time? + - Any filter rules? + - Want to be notified when changes are detected -> redirect[Alerts app/create new alert(crawl=self)] + + +* Choose Schedule check for new URLs: Schedule.objects.get(pk=xyz) + - 1 minute + - 5 minutes + - 1 hour + - 1 day + + * Choose Destination Crawl to archive URLs using : Crawl.objects.get(pk=xyz) + - Tags + - Persona + - Created By ID + - Config + - Filters + - URL patterns to include + - URL patterns to exclude + - ONLY_NEW= Ignore URLs if already saved once / save URL each time it appears / only save is last save > x time ago + + +-------------------------------------------------------------------------------- + +### Sources App (For managing sources that ArchiveBox pulls URLs in from) + +- Add a new source to pull URLs in from (WIZARD) + - Choose URI: + - [x] Web UI + - [x] CLI + - Local filesystem path (directory to monitor for new files containing URLs) + - Remote URL (RSS/JSON/XML feed) + - Chrome browser profile sync (login using gmail to pull bookmarks/history) + - Pocket, Pinboard, Instapaper, Wallabag, etc. + - Zapier, N8N, IFTTT, etc. + - Local server filesystem path (directory to monitor for new files containing URLs) + - Google drive (directory to monitor for new files containing URLs) + - Remote server FTP/SFTP/SCP path (directory to monitor for new files containing URLs) + - AWS/S3/B2/GCP bucket (directory to monitor for new files containing URLs) + - XBrowserSync (login to pull bookmarks) + - Choose extractor + - auto + - RSS + - Pocket + - etc. + - Specify extra Config, e.g. + - credentials + - extractor tuning options (e.g. verify_ssl, cookies, etc.) + +- Provide credentials for the source + - API Key + - Username / Password + - OAuth + +-------------------------------------------------------------------------------- + +### Alerts App + +- Create a new alert, choose condition + - Get notified when a site goes down ( None: + self.num_uses_failed += 1 + self.save() + + def record_health_success(self) -> None: + self.num_uses_succeeded += 1 + self.save() + + def reset_health(self) -> None: + # move all the failures to successes when resetting so we dont lose track of the total count + self.num_uses_succeeded = self.num_uses_failed + self.num_uses_succeeded + self.num_uses_failed = 0 + self.save() + + @property + def health(self) -> int: + total_uses = max((self.num_uses_failed + self.num_uses_succeeded, 1)) + success_pct = (self.num_uses_succeeded / total_uses) * 100 + return round(success_pct) + + + + + + + + + + #################################################### # Django helpers diff --git a/archivebox/abx/__init__.py b/archivebox/abx/__init__.py index afda37a3..c571a2e3 100644 --- a/archivebox/abx/__init__.py +++ b/archivebox/abx/__init__.py @@ -2,11 +2,11 @@ __package__ = 'abx' import importlib from pathlib import Path -from typing import Dict +from typing import Dict, Callable, List from . import hookspec as base_spec -from .hookspec import hookimpl, hookspec # noqa -from .manager import pm, PluginManager # noqa +from abx.hookspec import hookimpl, hookspec # noqa +from abx.manager import pm, PluginManager # noqa pm.add_hookspecs(base_spec) @@ -23,21 +23,28 @@ def get_plugin_order(plugin_entrypoint: Path): pass return (order, plugin_entrypoint) -def register_hookspecs(hookspecs): +def register_hookspecs(hookspecs: List[str]): + """ + Register all the hookspecs from a list of module names. + """ for hookspec_import_path in hookspecs: hookspec_module = importlib.import_module(hookspec_import_path) pm.add_hookspecs(hookspec_module) def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]: + """ + Find all the plugins in a given directory. Just looks for an __init__.py file. + """ return { f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent - for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order) + for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order) + if plugin_entrypoint.parent.name != 'abx' } # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip" def get_pip_installed_plugins(group='abx'): - """replaces pm.load_setuptools_entrypoints("abx")""" + """replaces pm.load_setuptools_entrypoints("abx"), finds plugins that registered entrypoints via pip""" import importlib.metadata DETECTED_PLUGINS = {} # module_name: module_dir_path @@ -52,6 +59,9 @@ def get_pip_installed_plugins(group='abx'): def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]): + """ + Get the mapping of dir_name: {plugin_id: plugin_dir} for all plugins in the given directories. + """ DETECTED_PLUGINS = {} for plugin_prefix, plugin_dir in plugin_dirs.items(): DETECTED_PLUGINS.update(find_plugins_in_dir(plugin_dir, prefix=plugin_prefix)) @@ -61,6 +71,9 @@ def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]): # Load all plugins from pip packages, archivebox built-ins, and user plugins def load_plugins(plugins_dict: Dict[str, Path]): + """ + Load all the plugins from a dictionary of module names and directory paths. + """ LOADED_PLUGINS = {} for plugin_module, plugin_dir in plugins_dict.items(): # print(f'Loading plugin: {plugin_module} from {plugin_dir}') @@ -71,6 +84,9 @@ def load_plugins(plugins_dict: Dict[str, Path]): return LOADED_PLUGINS def get_registered_plugins(): + """ + Get all the plugins registered with Pluggy. + """ plugins = {} plugin_to_distinfo = dict(pm.list_plugin_distinfo()) for plugin in pm.get_plugins(): @@ -88,3 +104,28 @@ def get_registered_plugins(): return plugins + + +def get_plugin_hooks(plugin_pkg: str | None) -> Dict[str, Callable]: + """ + Get all the functions marked with @hookimpl on a module. + """ + if not plugin_pkg: + return {} + + hooks = {} + + plugin_module = importlib.import_module(plugin_pkg) + for attr_name in dir(plugin_module): + if attr_name.startswith('_'): + continue + try: + attr = getattr(plugin_module, attr_name) + if isinstance(attr, Callable): + hooks[attr_name] = None + pm.parse_hookimpl_opts(plugin_module, attr_name) + hooks[attr_name] = attr + except Exception as e: + print(f'Error getting hookimpls for {plugin_pkg}: {e}') + + return hooks diff --git a/archivebox/abx/archivebox/__init__.py b/archivebox/abx/archivebox/__init__.py index ddbcc4e4..58bbb447 100644 --- a/archivebox/abx/archivebox/__init__.py +++ b/archivebox/abx/archivebox/__init__.py @@ -10,31 +10,21 @@ from pathlib import Path def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]): """Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py""" LOADED_PLUGINS = {} - for plugin_module, plugin_dir in plugins_dict.items(): + for plugin_module, plugin_dir in reversed(plugins_dict.items()): # print(f'Loading plugin: {plugin_module} from {plugin_dir}') - archivebox_plugins_found = [] - # 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py) - plugin_module_loaded = importlib.import_module(plugin_module) - pm.register(plugin_module_loaded) - if hasattr(plugin_module_loaded, 'PLUGIN'): - archivebox_plugins_found.append(plugin_module_loaded.PLUGIN) + try: + plugin_module_loaded = importlib.import_module(plugin_module) + pm.register(plugin_module_loaded) + except Exception as e: + print(f'Error registering plugin: {plugin_module} - {e}') + # 2. then try to import plugin_module.apps as well if os.access(plugin_dir / 'apps.py', os.R_OK): plugin_apps = importlib.import_module(plugin_module + '.apps') pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class) - if hasattr(plugin_apps, 'PLUGIN'): - archivebox_plugins_found.append(plugin_apps.PLUGIN) - - # 3. then try to look for plugin_module.PLUGIN and register it + all its hooks - for ab_plugin in archivebox_plugins_found: - pm.register(ab_plugin) - for hook in ab_plugin.hooks: - hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin - pm.register(hook) - LOADED_PLUGINS[plugin_module] = ab_plugin - # print(f' √ Loaded plugin: {LOADED_PLUGINS}') + # print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}') return LOADED_PLUGINS diff --git a/archivebox/abx/archivebox/base_admindataview.py b/archivebox/abx/archivebox/base_admindataview.py deleted file mode 100644 index 32cf49fc..00000000 --- a/archivebox/abx/archivebox/base_admindataview.py +++ /dev/null @@ -1,38 +0,0 @@ -__package__ = 'abx.archivebox' - -from typing import Dict - -import abx - -from .base_hook import BaseHook, HookType - - -class BaseAdminDataView(BaseHook): - hook_type: HookType = "ADMINDATAVIEW" - - name: str = 'example_admin_data_view_list' - verbose_name: str = 'Data View' - route: str = '/__OVERRIDE_THIS__/' - view: str = 'plugins_example.example.views.example_view_list' - - items: Dict[str, str] = { - 'route': '/', - "name": 'example_admin_data_view_item', - 'view': 'plugins_example.example.views.example_view_item', - } - - @abx.hookimpl - def get_ADMINDATAVIEWS(self): - return [self] - - @abx.hookimpl - def get_ADMIN_DATA_VIEWS_URLS(self): - """routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']""" - route = { - "route": self.route, - "view": self.view, - "name": self.verbose_name, - "items": self.items, - } - return [route] - diff --git a/archivebox/abx/archivebox/base_binary.py b/archivebox/abx/archivebox/base_binary.py index 45735a1b..afa4f192 100644 --- a/archivebox/abx/archivebox/base_binary.py +++ b/archivebox/abx/archivebox/base_binary.py @@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS from archivebox.config.permissions import ARCHIVEBOX_USER import abx -from .base_hook import BaseHook, HookType -class BaseBinProvider(BaseHook, BinProvider): - hook_type: HookType = "BINPROVIDER" - +class BaseBinProvider(BinProvider): # TODO: add install/load/load_or_install methods as abx.hookimpl methods @@ -36,12 +33,12 @@ class BaseBinProvider(BaseHook, BinProvider): def get_BINPROVIDERS(self): return [self] -class BaseBinary(BaseHook, Binary): - hook_type: HookType = "BINARY" +class BaseBinary(Binary): @staticmethod def symlink_to_lib(binary, bin_dir=None) -> None: - bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR + from archivebox.config.common import STORAGE_CONFIG + bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin' if not (binary.abspath and os.access(binary.abspath, os.R_OK)): return @@ -59,9 +56,10 @@ class BaseBinary(BaseHook, Binary): @validate_call def load(self, fresh=False, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG if fresh: binary = super().load(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') else: # get cached binary from db try: @@ -76,16 +74,18 @@ class BaseBinary(BaseHook, Binary): @validate_call def install(self, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG binary = super().install(**kwargs) - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') return binary @validate_call def load_or_install(self, fresh=False, **kwargs) -> Self: + from archivebox.config.common import STORAGE_CONFIG try: binary = self.load(fresh=fresh) if binary and binary.version: - self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR) + self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin') return binary except Exception: pass diff --git a/archivebox/abx/archivebox/base_configset.py b/archivebox/abx/archivebox/base_configset.py index be7b89c3..700d7caa 100644 --- a/archivebox/abx/archivebox/base_configset.py +++ b/archivebox/abx/archivebox/base_configset.py @@ -1,8 +1,13 @@ __package__ = 'abx.archivebox' import os +import sys +import re from pathlib import Path -from typing import Type, Tuple, Callable, ClassVar +from typing import Type, Tuple, Callable, ClassVar, Dict, Any + +import toml +from rich import print from benedict import benedict from pydantic import model_validator, TypeAdapter @@ -11,15 +16,18 @@ from pydantic_settings.sources import TomlConfigSettingsSource from pydantic_pkgr import func_takes_args_or_kwargs -import abx -from .base_hook import BaseHook, HookType from . import toml_util PACKAGE_DIR = Path(__file__).resolve().parent.parent DATA_DIR = Path(os.getcwd()).resolve() +ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf" +ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak" + +AUTOFIXES_HEADER = "[AUTOFIXES]" +AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:" class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): @@ -55,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource): super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data) -class ArchiveBoxBaseConfig(BaseSettings): +class BaseConfigSet(BaseSettings): """ This is the base class for an ArchiveBox ConfigSet. It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables. @@ -85,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings): loc_by_alias=False, validate_assignment=True, validate_return=True, - revalidate_instances="always", + revalidate_instances="subclass-instances", ) load_from_defaults: ClassVar[bool] = True @@ -103,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings): ) -> Tuple[PydanticBaseSettingsSource, ...]: """Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables""" - ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf" - ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak" - # import ipdb; ipdb.set_trace() precedence_order = {} @@ -154,27 +159,36 @@ class ArchiveBoxBaseConfig(BaseSettings): def fill_defaults(self): """Populate any unset values using function provided as their default""" - for key, field in self.model_fields.items(): - value = getattr(self, key) - - if isinstance(value, Callable): - # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected - if func_takes_args_or_kwargs(value): - # assemble dict of existing field values to pass to default factory functions - config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) - computed_default = field.default(config_so_far) - else: - # otherwise it's a pure function with no args, just call it - computed_default = field.default() - - # coerce/check to make sure default factory return value matches type annotation - TypeAdapter(field.annotation).validate_python(computed_default) - - # set generated default value as final validated value - setattr(self, key, computed_default) + for key in self.model_fields.keys(): + if isinstance(getattr(self, key), Callable): + if self.load_from_defaults: + computed_default = self.get_default_value(key) + # set generated default value as final validated value + setattr(self, key, computed_default) return self - def update_in_place(self, warn=True, **kwargs): + def get_default_value(self, key): + """Get the default value for a given config key""" + field = self.model_fields[key] + value = getattr(self, key) + + if isinstance(value, Callable): + # if value is a function, execute it to get the actual value, passing existing config as a dict arg if expected + if func_takes_args_or_kwargs(value): + # assemble dict of existing field values to pass to default factory functions + config_so_far = benedict(self.model_dump(include=set(self.model_fields.keys()), warnings=False)) + computed_default = field.default(config_so_far) + else: + # otherwise it's a pure function with no args, just call it + computed_default = field.default() + + # coerce/check to make sure default factory return value matches type annotation + TypeAdapter(field.annotation).validate_python(computed_default) + + return computed_default + return value + + def update_in_place(self, warn=True, persist=False, hint='', **kwargs): """ Update the config with new values. Use this sparingly! We should almost never be updating config at runtime. Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment @@ -182,48 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings): Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it. SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue. """ + from archivebox.misc.toml_util import CustomTOMLEncoder + if warn: - print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:') + fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run' + print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr) + + # set the new values in the environment for key, value in kwargs.items(): os.environ[key] = str(value) original_value = getattr(self, key) if warn: print(f' {key}={original_value} -> {value}') + + # if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section + try: + if persist and ARCHIVEBOX_CONFIG_FILE.is_file(): + autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder()) + + existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip() + if AUTOFIXES_HEADER in existing_config: + existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip() + else: + existing_autofixes = '' + + new_config = '\n'.join(line for line in [ + existing_config, + '\n' + AUTOFIXES_HEADER, + AUTOFIXES_SUBHEADER, + existing_autofixes, + autofixes_to_add, + ] if line.strip()).strip() + '\n' + ARCHIVEBOX_CONFIG_FILE.write_text(new_config) + except Exception: + pass self.__init__() + if warn: + print(file=sys.stderr) + return self - def as_legacy_config_schema(self): + @property + def toml_section_header(self): + """Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG""" + class_name = self.__class__.__name__ + return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_') + + + def from_defaults(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the default values""" + class OnlyDefaultsConfig(self.__class__): + load_from_defaults = True + load_from_configfile = False + load_from_environment = False + return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_configfile(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf""" + class OnlyConfigFileConfig(self.__class__): + load_from_defaults = False + load_from_configfile = True + load_from_environment = False + return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_environment(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the environment variables""" + class OnlyEnvironmentConfig(self.__class__): + load_from_defaults = False + load_from_configfile = False + load_from_environment = True + return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys()))) + + def from_computed(self) -> Dict[str, Any]: + """Get the dictionary of {key: value} config loaded from the computed fields""" + return benedict(self.model_dump(include=set(self.model_computed_fields.keys()))) + + + def to_toml_dict(self, defaults=False) -> Dict[str, Any]: + """Get the current config as a TOML-ready dict""" + config_dict = {} + for key, value in benedict(self).items(): + if defaults or value != self.get_default_value(key): + config_dict[key] = value + + return benedict({self.toml_section_header: config_dict}) + + def to_toml_str(self, defaults=False) -> str: + """Get the current config as a TOML string""" + from archivebox.misc.toml_util import CustomTOMLEncoder + + toml_dict = self.to_toml_dict(defaults=defaults) + if not toml_dict[self.toml_section_header]: + # if the section is empty, don't write it + toml_dict.pop(self.toml_section_header) + + return toml.dumps(toml_dict, encoder=CustomTOMLEncoder()) + + def as_legacy_config_schema(self) -> Dict[str, Any]: # shim for backwards compatibility with old config schema style model_values = self.model_dump() return benedict({ key: {'type': field.annotation, 'default': model_values[key]} for key, field in self.model_fields.items() }) - - -class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg] - hook_type: ClassVar[HookType] = 'CONFIG' - - # @abx.hookimpl - # def ready(self, settings): - # # reload config from environment, in case it's been changed by any other plugins - # self.__init__() - - - @abx.hookimpl - def get_CONFIGS(self): - try: - return {self.id: self} - except Exception as e: - # raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}') - print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}') - return {} - - @abx.hookimpl - def get_FLAT_CONFIG(self): - try: - return self.model_dump() - except Exception as e: - # raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}') - print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}') - return {} diff --git a/archivebox/abx/archivebox/base_extractor.py b/archivebox/abx/archivebox/base_extractor.py index 7391f106..81ea2200 100644 --- a/archivebox/abx/archivebox/base_extractor.py +++ b/archivebox/abx/archivebox/base_extractor.py @@ -14,7 +14,6 @@ from django.utils import timezone import abx -from .base_hook import BaseHook, HookType from .base_binary import BaseBinary @@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))] CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)] -class BaseExtractor(BaseHook): - hook_type: HookType = 'EXTRACTOR' +class BaseExtractor: name: ExtractorName binary: BinName @@ -51,9 +49,9 @@ class BaseExtractor(BaseHook): def get_output_path(self, snapshot) -> Path: - return Path(self.id.lower()) + return Path(self.__class__.__name__.lower()) - def should_extract(self, snapshot) -> bool: + def should_extract(self, uri: str, config: dict | None=None) -> bool: try: assert self.detect_installed_binary().version except Exception: @@ -197,8 +195,8 @@ class BaseExtractor(BaseHook): @cached_property def BINARY(self) -> BaseBinary: - import abx.archivebox.use - for binary in abx.archivebox.use.get_BINARIES().values(): + import abx.archivebox.reads + for binary in abx.archivebox.reads.get_BINARIES().values(): if binary.name == self.binary: return binary raise ValueError(f'Binary {self.binary} not found') diff --git a/archivebox/abx/archivebox/base_hook.py b/archivebox/abx/archivebox/base_hook.py deleted file mode 100644 index b2dfe58b..00000000 --- a/archivebox/abx/archivebox/base_hook.py +++ /dev/null @@ -1,80 +0,0 @@ -__package__ = 'abx.archivebox' - -import inspect -from huey.api import TaskWrapper - -from pathlib import Path -from typing import Tuple, Literal, ClassVar, get_args -from pydantic import BaseModel, ConfigDict -from django.utils.functional import cached_property - -import abx - -HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND'] -hook_type_names: Tuple[HookType] = get_args(HookType) - -class BaseHook(BaseModel): - model_config = ConfigDict( - extra="allow", - arbitrary_types_allowed=True, - from_attributes=True, - populate_by_name=True, - validate_defaults=True, - validate_assignment=False, - revalidate_instances="subclass-instances", - ignored_types=(TaskWrapper, cached_property), - ) - - hook_type: ClassVar[HookType] # e.g. = 'CONFIG' - - # verbose_name: str = Field() - - _is_registered: bool = False - _is_ready: bool = False - - - @property - def id(self) -> str: - return self.__class__.__name__ - - @property - def hook_module(self) -> str: - """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet""" - return f'{self.__module__}.{self.__class__.__name__}' - - @property - def hook_file(self) -> Path: - """e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet""" - return Path(inspect.getfile(self.__class__)) - - @property - def plugin_module(self) -> str: - """e.g. plugins_extractor.singlefile""" - return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0] - - @property - def plugin_dir(self) -> Path: - return Path(inspect.getfile(self.__class__)).parent.resolve() - - @property - def admin_url(self) -> str: - # e.g. /admin/environment/config/LdapConfig/ - return f"/admin/environment/{self.hook_type.lower()}/{self.id}/" - - - @abx.hookimpl - def register(self, settings): - """Called when django.apps.AppConfig.ready() is called""" - - # print("REGISTERED HOOK:", self.hook_module) - self._is_registered = True - - - @abx.hookimpl - def ready(self): - """Called when django.apps.AppConfig.ready() is called""" - - assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!" - - # print("READY HOOK:", self.hook_module) - self._is_ready = True diff --git a/archivebox/abx/archivebox/base_plugin.py b/archivebox/abx/archivebox/base_plugin.py deleted file mode 100644 index 5f0d1d0e..00000000 --- a/archivebox/abx/archivebox/base_plugin.py +++ /dev/null @@ -1,154 +0,0 @@ -__package__ = 'abx.archivebox' - -import abx -import inspect -from pathlib import Path - -from django.apps import AppConfig - -from typing import List, Type, Dict -from typing_extensions import Self - -from pydantic import ( - BaseModel, - ConfigDict, - Field, - model_validator, - InstanceOf, - computed_field, -) -from benedict import benedict - -from .base_hook import BaseHook, HookType - -class BasePlugin(BaseModel): - model_config = ConfigDict( - extra='forbid', - arbitrary_types_allowed=True, - populate_by_name=True, - from_attributes=True, - validate_defaults=False, - validate_assignment=False, - revalidate_instances="always", - # frozen=True, - ) - - # Required by AppConfig: - app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name) - verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.) - docs_url: str = Field(default=None) # e.g. 'https://github.com/...' - - # All the hooks the plugin will install: - hooks: List[InstanceOf[BaseHook]] = Field(default=[]) - - _is_registered: bool = False - _is_ready: bool = False - - @computed_field - @property - def id(self) -> str: - return self.__class__.__name__ - - @property - def name(self) -> str: - return self.app_label - - # @computed_field - @property - def plugin_module(self) -> str: # DottedImportPath - """ " - Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS). - e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm' - """ - return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0] - - - @property - def plugin_module_full(self) -> str: # DottedImportPath - """e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'""" - return f"{self.__module__}.{self.__class__.__name__}" - - # @computed_field - @property - def plugin_dir(self) -> Path: - return Path(inspect.getfile(self.__class__)).parent.resolve() - - @model_validator(mode='after') - def validate(self) -> Self: - """Validate the plugin's build-time configuration here before it's registered in Django at runtime.""" - - # VERY IMPORTANT: - # preserve references to original default objects, - # pydantic deepcopies them by default which breaks mutability - # see https://github.com/pydantic/pydantic/issues/7608 - # if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example - # and calling .__init__() on one of them will not update the other - self.hooks = self.model_fields['hooks'].default - - assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name' - - # assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema." - - return self - - @property - def AppConfig(plugin_self) -> Type[AppConfig]: - """Generate a Django AppConfig class for this plugin.""" - - - class PluginAppConfig(AppConfig): - """Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS.""" - name = plugin_self.plugin_module - app_label = plugin_self.app_label - verbose_name = plugin_self.verbose_name - - default_auto_field = 'django.db.models.AutoField' - - # handled by abx.hookimpl ready() - # def ready(self): - # from django.conf import settings - # plugin_self.ready(settings) - - return PluginAppConfig - - @property - def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]: - return benedict({hook.id: hook for hook in self.hooks}) - - @property - def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]: - hooks = benedict({}) - for hook in self.hooks: - hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({}) - hooks[hook.hook_type][hook.id] = hook - return hooks - - - - @abx.hookimpl - def register(self, settings): - from archivebox.config.legacy import bump_startup_progress_bar - - self._is_registered = True - bump_startup_progress_bar() - - # print('β—£----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------β—’') - # print() - - @abx.hookimpl - def ready(self, settings=None): - """Runs any runtime code needed when AppConfig.ready() is called (after all models are imported).""" - - from archivebox.config.legacy import bump_startup_progress_bar - - assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!" - self._is_ready = True - - # settings.PLUGINS[self.id]._is_ready = True - bump_startup_progress_bar() - - - @abx.hookimpl - def get_INSTALLED_APPS(self): - return [self.plugin_module] - diff --git a/archivebox/abx/archivebox/base_queue.py b/archivebox/abx/archivebox/base_queue.py deleted file mode 100644 index a50ed4ce..00000000 --- a/archivebox/abx/archivebox/base_queue.py +++ /dev/null @@ -1,106 +0,0 @@ -__package__ = 'abx.archivebox' - -import importlib - -from typing import Dict, List, TYPE_CHECKING -from pydantic import Field, InstanceOf -from benedict import benedict - -if TYPE_CHECKING: - from huey.api import TaskWrapper - -import abx - -from .base_hook import BaseHook, HookType -from .base_binary import BaseBinary - - - -class BaseQueue(BaseHook): - hook_type: HookType = 'QUEUE' - - name: str = Field() # e.g. 'singlefile' - - binaries: List[InstanceOf[BaseBinary]] = Field() - - @property - def tasks(self) -> Dict[str, 'TaskWrapper']: - """Return an dict of all the background worker tasks defined in the plugin's tasks.py file.""" - tasks = importlib.import_module(f"{self.plugin_module}.tasks") - - all_tasks = {} - - for task_name, task in tasks.__dict__.items(): - # if attr is a Huey task and its queue_name matches our hook's queue name - if hasattr(task, "task_class") and task.huey.name == self.name: - all_tasks[task_name] = task - - return benedict(all_tasks) - - def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict: - """Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues'].""" - return { - "huey_class": "huey.SqliteHuey", - "filename": QUEUE_DATABASE_NAME, - "name": self.name, - "results": True, - "store_none": True, - "immediate": False, - "utc": True, - "consumer": { - "workers": 1, - "worker_type": "thread", - "initial_delay": 0.1, # Smallest polling interval, same as -d. - "backoff": 1.15, # Exponential backoff using this rate, -b. - "max_delay": 10.0, # Max possible polling interval, -m. - "scheduler_interval": 1, # Check schedule every second, -s. - "periodic": True, # Enable crontab feature. - "check_worker_health": True, # Enable worker health checks. - "health_check_interval": 1, # Check worker health every second. - }, - } - - def get_supervisord_config(self, settings) -> dict: - """Ge the config dict used to tell sueprvisord to start a huey consumer for this queue.""" - return { - "name": f"worker_{self.name}", - "command": f"archivebox manage djangohuey --queue {self.name}", - "stdout_logfile": f"logs/worker_{self.name}.log", - "redirect_stderr": "true", - "autorestart": "true", - "autostart": "false", - } - - def start_supervisord_worker(self, settings, lazy=True): - from queues.supervisor_util import get_or_create_supervisord_process, start_worker - print() - try: - supervisor = get_or_create_supervisord_process(daemonize=False) - except Exception as e: - print(f"Error starting worker for queue {self.name}: {e}") - return None - print() - worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy) - - # Update settings.WORKERS to include this worker - settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({}) - settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True) - - return worker - - @abx.hookimpl - def get_QUEUES(self): - return [self] - - @abx.hookimpl - def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME): - """queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']""" - return { - self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME) - } - - - # @abx.hookimpl - # def ready(self, settings): - # self.start_supervisord_worker(settings, lazy=True) - # super().ready(settings) diff --git a/archivebox/abx/archivebox/base_replayer.py b/archivebox/abx/archivebox/base_replayer.py index 7b51ae47..097a9e94 100644 --- a/archivebox/abx/archivebox/base_replayer.py +++ b/archivebox/abx/archivebox/base_replayer.py @@ -2,14 +2,10 @@ __package__ = 'abx.archivebox' import abx -from .base_hook import BaseHook, HookType - -class BaseReplayer(BaseHook): +class BaseReplayer: """Describes how to render an ArchiveResult in several contexts""" - hook_type: HookType = 'REPLAYER' - url_pattern: str = '*' row_template: str = 'plugins/generic_replayer/templates/row.html' diff --git a/archivebox/abx/archivebox/base_searchbackend.py b/archivebox/abx/archivebox/base_searchbackend.py index 6465dafd..72713ab8 100644 --- a/archivebox/abx/archivebox/base_searchbackend.py +++ b/archivebox/abx/archivebox/base_searchbackend.py @@ -1,33 +1,25 @@ __package__ = 'abx.archivebox' from typing import Iterable, List -from pydantic import Field - -import abx -from .base_hook import BaseHook, HookType +import abc -class BaseSearchBackend(BaseHook): - hook_type: HookType = 'SEARCHBACKEND' - - name: str = Field() # e.g. 'singlefile' - - - # TODO: move these to a hookimpl +class BaseSearchBackend(abc.ABC): + name: str @staticmethod + @abc.abstractmethod def index(snapshot_id: str, texts: List[str]): return @staticmethod + @abc.abstractmethod def flush(snapshot_ids: Iterable[str]): return @staticmethod + @abc.abstractmethod def search(text: str) -> List[str]: raise NotImplementedError("search method must be implemented by subclass") - - @abx.hookimpl - def get_SEARCHBACKENDS(self): - return [self] + diff --git a/archivebox/abx/archivebox/effects.py b/archivebox/abx/archivebox/effects.py new file mode 100644 index 00000000..8f0e54f3 --- /dev/null +++ b/archivebox/abx/archivebox/effects.py @@ -0,0 +1,20 @@ +""" +Hookspec for side-effects that ArchiveBox plugins can trigger. + +(e.g. network requests, binary execution, remote API calls, external library calls, etc.) +""" + +__package__ = 'abx.archivebox' + +import abx + + +@abx.hookspec +def check_remote_seed_connection(urls, extractor, credentials, created_by): + pass + + +@abx.hookspec +def exec_extractor(url, extractor, credentials, config): + pass + diff --git a/archivebox/abx/archivebox/events.py b/archivebox/abx/archivebox/events.py new file mode 100644 index 00000000..d3384318 --- /dev/null +++ b/archivebox/abx/archivebox/events.py @@ -0,0 +1,45 @@ +""" +Hookspec for ArchiveBox system events that plugins can hook into. + +Loosely modeled after Django's signals architecture. +https://docs.djangoproject.com/en/5.1/ref/signals/ +""" + +__package__ = 'abx.archivebox' + +import abx + + + +@abx.hookspec +def on_crawl_schedule_tick(crawl_schedule): + pass + + + + +@abx.hookspec +def on_seed_post_save(seed, created=False): + ... + +@abx.hookspec +def on_crawl_post_save(crawl, created=False): + ... + + +@abx.hookspec +def on_snapshot_post_save(snapshot, created=False): + ... + +# @abx.hookspec +# def on_snapshot_post_delete(snapshot): +# ... + + +@abx.hookspec +def on_archiveresult_post_save(archiveresult, created=False): + ... + +# @abx.hookspec +# def on_archiveresult_post_delete(archiveresult): +# ... diff --git a/archivebox/abx/archivebox/hookspec.py b/archivebox/abx/archivebox/hookspec.py index 1d08aa56..bfcb93b8 100644 --- a/archivebox/abx/archivebox/hookspec.py +++ b/archivebox/abx/archivebox/hookspec.py @@ -4,32 +4,49 @@ from typing import Dict, Any from .. import hookspec - -@hookspec -def get_CONFIGS(): - return {} - -@hookspec -def get_EXTRACTORS(): - return {} - -@hookspec -def get_REPLAYERS(): - return {} - -@hookspec -def get_ADMINDATAVIEWS(): - return {} - -@hookspec -def get_QUEUES(): - return {} - -@hookspec -def get_SEARCHBACKENDS(): - return {} +from .base_binary import BaseBinary, BaseBinProvider +from .base_configset import BaseConfigSet +from .base_extractor import BaseExtractor +from .base_searchbackend import BaseSearchBackend @hookspec -def extract(snapshot_id) -> Dict[str, Any]: +def get_PLUGIN() -> Dict[str, Dict[str, Any]]: return {} + +@hookspec +def get_CONFIG() -> Dict[str, BaseConfigSet]: + return {} + + + +@hookspec +def get_EXTRACTORS() -> Dict[str, BaseExtractor]: + return {} + +@hookspec +def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]: + return {} + +# @hookspec +# def get_REPLAYERS() -> Dict[str, BaseReplayer]: +# return {} + +# @hookspec +# def get_ADMINDATAVIEWS(): +# return {} + +# @hookspec +# def get_QUEUES(): +# return {} + + +############################################################## +# provided by abx.pydantic_pkgr.hookspec: +# @hookspec +# def get_BINARIES() -> Dict[str, BaseBinary]: +# return {} + +# @hookspec +# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]: +# return {} diff --git a/archivebox/abx/archivebox/reads.py b/archivebox/abx/archivebox/reads.py new file mode 100644 index 00000000..4b12b560 --- /dev/null +++ b/archivebox/abx/archivebox/reads.py @@ -0,0 +1,160 @@ +__package__ = 'abx.archivebox' + +import importlib +from typing import Dict, Set, Any, TYPE_CHECKING + +from benedict import benedict + +import abx +from .. import pm + +if TYPE_CHECKING: + from .base_configset import BaseConfigSet + from .base_binary import BaseBinary, BaseBinProvider + from .base_extractor import BaseExtractor + from .base_searchbackend import BaseSearchBackend + # from .base_replayer import BaseReplayer + # from .base_queue import BaseQueue + # from .base_admindataview import BaseAdminDataView + +# API exposed to ArchiveBox code + +def get_PLUGINS() -> Dict[str, Dict[str, Any]]: + return benedict({ + plugin_id: plugin + for plugin_dict in pm.hook.get_PLUGIN() + for plugin_id, plugin in plugin_dict.items() + }) + +def get_PLUGIN(plugin_id: str) -> Dict[str, Any]: + plugin_info = get_PLUGINS().get(plugin_id, {}) + package = plugin_info.get('package', plugin_info.get('PACKAGE', None)) + if not package: + return {'id': plugin_id, 'hooks': {}} + module = importlib.import_module(package) + hooks = abx.get_plugin_hooks(module.__package__) + assert plugin_info and (plugin_info.get('id') or plugin_info.get('ID') or hooks) + + return benedict({ + 'id': plugin_id, + 'label': getattr(module, '__label__', plugin_id), + 'module': module, + 'package': module.__package__, + 'hooks': hooks, + 'version': getattr(module, '__version__', '999.999.999'), + 'author': getattr(module, '__author__', 'Unknown'), + 'homepage': getattr(module, '__homepage__', 'https://github.com/ArchiveBox/ArchiveBox'), + 'dependencies': getattr(module, '__dependencies__', []), + 'source_code': module.__file__, + **plugin_info, + }) + + +def get_HOOKS() -> Set[str]: + return { + hook_name + for plugin_id in get_PLUGINS().keys() + for hook_name in get_PLUGIN(plugin_id).hooks + } + +def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: + return benedict({ + config_id: configset + for plugin_configs in pm.hook.get_CONFIG() + for config_id, configset in plugin_configs.items() + }) + + +def get_FLAT_CONFIG() -> Dict[str, Any]: + return benedict({ + key: value + for configset in get_CONFIGS().values() + for key, value in configset.model_dump().items() + }) + +def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']: + # TODO: move these to plugins + from abx.archivebox.base_binary import apt, brew, env + builtin_binproviders = { + 'apt': apt, + 'brew': brew, + 'env': env, + } + + return benedict({ + binprovider_id: binprovider + for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()] + for binprovider_id, binprovider in plugin_binproviders.items() + }) + +def get_BINARIES() -> Dict[str, 'BaseBinary']: + return benedict({ + binary_id: binary + for plugin_binaries in pm.hook.get_BINARIES() + for binary_id, binary in plugin_binaries.items() + }) + +def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']: + return benedict({ + extractor_id: extractor + for plugin_extractors in pm.hook.get_EXTRACTORS() + for extractor_id, extractor in plugin_extractors.items() + }) + +# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']: +# return benedict({ +# replayer.id: replayer +# for plugin_replayers in pm.hook.get_REPLAYERS() +# for replayer in plugin_replayers +# }) + +# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']: +# return benedict({ +# admin_dataview.id: admin_dataview +# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS() +# for admin_dataview in plugin_admin_dataviews +# }) + +# def get_QUEUES() -> Dict[str, 'BaseQueue']: +# return benedict({ +# queue.id: queue +# for plugin_queues in pm.hook.get_QUEUES() +# for queue in plugin_queues +# }) + +def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']: + return benedict({ + searchbackend_id: searchbackend + for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS() + for searchbackend_id,searchbackend in plugin_searchbackends.items() + }) + + + +def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None): + """Get all the relevant config for the given scope, in correct precedence order""" + + from django.conf import settings + default_config: benedict = defaults or settings.CONFIG + + snapshot = snapshot or (archiveresult and archiveresult.snapshot) + crawl = crawl or (snapshot and snapshot.crawl) + seed = seed or (crawl and crawl.seed) + persona = persona or (crawl and crawl.persona) + + persona_config = persona.config if persona else {} + seed_config = seed.config if seed else {} + crawl_config = crawl.config if crawl else {} + snapshot_config = snapshot.config if snapshot else {} + archiveresult_config = archiveresult.config if archiveresult else {} + extra_config = extra_config or {} + + return { + **default_config, # defaults / config file / environment variables + **persona_config, # lowest precedence + **seed_config, + **crawl_config, + **snapshot_config, + **archiveresult_config, + **extra_config, # highest precedence + } diff --git a/archivebox/abx/archivebox/use.py b/archivebox/abx/archivebox/use.py deleted file mode 100644 index e958b62f..00000000 --- a/archivebox/abx/archivebox/use.py +++ /dev/null @@ -1,130 +0,0 @@ -__package__ = 'abx.archivebox' - -from typing import Dict, Any, TYPE_CHECKING - -from django.utils import timezone -from benedict import benedict - -from .. import pm - -if TYPE_CHECKING: - from .base_hook import BaseHook - from .base_configset import BaseConfigSet - from .base_binary import BaseBinary, BaseBinProvider - from .base_extractor import BaseExtractor - from .base_replayer import BaseReplayer - from .base_queue import BaseQueue - from .base_admindataview import BaseAdminDataView - from .base_searchbackend import BaseSearchBackend - -# API exposed to ArchiveBox code - -def get_PLUGINS(): - return benedict({ - plugin.PLUGIN.id: plugin.PLUGIN - for plugin in pm.get_plugins() - }) - -def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']: - return benedict({ - hook.id: hook - for plugin in PLUGINS.values() - for hook in plugin.hooks - }) - -def get_CONFIGS() -> Dict[str, 'BaseConfigSet']: - return benedict({ - config_id: config - for plugin_configs in pm.hook.get_CONFIGS() - for config_id, config in plugin_configs.items() - }) - -def get_FLAT_CONFIG() -> Dict[str, Any]: - return benedict({ - key: value - for plugin_config_dict in pm.hook.get_FLAT_CONFIG() - for key, value in plugin_config_dict.items() - }) - -def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']: - # TODO: move these to plugins - from abx.archivebox.base_binary import apt, brew, env - builtin_binproviders = [apt, brew, env] - - return benedict({ - binprovider.id: binprovider - for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()] - for binprovider in plugin_binproviders - }) - -def get_BINARIES() -> Dict[str, 'BaseBinary']: - return benedict({ - binary.id: binary - for plugin_binaries in pm.hook.get_BINARIES() - for binary in plugin_binaries - }) - -def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']: - return benedict({ - extractor.id: extractor - for plugin_extractors in pm.hook.get_EXTRACTORS() - for extractor in plugin_extractors - }) - -def get_REPLAYERS() -> Dict[str, 'BaseReplayer']: - return benedict({ - replayer.id: replayer - for plugin_replayers in pm.hook.get_REPLAYERS() - for replayer in plugin_replayers - }) - -def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']: - return benedict({ - admin_dataview.id: admin_dataview - for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS() - for admin_dataview in plugin_admin_dataviews - }) - -def get_QUEUES() -> Dict[str, 'BaseQueue']: - return benedict({ - queue.id: queue - for plugin_queues in pm.hook.get_QUEUES() - for queue in plugin_queues - }) - -def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']: - return benedict({ - searchbackend.id: searchbackend - for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS() - for searchbackend in plugin_searchbackends - }) - - -########################### - - -def register_all_hooks(settings): - pm.hook.register(settings=settings) - - - -def extract(url_or_snapshot_id): - from core.models import Snapshot - - url, snapshot_abid, snapshot_id = None, None, None - snapshot = None - if '://' in url_or_snapshot_id: - url = url_or_snapshot_id - try: - snapshot = Snapshot.objects.get(url=url) - except Snapshot.DoesNotExist: - snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now()) - snapshot.save() - elif '-' in url_or_snapshot_id: - snapshot_id = url_or_snapshot_id - snapshot = Snapshot.objects.get(id=snapshot_id) - else: - snapshot_abid = url_or_snapshot_id - snapshot = Snapshot.objects.get(abid=snapshot_abid) - - return pm.hook.extract(snapshot_id=snapshot.id) diff --git a/archivebox/abx/archivebox/writes.py b/archivebox/abx/archivebox/writes.py new file mode 100644 index 00000000..0c4566b4 --- /dev/null +++ b/archivebox/abx/archivebox/writes.py @@ -0,0 +1,133 @@ +__package__ = 'abx.archivebox' + +import importlib +from typing import Dict, Set, Any, TYPE_CHECKING + +from benedict import benedict + +from django.conf import settings + +import abx +from .. import pm + + +@abx.hookimpl +def get_or_create_snapshot(crawl, url, config): + pass + +@abx.hookimpl +def update_crawl_schedule_next_run_at(crawl_schedule, next_run_at): + pass + +@abx.hookimpl +def create_crawl_copy(crawl_to_copy, schedule): + pass + +@abx.hookimpl +def create_crawl(seed, depth, tags_str, persona, created_by, config, schedule): + pass + + + + +def create_crawl_from_ui_action(urls, extractor, credentials, depth, tags_str, persona, created_by, crawl_config): + if seed_is_remote(urls, extractor, credentials): + # user's seed is a remote source that will provide the urls (e.g. RSS feed URL, Pocket API, etc.) + uri, extractor, credentials = abx.archivebox.effects.check_remote_seed_connection(urls, extractor, credentials, created_by) + else: + # user's seed is some raw text they provided to parse for urls, save it to a file then load the file as a Seed + uri = abx.archivebox.writes.write_raw_urls_to_local_file(urls, extractor, tags_str, created_by) # file:///data/sources/some_import.txt + + seed = abx.archivebox.writes.get_or_create_seed(uri=remote_uri, extractor, credentials, created_by) + # abx.archivebox.events.on_seed_created(seed) + + crawl = abx.archivebox.writes.create_crawl(seed=seed, depth=depth, tags_str=tags_str, persona=persona, created_by=created_by, config=crawl_config, schedule=None) + abx.archivebox.events.on_crawl_created(crawl) + + +@abx.hookimpl(specname='on_crawl_schedule_tick') +def create_crawl_from_crawlschedule_if_due(crawl_schedule): + # make sure it's not too early to run this scheduled import (makes this function indepmpotent / safe to call multiple times / every second) + if timezone.now() < crawl_schedule.next_run_at: + # it's not time to run it yet, wait for the next tick + return + else: + # we're going to run it now, bump the next run time so that no one else runs it at the same time as us + abx.archivebox.writes.update_crawl_schedule_next_run_at(crawl_schedule, next_run_at=crawl_schedule.next_run_at + crawl_schedule.interval) + + crawl_to_copy = None + try: + crawl_to_copy = crawl_schedule.crawl_set.first() # alternatively use .last() to copy most recent crawl instead of very first crawl + except Crawl.DoesNotExist: + # there is no template crawl to base the next one off of + # user must add at least one crawl to a schedule that serves as the template for all future repeated crawls + return + + new_crawl = abx.archivebox.writes.create_crawl_copy(crawl_to_copy=crawl_to_copy, schedule=crawl_schedule) + abx.archivebox.events.on_crawl_created(new_crawl) + + +@abx.hookimpl(specname='on_crawl_post_save') +def create_root_snapshot_from_seed(crawl): + # create a snapshot for the seed URI which kicks off the crawl + # only a single extractor will run on it, which will produce outlinks which get added back to the crawl + root_snapshot, created = abx.archivebox.writes.get_or_create_snapshot(crawl=crawl, url=crawl.seed.uri, config={ + 'extractors': ( + abx.archivebox.reads.get_extractors_that_produce_outlinks() + if crawl.seed.extractor == 'auto' else + [crawl.seed.extractor] + ), + **crawl.seed.config, + }) + if created: + abx.archivebox.events.on_snapshot_created(root_snapshot) + abx.archivebox.writes.update_crawl_stats(started_at=timezone.now()) + + +@abx.hookimpl(specname='on_snapshot_created') +def create_archiveresults_pending_from_snapshot(snapshot, config): + config = get_scope_config( + # defaults=settings.CONFIG_FROM_DEFAULTS, + # configfile=settings.CONFIG_FROM_FILE, + # environment=settings.CONFIG_FROM_ENVIRONMENT, + persona=archiveresult.snapshot.crawl.persona, + seed=archiveresult.snapshot.crawl.seed, + crawl=archiveresult.snapshot.crawl, + snapshot=archiveresult.snapshot, + archiveresult=archiveresult, + # extra_config=extra_config, + ) + + extractors = abx.archivebox.reads.get_extractors_for_snapshot(snapshot, config) + for extractor in extractors: + archiveresult, created = abx.archivebox.writes.get_or_create_archiveresult_pending( + snapshot=snapshot, + extractor=extractor, + status='pending' + ) + if created: + abx.archivebox.events.on_archiveresult_created(archiveresult) + + + +@abx.hookimpl(specname='on_archiveresult_updated') +def create_snapshots_pending_from_archiveresult_outlinks(archiveresult): + config = get_scope_config(...) + + # check if extractor has finished succesfully, if not, dont bother checking for outlinks + if not archiveresult.status == 'succeeded': + return + + # check if we have already reached the maximum recursion depth + hops_to_here = abx.archivebox.reads.get_outlink_parents(crawl_pk=archiveresult.snapshot.crawl_id, url=archiveresult.url, config=config) + if len(hops_to_here) >= archiveresult.crawl.max_depth +1: + return + + # parse the output to get outlink url_entries + discovered_urls = abx.archivebox.reads.get_archiveresult_discovered_url_entries(archiveresult, config=config) + + for url_entry in discovered_urls: + abx.archivebox.writes.create_outlink_record(src=archiveresult.snapshot.url, dst=url_entry.url, via=archiveresult) + abx.archivebox.writes.create_snapshot(crawl=archiveresult.snapshot.crawl, url_entry=url_entry) + + # abx.archivebox.events.on_crawl_updated(archiveresult.snapshot.crawl) diff --git a/archivebox/abx/django/hookspec.py b/archivebox/abx/django/hookspec.py index 04bb359b..87f8e520 100644 --- a/archivebox/abx/django/hookspec.py +++ b/archivebox/abx/django/hookspec.py @@ -110,6 +110,11 @@ def register_checks(): """Register django checks with django system checks system""" pass +@hookspec +def register_admin(admin_site): + """Register django admin views/models with the main django admin site instance""" + pass + ########################################################################################### diff --git a/archivebox/abx/django/use.py b/archivebox/abx/django/use.py index 87d3f9bd..a52ada3b 100644 --- a/archivebox/abx/django/use.py +++ b/archivebox/abx/django/use.py @@ -96,3 +96,6 @@ def register_checks(): """register any django system checks""" pm.hook.register_checks() +def register_admin(admin_site): + """register any django admin models/views with the main django admin site instance""" + pm.hook.register_admin(admin_site=admin_site) diff --git a/archivebox/api/admin.py b/archivebox/api/admin.py new file mode 100644 index 00000000..f478815d --- /dev/null +++ b/archivebox/api/admin.py @@ -0,0 +1,31 @@ +__package__ = 'archivebox.api' + +from signal_webhooks.admin import WebhookAdmin +from signal_webhooks.utils import get_webhook_model + +from abid_utils.admin import ABIDModelAdmin + +from api.models import APIToken + + +class APITokenAdmin(ABIDModelAdmin): + list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires') + sort_fields = ('abid', 'created_at', 'created_by', 'expires') + readonly_fields = ('created_at', 'modified_at', 'abid_info') + search_fields = ('id', 'abid', 'created_by__username', 'token') + fields = ('created_by', 'token', 'expires', *readonly_fields) + + list_filter = ('created_by',) + ordering = ['-created_at'] + list_per_page = 100 + + +class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin): + list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display) + sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') + readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields) + + +def register_admin(admin_site): + admin_site.register(APIToken, APITokenAdmin) + admin_site.register(get_webhook_model(), CustomWebhookAdmin) diff --git a/archivebox/api/apps.py b/archivebox/api/apps.py index d7b8b0d9..35b1238e 100644 --- a/archivebox/api/apps.py +++ b/archivebox/api/apps.py @@ -2,10 +2,14 @@ __package__ = 'archivebox.api' from django.apps import AppConfig +import abx class APIConfig(AppConfig): name = 'api' - def ready(self): - pass + +@abx.hookimpl +def register_admin(admin_site): + from api.admin import register_admin + register_admin(admin_site) diff --git a/archivebox/api/v1_core.py b/archivebox/api/v1_core.py index 471ddbe7..bcc957ee 100644 --- a/archivebox/api/v1_core.py +++ b/archivebox/api/v1_core.py @@ -6,7 +6,6 @@ from typing import List, Optional, Union, Any from datetime import datetime from django.db.models import Q -from django.shortcuts import get_object_or_404 from django.core.exceptions import ValidationError from django.contrib.auth import get_user_model @@ -16,7 +15,6 @@ from ninja.errors import HttpError from core.models import Snapshot, ArchiveResult, Tag from api.models import APIToken, OutboundWebhook -from abid_utils.abid import ABID from .auth import API_AUTH_METHODS @@ -397,11 +395,70 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True): +# class CrawlSchema(Schema): +# TYPE: str = 'core.models.Crawl' + +# id: UUID +# abid: str + +# modified_at: datetime +# created_at: datetime +# created_by_id: str +# created_by_username: str + +# urls: str +# depth: int +# parser: str + +# # snapshots: List[SnapshotSchema] + +# @staticmethod +# def resolve_created_by_id(obj): +# return str(obj.created_by_id) + +# @staticmethod +# def resolve_created_by_username(obj): +# User = get_user_model() +# return User.objects.get(id=obj.created_by_id).username + +# @staticmethod +# def resolve_snapshots(obj, context): +# if context['request'].with_snapshots: +# return obj.snapshot_set.all().distinct() +# return Snapshot.objects.none() + + +# @router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl") +# def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False): +# """Get a specific Crawl by id or abid.""" +# crawl = None +# request.with_snapshots = with_snapshots +# request.with_archiveresults = with_archiveresults + +# try: +# crawl = Crawl.objects.get(abid__icontains=crawl_id) +# except Exception: +# pass + +# try: +# crawl = crawl or Crawl.objects.get(id__icontains=crawl_id) +# except Exception: +# pass +# return crawl + + +# [..., CrawlSchema] @router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any") def get_any(request, abid: str): request.with_snapshots = False request.with_archiveresults = False + if abid.startswith(APIToken.abid_prefix): + raise HttpError(403, 'APIToken objects are not accessible via REST API') + + if abid.startswith(OutboundWebhook.abid_prefix): + raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API') + response = None try: response = response or get_snapshot(request, abid) @@ -417,11 +474,13 @@ def get_any(request, abid: str): response = response or get_tag(request, abid) except Exception: pass - - if abid.startswith(APIToken.abid_prefix): - raise HttpError(403, 'APIToken objects are not accessible via REST API') - if abid.startswith(OutboundWebhook.abid_prefix): - raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API') + # try: + # response = response or get_crawl(request, abid) + # except Exception: + # pass - raise HttpError(404, 'Object with given ABID not found') + if not response: + raise HttpError(404, 'Object with given ABID not found') + + return response diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index ab532a04..57750918 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -164,13 +164,18 @@ def run_subcommand(subcommand: str, # print('DATA_DIR is', DATA_DIR) # print('pwd is', os.getcwd()) - cmd_requires_db = subcommand in archive_cmds + cmd_requires_db = (subcommand in archive_cmds) init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args check_db = cmd_requires_db and not init_pending setup_django(in_memory_db=subcommand in fake_db, check_db=check_db) + for ignore_pattern in ('help', '-h', '--help', 'version', '--version'): + if ignore_pattern in sys.argv[:4]: + cmd_requires_db = False + break + if subcommand in archive_cmds: if cmd_requires_db: check_migrations() diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index d70352e0..a3e96681 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -5,5 +5,34 @@ from .paths import ( DATA_DIR, # noqa ARCHIVE_DIR, # noqa ) -from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa +from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa from .version import VERSION # noqa + + +import abx + + +# @abx.hookimpl +# def get_INSTALLED_APPS(): +# return ['config'] + + +@abx.hookimpl +def get_CONFIG(): + from .common import ( + SHELL_CONFIG, + STORAGE_CONFIG, + GENERAL_CONFIG, + SERVER_CONFIG, + ARCHIVING_CONFIG, + SEARCH_BACKEND_CONFIG, + ) + return { + 'SHELL_CONFIG': SHELL_CONFIG, + 'STORAGE_CONFIG': STORAGE_CONFIG, + 'GENERAL_CONFIG': GENERAL_CONFIG, + 'SERVER_CONFIG': SERVER_CONFIG, + 'ARCHIVING_CONFIG': ARCHIVING_CONFIG, + 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG, + } + diff --git a/archivebox/config/apps.py b/archivebox/config/apps.py deleted file mode 100644 index e56a9179..00000000 --- a/archivebox/config/apps.py +++ /dev/null @@ -1,57 +0,0 @@ -__package__ = 'archivebox.config' - -from typing import List -from pydantic import InstanceOf - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_hook import BaseHook - - -from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa -from .common import ( - ShellConfig, # noqa: F401 - StorageConfig, # noqa: F401 - GeneralConfig, # noqa: F401 - ServerConfig, # noqa: F401 - ArchivingConfig, # noqa: F401 - SearchBackendConfig, # noqa: F401 - SHELL_CONFIG, - STORAGE_CONFIG, - GENERAL_CONFIG, - SERVER_CONFIG, - ARCHIVING_CONFIG, - SEARCH_BACKEND_CONFIG, -) - -###################### Config ########################## - - -class ConfigPlugin(BasePlugin): - app_label: str = 'CONFIG' - verbose_name: str = 'Configuration' - - hooks: List[InstanceOf[BaseHook]] = [ - SHELL_CONFIG, - GENERAL_CONFIG, - STORAGE_CONFIG, - SERVER_CONFIG, - ARCHIVING_CONFIG, - SEARCH_BACKEND_CONFIG, - ] - - -PLUGIN = ConfigPlugin() -DJANGO_APP = PLUGIN.AppConfig - - - -# # register django apps -# @abx.hookimpl -# def get_INSTALLED_APPS(): -# return [DJANGO_APP.name] - -# # register configs -# @abx.hookimpl -# def register_CONFIG(): -# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values() - diff --git a/archivebox/config/common.py b/archivebox/config/common.py index b17fde09..e9903d41 100644 --- a/archivebox/config/common.py +++ b/archivebox/config/common.py @@ -1,18 +1,18 @@ __package__ = 'archivebox.config' +import os import sys import shutil - +import tempfile from typing import Dict, Optional from pathlib import Path from rich import print -from pydantic import Field, field_validator, computed_field +from pydantic import Field, field_validator, computed_field, model_validator from django.utils.crypto import get_random_string from abx.archivebox.base_configset import BaseConfigSet - from .constants import CONSTANTS from .version import get_COMMIT_HASH, get_BUILD_TIME from .permissions import IN_DOCKER @@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet): VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)}, CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)}, - @computed_field @property def TERM_WIDTH(self) -> int: @@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig() class StorageConfig(BaseConfigSet): + # TMP_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be a short path due to unix path length restrictions for socket files (<100 chars) + # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets + TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR) + + # LIB_DIR must be a local, fast, readable/writable dir by archivebox user, + # must be able to contain executable binaries (up to 5GB size) + # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow + LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR) + OUTPUT_PERMISSIONS: str = Field(default='644') RESTRICT_FILE_NAMES: str = Field(default='windows') ENFORCE_ATOMIC_WRITES: bool = Field(default=True) diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index e8ea9958..b8019f99 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -1,6 +1,5 @@ __package__ = 'archivebox.config' -import os import re import sys @@ -97,14 +96,10 @@ class ConstantsDict(Mapping): # Runtime dirs TMP_DIR_NAME: str = 'tmp' - TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID + DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323 + LIB_DIR_NAME: str = 'lib' - LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE - LIB_PIP_DIR: Path = LIB_DIR / 'pip' - LIB_NPM_DIR: Path = LIB_DIR / 'npm' - LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers' - LIB_BIN_DIR: Path = LIB_DIR / 'bin' - BIN_DIR: Path = LIB_BIN_DIR + DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker # Config constants TIMEZONE: str = 'UTC' @@ -198,91 +193,7 @@ class ConstantsDict(Mapping): ".archivebox_id", "Dockerfile", )) - - CODE_LOCATIONS = benedict({ - 'PACKAGE_DIR': { - 'path': (PACKAGE_DIR).resolve(), - 'enabled': True, - 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable - }, - 'TEMPLATES_DIR': { - 'path': TEMPLATES_DIR.resolve(), - 'enabled': True, - 'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list - }, - 'CUSTOM_TEMPLATES_DIR': { - 'path': CUSTOM_TEMPLATES_DIR.resolve(), - 'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR), - 'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read - }, - 'USER_PLUGINS_DIR': { - 'path': USER_PLUGINS_DIR.resolve(), - 'enabled': os.path.isdir(USER_PLUGINS_DIR), - 'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read - }, - 'LIB_DIR': { - 'path': LIB_DIR.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write - }, - }) - DATA_LOCATIONS = benedict({ - "DATA_DIR": { - "path": DATA_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), - "is_mount": os.path.ismount(DATA_DIR.resolve()), - }, - "CONFIG_FILE": { - "path": CONFIG_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK), - }, - "SQL_INDEX": { - "path": DATABASE_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(DATABASE_FILE.resolve()), - }, - "QUEUE_DATABASE": { - "path": QUEUE_DATABASE_FILE.resolve(), - "enabled": True, - "is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK), - "is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()), - }, - "ARCHIVE_DIR": { - "path": ARCHIVE_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), - "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), - }, - "SOURCES_DIR": { - "path": SOURCES_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK), - }, - "PERSONAS_DIR": { - "path": PERSONAS_DIR.resolve(), - "enabled": os.path.isdir(PERSONAS_DIR), - "is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write - }, - "LOGS_DIR": { - "path": LOGS_DIR.resolve(), - "enabled": True, - "is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write - }, - 'TMP_DIR': { - 'path': TMP_DIR.resolve(), - 'enabled': True, - 'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write - }, - # "CACHE_DIR": { - # "path": CACHE_DIR.resolve(), - # "enabled": True, - # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write - # }, - }) @classmethod def __getitem__(cls, key: str): diff --git a/archivebox/config/legacy.py b/archivebox/config/legacy.py index f53a9b29..99b497ca 100644 --- a/archivebox/config/legacy.py +++ b/archivebox/config/legacy.py @@ -50,13 +50,11 @@ from ..misc.logging import ( ) from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG -from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.wget.apps import WGET_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG +from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG +from archivebox.plugins_extractor.wget.config import WGET_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG ANSI = SHELL_CONFIG.ANSI -LDAP = LDAP_CONFIG.LDAP_ENABLED ############################### Config Schema ################################## @@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = { 'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(), - 'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(), - # 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(), # 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(), @@ -262,6 +258,9 @@ def load_config_val(key: str, elif type is list or type is dict: return json.loads(val) + + elif type is Path: + return Path(val) raise Exception('Config values can only be str, bool, int, or json') @@ -578,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON with SudoPermission(uid=0): # running as root is a special case where it's ok to be a bit slower # make sure data dir is always owned by the correct user - os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"') + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null') bump_startup_progress_bar() diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py index 217dfbe9..1f582881 100644 --- a/archivebox/config/paths.py +++ b/archivebox/config/paths.py @@ -1,12 +1,16 @@ __package__ = 'archivebox.config' import os +import socket import hashlib +import tempfile import platform from pathlib import Path from functools import cache from datetime import datetime +from benedict import benedict + from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER ############################################################################################# @@ -41,7 +45,8 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: try: # only persist collection_id file if we already have an index.sqlite3 file present # otherwise we might be running in a directory that is not a collection, no point creating cruft files - if os.path.isfile(DATABASE_FILE) and os.access(DATA_DIR, os.W_OK) or force_create: + collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK) + if collection_is_active or force_create: collection_id_file.write_text(collection_id) # if we're running as root right now, make sure the collection_id file is owned by the archivebox user @@ -87,7 +92,7 @@ def get_machine_type() -> str: return LIB_DIR_SCOPE -def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool: +def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool: """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" current_uid, current_gid = os.geteuid(), os.getegid() uid, gid = uid or current_uid, gid or current_gid @@ -100,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No test_file.unlink() return True except (IOError, OSError, PermissionError): - pass - + if chown: + # try fixing it using sudo permissions + with SudoPermission(uid=uid, fallback=fallback): + os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null') + return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) return False +def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: + """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" + from archivebox.logging_util import pretty_path + + try: + socket_path = str(dir_path / '.test_socket.sock') + s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) + try: + os.remove(socket_path) + except OSError: + pass + s.bind(socket_path) + s.close() + try: + os.remove(socket_path) + except OSError: + pass + except Exception as e: + raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e + + return True + + +def create_and_chown_dir(dir_path: Path) -> None: + with SudoPermission(uid=0, fallback=True): + dir_path.mkdir(parents=True, exist_ok=True) + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null') + os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') + +@cache +def get_or_create_working_tmp_dir(autofix=True, quiet=False): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_tmp_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.TMP_DIR, # + CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/ + Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512 + Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512 + Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 + Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 + Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d + Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 + ] + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.TMP_DIR != candidate: + STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet) + return candidate + + if not quiet: + raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!') + +@cache +def get_or_create_working_lib_dir(autofix=True, quiet=False): + from archivebox import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + from archivebox.misc.checks import check_lib_dir + + # try a few potential directories in order of preference + CANDIDATES = [ + STORAGE_CONFIG.LIB_DIR, # + CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker + Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5 + *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5 + Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 + ] + + for candidate in CANDIDATES: + try: + create_and_chown_dir(candidate) + except Exception: + pass + if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): + if autofix and STORAGE_CONFIG.LIB_DIR != candidate: + STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet) + return candidate + + if not quiet: + raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!') + + + +@cache +def get_data_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + return benedict({ + "DATA_DIR": { + "path": DATA_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), + "is_mount": os.path.ismount(DATA_DIR.resolve()), + }, + "CONFIG_FILE": { + "path": CONSTANTS.CONFIG_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), + }, + "SQL_INDEX": { + "path": DATABASE_FILE.resolve(), + "enabled": True, + "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(DATABASE_FILE.resolve()), + }, + "QUEUE_DATABASE": { + "path": CONSTANTS.QUEUE_DATABASE_FILE, + "enabled": True, + "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK), + "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE), + }, + "ARCHIVE_DIR": { + "path": ARCHIVE_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), + "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), + }, + "SOURCES_DIR": { + "path": CONSTANTS.SOURCES_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), + }, + "PERSONAS_DIR": { + "path": CONSTANTS.PERSONAS_DIR.resolve(), + "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), + "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write + }, + "LOGS_DIR": { + "path": CONSTANTS.LOGS_DIR.resolve(), + "enabled": True, + "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write + }, + 'TMP_DIR': { + 'path': STORAGE_CONFIG.TMP_DIR.resolve(), + 'enabled': True, + 'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write + }, + # "CACHE_DIR": { + # "path": CACHE_DIR.resolve(), + # "enabled": True, + # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write + # }, + }) + +@cache +def get_code_locations(): + from archivebox.config import CONSTANTS + from archivebox.config.common import STORAGE_CONFIG + + return benedict({ + 'PACKAGE_DIR': { + 'path': (PACKAGE_DIR).resolve(), + 'enabled': True, + 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable + }, + 'TEMPLATES_DIR': { + 'path': CONSTANTS.TEMPLATES_DIR.resolve(), + 'enabled': True, + 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list + }, + 'CUSTOM_TEMPLATES_DIR': { + 'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(), + 'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR), + 'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read + }, + 'USER_PLUGINS_DIR': { + 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(), + 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), + 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read + }, + 'LIB_DIR': { + 'path': STORAGE_CONFIG.LIB_DIR.resolve(), + 'enabled': True, + 'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write + }, + }) + # @cache diff --git a/archivebox/config/views.py b/archivebox/config/views.py index eb1adbe8..db2c7eaa 100644 --- a/archivebox/config/views.py +++ b/archivebox/config/views.py @@ -2,6 +2,7 @@ __package__ = 'abx.archivebox' import os import inspect +from pathlib import Path from typing import Any, List, Dict, cast from benedict import benedict @@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink +import abx.archivebox.reads + from archivebox.config import CONSTANTS from archivebox.misc.util import parse_date @@ -82,8 +85,12 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: if '_BINARY' in key or '_VERSION' in key } - for plugin in settings.PLUGINS.values(): - for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): + for plugin_id, plugin in abx.archivebox.reads.get_PLUGINS().items(): + plugin = abx.archivebox.reads.get_PLUGIN(plugin_id) + if not plugin.hooks.get('get_BINARIES'): + continue + + for binary in plugin.hooks.get_BINARIES().values(): try: installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary) binary = installed_binary.load_from_db() @@ -92,7 +99,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext: rows['Binary Name'].append(ItemLink(binary.name, key=binary.name)) rows['Found Version'].append(f'βœ… {binary.loaded_version}' if binary.loaded_version else '❌ missing') - rows['From Plugin'].append(plugin.plugin_module) + rows['From Plugin'].append(plugin.package) rows['Provided By'].append( ', '.join( f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name @@ -128,11 +135,16 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: binary = None plugin = None - for loaded_plugin in settings.PLUGINS.values(): - for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values(): - if loaded_binary.name == key: - binary = loaded_binary - plugin = loaded_plugin + for plugin_id in abx.archivebox.reads.get_PLUGINS().keys(): + loaded_plugin = abx.archivebox.reads.get_PLUGIN(plugin_id) + try: + for loaded_binary in loaded_plugin.hooks.get_BINARIES().values(): + if loaded_binary.name == key: + binary = loaded_binary + plugin = loaded_plugin + # break # last write wins + except Exception as e: + print(e) assert plugin and binary, f'Could not find a binary matching the specified name: {key}' @@ -149,7 +161,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: "name": binary.name, "description": binary.abspath, "fields": { - 'plugin': plugin.name, + 'plugin': plugin.package, 'binprovider': binary.loaded_binprovider, 'abspath': binary.loaded_abspath, 'version': binary.loaded_version, @@ -170,28 +182,68 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext: assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' rows = { - "Name": [], - "verbose_name": [], - "module": [], - "source_code": [], - "hooks": [], + "Label": [], + "Version": [], + "Author": [], + "Package": [], + "Source Code": [], + "Config": [], + "Binaries": [], + "Package Managers": [], + # "Search Backends": [], } + config_colors = { + '_BINARY': '#339', + 'USE_': 'green', + 'SAVE_': 'green', + '_ARGS': '#33e', + 'KEY': 'red', + 'COOKIES': 'red', + 'AUTH': 'red', + 'SECRET': 'red', + 'TOKEN': 'red', + 'PASSWORD': 'red', + 'TIMEOUT': '#533', + 'RETRIES': '#533', + 'MAX': '#533', + 'MIN': '#533', + } + def get_color(key): + for pattern, color in config_colors.items(): + if pattern in key: + return color + return 'black' - for plugin in settings.PLUGINS.values(): - # try: - # plugin.load_binaries() - # except Exception as e: - # print(e) - - rows['Name'].append(ItemLink(plugin.id, key=plugin.id)) - rows['verbose_name'].append(mark_safe(f'{plugin.verbose_name}')) - rows['module'].append(str(plugin.plugin_module)) - rows['source_code'].append(str(plugin.plugin_dir)) - rows['hooks'].append(mark_safe(', '.join( - f'{hook.id}' - for hook in plugin.hooks + for plugin_id in settings.PLUGINS.keys(): + + plugin = abx.archivebox.reads.get_PLUGIN(plugin_id) + plugin.hooks.get_BINPROVIDERS = plugin.hooks.get('get_BINPROVIDERS', lambda: {}) + plugin.hooks.get_BINARIES = plugin.hooks.get('get_BINARIES', lambda: {}) + plugin.hooks.get_CONFIG = plugin.hooks.get('get_CONFIG', lambda: {}) + + rows['Label'].append(ItemLink(plugin.label, key=plugin.package)) + rows['Version'].append(str(plugin.version)) + rows['Author'].append(mark_safe(f'{plugin.author}')) + rows['Package'].append(ItemLink(plugin.package, key=plugin.package)) + rows['Source Code'].append(format_html('{}', str(plugin.source_code).replace(str(Path('~').expanduser()), '~'))) + rows['Config'].append(mark_safe(''.join( + f'{key}={value}
' + for configdict in plugin.hooks.get_CONFIG().values() + for key, value in benedict(configdict).items() ))) + rows['Binaries'].append(mark_safe(', '.join( + f'{binary.name}' + for binary in plugin.hooks.get_BINARIES().values() + ))) + rows['Package Managers'].append(mark_safe(', '.join( + f'{binprovider.name}' + for binprovider in plugin.hooks.get_BINPROVIDERS().values() + ))) + # rows['Search Backends'].append(mark_safe(', '.join( + # f'{searchbackend.name}' + # for searchbackend in plugin.SEARCHBACKENDS.values() + # ))) return TableContext( title="Installed plugins", @@ -203,28 +255,33 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: assert request.user.is_superuser, 'Must be a superuser to view configuration settings.' - plugin = None - for loaded_plugin in settings.PLUGINS.values(): - if loaded_plugin.id == key: - plugin = loaded_plugin + plugin_id = None + for check_plugin_id, loaded_plugin in settings.PLUGINS.items(): + if check_plugin_id.split('.')[-1] == key.split('.')[-1]: + plugin_id = check_plugin_id + break - assert plugin, f'Could not find a plugin matching the specified name: {key}' + assert plugin_id, f'Could not find a plugin matching the specified name: {key}' - try: - plugin = plugin.load_binaries() - except Exception as e: - print(e) + plugin = abx.archivebox.reads.get_PLUGIN(plugin_id) return ItemContext( slug=key, title=key, data=[ { - "name": plugin.id, - "description": plugin.verbose_name, + "name": plugin.package, + "description": plugin.label, "fields": { + "id": plugin.id, + "package": plugin.package, + "label": plugin.label, + "version": plugin.version, + "author": plugin.author, + "homepage": plugin.homepage, + "dependencies": getattr(plugin, 'DEPENDENCIES', []), + "source_code": plugin.source_code, "hooks": plugin.hooks, - "schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))), }, "help_texts": { # TODO diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index bd2c5459..9cf894a4 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -1,859 +1,20 @@ __package__ = 'archivebox.core' -import os +from django.contrib.auth import get_user_model -from pathlib import Path - -from django.contrib import admin, messages -from django.urls import path, reverse, resolve -from django.utils import timezone -from django.utils.functional import cached_property -from django.utils.html import format_html -from django.utils.safestring import mark_safe -from django.contrib.auth import get_user_model, get_permission_codename -from django.contrib.auth.admin import UserAdmin -from django.core.paginator import Paginator -from django.core.exceptions import ValidationError -from django.template import Template, RequestContext -from django.conf import settings -from django import forms - -from signal_webhooks.admin import WebhookAdmin -from signal_webhooks.utils import get_webhook_model - -from archivebox.config import VERSION, DATA_DIR -from archivebox.misc.util import htmldecode, urldecode from core.models import Snapshot, ArchiveResult, Tag -from core.mixins import SearchResultsAdminMixin -from api.models import APIToken -from abid_utils.admin import ABIDModelAdmin -from queues.tasks import bg_archive_links, bg_add -from machine.models import Machine, NetworkInterface, InstalledBinary +from core.admin_tags import TagAdmin +from core.admin_snapshots import SnapshotAdmin +from core.admin_archiveresults import ArchiveResultAdmin +from core.admin_users import UserAdmin -from index.html import snapshot_icons -from logging_util import printable_filesize -from main import remove -from extractors import archive_links +import abx -CONFIG = settings.FLAT_CONFIG - -GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} - -# Admin URLs -# /admin/ -# /admin/login/ -# /admin/core/ -# /admin/core/snapshot/ -# /admin/core/snapshot/:uuid/ -# /admin/core/tag/ -# /admin/core/tag/:uuid/ - - -# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel - - -class ArchiveBoxAdmin(admin.AdminSite): - site_header = 'ArchiveBox' - index_title = 'Links' - site_title = 'Index' - namespace = 'admin' - - -class CustomUserAdmin(UserAdmin): - sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined'] - list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined'] - readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set') - fieldsets = [*UserAdmin.fieldsets, ('Data', {'fields': readonly_fields})] - - @admin.display(description='Snapshots') - def snapshot_set(self, obj): - total_count = obj.snapshot_set.count() - return mark_safe('
'.join( - format_html( - '[{}] πŸ“… {} {}', - snap.pk, - snap.abid, - snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', - snap.url[:64], - ) - for snap in obj.snapshot_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - @admin.display(description='Archive Result Logs') - def archiveresult_set(self, obj): - total_count = obj.archiveresult_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] πŸ“… {} πŸ“„ {} {}', - result.pk, - result.abid, - result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...', - result.extractor, - result.snapshot.url[:64], - ) - for result in obj.archiveresult_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - @admin.display(description='Tags') - def tag_set(self, obj): - total_count = obj.tag_set.count() - return mark_safe(', '.join( - format_html( - '{}', - tag.pk, - tag.name, - ) - for tag in obj.tag_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - @admin.display(description='API Tokens') - def apitoken_set(self, obj): - total_count = obj.apitoken_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] {} (expires {})', - apitoken.pk, - apitoken.abid, - apitoken.token_redacted[:64], - apitoken.expires, - ) - for apitoken in obj.apitoken_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - @admin.display(description='API Outbound Webhooks') - def outboundwebhook_set(self, obj): - total_count = obj.outboundwebhook_set.count() - return mark_safe('
'.join( - format_html( - '
[{}] {} -> {}', - outboundwebhook.pk, - outboundwebhook.abid, - outboundwebhook.referenced_model, - outboundwebhook.endpoint, - ) - for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10] - ) + f'
{total_count} total records...') - - - - -archivebox_admin = ArchiveBoxAdmin() -archivebox_admin.register(get_user_model(), CustomUserAdmin) -archivebox_admin.disable_action('delete_selected') - -# archivebox_admin.register(CustomPlugin) - -# patch admin with methods to add data views (implemented by admin_data_views package) -# https://github.com/MrThearMan/django-admin-data-views -# https://mrthearman.github.io/django-admin-data-views/setup/ -############### Additional sections are defined in settings.ADMIN_DATA_VIEWS ######### -from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls - -archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) -archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore -archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore -archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) - - -from huey_monitor.apps import HueyMonitorConfig -HueyMonitorConfig.verbose_name = 'Background Workers' - -from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin -archivebox_admin.register(SignalInfoModel, SignalInfoModelAdmin) - - -class CustomTaskModelAdmin(TaskModelAdmin): - actions = ["delete_selected"] - - def has_delete_permission(self, request, obj=None): - codename = get_permission_codename("delete", self.opts) - return request.user.has_perm("%s.%s" % (self.opts.app_label, codename)) - - -archivebox_admin.register(TaskModel, CustomTaskModelAdmin) - -def result_url(result: TaskModel) -> str: - url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)]) - return format_html('See progress...'.format(url=url)) - - -class AccelleratedPaginator(Paginator): - """ - Accellerated Pagniator ignores DISTINCT when counting total number of rows. - Speeds up SELECT Count(*) on Admin views by >20x. - https://hakibenita.com/optimizing-the-django-admin-paginator - """ - - @cached_property - def count(self): - if self.object_list._has_filters(): # type: ignore - # fallback to normal count method on filtered queryset - return super().count - else: - # otherwise count total rows in a separate fast query - return self.object_list.model.objects.count() - - # Alternative approach for PostgreSQL: fallback count takes > 200ms - # from django.db import connection, transaction, OperationalError - # with transaction.atomic(), connection.cursor() as cursor: - # cursor.execute('SET LOCAL statement_timeout TO 200;') - # try: - # return super().count - # except OperationalError: - # return 9999999999999 - - -class ArchiveResultInline(admin.TabularInline): - name = 'Archive Results Log' - model = ArchiveResult - parent_model = Snapshot - # fk_name = 'snapshot' - extra = 0 - sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') - readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') - fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output') - # exclude = ('id',) - ordering = ('end_ts',) - show_change_link = True - # # classes = ['collapse'] - # # list_display_links = ['abid'] - - def get_parent_object_from_request(self, request): - resolved = resolve(request.path_info) - try: - return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) - except (self.parent_model.DoesNotExist, ValidationError): - return self.parent_model.objects.get(pk=self.parent_model.id_from_abid(resolved.kwargs['object_id'])) - - @admin.display( - description='Completed', - ordering='end_ts', - ) - def completed(self, obj): - return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) - - def result_id(self, obj): - return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) - - def command(self, obj): - return format_html('{}', " ".join(obj.cmd or [])) - - def version(self, obj): - return format_html('{}', obj.cmd_version or '-') - - def get_formset(self, request, obj=None, **kwargs): - formset = super().get_formset(request, obj, **kwargs) - snapshot = self.get_parent_object_from_request(request) - - # import ipdb; ipdb.set_trace() - # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() - - # default values for new entries - formset.form.base_fields['status'].initial = 'succeeded' - formset.form.base_fields['start_ts'].initial = timezone.now() - formset.form.base_fields['end_ts'].initial = timezone.now() - formset.form.base_fields['cmd_version'].initial = '-' - formset.form.base_fields['pwd'].initial = str(snapshot.link_dir) - formset.form.base_fields['created_by'].initial = request.user - formset.form.base_fields['cmd'] = forms.JSONField(initial=['-']) - formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' - - if obj is not None: - # hidden values for existing entries and new entries - formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget() - formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget() - formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget() - formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget() - formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget() - formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget() - return formset - - def get_readonly_fields(self, request, obj=None): - if obj is not None: - return self.readonly_fields - else: - return [] - - -class TagInline(admin.TabularInline): - model = Tag.snapshot_set.through # type: ignore - # fk_name = 'snapshot' - fields = ('id', 'tag') - extra = 1 - # min_num = 1 - max_num = 1000 - autocomplete_fields = ( - 'tag', - ) - -from django.contrib.admin.helpers import ActionForm -from django.contrib.admin.widgets import FilteredSelectMultiple - -# class AutocompleteTags: -# model = Tag -# search_fields = ['name'] -# name = 'name' -# # source_field = 'name' -# remote_field = Tag._meta.get_field('name') - -# class AutocompleteTagsAdminStub: -# name = 'admin' - - -class SnapshotActionForm(ActionForm): - tags = forms.ModelMultipleChoiceField( - label='Edit tags', - queryset=Tag.objects.all(), - required=False, - widget=FilteredSelectMultiple( - 'core_tag__name', - False, - ), - ) - - # TODO: allow selecting actions for specific extractors? is this useful? - # extractor = forms.ChoiceField( - # choices=ArchiveResult.EXTRACTOR_CHOICES, - # required=False, - # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) - # ) - - - - - -@admin.register(Snapshot, site=archivebox_admin) -class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): - list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') - sort_fields = ('title_str', 'url_str', 'created_at') - readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') - search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') - list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') - fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) - ordering = ['-created_at'] - actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] - inlines = [TagInline, ArchiveResultInline] - list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000) - - action_form = SnapshotActionForm - paginator = AccelleratedPaginator - - save_on_top = True - show_full_result_count = False - - def changelist_view(self, request, extra_context=None): - self.request = request - extra_context = extra_context or {} - try: - return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) - except Exception as e: - self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') - return super().changelist_view(request, GLOBAL_CONTEXT) - - - def get_urls(self): - urls = super().get_urls() - custom_urls = [ - path('grid/', self.admin_site.admin_view(self.grid_view), name='grid') - ] - return custom_urls + urls - - # def get_queryset(self, request): - # # tags_qs = SnapshotTag.objects.all().select_related('tag') - # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) - - # self.request = request - # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) - - @admin.action( - description="Imported Timestamp" - ) - def imported_timestamp(self, obj): - context = RequestContext(self.request, { - 'bookmarked_date': obj.bookmarked, - 'timestamp': obj.timestamp, - }) - - html = Template("""{{bookmarked_date}} ({{timestamp}})""") - return mark_safe(html.render(context)) - - # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') - # return f'{pretty_time} ({obj.timestamp})' - - # TODO: figure out a different way to do this, you cant nest forms so this doenst work - # def action(self, obj): - # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 - # # action: update_snapshots - # # select_across: 0 - # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 - # return format_html( - # ''' - #
- # - # - # - # - # - # - # - #
- # ''', - # csrf.get_token(self.request), - # obj.pk, - # ) - - def admin_actions(self, obj): - return format_html( - # URL Hash: {}
- ''' - Summary page ➑️     - Result files πŸ“‘     - Admin actions βš™οΈ - ''', - obj.timestamp, - obj.timestamp, - obj.pk, - ) - - def status_info(self, obj): - return format_html( - # URL Hash: {}
- ''' - Archived: {} ({} files {})     - Favicon:     - Status code: {}    
- Server: {}     - Content type: {}     - Extension: {}     - ''', - 'βœ…' if obj.is_archived else '❌', - obj.num_outputs, - self.size(obj) or '0kb', - f'/archive/{obj.timestamp}/favicon.ico', - obj.status_code or '-', - obj.headers and obj.headers.get('Server') or '-', - obj.headers and obj.headers.get('Content-Type') or '-', - obj.extension or '-', - ) - - @admin.display( - description='Title', - ordering='title', - ) - def title_str(self, obj): - tags = ''.join( - format_html('{} ', tag.pk, tag.name) - for tag in obj.tags.all() - if str(tag.name).strip() - ) - return format_html( - '' - '' - '' - '' - '{}' - '', - obj.archive_path, - obj.archive_path, - obj.archive_path, - 'fetched' if obj.latest_title or obj.title else 'pending', - urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' - ) + mark_safe(f' {tags}') - - @admin.display( - description='Files Saved', - # ordering='archiveresult_count', - ) - def files(self, obj): - # return '-' - return snapshot_icons(obj) - - - @admin.display( - # ordering='archiveresult_count' - ) - def size(self, obj): - archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size - if archive_size: - size_txt = printable_filesize(archive_size) - if archive_size > 52428800: - size_txt = mark_safe(f'{size_txt}') - else: - size_txt = mark_safe('...') - return format_html( - '{}', - obj.archive_path, - size_txt, - ) - - - @admin.display( - description='Original URL', - ordering='url', - ) - def url_str(self, obj): - return format_html( - '{}', - obj.url, - obj.url[:128], - ) - - def grid_view(self, request, extra_context=None): - - # cl = self.get_changelist_instance(request) - - # Save before monkey patching to restore for changelist list view - saved_change_list_template = self.change_list_template - saved_list_per_page = self.list_per_page - saved_list_max_show_all = self.list_max_show_all - - # Monkey patch here plus core_tags.py - self.change_list_template = 'private_index_grid.html' - self.list_per_page = CONFIG.SNAPSHOTS_PER_PAGE - self.list_max_show_all = self.list_per_page - - # Call monkey patched view - rendered_response = self.changelist_view(request, extra_context=extra_context) - - # Restore values - self.change_list_template = saved_change_list_template - self.list_per_page = saved_list_per_page - self.list_max_show_all = saved_list_max_show_all - - return rendered_response - - # for debugging, uncomment this to print all requests: - # def changelist_view(self, request, extra_context=None): - # print('[*] Got request', request.method, request.POST) - # return super().changelist_view(request, extra_context=None) - - @admin.action( - description="ℹ️ Get Title" - ) - def update_titles(self, request, queryset): - links = [snapshot.as_link() for snapshot in queryset] - if len(links) < 3: - # run syncronously if there are only 1 or 2 links - archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR) - messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.") - else: - # otherwise run in a background worker - result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) - messages.success( - request, - mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"), - ) - - @admin.action( - description="⬇️ Get Missing" - ) - def update_snapshots(self, request, queryset): - links = [snapshot.as_link() for snapshot in queryset] - - result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR}) - - messages.success( - request, - mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"), - ) - - - @admin.action( - description="πŸ†• Archive Again" - ) - def resnapshot_snapshot(self, request, queryset): - for snapshot in queryset: - timestamp = timezone.now().isoformat('T', 'seconds') - new_url = snapshot.url.split('#')[0] + f'#{timestamp}' - - result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) - - messages.success( - request, - mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"), - ) - - @admin.action( - description="πŸ”„ Redo" - ) - def overwrite_snapshots(self, request, queryset): - links = [snapshot.as_link() for snapshot in queryset] - - result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR}) - - messages.success( - request, - mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"), - ) - - @admin.action( - description="☠️ Delete" - ) - def delete_snapshots(self, request, queryset): - remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) - messages.success( - request, - mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), - ) - - - @admin.action( - description="+" - ) - def add_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[+] Adding tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.add(*tags) - messages.success( - request, - f"Added {len(tags)} tags to {queryset.count()} Snapshots.", - ) - - - @admin.action( - description="–" - ) - def remove_tags(self, request, queryset): - tags = request.POST.getlist('tags') - print('[-] Removing tags', tags, 'to Snapshots', queryset) - for obj in queryset: - obj.tags.remove(*tags) - messages.success( - request, - f"Removed {len(tags)} tags from {queryset.count()} Snapshots.", - ) - - -# @admin.register(SnapshotTag, site=archivebox_admin) -# class SnapshotTagAdmin(ABIDModelAdmin): -# list_display = ('id', 'snapshot', 'tag') -# sort_fields = ('id', 'snapshot', 'tag') -# search_fields = ('id', 'snapshot_id', 'tag_id') -# fields = ('snapshot', 'id') -# actions = ['delete_selected'] -# ordering = ['-id'] - - - -@admin.register(Tag, site=archivebox_admin) -class TagAdmin(ABIDModelAdmin): - list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') - list_filter = ('created_at', 'created_by') - sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at') - readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots') - search_fields = ('abid', 'name', 'slug') - fields = ('name', 'created_by', *readonly_fields) - actions = ['delete_selected'] - ordering = ['-created_at'] - - paginator = AccelleratedPaginator - - - def num_snapshots(self, tag): - return format_html( - '{} total', - tag.id, - tag.snapshot_set.count(), - ) - - def snapshots(self, tag): - total_count = tag.snapshot_set.count() - return mark_safe('
'.join( - format_html( - '[{}] {}', - snap.pk, - snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', - snap.url[:64], - ) - for snap in tag.snapshot_set.order_by('-downloaded_at')[:10] - ) + (f'
{total_count} total snapshots...')) - - -@admin.register(ArchiveResult, site=archivebox_admin) -class ArchiveResultAdmin(ABIDModelAdmin): - list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') - sort_fields = ('start_ts', 'extractor', 'status') - readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary') - search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') - fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields) - autocomplete_fields = ['snapshot'] - - list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') - ordering = ['-start_ts'] - list_per_page = CONFIG.SNAPSHOTS_PER_PAGE - - paginator = AccelleratedPaginator - save_on_top = True - - actions = ['delete_selected'] - - class Meta: - verbose_name = 'Archive Result' - verbose_name_plural = 'Archive Results' - - def change_view(self, request, object_id, form_url="", extra_context=None): - self.request = request - return super().change_view(request, object_id, form_url, extra_context) - - @admin.display( - description='Snapshot Info' - ) - def snapshot_info(self, result): - return format_html( - '[{}]   {}   {}
', - result.snapshot.timestamp, - result.snapshot.abid, - result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), - result.snapshot.url[:128], - ) - - - @admin.display( - description='Snapshot Tags' - ) - def tags_str(self, result): - return result.snapshot.tags_str() - - def cmd_str(self, result): - return format_html( - '
{}
', - ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), - ) - - def output_str(self, result): - return format_html( - '↗️
{}
', - result.snapshot.timestamp, - result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', - result.output, - ) - - def output_summary(self, result): - snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] - output_str = format_html( - '
{}

', - result.output, - ) - output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
-        path_from_output_str = (snapshot_dir / result.output)
-        output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) - if os.access(path_from_output_str, os.R_OK): - root_dir = str(path_from_output_str) - else: - root_dir = str(snapshot_dir) - - # print(root_dir, str(list(os.walk(root_dir)))) - - for root, dirs, files in os.walk(root_dir): - depth = root.replace(root_dir, '').count(os.sep) + 1 - if depth > 2: - continue - indent = ' ' * 4 * (depth) - output_str += format_html('{}{}/
', indent, os.path.basename(root)) - indentation_str = ' ' * 4 * (depth + 1) - for filename in sorted(files): - is_hidden = filename.startswith('.') - output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) - - return output_str + format_html('
') - - - -@admin.register(APIToken, site=archivebox_admin) -class APITokenAdmin(ABIDModelAdmin): - list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires') - sort_fields = ('abid', 'created_at', 'created_by', 'expires') - readonly_fields = ('created_at', 'modified_at', 'abid_info') - search_fields = ('id', 'abid', 'created_by__username', 'token') - fields = ('created_by', 'token', 'expires', *readonly_fields) - - list_filter = ('created_by',) - ordering = ['-created_at'] - list_per_page = 100 - -@admin.register(get_webhook_model(), site=archivebox_admin) -class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin): - list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display) - sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error') - readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields) - - -@admin.register(Machine, site=archivebox_admin) -class MachineAdmin(ABIDModelAdmin): - list_display = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health') - sort_fields = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') - # search_fields = ('id', 'abid', 'guid', 'hostname', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release') - - readonly_fields = ('guid', 'created_at', 'modified_at', 'abid_info', 'ips') - fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed') - - list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] - - @admin.display( - description='Public IP', - ordering='networkinterface__ip_public', - ) - def ips(self, machine): - return format_html( - '{}', - machine.abid, - ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)), - ) - -@admin.register(NetworkInterface, site=archivebox_admin) -class NetworkInterfaceAdmin(ABIDModelAdmin): - list_display = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health') - sort_fields = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address') - search_fields = ('abid', 'machine__abid', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') - - readonly_fields = ('machine', 'created_at', 'modified_at', 'abid_info', 'mac_address', 'ip_public', 'ip_local', 'dns_server') - fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed') - - list_filter = ('isp', 'country', 'region') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] - - @admin.display( - description='Machine', - ordering='machine__abid', - ) - def machine_info(self, iface): - return format_html( - '[{}]   {}', - iface.machine.id, - iface.machine.abid, - iface.machine.hostname, - ) - -@admin.register(InstalledBinary, site=archivebox_admin) -class InstalledBinaryAdmin(ABIDModelAdmin): - list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health') - sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256') - search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256') - - readonly_fields = ('created_at', 'modified_at', 'abid_info') - fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') - - list_filter = ('name', 'binprovider', 'machine_id') - ordering = ['-created_at'] - list_per_page = 100 - actions = ["delete_selected"] - - @admin.display( - description='Machine', - ordering='machine__abid', - ) - def machine_info(self, installed_binary): - return format_html( - '[{}]   {}', - installed_binary.machine.id, - installed_binary.machine.abid, - installed_binary.machine.hostname, - ) +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(get_user_model(), UserAdmin) + admin_site.register(ArchiveResult, ArchiveResultAdmin) + admin_site.register(Snapshot, SnapshotAdmin) + admin_site.register(Tag, TagAdmin) diff --git a/archivebox/core/admin_archiveresults.py b/archivebox/core/admin_archiveresults.py new file mode 100644 index 00000000..aff7b1df --- /dev/null +++ b/archivebox/core/admin_archiveresults.py @@ -0,0 +1,199 @@ +__package__ = 'archivebox.core' + +import os +from pathlib import Path + +from django.contrib import admin +from django.utils.html import format_html, mark_safe +from django.core.exceptions import ValidationError +from django.urls import reverse, resolve +from django.utils import timezone +from django.forms import forms + +from huey_monitor.admin import TaskModel + +import abx + +from archivebox.config import DATA_DIR +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.paginators import AccelleratedPaginator + +from abid_utils.admin import ABIDModelAdmin + +from core.models import ArchiveResult, Snapshot + + + + +def result_url(result: TaskModel) -> str: + url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)]) + return format_html('See progress...'.format(url=url)) + + + +class ArchiveResultInline(admin.TabularInline): + name = 'Archive Results Log' + model = ArchiveResult + parent_model = Snapshot + # fk_name = 'snapshot' + extra = 0 + sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version') + readonly_fields = ('id', 'result_id', 'completed', 'command', 'version') + fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output') + # exclude = ('id',) + ordering = ('end_ts',) + show_change_link = True + # # classes = ['collapse'] + # # list_display_links = ['abid'] + + def get_parent_object_from_request(self, request): + resolved = resolve(request.path_info) + try: + return self.parent_model.objects.get(pk=resolved.kwargs['object_id']) + except (self.parent_model.DoesNotExist, ValidationError): + return self.parent_model.objects.get(pk=self.parent_model.id_from_abid(resolved.kwargs['object_id'])) + + @admin.display( + description='Completed', + ordering='end_ts', + ) + def completed(self, obj): + return format_html('

{}

', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S')) + + def result_id(self, obj): + return format_html('[{}]', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid) + + def command(self, obj): + return format_html('{}', " ".join(obj.cmd or [])) + + def version(self, obj): + return format_html('{}', obj.cmd_version or '-') + + def get_formset(self, request, obj=None, **kwargs): + formset = super().get_formset(request, obj, **kwargs) + snapshot = self.get_parent_object_from_request(request) + + # import ipdb; ipdb.set_trace() + # formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget() + + # default values for new entries + formset.form.base_fields['status'].initial = 'succeeded' + formset.form.base_fields['start_ts'].initial = timezone.now() + formset.form.base_fields['end_ts'].initial = timezone.now() + formset.form.base_fields['cmd_version'].initial = '-' + formset.form.base_fields['pwd'].initial = str(snapshot.link_dir) + formset.form.base_fields['created_by'].initial = request.user + formset.form.base_fields['cmd'] = forms.JSONField(initial=['-']) + formset.form.base_fields['output'].initial = 'Manually recorded cmd output...' + + if obj is not None: + # hidden values for existing entries and new entries + formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget() + formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget() + formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget() + formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget() + formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget() + formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget() + return formset + + def get_readonly_fields(self, request, obj=None): + if obj is not None: + return self.readonly_fields + else: + return [] + + + +class ArchiveResultAdmin(ABIDModelAdmin): + list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str') + sort_fields = ('start_ts', 'extractor', 'status') + readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary') + search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp') + fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields) + autocomplete_fields = ['snapshot'] + + list_filter = ('status', 'extractor', 'start_ts', 'cmd_version') + ordering = ['-start_ts'] + list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + + paginator = AccelleratedPaginator + save_on_top = True + + actions = ['delete_selected'] + + class Meta: + verbose_name = 'Archive Result' + verbose_name_plural = 'Archive Results' + + def change_view(self, request, object_id, form_url="", extra_context=None): + self.request = request + return super().change_view(request, object_id, form_url, extra_context) + + @admin.display( + description='Snapshot Info' + ) + def snapshot_info(self, result): + return format_html( + '[{}]   {}   {}
', + result.snapshot.timestamp, + result.snapshot.abid, + result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'), + result.snapshot.url[:128], + ) + + + @admin.display( + description='Snapshot Tags' + ) + def tags_str(self, result): + return result.snapshot.tags_str() + + def cmd_str(self, result): + return format_html( + '
{}
', + ' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd), + ) + + def output_str(self, result): + return format_html( + '↗️
{}
', + result.snapshot.timestamp, + result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html', + result.output, + ) + + def output_summary(self, result): + snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1] + output_str = format_html( + '
{}

', + result.output, + ) + output_str += format_html('See result files ...
', str(result.snapshot.timestamp))
+        path_from_output_str = (snapshot_dir / result.output)
+        output_str += format_html('{}/{}

', str(snapshot_dir), str(result.output)) + if os.access(path_from_output_str, os.R_OK): + root_dir = str(path_from_output_str) + else: + root_dir = str(snapshot_dir) + + # print(root_dir, str(list(os.walk(root_dir)))) + + for root, dirs, files in os.walk(root_dir): + depth = root.replace(root_dir, '').count(os.sep) + 1 + if depth > 2: + continue + indent = ' ' * 4 * (depth) + output_str += format_html('{}{}/
', indent, os.path.basename(root)) + indentation_str = ' ' * 4 * (depth + 1) + for filename in sorted(files): + is_hidden = filename.startswith('.') + output_str += format_html('{}{}
', int(not is_hidden), indentation_str, filename.strip()) + + return output_str + format_html('
') + + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(ArchiveResult, ArchiveResultAdmin) diff --git a/archivebox/core/admin_site.py b/archivebox/core/admin_site.py new file mode 100644 index 00000000..de92db8c --- /dev/null +++ b/archivebox/core/admin_site.py @@ -0,0 +1,42 @@ +__package__ = 'archivebox.core' + +from django.contrib import admin + +import abx.django.use + +class ArchiveBoxAdmin(admin.AdminSite): + site_header = 'ArchiveBox' + index_title = 'Admin Views' + site_title = 'Admin' + namespace = 'admin' + + +archivebox_admin = ArchiveBoxAdmin() +archivebox_admin.disable_action('delete_selected') +# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel + + + +# patch admin with methods to add data views (implemented by admin_data_views package) +# https://github.com/MrThearMan/django-admin-data-views +# https://mrthearman.github.io/django-admin-data-views/setup/ +from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls +archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin) +archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore +archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore +archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin) +############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS ######### + + +def register_admin_site(): + """Replace the default admin site with our custom ArchiveBox admin site.""" + from django.contrib import admin + from django.contrib.admin import sites + + admin.site = archivebox_admin + sites.site = archivebox_admin + + # register all plugins admin classes + abx.django.use.register_admin(archivebox_admin) + + return archivebox_admin diff --git a/archivebox/core/admin_snapshots.py b/archivebox/core/admin_snapshots.py new file mode 100644 index 00000000..2bd08421 --- /dev/null +++ b/archivebox/core/admin_snapshots.py @@ -0,0 +1,357 @@ + +__package__ = 'archivebox.core' + +import os +from pathlib import Path + +from django.contrib import admin, messages +from django.urls import path +from django.utils.html import format_html, mark_safe +from django.utils import timezone +from django import forms +from django.template import Template, RequestContext +from django.contrib.admin.helpers import ActionForm +from django.contrib.admin.widgets import FilteredSelectMultiple + +from archivebox.config import DATA_DIR, VERSION +from archivebox.config.common import SERVER_CONFIG +from archivebox.misc.util import htmldecode, urldecode +from archivebox.misc.paginators import AccelleratedPaginator +from archivebox.search.admin import SearchResultsAdminMixin + +from archivebox.logging_util import printable_filesize +from archivebox.index.html import snapshot_icons +from archivebox.extractors import archive_links +from archivebox.main import remove + +from archivebox.abid_utils.admin import ABIDModelAdmin +from archivebox.queues.tasks import bg_archive_links, bg_add + +from core.models import Tag +from core.admin_tags import TagInline +from core.admin_archiveresults import ArchiveResultInline, result_url + + +GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False} + + + +class SnapshotActionForm(ActionForm): + tags = forms.ModelMultipleChoiceField( + label='Edit tags', + queryset=Tag.objects.all(), + required=False, + widget=FilteredSelectMultiple( + 'core_tag__name', + False, + ), + ) + + # TODO: allow selecting actions for specific extractors? is this useful? + # extractor = forms.ChoiceField( + # choices=ArchiveResult.EXTRACTOR_CHOICES, + # required=False, + # widget=forms.MultileChoiceField(attrs={'class': "form-control"}) + # ) + + +class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin): + list_display = ('created_at', 'title_str', 'files', 'size', 'url_str') + sort_fields = ('title_str', 'url_str', 'created_at') + readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir') + search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name') + list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name') + fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields) + ordering = ['-created_at'] + actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots'] + inlines = [TagInline, ArchiveResultInline] + list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000) + + action_form = SnapshotActionForm + paginator = AccelleratedPaginator + + save_on_top = True + show_full_result_count = False + + def changelist_view(self, request, extra_context=None): + self.request = request + extra_context = extra_context or {} + try: + return super().changelist_view(request, extra_context | GLOBAL_CONTEXT) + except Exception as e: + self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}') + return super().changelist_view(request, GLOBAL_CONTEXT) + + + def get_urls(self): + urls = super().get_urls() + custom_urls = [ + path('grid/', self.admin_site.admin_view(self.grid_view), name='grid') + ] + return custom_urls + urls + + # def get_queryset(self, request): + # # tags_qs = SnapshotTag.objects.all().select_related('tag') + # # prefetch = Prefetch('snapshottag_set', queryset=tags_qs) + + # self.request = request + # return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult')) + + @admin.action( + description="Imported Timestamp" + ) + def imported_timestamp(self, obj): + context = RequestContext(self.request, { + 'bookmarked_date': obj.bookmarked, + 'timestamp': obj.timestamp, + }) + + html = Template("""{{bookmarked_date}} ({{timestamp}})""") + return mark_safe(html.render(context)) + + # pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S') + # return f'{pretty_time} ({obj.timestamp})' + + # TODO: figure out a different way to do this, you cant nest forms so this doenst work + # def action(self, obj): + # # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0 + # # action: update_snapshots + # # select_across: 0 + # # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3 + # return format_html( + # ''' + #
+ # + # + # + # + # + # + # + #
+ # ''', + # csrf.get_token(self.request), + # obj.pk, + # ) + + def admin_actions(self, obj): + return format_html( + # URL Hash: {}
+ ''' + Summary page ➑️     + Result files πŸ“‘     + Admin actions βš™οΈ + ''', + obj.timestamp, + obj.timestamp, + obj.pk, + ) + + def status_info(self, obj): + return format_html( + # URL Hash: {}
+ ''' + Archived: {} ({} files {})     + Favicon:     + Status code: {}    
+ Server: {}     + Content type: {}     + Extension: {}     + ''', + 'βœ…' if obj.is_archived else '❌', + obj.num_outputs, + self.size(obj) or '0kb', + f'/archive/{obj.timestamp}/favicon.ico', + obj.status_code or '-', + obj.headers and obj.headers.get('Server') or '-', + obj.headers and obj.headers.get('Content-Type') or '-', + obj.extension or '-', + ) + + @admin.display( + description='Title', + ordering='title', + ) + def title_str(self, obj): + tags = ''.join( + format_html('{} ', tag.pk, tag.name) + for tag in obj.tags.all() + if str(tag.name).strip() + ) + return format_html( + '' + '' + '' + '' + '{}' + '', + obj.archive_path, + obj.archive_path, + obj.archive_path, + 'fetched' if obj.latest_title or obj.title else 'pending', + urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...' + ) + mark_safe(f' {tags}') + + @admin.display( + description='Files Saved', + # ordering='archiveresult_count', + ) + def files(self, obj): + # return '-' + return snapshot_icons(obj) + + + @admin.display( + # ordering='archiveresult_count' + ) + def size(self, obj): + archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size + if archive_size: + size_txt = printable_filesize(archive_size) + if archive_size > 52428800: + size_txt = mark_safe(f'{size_txt}') + else: + size_txt = mark_safe('...') + return format_html( + '{}', + obj.archive_path, + size_txt, + ) + + + @admin.display( + description='Original URL', + ordering='url', + ) + def url_str(self, obj): + return format_html( + '{}', + obj.url, + obj.url[:128], + ) + + def grid_view(self, request, extra_context=None): + + # cl = self.get_changelist_instance(request) + + # Save before monkey patching to restore for changelist list view + saved_change_list_template = self.change_list_template + saved_list_per_page = self.list_per_page + saved_list_max_show_all = self.list_max_show_all + + # Monkey patch here plus core_tags.py + self.change_list_template = 'private_index_grid.html' + self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE + self.list_max_show_all = self.list_per_page + + # Call monkey patched view + rendered_response = self.changelist_view(request, extra_context=extra_context) + + # Restore values + self.change_list_template = saved_change_list_template + self.list_per_page = saved_list_per_page + self.list_max_show_all = saved_list_max_show_all + + return rendered_response + + # for debugging, uncomment this to print all requests: + # def changelist_view(self, request, extra_context=None): + # print('[*] Got request', request.method, request.POST) + # return super().changelist_view(request, extra_context=None) + + @admin.action( + description="ℹ️ Get Title" + ) + def update_titles(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + if len(links) < 3: + # run syncronously if there are only 1 or 2 links + archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR) + messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.") + else: + # otherwise run in a background worker + result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR}) + messages.success( + request, + mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"), + ) + + @admin.action( + description="⬇️ Get Missing" + ) + def update_snapshots(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + + result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR}) + + messages.success( + request, + mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"), + ) + + + @admin.action( + description="πŸ†• Archive Again" + ) + def resnapshot_snapshot(self, request, queryset): + for snapshot in queryset: + timestamp = timezone.now().isoformat('T', 'seconds') + new_url = snapshot.url.split('#')[0] + f'#{timestamp}' + + result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()}) + + messages.success( + request, + mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"), + ) + + @admin.action( + description="πŸ”„ Redo" + ) + def overwrite_snapshots(self, request, queryset): + links = [snapshot.as_link() for snapshot in queryset] + + result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR}) + + messages.success( + request, + mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"), + ) + + @admin.action( + description="☠️ Delete" + ) + def delete_snapshots(self, request, queryset): + remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR) + messages.success( + request, + mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."), + ) + + + @admin.action( + description="+" + ) + def add_tags(self, request, queryset): + tags = request.POST.getlist('tags') + print('[+] Adding tags', tags, 'to Snapshots', queryset) + for obj in queryset: + obj.tags.add(*tags) + messages.success( + request, + f"Added {len(tags)} tags to {queryset.count()} Snapshots.", + ) + + + @admin.action( + description="–" + ) + def remove_tags(self, request, queryset): + tags = request.POST.getlist('tags') + print('[-] Removing tags', tags, 'to Snapshots', queryset) + for obj in queryset: + obj.tags.remove(*tags) + messages.success( + request, + f"Removed {len(tags)} tags from {queryset.count()} Snapshots.", + ) diff --git a/archivebox/core/admin_tags.py b/archivebox/core/admin_tags.py new file mode 100644 index 00000000..495c801f --- /dev/null +++ b/archivebox/core/admin_tags.py @@ -0,0 +1,165 @@ +__package__ = 'archivebox.core' + +from django.contrib import admin +from django.utils.html import format_html, mark_safe + +import abx + +from abid_utils.admin import ABIDModelAdmin +from archivebox.misc.paginators import AccelleratedPaginator + +from core.models import Tag + + +class TagInline(admin.TabularInline): + model = Tag.snapshot_set.through # type: ignore + # fk_name = 'snapshot' + fields = ('id', 'tag') + extra = 1 + # min_num = 1 + max_num = 1000 + autocomplete_fields = ( + 'tag', + ) + + +# class AutocompleteTags: +# model = Tag +# search_fields = ['name'] +# name = 'name' +# # source_field = 'name' +# remote_field = Tag._meta.get_field('name') + +# class AutocompleteTagsAdminStub: +# name = 'admin' + + +# class TaggedItemInline(admin.TabularInline): +# readonly_fields = ('object_link',) +# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields) +# model = TaggedItem +# extra = 1 +# show_change_link = True + +# @admin.display(description='object') +# def object_link(self, obj): +# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id) +# return format_html('[{}]', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj)) + + +class TagAdmin(ABIDModelAdmin): + list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots') + list_filter = ('created_at', 'created_by') + sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at') + readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots') + search_fields = ('abid', 'name', 'slug') + fields = ('name', 'created_by', *readonly_fields) + actions = ['delete_selected', 'merge_tags'] + ordering = ['-created_at'] + # inlines = [TaggedItemInline] + + paginator = AccelleratedPaginator + + + def num_snapshots(self, tag): + return format_html( + '{} total', + tag.id, + tag.snapshot_set.count(), + ) + + def snapshots(self, tag): + total_count = tag.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '[{}] {}', + snap.pk, + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', + snap.url[:64], + ) + for snap in tag.snapshot_set.order_by('-downloaded_at')[:10] + ) + (f'
{total_count} total snapshots...')) + + # def get_urls(self): + # urls = super().get_urls() + # custom_urls = [ + # path( + # "merge-tags/", + # self.admin_site.admin_view(self.merge_tags_view), + # name="taggit_tag_merge_tags", + # ), + # ] + # return custom_urls + urls + + # @admin.action(description="Merge selected tags") + # def merge_tags(self, request, queryset): + # selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME) + # if not selected: + # self.message_user(request, "Please select at least one tag.") + # return redirect(request.get_full_path()) + + # selected_tag_ids = ",".join(selected) + # redirect_url = f"{request.get_full_path()}merge-tags/" + + # request.session["selected_tag_ids"] = selected_tag_ids + + # return redirect(redirect_url) + + # def merge_tags_view(self, request): + # selected_tag_ids = request.session.get("selected_tag_ids", "").split(",") + # if request.method == "POST": + # form = MergeTagsForm(request.POST) + # if form.is_valid(): + # new_tag_name = form.cleaned_data["new_tag_name"] + # new_tag, created = Tag.objects.get_or_create(name=new_tag_name) + # with transaction.atomic(): + # for tag_id in selected_tag_ids: + # tag = Tag.objects.get(id=tag_id) + # tagged_items = TaggedItem.objects.filter(tag=tag) + # for tagged_item in tagged_items: + # if TaggedItem.objects.filter( + # tag=new_tag, + # content_type=tagged_item.content_type, + # object_id=tagged_item.object_id, + # ).exists(): + # # we have the new tag as well, so we can just + # # remove the tag association + # tagged_item.delete() + # else: + # # point this taggedItem to the new one + # tagged_item.tag = new_tag + # tagged_item.save() + + # # delete the old tag + # if tag.id != new_tag.id: + # tag.delete() + + # self.message_user(request, "Tags have been merged", level="success") + # # clear the selected_tag_ids from session after merge is complete + # request.session.pop("selected_tag_ids", None) + + # return redirect("..") + # else: + # self.message_user(request, "Form is invalid.", level="error") + + # context = { + # "form": MergeTagsForm(), + # "selected_tag_ids": selected_tag_ids, + # } + # return render(request, "admin/taggit/merge_tags_form.html", context) + + +# @admin.register(SnapshotTag, site=archivebox_admin) +# class SnapshotTagAdmin(ABIDModelAdmin): +# list_display = ('id', 'snapshot', 'tag') +# sort_fields = ('id', 'snapshot', 'tag') +# search_fields = ('id', 'snapshot_id', 'tag_id') +# fields = ('snapshot', 'id') +# actions = ['delete_selected'] +# ordering = ['-id'] + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(Tag, TagAdmin) + diff --git a/archivebox/core/admin_users.py b/archivebox/core/admin_users.py new file mode 100644 index 00000000..259d2daf --- /dev/null +++ b/archivebox/core/admin_users.py @@ -0,0 +1,91 @@ +__package__ = 'archivebox.core' + +from django.contrib import admin +from django.contrib.auth.admin import UserAdmin +from django.utils.html import format_html, mark_safe +from django.contrib.auth import get_user_model + +import abx + + +class CustomUserAdmin(UserAdmin): + sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined'] + list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined'] + readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set') + fieldsets = [*UserAdmin.fieldsets, ('Data', {'fields': readonly_fields})] + + @admin.display(description='Snapshots') + def snapshot_set(self, obj): + total_count = obj.snapshot_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] πŸ“… {} {}', + snap.pk, + snap.abid, + snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...', + snap.url[:64], + ) + for snap in obj.snapshot_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='Archive Result Logs') + def archiveresult_set(self, obj): + total_count = obj.archiveresult_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] πŸ“… {} πŸ“„ {} {}', + result.pk, + result.abid, + result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...', + result.extractor, + result.snapshot.url[:64], + ) + for result in obj.archiveresult_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='Tags') + def tag_set(self, obj): + total_count = obj.tag_set.count() + return mark_safe(', '.join( + format_html( + '{}', + tag.pk, + tag.name, + ) + for tag in obj.tag_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='API Tokens') + def apitoken_set(self, obj): + total_count = obj.apitoken_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] {} (expires {})', + apitoken.pk, + apitoken.abid, + apitoken.token_redacted[:64], + apitoken.expires, + ) + for apitoken in obj.apitoken_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + @admin.display(description='API Outbound Webhooks') + def outboundwebhook_set(self, obj): + total_count = obj.outboundwebhook_set.count() + return mark_safe('
'.join( + format_html( + '
[{}] {} -> {}', + outboundwebhook.pk, + outboundwebhook.abid, + outboundwebhook.referenced_model, + outboundwebhook.endpoint, + ) + for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10] + ) + f'
{total_count} total records...') + + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(get_user_model(), CustomUserAdmin) diff --git a/archivebox/core/apps.py b/archivebox/core/apps.py index f955cb7d..d29e6266 100644 --- a/archivebox/core/apps.py +++ b/archivebox/core/apps.py @@ -2,27 +2,22 @@ __package__ = 'archivebox.core' from django.apps import AppConfig +import abx + class CoreConfig(AppConfig): name = 'core' def ready(self): - # register our custom admin as the primary django admin - from django.contrib import admin - from django.contrib.admin import sites - from core.admin import archivebox_admin - - admin.site = archivebox_admin - sites.site = archivebox_admin - - - # register signal handlers - from .auth import register_signals - - register_signals() + """Register the archivebox.core.admin_site as the main django admin site""" + from core.admin_site import register_admin_site + register_admin_site() -# from django.contrib.admin.apps import AdminConfig -# class CoreAdminConfig(AdminConfig): -# default_site = "core.admin.get_admin_site" + +@abx.hookimpl +def register_admin(admin_site): + """Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site""" + from core.admin import register_admin + register_admin(admin_site) diff --git a/archivebox/core/auth.py b/archivebox/core/auth.py deleted file mode 100644 index b3892322..00000000 --- a/archivebox/core/auth.py +++ /dev/null @@ -1,12 +0,0 @@ -__package__ = 'archivebox.core' - - -from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG - -def register_signals(): - - if LDAP_CONFIG.LDAP_ENABLED: - import django_auth_ldap.backend - from .auth_ldap import create_user - - django_auth_ldap.backend.populate_user.connect(create_user) diff --git a/archivebox/core/auth_ldap.py b/archivebox/core/auth_ldap.py deleted file mode 100644 index 7e94c316..00000000 --- a/archivebox/core/auth_ldap.py +++ /dev/null @@ -1,8 +0,0 @@ -from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG - -def create_user(sender, user=None, ldap_user=None, **kwargs): - if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER: - user.is_superuser = True - - user.is_staff = True - print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})') diff --git a/archivebox/core/migrations/0075_crawl.py b/archivebox/core/migrations/0075_crawl.py deleted file mode 100644 index 6018ad97..00000000 --- a/archivebox/core/migrations/0075_crawl.py +++ /dev/null @@ -1,101 +0,0 @@ -# Generated by Django 5.1.1 on 2024-10-01 02:10 - -import abid_utils.models -import charidfield.fields -import django.core.validators -import django.db.models.deletion -from django.conf import settings -from django.db import migrations, models - - -class Migration(migrations.Migration): - - dependencies = [ - ("core", "0074_alter_snapshot_downloaded_at"), - migrations.swappable_dependency(settings.AUTH_USER_MODEL), - ] - - operations = [ - migrations.CreateModel( - name="Crawl", - fields=[ - ( - "id", - models.UUIDField( - default=None, - editable=False, - primary_key=True, - serialize=False, - unique=True, - verbose_name="ID", - ), - ), - ( - "abid", - charidfield.fields.CharIDField( - blank=True, - db_index=True, - default=None, - help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)", - max_length=30, - null=True, - prefix="crl_", - unique=True, - ), - ), - ( - "created_at", - abid_utils.models.AutoDateTimeField(db_index=True, default=None), - ), - ("modified_at", models.DateTimeField(auto_now=True)), - ("urls", models.TextField()), - ( - "depth", - models.PositiveSmallIntegerField( - default=1, - validators=[ - django.core.validators.MinValueValidator(0), - django.core.validators.MaxValueValidator(2), - ], - ), - ), - ( - "parser", - models.CharField( - choices=[ - ("auto", "auto"), - ("pocket_api", "Pocket API"), - ("readwise_reader_api", "Readwise Reader API"), - ("wallabag_atom", "Wallabag Atom"), - ("pocket_html", "Pocket HTML"), - ("pinboard_rss", "Pinboard RSS"), - ("shaarli_rss", "Shaarli RSS"), - ("medium_rss", "Medium RSS"), - ("netscape_html", "Netscape HTML"), - ("rss", "Generic RSS"), - ("json", "Generic JSON"), - ("jsonl", "Generic JSONL"), - ("html", "Generic HTML"), - ("txt", "Generic TXT"), - ("url_list", "URL List"), - ], - default="auto", - max_length=32, - ), - ), - ( - "created_by", - models.ForeignKey( - default=None, - on_delete=django.db.models.deletion.CASCADE, - related_name="crawl_set", - to=settings.AUTH_USER_MODEL, - ), - ), - ], - options={ - "verbose_name": "Crawl", - "verbose_name_plural": "Crawls", - }, - ), - ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 5b97eb73..79776b7f 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -15,7 +15,6 @@ from django.utils.text import slugify from django.core.cache import cache from django.urls import reverse, reverse_lazy from django.db.models import Case, When, Value, IntegerField -from django.core.validators import MaxValueValidator, MinValueValidator from django.contrib import admin from django.conf import settings @@ -23,6 +22,7 @@ from archivebox.config import CONSTANTS from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField from queues.tasks import bg_archive_snapshot +# from crawls.models import Crawl # from machine.models import Machine, NetworkInterface from archivebox.misc.system import get_dir_size @@ -30,7 +30,6 @@ from archivebox.misc.util import parse_date, base_url from ..index.schema import Link from ..index.html import snapshot_icons from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS -from ..parsers import PARSERS # class BaseModel(models.Model): @@ -45,9 +44,11 @@ from ..parsers import PARSERS + + class Tag(ABIDModel): """ - Based on django-taggit model + ABID base. + Loosely based on django-taggit model + ABID base. """ abid_prefix = 'tag_' abid_ts_src = 'self.created_at' @@ -68,7 +69,7 @@ class Tag(ABIDModel): # slug is autoset on save from name, never set it manually snapshot_set: models.Manager['Snapshot'] - crawl_set: models.Manager['Crawl'] + # crawl_set: models.Manager['Crawl'] class Meta(TypedModelMeta): verbose_name = "Tag" @@ -82,9 +83,13 @@ class Tag(ABIDModel): if i is not None: slug += "_%d" % i return slug + + def clean(self, *args, **kwargs): + self.slug = self.slug or self.slugify(self.name) + super().clean(*args, **kwargs) def save(self, *args, **kwargs): - if self._state.adding and not self.slug: + if self._state.adding: self.slug = self.slugify(self.name) # if name is different but slug conficts with another tags slug, append a counter @@ -114,6 +119,8 @@ class Tag(ABIDModel): def api_docs_url(self) -> str: return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag' + + class SnapshotTag(models.Model): id = models.AutoField(primary_key=True) @@ -136,69 +143,6 @@ class SnapshotTag(models.Model): # unique_together = [('crawl', 'tag')] -class Crawl(ABIDModel): - abid_prefix = 'crl_' - abid_ts_src = 'self.created_at' - abid_uri_src = 'self.urls' - abid_subtype_src = 'self.crawler' - abid_rand_src = 'self.id' - abid_drift_allowed = True - - # CRAWLER_CHOICES = ( - # ('breadth_first', 'Breadth-First'), - # ('depth_first', 'Depth-First'), - # ) - PARSER_CHOICES = ( - ('auto', 'auto'), - *((parser_key, value[0]) for parser_key, value in PARSERS.items()), - ) - - id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') - abid = ABIDField(prefix=abid_prefix) - - created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set') - created_at = AutoDateTimeField(default=None, null=False, db_index=True) - modified_at = models.DateTimeField(auto_now=True) - - urls = models.TextField(blank=False, null=False) - depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)]) - parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32) - - # crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32) - # tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag') - # schedule = models.JSONField() - # config = models.JSONField() - - - class Meta(TypedModelMeta): - verbose_name = 'Crawl' - verbose_name_plural = 'Crawls' - - def __str__(self): - return self.parser - - @cached_property - def crawl_dir(self): - return Path() - - @property - def api_url(self) -> str: - # /api/v1/core/crawl/{uulid} - return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}' - - @property - def api_docs_url(self) -> str: - return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl' - - # def get_absolute_url(self): - # return f'/crawls/{self.abid}' - - def crawl(self): - # write self.urls to sources/crawl____YYYYMMDDHHMMSS.txt - # run parse_links(sources/crawl____YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link: - # create a Snapshot - # enqueue task bg_archive_snapshot(snapshot) - pass @@ -227,6 +171,8 @@ class Snapshot(ABIDModel): bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) + # crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set') + url = models.URLField(unique=True, db_index=True) timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False) tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) @@ -561,9 +507,10 @@ class ArchiveResult(ABIDModel): # return f'[{self.abid}] πŸ“… {self.start_ts.strftime("%Y-%m-%d %H:%M")} πŸ“„ {self.extractor} {self.snapshot.url}' return self.extractor - @cached_property - def machine(self): - return self.iface.machine if self.iface else None + # TODO: finish connecting machine.models + # @cached_property + # def machine(self): + # return self.iface.machine if self.iface else None @cached_property def snapshot_dir(self): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 3c2c40f0..3810954e 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -10,7 +10,7 @@ from django.utils.crypto import get_random_string import abx import abx.archivebox -import abx.archivebox.use +import abx.archivebox.reads import abx.django.use from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS @@ -19,8 +19,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3] - - +IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--version' in sys.argv or '--help' in sys.argv ################################################################################ ### ArchiveBox Plugin Settings @@ -41,7 +40,7 @@ BUILTIN_PLUGIN_DIRS = { 'plugins_extractor': PACKAGE_DIR / 'plugins_extractor', } USER_PLUGIN_DIRS = { - 'user_plugins': DATA_DIR / 'user_plugins', + # 'user_plugins': DATA_DIR / 'user_plugins', } # Discover ArchiveBox plugins @@ -52,19 +51,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS} # Load ArchiveBox plugins PLUGIN_MANAGER = abx.pm -PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS) -HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS) +abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS) +PLUGINS = abx.archivebox.reads.get_PLUGINS() # Load ArchiveBox config from plugins -CONFIGS = abx.archivebox.use.get_CONFIGS() -FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG() -BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS() -BINARIES = abx.archivebox.use.get_BINARIES() -EXTRACTORS = abx.archivebox.use.get_EXTRACTORS() -REPLAYERS = abx.archivebox.use.get_REPLAYERS() -ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS() -QUEUES = abx.archivebox.use.get_QUEUES() -SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS() +CONFIGS = abx.archivebox.reads.get_CONFIGS() +CONFIG = FLAT_CONFIG = abx.archivebox.reads.get_FLAT_CONFIG() +BINPROVIDERS = abx.archivebox.reads.get_BINPROVIDERS() +BINARIES = abx.archivebox.reads.get_BINARIES() +EXTRACTORS = abx.archivebox.reads.get_EXTRACTORS() +SEARCHBACKENDS = abx.archivebox.reads.get_SEARCHBACKENDS() +# REPLAYERS = abx.archivebox.reads.get_REPLAYERS() +# ADMINDATAVIEWS = abx.archivebox.reads.get_ADMINDATAVIEWS() ################################################################################ @@ -101,10 +99,13 @@ INSTALLED_APPS = [ 'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions # Our ArchiveBox-provided apps - #'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) + # 'abid_utils', # handles ABID ID creation, handling, and models + 'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here) 'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc. 'queues', # handles starting and managing background workers and processes - 'abid_utils', # handles ABID ID creation, handling, and models + 'seeds', # handles Seed model and URL source management + 'crawls', # handles Crawl and CrawlSchedule models and management + 'personas', # handles Persona and session management 'core', # core django model with Snapshot, ArchiveResult, etc. 'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc. @@ -262,37 +263,38 @@ MIGRATION_MODULES = {'signal_webhooks': None} DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField' -HUEY = { - "huey_class": "huey.SqliteHuey", - "filename": CONSTANTS.QUEUE_DATABASE_FILENAME, - "name": "system_tasks", - "results": True, - "store_none": True, - "immediate": False, - "utc": True, - "consumer": { - "workers": 1, - "worker_type": "thread", - "initial_delay": 0.1, # Smallest polling interval, same as -d. - "backoff": 1.15, # Exponential backoff using this rate, -b. - "max_delay": 10.0, # Max possible polling interval, -m. - "scheduler_interval": 1, # Check schedule every second, -s. - "periodic": True, # Enable crontab feature. - "check_worker_health": True, # Enable worker health checks. - "health_check_interval": 1, # Check worker health every second. - }, -} +if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file if we're just running to get --version or --help + HUEY = { + "huey_class": "huey.SqliteHuey", + "filename": CONSTANTS.QUEUE_DATABASE_FILENAME, + "name": "system_tasks", + "results": True, + "store_none": True, + "immediate": False, + "utc": True, + "consumer": { + "workers": 1, + "worker_type": "thread", + "initial_delay": 0.1, # Smallest polling interval, same as -d. + "backoff": 1.15, # Exponential backoff using this rate, -b. + "max_delay": 10.0, # Max possible polling interval, -m. + "scheduler_interval": 1, # Check schedule every second, -s. + "periodic": True, # Enable crontab feature. + "check_worker_health": True, # Enable worker health checks. + "health_check_interval": 1, # Check worker health every second. + }, + } -# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up -# https://github.com/gaiacoop/django-huey -DJANGO_HUEY = { - "default": "system_tasks", - "queues": { - HUEY["name"]: HUEY.copy(), - # more registered here at plugin import-time by BaseQueue.register() - **abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME), - }, -} + # https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up + # https://github.com/gaiacoop/django-huey + DJANGO_HUEY = { + "default": "system_tasks", + "queues": { + HUEY["name"]: HUEY.copy(), + # more registered here at plugin import-time by BaseQueue.register() + **abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME), + }, + } class HueyDBRouter: """ @@ -410,7 +412,7 @@ SHELL_PLUS_PRINT_SQL = False IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner'] IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell' if IS_SHELL: - os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py') + os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'misc' / 'shell_welcome_message.py') ################################################################################ @@ -610,6 +612,6 @@ if DEBUG_REQUESTS_TRACKER: abx.django.use.register_checks() -abx.archivebox.use.register_all_hooks(globals()) +# abx.archivebox.reads.register_all_hooks(globals()) # import ipdb; ipdb.set_trace() diff --git a/archivebox/core/urls.py b/archivebox/core/urls.py index e9eb4bca..6143e566 100644 --- a/archivebox/core/urls.py +++ b/archivebox/core/urls.py @@ -5,9 +5,10 @@ from django.views import static from django.conf import settings from django.views.generic.base import RedirectView -from .admin import archivebox_admin -from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView -from .serve_static import serve_static +from archivebox.misc.serve_static import serve_static + +from core.admin_site import archivebox_admin +from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView # GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306 # from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 205dc201..d423c146 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -24,16 +24,15 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view from core.models import Snapshot from core.forms import AddLinkForm -from core.admin import result_url from queues.tasks import bg_add from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.misc.util import base_url, htmlencode, ts_to_date_str +from archivebox.misc.serve_static import serve_static_with_byterange_support -from .serve_static import serve_static_with_byterange_support -from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG from ..logging_util import printable_filesize from ..search import query_search_index @@ -452,6 +451,8 @@ class AddView(UserPassesTestMixin, FormView): } def form_valid(self, form): + from core.admin_archiveresults import result_url + url = form.cleaned_data["url"] print(f'[+] Adding URL: {url}') parser = form.cleaned_data["parser"] @@ -502,7 +503,7 @@ def find_config_section(key: str) -> str: if key in CONSTANTS_CONFIG: return 'CONSTANT' matching_sections = [ - section.id for section in settings.CONFIGS.values() if key in section.model_fields + section_id for section_id, section in settings.CONFIGS.items() if key in section.model_fields ] section = matching_sections[0] if matching_sections else 'DYNAMIC' return section @@ -559,9 +560,9 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext: # "Aliases": [], } - for section in reversed(list(settings.CONFIGS.values())): + for section_id, section in reversed(list(settings.CONFIGS.items())): for key, field in section.model_fields.items(): - rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '') + rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '') rows['Key'].append(ItemLink(key, key=key)) rows['Type'].append(format_html('{}', find_config_type(key))) rows['Value'].append(mark_safe(f'{getattr(section, key)}') if key_is_safe(key) else '******** (redacted)') @@ -612,7 +613,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont "fields": { 'Key': key, 'Type': find_config_type(key), - 'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********', + 'Value': settings.FLAT_CONFIG.get(key, settings.CONFIGS.get(key, None)) if key_is_safe(key) else '********', }, "help_texts": { 'Key': mark_safe(f''' diff --git a/archivebox/plugins_extractor/singlefile/migrations/__init__.py b/archivebox/crawls/__init__.py similarity index 100% rename from archivebox/plugins_extractor/singlefile/migrations/__init__.py rename to archivebox/crawls/__init__.py diff --git a/archivebox/crawls/admin.py b/archivebox/crawls/admin.py new file mode 100644 index 00000000..89892178 --- /dev/null +++ b/archivebox/crawls/admin.py @@ -0,0 +1,28 @@ +__package__ = 'archivebox.crawls' + +import abx + +from abid_utils.admin import ABIDModelAdmin + +from crawls.models import Crawl + + + +class CrawlAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') + sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls') + search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls') + + readonly_fields = ('created_at', 'modified_at', 'abid_info') + fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields) + + list_filter = ('depth', 'parser', 'created_by') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(Crawl, CrawlAdmin) diff --git a/archivebox/crawls/apps.py b/archivebox/crawls/apps.py new file mode 100644 index 00000000..e7bf709b --- /dev/null +++ b/archivebox/crawls/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class CrawlsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "crawls" diff --git a/archivebox/plugins_search/sqlite/__init__.py b/archivebox/crawls/migrations/__init__.py similarity index 100% rename from archivebox/plugins_search/sqlite/__init__.py rename to archivebox/crawls/migrations/__init__.py diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py new file mode 100644 index 00000000..a806d889 --- /dev/null +++ b/archivebox/crawls/models.py @@ -0,0 +1,164 @@ +__package__ = 'archivebox.crawls' + +from django_stubs_ext.db.models import TypedModelMeta + +from django.db import models +from django.db.models import Q +from django.core.validators import MaxValueValidator, MinValueValidator +from django.conf import settings +from django.utils import timezone +from django.urls import reverse_lazy + +from seeds.models import Seed + +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + + +class CrawlSchedule(ABIDModel, ModelWithHealthStats): + """ + A record for a job that should run repeatedly on a given schedule. + + It pulls from a given Seed and creates a new Crawl for each scheduled run. + The new Crawl will inherit all the properties of the crawl_template Crawl. + """ + abid_prefix = 'sch_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.created_by_id' + abid_subtype_src = 'self.schedule' + abid_rand_src = 'self.id' + + schedule = models.CharField(max_length=64, blank=False, null=False) + + is_enabled = models.BooleanField(default=True) + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + + crawl_set: models.Manager['Crawl'] + + @property + def template(self): + """The base crawl that each new scheduled job should copy as a template""" + return self.crawl_set.first() + + +class Crawl(ABIDModel, ModelWithHealthStats): + """ + A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak. + + A new Crawl should be created for each loading from a Seed (because it can produce a different set of URLs every time its loaded). + E.g. every scheduled import from an RSS feed should create a new Crawl, and more loadings from the same seed each create a new Crawl + + Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a + file URI e.g. file:///sources/_{ui,cli}_add.txt containing the user's input. + """ + abid_prefix = 'crl_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.seed.uri' + abid_subtype_src = 'self.persona_id' + abid_rand_src = 'self.id' + abid_drift_allowed = True + + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + abid = ABIDField(prefix=abid_prefix) + + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set') + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + + seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False) + max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)]) + tags_str = models.CharField(max_length=1024, blank=True, null=False, default='') + persona = models.CharField(max_length=32, blank=True, null=False, default='auto') + config = models.JSONField(default=dict) + + schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True) + + # crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32) + # tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag') + # schedule = models.JSONField() + # config = models.JSONField() + + # snapshot_set: models.Manager['Snapshot'] + + + class Meta(TypedModelMeta): + verbose_name = 'Crawl' + verbose_name_plural = 'Crawls' + + @property + def template(self): + """If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off""" + if not self.schedule: + return None + return self.schedule.template + + @property + def api_url(self) -> str: + # /api/v1/core/crawl/{uulid} + # TODO: implement get_crawl + return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}' + + @property + def api_docs_url(self) -> str: + return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl' + + +class Outlink(models.Model): + """A record of a link found on a page, pointing to another page.""" + id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') + + src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads + dst = models.URLField() # remote location the child outlink/href points to e.g. https://example.com/downloads/some_file.pdf + + crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set') + via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set') + + class Meta: + unique_together = (('src', 'dst', 'via'),) + + + + + +# @abx.hookimpl.on_archiveresult_created +# def exec_archiveresult_extractor_effects(archiveresult): +# config = get_scope_config(...) + +# # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now()) +# # abx.archivebox.events.on_archiveresult_updated(archiveresult) + +# # check if it should be skipped +# if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config): +# abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped') +# abx.archivebox.events.on_archiveresult_skipped(archiveresult, config) +# return + +# # run the extractor method and save the output back to the archiveresult +# try: +# output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config) +# abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now()) +# except Exception as e: +# abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now()) + +# # bump the modified time on the archiveresult and Snapshot +# abx.archivebox.events.on_archiveresult_updated(archiveresult) +# abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot) + + +# @abx.hookimpl.reads.get_outlink_parents +# def get_outlink_parents(url, crawl_pk=None, config=None): +# scope = Q(dst=url) +# if crawl_pk: +# scope = scope | Q(via__snapshot__crawl_id=crawl_pk) + +# parent = list(Outlink.objects.filter(scope)) +# if not parent: +# # base case: we reached the top of the chain, no more parents left +# return [] + +# # recursive case: there is another parent above us, get its parents +# yield parent[0] +# yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config) + + diff --git a/archivebox/crawls/tests.py b/archivebox/crawls/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/crawls/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/crawls/views.py b/archivebox/crawls/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/crawls/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/extractors/archive_org.py b/archivebox/extractors/archive_org.py index e3451b7b..ff7297cd 100644 --- a/archivebox/extractors/archive_org.py +++ b/archivebox/extractors/archive_org.py @@ -8,8 +8,9 @@ from collections import defaultdict from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..logging_util import TimedProgress diff --git a/archivebox/extractors/dom.py b/archivebox/extractors/dom.py index b770fd46..07057a44 100644 --- a/archivebox/extractors/dom.py +++ b/archivebox/extractors/dom.py @@ -11,6 +11,9 @@ from archivebox.misc.util import ( ) from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'output.html' @@ -18,7 +21,6 @@ def get_output_path(): @enforce_types def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print HTML of site to file using chrome --dump-html""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py index 06bc1386..09cfae44 100644 --- a/archivebox/extractors/favicon.py +++ b/archivebox/extractors/favicon.py @@ -4,8 +4,9 @@ from pathlib import Path from archivebox.misc.system import chmod_file, run from archivebox.misc.util import enforce_types, domain, dedupe -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress diff --git a/archivebox/extractors/git.py b/archivebox/extractors/git.py index 2ae08064..9ac71d3e 100644 --- a/archivebox/extractors/git.py +++ b/archivebox/extractors/git.py @@ -13,10 +13,12 @@ from archivebox.misc.util import ( without_query, without_fragment, ) -from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY from ..logging_util import TimedProgress from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError +from archivebox.plugins_extractor.git.config import GIT_CONFIG +from archivebox.plugins_extractor.git.binaries import GIT_BINARY + def get_output_path(): return 'git/' diff --git a/archivebox/extractors/headers.py b/archivebox/extractors/headers.py index 85946619..e49907cb 100644 --- a/archivebox/extractors/headers.py +++ b/archivebox/extractors/headers.py @@ -10,7 +10,8 @@ from archivebox.misc.util import ( get_headers, dedupe, ) -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY from ..index.schema import Link, ArchiveResult, ArchiveOutput from ..logging_util import TimedProgress diff --git a/archivebox/extractors/media.py b/archivebox/extractors/media.py index 9f3d80d5..c1f3bbc9 100644 --- a/archivebox/extractors/media.py +++ b/archivebox/extractors/media.py @@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.ytdlp.config import YTDLP_CONFIG +from plugins_extractor.ytdlp.binaries import YTDLP_BINARY def get_output_path(): return 'media/' @@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.ytdlp.apps import YTDLP_CONFIG if is_static_file(link.url): return False @@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult: """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp""" - - # from plugins_extractor.chrome.apps import CHROME_CONFIG - from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG - YTDLP_BIN = YTDLP_BINARY.load() assert YTDLP_BIN.abspath and YTDLP_BIN.version diff --git a/archivebox/extractors/mercury.py b/archivebox/extractors/mercury.py index a0cb86fa..08be60ad 100644 --- a/archivebox/extractors/mercury.py +++ b/archivebox/extractors/mercury.py @@ -12,7 +12,8 @@ from archivebox.misc.util import ( enforce_types, is_static_file, ) -from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY +from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG +from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY from ..logging_util import TimedProgress diff --git a/archivebox/extractors/pdf.py b/archivebox/extractors/pdf.py index 78b54f34..d3310ba1 100644 --- a/archivebox/extractors/pdf.py +++ b/archivebox/extractors/pdf.py @@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import ( enforce_types, is_static_file, ) +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'output.pdf' @@ -18,7 +21,6 @@ def get_output_path(): @enforce_types def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """print PDF of site to file using chrome --headless""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/readability.py b/archivebox/extractors/readability.py index 9205167a..ccfde023 100644 --- a/archivebox/extractors/readability.py +++ b/archivebox/extractors/readability.py @@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile from typing import Optional import json -from ..index.schema import Link, ArchiveResult, ArchiveError from archivebox.misc.system import run, atomic_write from archivebox.misc.util import enforce_types, is_static_file +from ..index.schema import Link, ArchiveResult, ArchiveError from ..logging_util import TimedProgress from .title import get_html +from plugins_extractor.readability.config import READABILITY_CONFIG +from plugins_extractor.readability.binaries import READABILITY_BINARY + + def get_output_path(): return 'readability/' @@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None): @enforce_types def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.readability.apps import READABILITY_CONFIG if is_static_file(link.url): return False @@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult: """download reader friendly version using @mozilla/readability""" - from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY - READABILITY_BIN = READABILITY_BINARY.load() assert READABILITY_BIN.abspath and READABILITY_BIN.version diff --git a/archivebox/extractors/screenshot.py b/archivebox/extractors/screenshot.py index 9ed7016e..adc309aa 100644 --- a/archivebox/extractors/screenshot.py +++ b/archivebox/extractors/screenshot.py @@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors' from pathlib import Path from typing import Optional -from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file +from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY + def get_output_path(): return 'screenshot.png' @@ -15,7 +18,6 @@ def get_output_path(): @enforce_types def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.chrome.apps import CHROME_CONFIG if is_static_file(link.url): return False @@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """take screenshot of site using chrome --headless""" - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py index 470d5da3..6988fd25 100644 --- a/archivebox/extractors/singlefile.py +++ b/archivebox/extractors/singlefile.py @@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file from archivebox.misc.util import enforce_types, is_static_file, dedupe from ..logging_util import TimedProgress +from plugins_extractor.chrome.config import CHROME_CONFIG +from plugins_extractor.chrome.binaries import CHROME_BINARY +from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG +from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY + def get_output_path(): return 'singlefile.html' @@ -17,7 +22,6 @@ def get_output_path(): @enforce_types def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool: - from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG if is_static_file(link.url): return False @@ -26,15 +30,12 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: if not overwrite and (out_dir / get_output_path()).exists(): return False - return SINGLEFILE_CONFIG.SAVE_SINGLEFILE + return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE @enforce_types def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult: """download full site using single-file""" - - from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY - from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY CHROME_BIN = CHROME_BINARY.load() assert CHROME_BIN.abspath and CHROME_BIN.version diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py index fa528a97..ceefb699 100644 --- a/archivebox/extractors/title.py +++ b/archivebox/extractors/title.py @@ -11,7 +11,9 @@ from archivebox.misc.util import ( htmldecode, dedupe, ) -from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY +from archivebox.plugins_extractor.curl.config import CURL_CONFIG +from archivebox.plugins_extractor.curl.binaries import CURL_BINARY + from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError from ..logging_util import TimedProgress diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py index 2107ac1b..416e797e 100644 --- a/archivebox/extractors/wget.py +++ b/archivebox/extractors/wget.py @@ -17,8 +17,8 @@ from archivebox.misc.util import ( urldecode, dedupe, ) -from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG - +from archivebox.plugins_extractor.wget.config import WGET_CONFIG +from archivebox.plugins_extractor.wget.binaries import WGET_BINARY from ..logging_util import TimedProgress from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError diff --git a/archivebox/index/html.py b/archivebox/index/html.py index b46e9911..eae93e67 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -19,7 +19,7 @@ from archivebox.misc.util import ( from archivebox.config import CONSTANTS, DATA_DIR, VERSION from archivebox.config.common import SERVER_CONFIG from archivebox.config.version import get_COMMIT_HASH -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG +from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG from .schema import Link from ..logging_util import printable_filesize diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index bdd93df4..a3c0e967 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -19,7 +19,7 @@ from django.utils.functional import cached_property from archivebox.config import ARCHIVE_DIR, CONSTANTS -from plugins_extractor.favicon.apps import FAVICON_CONFIG +from plugins_extractor.favicon.config import FAVICON_CONFIG from archivebox.misc.system import get_dir_size from archivebox.misc.util import ts_to_date_str, parse_date diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index cb07d546..18b811e7 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -160,4 +160,4 @@ def apply_migrations(out_dir: Path=DATA_DIR) -> List[str]: @enforce_types def get_admins(out_dir: Path=DATA_DIR) -> List[str]: from django.contrib.auth.models import User - return User.objects.filter(is_superuser=True) + return User.objects.filter(is_superuser=True).exclude(username='system') diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index f4503a1f..33ab0766 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -510,7 +510,7 @@ def log_removal_finished(all_links: int, to_remove: int): ### Helpers @enforce_types -def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str: +def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str: """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc""" pwd = str(Path(pwd)) # .resolve() path = str(path) @@ -520,7 +520,10 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str: # replace long absolute paths with ./ relative ones to save on terminal output width if path.startswith(pwd) and (pwd != '/') and path != pwd: - path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) + if color: + path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1) + else: + path = path.replace(pwd, '.', 1) # quote paths containing spaces if ' ' in path: diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py new file mode 100644 index 00000000..e3039a78 --- /dev/null +++ b/archivebox/machine/admin.py @@ -0,0 +1,94 @@ +__package__ = 'archivebox.machine' + +import abx + +from django.contrib import admin +from django.utils.html import format_html + +from abid_utils.admin import ABIDModelAdmin + +from machine.models import Machine, NetworkInterface, InstalledBinary + + + +class MachineAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health') + sort_fields = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid') + # search_fields = ('id', 'abid', 'guid', 'hostname', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release') + + readonly_fields = ('guid', 'created_at', 'modified_at', 'abid_info', 'ips') + fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed') + + list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display( + description='Public IP', + ordering='networkinterface__ip_public', + ) + def ips(self, machine): + return format_html( + '{}', + machine.abid, + ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)), + ) + +class NetworkInterfaceAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health') + sort_fields = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address') + search_fields = ('abid', 'machine__abid', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country') + + readonly_fields = ('machine', 'created_at', 'modified_at', 'abid_info', 'mac_address', 'ip_public', 'ip_local', 'dns_server') + fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed') + + list_filter = ('isp', 'country', 'region') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display( + description='Machine', + ordering='machine__abid', + ) + def machine_info(self, iface): + return format_html( + '[{}]   {}', + iface.machine.id, + iface.machine.abid, + iface.machine.hostname, + ) + +class InstalledBinaryAdmin(ABIDModelAdmin): + list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health') + sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256') + search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256') + + readonly_fields = ('created_at', 'modified_at', 'abid_info') + fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed') + + list_filter = ('name', 'binprovider', 'machine_id') + ordering = ['-created_at'] + list_per_page = 100 + actions = ["delete_selected"] + + @admin.display( + description='Machine', + ordering='machine__abid', + ) + def machine_info(self, installed_binary): + return format_html( + '[{}]   {}', + installed_binary.machine.id, + installed_binary.machine.abid, + installed_binary.machine.hostname, + ) + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(Machine, MachineAdmin) + admin_site.register(NetworkInterface, NetworkInterfaceAdmin) + admin_site.register(InstalledBinary, InstalledBinaryAdmin) diff --git a/archivebox/machine/apps.py b/archivebox/machine/apps.py index 960ffefe..73ae3b6c 100644 --- a/archivebox/machine/apps.py +++ b/archivebox/machine/apps.py @@ -2,9 +2,17 @@ __package__ = 'archivebox.machine' from django.apps import AppConfig +import abx + class MachineConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' name = 'machine' verbose_name = 'Machine Info' + + +@abx.hookimpl +def register_admin(admin_site): + from machine.admin import register_admin + register_admin(admin_site) diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index b1854d44..229e1d83 100644 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -8,66 +8,41 @@ from django.db import models from django.utils import timezone from django.utils.functional import cached_property +import abx.archivebox.reads -import abx.archivebox.use from abx.archivebox.base_binary import BaseBinary, BaseBinProvider -from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField +from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats -CURRENT_MACHINE = None # global cache for the current machine -CURRENT_INTERFACE = None # global cache for the current network interface -CURRENT_BINARIES = {} # global cache for the currently installed binaries +_CURRENT_MACHINE = None # global cache for the current machine +_CURRENT_INTERFACE = None # global cache for the current network interface +_CURRENT_BINARIES = {} # global cache for the currently installed binaries + + MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 # 1 week (how often should we check for OS/hardware changes?) NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 # 1 hour (how often should we check for public IP/private IP/DNS changes?) INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60 # 30min (how often should we check for changes to locally installed binaries?) -class ModelWithHealthStats(models.Model): - num_uses_failed = models.PositiveIntegerField(default=0) - num_uses_succeeded = models.PositiveIntegerField(default=0) - - class Meta: - abstract = True - - def record_health_failure(self) -> None: - self.num_uses_failed += 1 - self.save() - - def record_health_success(self) -> None: - self.num_uses_succeeded += 1 - self.save() - - def reset_health(self) -> None: - # move all the failures to successes when resetting so we dont lose track of the total count - self.num_uses_succeeded = self.num_uses_failed + self.num_uses_succeeded - self.num_uses_failed = 0 - self.save() - - @property - def health(self) -> int: - total_uses = max((self.num_uses_failed + self.num_uses_succeeded, 1)) - success_pct = (self.num_uses_succeeded / total_uses) * 100 - return round(success_pct) - class MachineManager(models.Manager): def current(self) -> 'Machine': """Get the current machine that ArchiveBox is running on.""" - global CURRENT_MACHINE - if CURRENT_MACHINE: - expires_at = CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL) + global _CURRENT_MACHINE + if _CURRENT_MACHINE: + expires_at = _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL) if timezone.now() < expires_at: # assume current machine cant change *while archivebox is actively running on it* # it's not strictly impossible to swap hardware while code is running, # but its rare and unusual so we check only once per week # (e.g. VMWare can live-migrate a VM to a new host while it's running) - return CURRENT_MACHINE + return _CURRENT_MACHINE else: - CURRENT_MACHINE = None + _CURRENT_MACHINE = None - CURRENT_MACHINE, _created = self.update_or_create( + _CURRENT_MACHINE, _created = self.update_or_create( guid=get_host_guid(), defaults={ 'hostname': socket.gethostname(), @@ -76,11 +51,14 @@ class MachineManager(models.Manager): 'stats': get_host_stats(), }, ) - CURRENT_MACHINE.save() # populate ABID + _CURRENT_MACHINE.save() # populate ABID - return CURRENT_MACHINE + return _CURRENT_MACHINE + class Machine(ABIDModel, ModelWithHealthStats): + """Audit log entry for a physical machine that was used to do archiving.""" + abid_prefix = 'mxn_' abid_ts_src = 'self.created_at' abid_uri_src = 'self.guid' @@ -113,6 +91,7 @@ class Machine(ABIDModel, ModelWithHealthStats): # STATS COUNTERS stats = models.JSONField(default=dict, null=False) # e.g. {"cpu_load": [1.25, 2.4, 1.4], "mem_swap_used_pct": 56, ...} + # num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats # num_uses_succeeded = models.PositiveIntegerField(default=0) @@ -127,18 +106,18 @@ class NetworkInterfaceManager(models.Manager): def current(self) -> 'NetworkInterface': """Get the current network interface for the current machine.""" - global CURRENT_INTERFACE - if CURRENT_INTERFACE: + global _CURRENT_INTERFACE + if _CURRENT_INTERFACE: # assume the current network interface (public IP, DNS servers, etc.) wont change more than once per hour - expires_at = CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL) + expires_at = _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL) if timezone.now() < expires_at: - return CURRENT_INTERFACE + return _CURRENT_INTERFACE else: - CURRENT_INTERFACE = None + _CURRENT_INTERFACE = None machine = Machine.objects.current() net_info = get_host_network() - CURRENT_INTERFACE, _created = self.update_or_create( + _CURRENT_INTERFACE, _created = self.update_or_create( machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'), @@ -146,14 +125,16 @@ class NetworkInterfaceManager(models.Manager): dns_server=net_info.pop('dns_server'), defaults=net_info, ) - CURRENT_INTERFACE.save() # populate ABID + _CURRENT_INTERFACE.save() # populate ABID - return CURRENT_INTERFACE + return _CURRENT_INTERFACE class NetworkInterface(ABIDModel, ModelWithHealthStats): + """Audit log entry for a physical network interface / internet connection that was used to do archiving.""" + abid_prefix = 'ixf_' abid_ts_src = 'self.machine.created_at' abid_uri_src = 'self.machine.guid' @@ -183,7 +164,7 @@ class NetworkInterface(ABIDModel, ModelWithHealthStats): region = models.CharField(max_length=63, default=None, null=False) # e.g. California country = models.CharField(max_length=63, default=None, null=False) # e.g. United States - # STATS COUNTERS (from ModelWithHealthStats) + # STATS COUNTERS (inherited from ModelWithHealthStats) # num_uses_failed = models.PositiveIntegerField(default=0) # num_uses_succeeded = models.PositiveIntegerField(default=0) @@ -202,8 +183,8 @@ class InstalledBinaryManager(models.Manager): def get_from_db_or_cache(self, binary: BaseBinary) -> 'InstalledBinary': """Get or create an InstalledBinary record for a Binary on the local machine""" - global CURRENT_BINARIES - cached_binary = CURRENT_BINARIES.get(binary.id) + global _CURRENT_BINARIES + cached_binary = _CURRENT_BINARIES.get(binary.name) if cached_binary: expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL) if timezone.now() < expires_at: @@ -218,7 +199,7 @@ class InstalledBinaryManager(models.Manager): or binary.sha256 != cached_binary.sha256 ) if is_different_from_cache: - CURRENT_BINARIES.pop(binary.id) + _CURRENT_BINARIES.pop(binary.name) else: return cached_binary else: @@ -229,7 +210,7 @@ class InstalledBinaryManager(models.Manager): return cached_binary else: # cached binary is too old, reload it from scratch - CURRENT_BINARIES.pop(binary.id) + _CURRENT_BINARIES.pop(binary.name) if not binary.abspath or not binary.version or not binary.sha256: # if binary was not yet loaded from filesystem, do it now @@ -239,7 +220,7 @@ class InstalledBinaryManager(models.Manager): assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256' - CURRENT_BINARIES[binary.id], _created = self.update_or_create( + _CURRENT_BINARIES[binary.name], _created = self.update_or_create( machine=Machine.objects.current(), name=binary.name, binprovider=binary.loaded_binprovider.name, @@ -247,7 +228,7 @@ class InstalledBinaryManager(models.Manager): abspath=str(binary.loaded_abspath), sha256=str(binary.loaded_sha256), ) - cached_binary = CURRENT_BINARIES[binary.id] + cached_binary = _CURRENT_BINARIES[binary.name] cached_binary.save() # populate ABID # if we get this far make sure DB record matches in-memroy cache @@ -282,11 +263,11 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats): version = models.CharField(max_length=32, default=None, null=False, blank=True) sha256 = models.CharField(max_length=64, default=None, null=False, blank=True) - # MUTABLE PROPERTIES + # MUTABLE PROPERTIES (TODO) # is_pinned = models.BooleanField(default=False) # i.e. should this binary superceede other binaries with the same name on the host? # is_valid = models.BooleanField(default=True) # i.e. is this binary still available on the host? - # STATS COUNTERS (from ModelWithHealthStats) + # STATS COUNTERS (inherited from ModelWithHealthStats) # num_uses_failed = models.PositiveIntegerField(default=0) # num_uses_succeeded = models.PositiveIntegerField(default=0) @@ -310,7 +291,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats): if not hasattr(self, 'machine'): self.machine = Machine.objects.current() if not self.binprovider: - all_known_binproviders = list(abx.archivebox.use.get_BINPROVIDERS().values()) + all_known_binproviders = list(abx.archivebox.reads.get_BINPROVIDERS().values()) binary = BaseBinary(name=self.name, binproviders=all_known_binproviders).load(fresh=True) self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None if not self.abspath: @@ -324,7 +305,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats): @cached_property def BINARY(self) -> BaseBinary: - for binary in abx.archivebox.use.get_BINARIES().values(): + for binary in abx.archivebox.reads.get_BINARIES().values(): if binary.name == self.name: return binary raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it') @@ -332,7 +313,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats): @cached_property def BINPROVIDER(self) -> BaseBinProvider: - for binprovider in abx.archivebox.use.get_BINPROVIDERS().values(): + for binprovider in abx.archivebox.reads.get_BINPROVIDERS().values(): if binprovider.name == self.binprovider: return binprovider raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})') diff --git a/archivebox/main.py b/archivebox/main.py index 8caabd80..5ed3973f 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -189,14 +189,16 @@ def version(quiet: bool=False, if quiet or '--version' in sys.argv: return + from rich.panel import Panel from rich.console import Console console = Console() prnt = console.print - from plugins_auth.ldap.apps import LDAP_CONFIG + from plugins_auth.ldap.config import LDAP_CONFIG from django.conf import settings from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID + from archivebox.config.paths import get_data_locations, get_code_locations from abx.archivebox.base_binary import BaseBinary, apt, brew, env @@ -221,7 +223,7 @@ def version(quiet: bool=False, f'PLATFORM={platform.platform()}', f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''), ) - OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount + OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() prnt( f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}', @@ -240,6 +242,21 @@ def version(quiet: bool=False, #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually ) prnt() + + if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)): + PANEL_TEXT = '\n'.join(( + # '', + # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]', + '', + '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...', + ' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.', + '', + ' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]', + '', + )) + prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]')) + prnt() + return prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]') failures = [] @@ -299,13 +316,13 @@ def version(quiet: bool=False, prnt() prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]') - for name, path in CONSTANTS.CODE_LOCATIONS.items(): + for name, path in get_code_locations().items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) prnt() if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK): prnt('[bright_yellow][i] Data locations:[/bright_yellow]') - for name, path in CONSTANTS.DATA_LOCATIONS.items(): + for name, path in get_data_locations().items(): prnt(printable_folder_status(name, path), overflow='ignore', crop=False) from archivebox.misc.checks import check_data_dir_permissions @@ -395,7 +412,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}') # from django.contrib.auth.models import User - # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists(): + # if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists(): # print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI)) # call_command("createsuperuser", interactive=True) @@ -486,9 +503,13 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat html_index.rename(f"{index_name}.html") CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True) - CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True) - + CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True) + CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True) + + from archivebox.config.common import STORAGE_CONFIG + STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True) + STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True) + if install: run_subcommand('install', pwd=out_dir) @@ -1115,14 +1136,14 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina from django.contrib.auth import get_user_model User = get_user_model() - if not User.objects.filter(is_superuser=True).exists(): + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green') stderr(' archivebox manage createsuperuser') # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) - from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY extra_args = [] if binproviders: @@ -1253,7 +1274,7 @@ def schedule(add: bool=False, """Set ArchiveBox to regularly import URLs at specific times using cron""" check_data_folder() - from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY from archivebox.config.permissions import USER Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) @@ -1399,46 +1420,43 @@ def server(runserver_args: Optional[List[str]]=None, from django.core.management import call_command from django.contrib.auth.models import User + if not User.objects.filter(is_superuser=True).exclude(username='system').exists(): + print() + # print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]') + print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:') + print(' [green]archivebox manage createsuperuser[/green]') + print() - print('[green][+] Starting ArchiveBox webserver...[/green]') - print(' > Logging errors to ./logs/errors.log') - if not User.objects.filter(is_superuser=True).exists(): - print('[yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow]') - print() - print(' [violet]Hint:[/violet] To create an admin user, run:') - print(' archivebox manage createsuperuser') - print() + host = '127.0.0.1' + port = '8000' + try: + host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] + if ':' in host_and_port: + host, port = host_and_port.split(':') + else: + if '.' in host_and_port: + host = host_and_port + else: + port = host_and_port + except IndexError: + pass + + print('[green][+] Starting ArchiveBox webserver...[/green]') + print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') + print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]') + print(' > Writing ArchiveBox error log to ./logs/errors.log') if SHELL_CONFIG.DEBUG: if not reload: runserver_args.append('--noreload') # '--insecure' call_command("runserver", *runserver_args) else: - host = '127.0.0.1' - port = '8000' - - try: - host_and_port = [arg for arg in runserver_args if arg.replace('.', '').replace(':', '').isdigit()][0] - if ':' in host_and_port: - host, port = host_and_port.split(':') - else: - if '.' in host_and_port: - host = host_and_port - else: - port = host_and_port - except IndexError: - pass - - print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]') - from queues.supervisor_util import start_server_workers print() - start_server_workers(host=host, port=port, daemonize=False) - print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]") diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 4d12b7df..5fe02055 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -5,16 +5,24 @@ import sys from pathlib import Path from rich import print +from rich.panel import Panel -# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE +# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries # this file is imported by archivebox/__init__.py # and any imports here will be imported by EVERYTHING else # so this file should only be used for pure python checks # that don't need to import other parts of ArchiveBox +# if a check needs to import other parts of ArchiveBox, +# the imports should be done inside the check function +# and you should make sure if you need to import any django stuff +# that the check is called after django.setup() has been called + def check_data_folder() -> None: from archivebox import DATA_DIR, ARCHIVE_DIR + from archivebox.config import CONSTANTS + from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir() if not archive_dir_exists: @@ -30,13 +38,27 @@ def check_data_folder() -> None: raise SystemExit(2) + # Create data dir subdirs + create_and_chown_dir(CONSTANTS.SOURCES_DIR) + create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default') + create_and_chown_dir(CONSTANTS.LOGS_DIR) + # create_and_chown_dir(CONSTANTS.CACHE_DIR) + + # Create /tmp and /lib dirs if they don't exist + get_or_create_working_tmp_dir(autofix=True, quiet=False) + get_or_create_working_lib_dir(autofix=True, quiet=False) + + # Check data dir permissions, /tmp, and /lib permissions + check_data_dir_permissions() + def check_migrations(): - from archivebox import DATA_DIR, CONSTANTS + from archivebox import DATA_DIR from ..index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] + is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init']) - if pending_migrations: + if pending_migrations and not is_migrating: print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]') print(f' {DATA_DIR}', file=sys.stderr) print(file=sys.stderr) @@ -44,13 +66,6 @@ def check_migrations(): print(' archivebox init', file=sys.stderr) raise SystemExit(3) - CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True) - CONSTANTS.LOGS_DIR.mkdir(exist_ok=True) - # CONSTANTS.CACHE_DIR.mkdir(exist_ok=True) - (CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True) - (CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True) - - def check_io_encoding(): PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') @@ -127,3 +142,98 @@ def check_data_dir_permissions(): STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]') STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]') + + from archivebox.config.common import STORAGE_CONFIG + + # Check /tmp dir permissions + check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True) + + # Check /lib dir permissions + check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True) + + +def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True): + from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir + from archivebox.misc.logging import STDERR + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.config.common import STORAGE_CONFIG + from archivebox.logging_util import pretty_path + + tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR + socket_file = tmp_dir.absolute().resolve() / "supervisord.sock" + + if not must_exist and not os.path.isdir(tmp_dir): + # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable) + return len(f'file://{socket_file}') <= 96 + + tmp_is_valid = False + try: + tmp_is_valid = dir_is_writable(tmp_dir) + tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir) + assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}' + assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.' + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = '\n'.join(( + '', + f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]', + f' [yellow]{e}[/yellow]', + '', + '[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.', + ' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).', + f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', + ' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.', + ' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]', + '', + '[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:', + f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]', + '', + )) + STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.')) + STDERR.print() + if throw: + raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e + return False + + +def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True): + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + from archivebox.misc.logging import STDERR + from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir + from archivebox.config.common import STORAGE_CONFIG + from archivebox.logging_util import pretty_path + + lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR + + if not must_exist and not os.path.isdir(lib_dir): + return True + + lib_is_valid = False + try: + lib_is_valid = dir_is_writable(lib_dir) + assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}' + return True + except Exception as e: + if not quiet: + STDERR.print() + ERROR_TEXT = '\n'.join(( + '', + f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]', + f' [yellow]{e}[/yellow]', + '', + '[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.', + f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).', + ' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).', + ' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]', + '', + '[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:', + f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]', + '', + )) + STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]')) + STDERR.print() + if throw: + raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e + return False diff --git a/archivebox/misc/paginators.py b/archivebox/misc/paginators.py new file mode 100644 index 00000000..2e623a65 --- /dev/null +++ b/archivebox/misc/paginators.py @@ -0,0 +1,30 @@ +__package__ = 'archivebox.misc' + +from django.core.paginator import Paginator +from django.utils.functional import cached_property + + +class AccelleratedPaginator(Paginator): + """ + Accellerated Pagniator ignores DISTINCT when counting total number of rows. + Speeds up SELECT Count(*) on Admin views by >20x. + https://hakibenita.com/optimizing-the-django-admin-paginator + """ + + @cached_property + def count(self): + if self.object_list._has_filters(): # type: ignore + # fallback to normal count method on filtered queryset + return super().count + else: + # otherwise count total rows in a separate fast query + return self.object_list.model.objects.count() + + # Alternative approach for PostgreSQL: fallback count takes > 200ms + # from django.db import connection, transaction, OperationalError + # with transaction.atomic(), connection.cursor() as cursor: + # cursor.execute('SET LOCAL statement_timeout TO 200;') + # try: + # return super().count + # except OperationalError: + # return 9999999999999 diff --git a/archivebox/core/serve_static.py b/archivebox/misc/serve_static.py similarity index 100% rename from archivebox/core/serve_static.py rename to archivebox/misc/serve_static.py diff --git a/archivebox/core/shell_welcome_message.py b/archivebox/misc/shell_welcome_message.py similarity index 94% rename from archivebox/core/shell_welcome_message.py rename to archivebox/misc/shell_welcome_message.py index b1ed1b58..5b85e6bd 100644 --- a/archivebox/core/shell_welcome_message.py +++ b/archivebox/misc/shell_welcome_message.py @@ -49,7 +49,7 @@ if __name__ == '__main__': prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!') prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]') - prnt(' [link=https://docs.archivebox.io/en/latest/modules.html]https://docs.archivebox.io/en/latest/modules.html[/link]') + prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]') prnt() prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]') prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]') diff --git a/archivebox/misc/toml_util.py b/archivebox/misc/toml_util.py index d4784335..9dd51d1b 100644 --- a/archivebox/misc/toml_util.py +++ b/archivebox/misc/toml_util.py @@ -82,10 +82,10 @@ class JSONSchemaWithLambdas(GenerateJsonSchema): if isinstance(default, Callable): return '{{lambda ' + inspect.getsource(default).split('=lambda ')[-1].strip()[:-1] + '}}' return to_jsonable_python( - default, - timedelta_mode=config.ser_json_timedelta, - bytes_mode=config.ser_json_bytes, - serialize_unknown=True + default, + timedelta_mode=config.ser_json_timedelta, + bytes_mode=config.ser_json_bytes, + serialize_unknown=True ) # for computed_field properties render them like this instead: diff --git a/archivebox/personas/__init__.py b/archivebox/personas/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/personas/admin.py b/archivebox/personas/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/personas/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/personas/apps.py b/archivebox/personas/apps.py new file mode 100644 index 00000000..02c85655 --- /dev/null +++ b/archivebox/personas/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SessionsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "personas" diff --git a/archivebox/personas/migrations/__init__.py b/archivebox/personas/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/personas/models.py b/archivebox/personas/models.py new file mode 100644 index 00000000..0b5c693b --- /dev/null +++ b/archivebox/personas/models.py @@ -0,0 +1,67 @@ +from django.db import models + +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + +from django.conf import settings + + +# class Persona(ABIDModel, ModelWithHealthStats): +# """Aka a "SessionType", its a template for a crawler browsing session containing some config.""" + +# abid_prefix = 'prs_' +# abid_ts_src = 'self.created_at' +# abid_uri_src = 'self.name' +# abid_subtype_src = 'self.created_by' +# abid_rand_src = 'self.id' + +# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID') +# abid = ABIDField(prefix=abid_prefix) + +# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) +# created_at = AutoDateTimeField(default=None, null=False, db_index=True) +# modified_at = models.DateTimeField(auto_now=True) + +# name = models.CharField(max_length=100, blank=False, null=False, editable=False) + +# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False) +# config = models.JSONField(default=dict) +# # e.g. { +# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36', +# # COOKIES_TXT_FILE: '/path/to/cookies.txt', +# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir', +# # CHECK_SSL_VALIDITY: False, +# # SAVE_ARCHIVE_DOT_ORG: True, +# # CHROME_BINARY: 'chromium' +# # ... +# # } +# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='') +# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='') + +# class Meta: +# verbose_name = 'Session Type' +# verbose_name_plural = 'Session Types' +# unique_together = (('created_by', 'name'),) + + +# def clean(self): +# self.persona_dir = settings.PERSONAS_DIR / self.name +# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name' + + +# # make sure config keys all exist in FLAT_CONFIG +# # make sure config values all match expected types +# pass + +# def save(self, *args, **kwargs): +# self.full_clean() + +# # make sure basic file structure is present in persona_dir: +# # - PERSONAS_DIR / self.name / +# # - chrome_profile/ +# # - chrome_downloads/ +# # - chrome_extensions/ +# # - cookies.txt +# # - auth.json +# # - config.json # json dump of the model + +# super().save(*args, **kwargs) diff --git a/archivebox/personas/tests.py b/archivebox/personas/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/personas/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/personas/views.py b/archivebox/personas/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/personas/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/plugins_auth/ldap/__init__.py b/archivebox/plugins_auth/ldap/__init__.py index e69de29b..66d5ad88 100644 --- a/archivebox/plugins_auth/ldap/__init__.py +++ b/archivebox/plugins_auth/ldap/__init__.py @@ -0,0 +1,72 @@ +__package__ = 'plugins_auth.ldap' +__id__ = 'ldap' +__label__ = 'LDAP' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap' +__dependencies__ = ['pip'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + __id__: { + 'id': __id__, + 'package': __package__, + 'label': __label__, + 'version': __version__, + 'author': __author__, + 'homepage': __homepage__, + 'dependencies': __dependencies__, + } + } + + + +@abx.hookimpl +def get_CONFIG(): + from .config import LDAP_CONFIG + return { + __id__: LDAP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import LDAP_BINARY + + return { + 'ldap': LDAP_BINARY, + } + + +def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs): + """ + Invoked after LDAP authenticates a user, but before they have a local User account created. + ArchiveBox requires staff/superuser status to view the admin at all, so we must create a user + + set staff and superuser when LDAP authenticates a new person. + """ + from django.conf import settings + + if user is None: + return # not authenticated at all + + if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER: + user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet + + user.is_staff = True + print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})') + + +@abx.hookimpl +def ready(): + """ + Called at AppConfig.ready() time (settings + models are all loaded) + """ + from django.conf import settings + + if settings.CONFIGS.ldap.LDAP_ENABLED: + # tell django-auth-ldap to call our function when a user is authenticated via LDAP + import django_auth_ldap.backend + django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user) diff --git a/archivebox/plugins_auth/ldap/apps.py b/archivebox/plugins_auth/ldap/binaries.py similarity index 74% rename from archivebox/plugins_auth/ldap/apps.py rename to archivebox/plugins_auth/ldap/binaries.py index 0cb74da1..cc932183 100644 --- a/archivebox/plugins_auth/ldap/apps.py +++ b/archivebox/plugins_auth/ldap/binaries.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.plugins_auth.ldap' +__package__ = 'plugins_auth.ldap' import inspect @@ -9,15 +9,14 @@ from pydantic import InstanceOf from pydantic_pkgr import BinaryOverrides, SemVer -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_hook import BaseHook + from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt -from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES -from .settings import LDAP_CONFIG, get_ldap_lib +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES + +from .config import get_ldap_lib -###################### Config ########################## def get_LDAP_LIB_path(paths=()): LDAP_LIB = get_ldap_lib()[0] @@ -34,10 +33,12 @@ def get_LDAP_LIB_path(paths=()): return lib_path return None + def get_LDAP_LIB_version(): LDAP_LIB = get_ldap_lib()[0] return LDAP_LIB and SemVer(LDAP_LIB.__version__) + class LdapBinary(BaseBinary): name: str = 'ldap' description: str = 'LDAP Authentication' @@ -67,17 +68,3 @@ class LdapBinary(BaseBinary): } LDAP_BINARY = LdapBinary() - - -class LdapAuthPlugin(BasePlugin): - app_label: str = 'ldap' - verbose_name: str = 'LDAP Authentication' - - hooks: List[InstanceOf[BaseHook]] = [ - LDAP_CONFIG, - *([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []), - ] - - -PLUGIN = LdapAuthPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_auth/ldap/settings.py b/archivebox/plugins_auth/ldap/config.py similarity index 99% rename from archivebox/plugins_auth/ldap/settings.py rename to archivebox/plugins_auth/ldap/config.py index 0685e1b5..fb124273 100644 --- a/archivebox/plugins_auth/ldap/settings.py +++ b/archivebox/plugins_auth/ldap/config.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox.plugins_auth.ldap' +__package__ = 'plugins_auth.ldap' import sys diff --git a/archivebox/plugins_extractor/archivedotorg/__init__.py b/archivebox/plugins_extractor/archivedotorg/__init__.py new file mode 100644 index 00000000..a5c24932 --- /dev/null +++ b/archivebox/plugins_extractor/archivedotorg/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_extractor.archivedotorg' +__label__ = 'archivedotorg' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://archive.org' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'archivedotorg': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import ARCHIVEDOTORG_CONFIG + + return { + 'archivedotorg': ARCHIVEDOTORG_CONFIG + } + + +# @abx.hookimpl +# def get_EXTRACTORS(): +# from .extractors import ARCHIVEDOTORG_EXTRACTOR +# +# return { +# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/archivedotorg/apps.py b/archivebox/plugins_extractor/archivedotorg/apps.py deleted file mode 100644 index a06b5108..00000000 --- a/archivebox/plugins_extractor/archivedotorg/apps.py +++ /dev/null @@ -1,28 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.archivedotorg' - -from typing import List - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook - -###################### Config ########################## - - -class ArchivedotorgConfig(BaseConfigSet): - SAVE_ARCHIVE_DOT_ORG: bool = True - - -ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig() - - -class ArchivedotorgPlugin(BasePlugin): - app_label: str = 'archivedotorg' - verbose_name: str = 'Archive.org' - - hooks: List[BaseHook] = [ - ARCHIVEDOTORG_CONFIG - ] - -PLUGIN = ArchivedotorgPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/archivedotorg/config.py b/archivebox/plugins_extractor/archivedotorg/config.py new file mode 100644 index 00000000..bebb6c98 --- /dev/null +++ b/archivebox/plugins_extractor/archivedotorg/config.py @@ -0,0 +1,11 @@ +__package__ = 'plugins_extractor.archivedotorg' + + +from abx.archivebox.base_configset import BaseConfigSet + + +class ArchivedotorgConfig(BaseConfigSet): + SAVE_ARCHIVE_DOT_ORG: bool = True + + +ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig() diff --git a/archivebox/plugins_extractor/chrome/__init__.py b/archivebox/plugins_extractor/chrome/__init__.py index e69de29b..9b254655 100644 --- a/archivebox/plugins_extractor/chrome/__init__.py +++ b/archivebox/plugins_extractor/chrome/__init__.py @@ -0,0 +1,65 @@ +__package__ = 'plugins_extractor.chrome' +__label__ = 'chrome' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'chrome': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import CHROME_CONFIG + + return { + 'chrome': CHROME_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import CHROME_BINARY + + return { + 'chrome': CHROME_BINARY, + } + +# @abx.hookimpl +# def get_EXTRACTORS(): +# return { +# 'pdf': PDF_EXTRACTOR, +# 'screenshot': SCREENSHOT_EXTRACTOR, +# 'dom': DOM_EXTRACTOR, +# } + +# Hooks Available: + +# Events: +# on_crawl_schedule_tick +# on_seed_post_save +# on_crawl_post_save +# on_snapshot_post_save +# on_archiveresult_post_save + + +# create_root_snapshot_from_seed +# create_archiveresults_pending_from_snapshot +# create_crawl_from_crawlschedule_if_due +# create_crawl_copy_from_template +# + + +# create_crawl_from_crawlschedule_if_due diff --git a/archivebox/plugins_extractor/chrome/binaries.py b/archivebox/plugins_extractor/chrome/binaries.py new file mode 100644 index 00000000..d2ece7c5 --- /dev/null +++ b/archivebox/plugins_extractor/chrome/binaries.py @@ -0,0 +1,148 @@ +__package__ = 'plugins_extractor.chrome' + +import os +import platform +from pathlib import Path +from typing import List, Optional + +from pydantic import InstanceOf +from pydantic_pkgr import ( + BinProvider, + BinName, + BinaryOverrides, + bin_abspath, +) + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +# Depends on Other Plugins: +from archivebox.config import CONSTANTS +from archivebox.config.common import SHELL_CONFIG +from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER +from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER + + +from .config import CHROME_CONFIG +CHROMIUM_BINARY_NAMES_LINUX = [ + "chromium", + "chromium-browser", + "chromium-browser-beta", + "chromium-browser-unstable", + "chromium-browser-canary", + "chromium-browser-dev", +] +CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"] +CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS + +CHROME_BINARY_NAMES_LINUX = [ + "google-chrome", + "google-chrome-stable", + "google-chrome-beta", + "google-chrome-canary", + "google-chrome-unstable", + "google-chrome-dev", + "chrome" +] +CHROME_BINARY_NAMES_MACOS = [ + "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome", + "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary", +] +CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS + +APT_DEPENDENCIES = [ + 'apt-transport-https', 'at-spi2-common', 'chromium-browser', + 'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei', + 'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2', + 'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1', + 'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings', +] + + +def autodetect_system_chrome_install(PATH=None) -> Optional[Path]: + for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES: + abspath = bin_abspath(bin_name, PATH=env.PATH) + if abspath: + return abspath + return None + +def create_macos_app_symlink(target: Path, shortcut: Path): + """ + on macOS, some binaries are inside of .app, so we need to + create a tiny bash script instead of a symlink + (so that ../ parent relationships are relative to original .app instead of callsite dir) + """ + # TODO: should we enforce this? is it useful in any other situation? + # if platform.system().lower() != 'darwin': + # raise Exception(...) + shortcut.unlink(missing_ok=True) + shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""") + shortcut.chmod(0o777) # make sure its executable by everyone + +###################### Config ########################## + + +class ChromeBinary(BaseBinary): + name: BinName = CHROME_CONFIG.CHROME_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew] + + overrides: BinaryOverrides = { + env.name: { + 'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable + }, + PUPPETEER_BINPROVIDER.name: { + 'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable + }, + PLAYWRIGHT_BINPROVIDER.name: { + 'packages': ['chromium'], # playwright install chromium + }, + apt.name: { + 'packages': APT_DEPENDENCIES, + }, + brew.name: { + 'packages': ['--cask', 'chromium'], + }, + } + + @staticmethod + def symlink_to_lib(binary, bin_dir=None) -> None: + from archivebox.config.common import STORAGE_CONFIG + bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin' + + if not (binary.abspath and os.access(binary.abspath, os.F_OK)): + return + + bin_dir.mkdir(parents=True, exist_ok=True) + symlink = bin_dir / binary.name + + try: + if platform.system().lower() == 'darwin': + # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink + create_macos_app_symlink(binary.abspath, symlink) + else: + # otherwise on linux we can symlink directly to binary executable + symlink.unlink(missing_ok=True) + symlink.symlink_to(binary.abspath) + except Exception as err: + # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}') + # not actually needed, we can just run without it + pass + + @staticmethod + def chrome_cleanup_lockfile(): + """ + Cleans up any state or runtime files that chrome leaves behind when killed by + a timeout or other error + """ + lock_file = Path("~/.config/chromium/SingletonLock").expanduser() + + if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK): + lock_file.unlink() + + if CHROME_CONFIG.CHROME_USER_DATA_DIR: + if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK): + lock_file.unlink() + + + +CHROME_BINARY = ChromeBinary() + diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/config.py similarity index 59% rename from archivebox/plugins_extractor/chrome/apps.py rename to archivebox/plugins_extractor/chrome/config.py index f9e310c5..be943a94 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/config.py @@ -1,35 +1,18 @@ -__package__ = 'archivebox.plugins_extractor.chrome' +__package__ = 'plugins_extractor.chrome' import os -import sys -import platform + from pathlib import Path from typing import List, Optional -# Depends on other PyPI/vendor packages: -from rich import print -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import ( - BinProvider, - BinName, - BinaryOverrides, - bin_abspath, -) +from pydantic import Field, model_validator +from pydantic_pkgr import bin_abspath -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import env -# Depends on Other Plugins: -from archivebox.config import CONSTANTS from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG -from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER -from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER - +from archivebox.misc.logging import STDERR from archivebox.misc.util import dedupe @@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet): @model_validator(mode='after') def validate_use_chrome(self): if self.USE_CHROME and self.CHROME_TIMEOUT < 15: - print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr) - print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr) - print(file=sys.stderr) + STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]') + STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.') + STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles') + STDERR.print() # if user has specified a user data dir, make sure its valid if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK): # check to make sure user_data_dir/ exists if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir(): - print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr) - print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr) - print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr) - print(' For more info see:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr) + STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]') + STDERR.print(f' {self.CHROME_USER_DATA_DIR}') + STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.') + STDERR.print(' For more info see:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR') if '/Default' in str(self.CHROME_USER_DATA_DIR): - print(file=sys.stderr) - print(' Try removing /Default from the end e.g.:', file=sys.stderr) - print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr) + STDERR.print() + STDERR.print(' Try removing /Default from the end e.g.:') + STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0])) # hard error is too annoying here, instead just set it to nothing # raise SystemExit(2) - self.CHROME_USER_DATA_DIR = None + self.update_in_place(CHROME_USER_DATA_DIR=None) else: - self.CHROME_USER_DATA_DIR = None + if self.CHROME_USER_DATA_DIR is not None: + self.update_in_place(CHROME_USER_DATA_DIR=None) return self @@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet): CHROME_CONFIG = ChromeConfig() - -class ChromeBinary(BaseBinary): - name: BinName = CHROME_CONFIG.CHROME_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew] - - overrides: BinaryOverrides = { - env.name: { - 'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable - }, - PUPPETEER_BINPROVIDER.name: { - 'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable - }, - PLAYWRIGHT_BINPROVIDER.name: { - 'packages': ['chromium'], # playwright install chromium - }, - apt.name: { - 'packages': APT_DEPENDENCIES, - }, - brew.name: { - 'packages': ['--cask', 'chromium'], - }, - } - - @staticmethod - def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None: - if not (binary.abspath and os.access(binary.abspath, os.F_OK)): - return - - bin_dir.mkdir(parents=True, exist_ok=True) - symlink = bin_dir / binary.name - - try: - if platform.system().lower() == 'darwin': - # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink - create_macos_app_symlink(binary.abspath, symlink) - else: - # otherwise on linux we can symlink directly to binary executable - symlink.unlink(missing_ok=True) - symlink.symlink_to(binary.abspath) - except Exception as err: - # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}') - # not actually needed, we can just run without it - pass - - @staticmethod - def chrome_cleanup_lockfile(): - """ - Cleans up any state or runtime files that chrome leaves behind when killed by - a timeout or other error - """ - lock_file = Path("~/.config/chromium/SingletonLock").expanduser() - - if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK): - lock_file.unlink() - - if CHROME_CONFIG.CHROME_USER_DATA_DIR: - if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK): - lock_file.unlink() - - - -CHROME_BINARY = ChromeBinary() - - -class ChromePlugin(BasePlugin): - app_label: str = 'chrome' - verbose_name: str = 'Chrome Browser' - - hooks: List[InstanceOf[BaseHook]] = [ - CHROME_CONFIG, - CHROME_BINARY, - ] - - - -PLUGIN = ChromePlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/curl/__init__.py b/archivebox/plugins_extractor/curl/__init__.py new file mode 100644 index 00000000..99af0107 --- /dev/null +++ b/archivebox/plugins_extractor/curl/__init__.py @@ -0,0 +1,38 @@ +__package__ = 'plugins_extractor.curl' +__label__ = 'curl' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/curl/curl' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'curl': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import CURL_CONFIG + + return { + 'curl': CURL_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import CURL_BINARY + + return { + 'curl': CURL_BINARY, + } diff --git a/archivebox/plugins_extractor/curl/apps.py b/archivebox/plugins_extractor/curl/apps.py deleted file mode 100644 index c496611b..00000000 --- a/archivebox/plugins_extractor/curl/apps.py +++ /dev/null @@ -1,79 +0,0 @@ -__package__ = 'plugins_extractor.curl' - -from typing import List, Optional -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG -from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG -from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG - -class CurlConfig(BaseConfigSet): - - SAVE_TITLE: bool = Field(default=True) - SAVE_HEADERS: bool = Field(default=True) - USE_CURL: bool = Field(default=lambda c: - ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG - or FAVICON_CONFIG.SAVE_FAVICON - or c.SAVE_HEADERS - or c.SAVE_TITLE - ) - - CURL_BINARY: str = Field(default='curl') - CURL_ARGS: List[str] = [ - '--silent', - '--location', - '--compressed', - ] - CURL_EXTRA_ARGS: List[str] = [] - - CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - -CURL_CONFIG = CurlConfig() - - -class CurlBinary(BaseBinary): - name: BinName = CURL_CONFIG.CURL_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -CURL_BINARY = CurlBinary() - - -# class CurlExtractor(BaseExtractor): -# name: ExtractorName = 'curl' -# binary: str = CURL_BINARY.name - -# def get_output_path(self, snapshot) -> Path | None: -# curl_index_path = curl_output_path(snapshot.as_link()) -# if curl_index_path: -# return Path(curl_index_path) -# return None - -# CURL_EXTRACTOR = CurlExtractor() - - - -class CurlPlugin(BasePlugin): - app_label: str = 'curl' - verbose_name: str = 'CURL' - - hooks: List[InstanceOf[BaseHook]] = [ - CURL_CONFIG, - CURL_BINARY, - # CURL_EXTRACTOR, - ] - - -PLUGIN = CurlPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/curl/binaries.py b/archivebox/plugins_extractor/curl/binaries.py new file mode 100644 index 00000000..41ff9616 --- /dev/null +++ b/archivebox/plugins_extractor/curl/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.curl' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +from .config import CURL_CONFIG + + +class CurlBinary(BaseBinary): + name: BinName = CURL_CONFIG.CURL_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +CURL_BINARY = CurlBinary() diff --git a/archivebox/plugins_extractor/curl/config.py b/archivebox/plugins_extractor/curl/config.py new file mode 100644 index 00000000..14996f66 --- /dev/null +++ b/archivebox/plugins_extractor/curl/config.py @@ -0,0 +1,33 @@ +__package__ = 'plugins_extractor.curl' + +from typing import List, Optional +from pathlib import Path + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class CurlConfig(BaseConfigSet): + + SAVE_TITLE: bool = Field(default=True) + SAVE_HEADERS: bool = Field(default=True) + USE_CURL: bool = Field(default=True) + + CURL_BINARY: str = Field(default='curl') + CURL_ARGS: List[str] = [ + '--silent', + '--location', + '--compressed', + ] + CURL_EXTRA_ARGS: List[str] = [] + + CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + +CURL_CONFIG = CurlConfig() diff --git a/archivebox/plugins_extractor/favicon/__init__.py b/archivebox/plugins_extractor/favicon/__init__.py new file mode 100644 index 00000000..3fa84560 --- /dev/null +++ b/archivebox/plugins_extractor/favicon/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_extractor.favicon' +__label__ = 'favicon' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/archivebox' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'favicon': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import FAVICON_CONFIG + + return { + 'favicon': FAVICON_CONFIG + } + + +# @abx.hookimpl +# def get_EXTRACTORS(): +# from .extractors import FAVICON_EXTRACTOR + +# return { +# 'favicon': FAVICON_EXTRACTOR, +# } diff --git a/archivebox/plugins_extractor/favicon/apps.py b/archivebox/plugins_extractor/favicon/apps.py deleted file mode 100644 index bfaae21e..00000000 --- a/archivebox/plugins_extractor/favicon/apps.py +++ /dev/null @@ -1,30 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.favicon' - -from typing import List - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook - -###################### Config ########################## - - -class FaviconConfig(BaseConfigSet): - SAVE_FAVICON: bool = True - - FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}' - - -FAVICON_CONFIG = FaviconConfig() - - -class FaviconPlugin(BasePlugin): - app_label: str = 'favicon' - verbose_name: str = 'Favicon' - - hooks: List[BaseHook] = [ - FAVICON_CONFIG - ] - -PLUGIN = FaviconPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/favicon/config.py b/archivebox/plugins_extractor/favicon/config.py new file mode 100644 index 00000000..6073ef87 --- /dev/null +++ b/archivebox/plugins_extractor/favicon/config.py @@ -0,0 +1,13 @@ +__package__ = 'plugins_extractor.favicon' + + +from abx.archivebox.base_configset import BaseConfigSet + + +class FaviconConfig(BaseConfigSet): + SAVE_FAVICON: bool = True + + FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}' + + +FAVICON_CONFIG = FaviconConfig() diff --git a/archivebox/plugins_extractor/git/__init__.py b/archivebox/plugins_extractor/git/__init__.py new file mode 100644 index 00000000..db18919f --- /dev/null +++ b/archivebox/plugins_extractor/git/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.git' +__label__ = 'git' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/git/git' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'git': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import GIT_CONFIG + + return { + 'git': GIT_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import GIT_BINARY + + return { + 'git': GIT_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import GIT_EXTRACTOR + + return { + 'git': GIT_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/git/apps.py b/archivebox/plugins_extractor/git/apps.py deleted file mode 100644 index ebdc9e9f..00000000 --- a/archivebox/plugins_extractor/git/apps.py +++ /dev/null @@ -1,66 +0,0 @@ -__package__ = 'plugins_extractor.git' - -from typing import List -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG - - -class GitConfig(BaseConfigSet): - - SAVE_GIT: bool = True - - GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') - - GIT_BINARY: str = Field(default='git') - GIT_ARGS: List[str] = [ - '--recursive', - ] - GIT_EXTRA_ARGS: List[str] = [] - - GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - - -GIT_CONFIG = GitConfig() - - -class GitBinary(BaseBinary): - name: BinName = GIT_CONFIG.GIT_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -GIT_BINARY = GitBinary() - - -class GitExtractor(BaseExtractor): - name: ExtractorName = 'git' - binary: str = GIT_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - return snapshot.as_link() / 'git' - -GIT_EXTRACTOR = GitExtractor() - - - -class GitPlugin(BasePlugin): - app_label: str = 'git' - verbose_name: str = 'GIT' - - hooks: List[InstanceOf[BaseHook]] = [ - GIT_CONFIG, - GIT_BINARY, - GIT_EXTRACTOR, - ] - - -PLUGIN = GitPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/git/binaries.py b/archivebox/plugins_extractor/git/binaries.py new file mode 100644 index 00000000..8d990769 --- /dev/null +++ b/archivebox/plugins_extractor/git/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.git' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from .config import GIT_CONFIG + + + +class GitBinary(BaseBinary): + name: BinName = GIT_CONFIG.GIT_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +GIT_BINARY = GitBinary() diff --git a/archivebox/plugins_extractor/git/config.py b/archivebox/plugins_extractor/git/config.py new file mode 100644 index 00000000..3d890d62 --- /dev/null +++ b/archivebox/plugins_extractor/git/config.py @@ -0,0 +1,28 @@ +__package__ = 'plugins_extractor.git' + +from typing import List + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class GitConfig(BaseConfigSet): + + SAVE_GIT: bool = True + + GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht') + + GIT_BINARY: str = Field(default='git') + GIT_ARGS: List[str] = [ + '--recursive', + ] + GIT_EXTRA_ARGS: List[str] = [] + + GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + + +GIT_CONFIG = GitConfig() diff --git a/archivebox/plugins_extractor/git/extractors.py b/archivebox/plugins_extractor/git/extractors.py new file mode 100644 index 00000000..350f1b82 --- /dev/null +++ b/archivebox/plugins_extractor/git/extractors.py @@ -0,0 +1,17 @@ +__package__ = 'plugins_extractor.git' + +from pathlib import Path + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import GIT_BINARY + + +class GitExtractor(BaseExtractor): + name: ExtractorName = 'git' + binary: str = GIT_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + return snapshot.as_link() / 'git' + +GIT_EXTRACTOR = GitExtractor() diff --git a/archivebox/plugins_extractor/mercury/__init__.py b/archivebox/plugins_extractor/mercury/__init__.py new file mode 100644 index 00000000..10aca671 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.mercury' +__label__ = 'mercury' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/postlight/mercury-parser' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'mercury': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import MERCURY_CONFIG + + return { + 'mercury': MERCURY_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import MERCURY_BINARY + + return { + 'mercury': MERCURY_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import MERCURY_EXTRACTOR + + return { + 'mercury': MERCURY_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/mercury/apps.py b/archivebox/plugins_extractor/mercury/apps.py deleted file mode 100644 index 926bbdca..00000000 --- a/archivebox/plugins_extractor/mercury/apps.py +++ /dev/null @@ -1,80 +0,0 @@ -__package__ = 'plugins_extractor.mercury' - -from typing import List, Optional -from pathlib import Path - -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG -from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -class MercuryConfig(BaseConfigSet): - - SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') - - MERCURY_BINARY: str = Field(default='postlight-parser') - MERCURY_EXTRA_ARGS: List[str] = [] - - SAVE_MERCURY_REQUISITES: bool = Field(default=True) - MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) - - MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - - -MERCURY_CONFIG = MercuryConfig() - - -class MercuryBinary(BaseBinary): - name: BinName = MERCURY_CONFIG.MERCURY_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: { - 'packages': ['@postlight/parser@^2.2.3'], - }, - SYS_NPM_BINPROVIDER.name: { - 'packages': ['@postlight/parser@^2.2.3'], - 'install': lambda: None, # never try to install things into global prefix - }, - env.name: { - 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, - }, - } - -MERCURY_BINARY = MercuryBinary() - - -class MercuryExtractor(BaseExtractor): - name: ExtractorName = 'mercury' - binary: str = MERCURY_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - return snapshot.link_dir / 'mercury' / 'content.html' - -MERCURY_EXTRACTOR = MercuryExtractor() - - - -class MercuryPlugin(BasePlugin): - app_label: str = 'mercury' - verbose_name: str = 'MERCURY' - - hooks: List[InstanceOf[BaseHook]] = [ - MERCURY_CONFIG, - MERCURY_BINARY, - MERCURY_EXTRACTOR, - ] - - -PLUGIN = MercuryPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/mercury/binaries.py b/archivebox/plugins_extractor/mercury/binaries.py new file mode 100644 index 00000000..b07055fd --- /dev/null +++ b/archivebox/plugins_extractor/mercury/binaries.py @@ -0,0 +1,32 @@ +__package__ = 'plugins_extractor.mercury' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath + +from abx.archivebox.base_binary import BaseBinary, env + +from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import MERCURY_CONFIG + + +class MercuryBinary(BaseBinary): + name: BinName = MERCURY_CONFIG.MERCURY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: { + 'packages': ['@postlight/parser@^2.2.3'], + }, + SYS_NPM_BINPROVIDER.name: { + 'packages': ['@postlight/parser@^2.2.3'], + 'install': lambda: None, # never try to install things into global prefix + }, + env.name: { + 'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None, + }, + } + +MERCURY_BINARY = MercuryBinary() diff --git a/archivebox/plugins_extractor/mercury/config.py b/archivebox/plugins_extractor/mercury/config.py new file mode 100644 index 00000000..49c92b73 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/config.py @@ -0,0 +1,31 @@ +__package__ = 'plugins_extractor.mercury' + +from typing import List, Optional +from pathlib import Path + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG + + + +class MercuryConfig(BaseConfigSet): + + SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY') + + MERCURY_BINARY: str = Field(default='postlight-parser') + MERCURY_EXTRA_ARGS: List[str] = [] + + SAVE_MERCURY_REQUISITES: bool = Field(default=True) + MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + + +MERCURY_CONFIG = MercuryConfig() diff --git a/archivebox/plugins_extractor/mercury/extractors.py b/archivebox/plugins_extractor/mercury/extractors.py new file mode 100644 index 00000000..5d91b0e0 --- /dev/null +++ b/archivebox/plugins_extractor/mercury/extractors.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.mercury' + +from pathlib import Path + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import MERCURY_BINARY + + + +class MercuryExtractor(BaseExtractor): + name: ExtractorName = 'mercury' + binary: str = MERCURY_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + return snapshot.link_dir / 'mercury' / 'content.html' + + +MERCURY_EXTRACTOR = MercuryExtractor() diff --git a/archivebox/plugins_extractor/readability/__init__.py b/archivebox/plugins_extractor/readability/__init__.py new file mode 100644 index 00000000..2ef1a1a8 --- /dev/null +++ b/archivebox/plugins_extractor/readability/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_extractor.readability' +__label__ = 'readability' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/readability-extractor' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'readability': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import READABILITY_CONFIG + + return { + 'readability': READABILITY_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import READABILITY_BINARY + + return { + 'readability': READABILITY_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import READABILITY_EXTRACTOR + + return { + 'readability': READABILITY_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/readability/apps.py b/archivebox/plugins_extractor/readability/apps.py deleted file mode 100644 index bf215c5f..00000000 --- a/archivebox/plugins_extractor/readability/apps.py +++ /dev/null @@ -1,86 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.readability' - -from pathlib import Path -from typing import List -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor -from abx.archivebox.base_hook import BaseHook - -# Depends on Other Plugins: -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -###################### Config ########################## - -class ReadabilityConfig(BaseConfigSet): - SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') - - READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - - READABILITY_BINARY: str = Field(default='readability-extractor') - # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args - - -READABILITY_CONFIG = ReadabilityConfig() - - -READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' - -class ReadabilityBinary(BaseBinary): - name: BinName = READABILITY_CONFIG.READABILITY_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]}, - SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages - } - - - - -READABILITY_BINARY = ReadabilityBinary() - - -class ReadabilityExtractor(BaseExtractor): - name: str = 'readability' - binary: BinName = READABILITY_BINARY.name - - def get_output_path(self, snapshot) -> Path: - return Path(snapshot.link_dir) / 'readability' / 'content.html' - - -READABILITY_BINARY = ReadabilityBinary() -READABILITY_EXTRACTOR = ReadabilityExtractor() - -# class ReadabilityQueue(BaseQueue): -# name: str = 'singlefile' - -# binaries: List[InstanceOf[BaseBinary]] = [READABILITY_BINARY] - -# READABILITY_QUEUE = ReadabilityQueue() - -class ReadabilityPlugin(BasePlugin): - app_label: str ='readability' - verbose_name: str = 'Readability' - - hooks: List[InstanceOf[BaseHook]] = [ - READABILITY_CONFIG, - READABILITY_BINARY, - READABILITY_EXTRACTOR, - # READABILITY_QUEUE, - ] - - - -PLUGIN = ReadabilityPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/readability/binaries.py b/archivebox/plugins_extractor/readability/binaries.py new file mode 100644 index 00000000..43343924 --- /dev/null +++ b/archivebox/plugins_extractor/readability/binaries.py @@ -0,0 +1,27 @@ +__package__ = 'plugins_extractor.readability' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import READABILITY_CONFIG + + +READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor' + +class ReadabilityBinary(BaseBinary): + name: BinName = READABILITY_CONFIG.READABILITY_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]}, + SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages + } + + +READABILITY_BINARY = ReadabilityBinary() diff --git a/archivebox/plugins_extractor/readability/config.py b/archivebox/plugins_extractor/readability/config.py new file mode 100644 index 00000000..8066d56c --- /dev/null +++ b/archivebox/plugins_extractor/readability/config.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.readability' + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class ReadabilityConfig(BaseConfigSet): + SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY') + + READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + + READABILITY_BINARY: str = Field(default='readability-extractor') + # READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args + + +READABILITY_CONFIG = ReadabilityConfig() diff --git a/archivebox/plugins_extractor/readability/extractors.py b/archivebox/plugins_extractor/readability/extractors.py new file mode 100644 index 00000000..eb8ea165 --- /dev/null +++ b/archivebox/plugins_extractor/readability/extractors.py @@ -0,0 +1,20 @@ +__package__ = 'plugins_extractor.readability' + +from pathlib import Path + +from pydantic_pkgr import BinName + +from abx.archivebox.base_extractor import BaseExtractor + +from .binaries import READABILITY_BINARY + + +class ReadabilityExtractor(BaseExtractor): + name: str = 'readability' + binary: BinName = READABILITY_BINARY.name + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'readability' / 'content.html' + + +READABILITY_EXTRACTOR = ReadabilityExtractor() diff --git a/archivebox/plugins_extractor/singlefile/__init__.py b/archivebox/plugins_extractor/singlefile/__init__.py index e69de29b..cd72adb8 100644 --- a/archivebox/plugins_extractor/singlefile/__init__.py +++ b/archivebox/plugins_extractor/singlefile/__init__.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_extractor.singlefile' +__label__ = 'singlefile' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/gildas-lormeau/singlefile' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'singlefile': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SINGLEFILE_CONFIG + + return { + 'singlefile': SINGLEFILE_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import SINGLEFILE_BINARY + + return { + 'singlefile': SINGLEFILE_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import SINGLEFILE_EXTRACTOR + + return { + 'singlefile': SINGLEFILE_EXTRACTOR, + } + +# @abx.hookimpl +# def get_INSTALLED_APPS(): +# # needed to load ./models.py +# return [__package__] diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py deleted file mode 100644 index a160f9bd..00000000 --- a/archivebox/plugins_extractor/singlefile/apps.py +++ /dev/null @@ -1,110 +0,0 @@ -__package__ = 'archivebox.plugins_extractor.singlefile' - -from pathlib import Path -from typing import List, Optional -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env -from abx.archivebox.base_extractor import BaseExtractor -from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook - -# Depends on Other Plugins: -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER - -###################### Config ########################## - -class SinglefileConfig(BaseConfigSet): - SAVE_SINGLEFILE: bool = True - - SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - SINGLEFILE_BINARY: str = Field(default='single-file') - SINGLEFILE_EXTRA_ARGS: List[str] = [] - - -SINGLEFILE_CONFIG = SinglefileConfig() - - -SINGLEFILE_MIN_VERSION = '1.1.54' -SINGLEFILE_MAX_VERSION = '1.1.60' - - -class SinglefileBinary(BaseBinary): - name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - overrides: BinaryOverrides = { - LIB_NPM_BINPROVIDER.name: { - "abspath": lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), - "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], - }, - SYS_NPM_BINPROVIDER.name: { - "abspath": lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH) - or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH), - "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], - "install": lambda: None, - }, - env.name: { - 'abspath': lambda: - bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) - or bin_abspath('single-file', PATH=env.PATH) - or bin_abspath('single-file-node.js', PATH=env.PATH), - }, - } - - -SINGLEFILE_BINARY = SinglefileBinary() - -PLUGIN_BINARIES = [SINGLEFILE_BINARY] - -class SinglefileExtractor(BaseExtractor): - name: str = 'singlefile' - binary: BinName = SINGLEFILE_BINARY.name - - def get_output_path(self, snapshot) -> Path: - return Path(snapshot.link_dir) / 'singlefile.html' - - -SINGLEFILE_BINARY = SinglefileBinary() -SINGLEFILE_EXTRACTOR = SinglefileExtractor() - -class SinglefileQueue(BaseQueue): - name: str = 'singlefile' - - binaries: List[InstanceOf[BaseBinary]] = [SINGLEFILE_BINARY] - -SINGLEFILE_QUEUE = SinglefileQueue() - -class SinglefilePlugin(BasePlugin): - app_label: str ='singlefile' - verbose_name: str = 'SingleFile' - - hooks: List[InstanceOf[BaseHook]] = [ - SINGLEFILE_CONFIG, - SINGLEFILE_BINARY, - SINGLEFILE_EXTRACTOR, - SINGLEFILE_QUEUE, - ] - - - -PLUGIN = SinglefilePlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/singlefile/binaries.py b/archivebox/plugins_extractor/singlefile/binaries.py new file mode 100644 index 00000000..0c8a1bab --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/binaries.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_extractor.singlefile' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER + +from .config import SINGLEFILE_CONFIG + + +SINGLEFILE_MIN_VERSION = '1.1.54' +SINGLEFILE_MAX_VERSION = '1.1.60' + + +class SinglefileBinary(BaseBinary): + name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + overrides: BinaryOverrides = { + LIB_NPM_BINPROVIDER.name: { + "abspath": lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH), + "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], + }, + SYS_NPM_BINPROVIDER.name: { + "abspath": lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH) + or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH), + "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"], + "install": lambda: None, + }, + env.name: { + 'abspath': lambda: + bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH) + or bin_abspath('single-file', PATH=env.PATH) + or bin_abspath('single-file-node.js', PATH=env.PATH), + }, + } + + +SINGLEFILE_BINARY = SinglefileBinary() diff --git a/archivebox/plugins_extractor/singlefile/config.py b/archivebox/plugins_extractor/singlefile/config.py new file mode 100644 index 00000000..7d27031e --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/config.py @@ -0,0 +1,25 @@ +__package__ = 'plugins_extractor.singlefile' + +from pathlib import Path +from typing import List, Optional + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG + + +class SinglefileConfig(BaseConfigSet): + SAVE_SINGLEFILE: bool = True + + SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + SINGLEFILE_BINARY: str = Field(default='single-file') + SINGLEFILE_EXTRA_ARGS: List[str] = [] + + +SINGLEFILE_CONFIG = SinglefileConfig() diff --git a/archivebox/plugins_extractor/singlefile/extractors.py b/archivebox/plugins_extractor/singlefile/extractors.py new file mode 100644 index 00000000..fedbe801 --- /dev/null +++ b/archivebox/plugins_extractor/singlefile/extractors.py @@ -0,0 +1,19 @@ +__package__ = 'plugins_extractor.singlefile' + +from pathlib import Path + +from pydantic_pkgr import BinName +from abx.archivebox.base_extractor import BaseExtractor + +from .binaries import SINGLEFILE_BINARY + + +class SinglefileExtractor(BaseExtractor): + name: str = 'singlefile' + binary: BinName = SINGLEFILE_BINARY.name + + def get_output_path(self, snapshot) -> Path: + return Path(snapshot.link_dir) / 'singlefile.html' + + +SINGLEFILE_EXTRACTOR = SinglefileExtractor() diff --git a/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py b/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py deleted file mode 100644 index 74ef955c..00000000 --- a/archivebox/plugins_extractor/singlefile/migrations/0001_initial.py +++ /dev/null @@ -1,26 +0,0 @@ -# Generated by Django 5.1.1 on 2024-09-10 05:05 - -from django.db import migrations - - -class Migration(migrations.Migration): - - initial = True - - dependencies = [ - ('core', '0074_alter_snapshot_downloaded_at'), - ] - - operations = [ - migrations.CreateModel( - name='SinglefileResult', - fields=[ - ], - options={ - 'proxy': True, - 'indexes': [], - 'constraints': [], - }, - bases=('core.archiveresult',), - ), - ] diff --git a/archivebox/plugins_extractor/singlefile/tasks.py b/archivebox/plugins_extractor/singlefile/tasks.py deleted file mode 100644 index 8ab2bd95..00000000 --- a/archivebox/plugins_extractor/singlefile/tasks.py +++ /dev/null @@ -1,40 +0,0 @@ -__package__ = 'archivebox.queues' - -import time - -from django.core.cache import cache - -from huey import crontab -from django_huey import db_task, on_startup, db_periodic_task -from huey_monitor.models import TaskModel -from huey_monitor.tqdm import ProcessInfo - -@db_task(queue="singlefile", context=True) -def extract(url, out_dir, config, task=None, parent_task_id=None): - if task and parent_task_id: - TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) - - process_info = ProcessInfo(task, desc="extract_singlefile", parent_task_id=parent_task_id, total=1) - - time.sleep(5) - - process_info.update(n=1) - return {'output': 'singlefile.html', 'status': 'succeeded'} - - -# @on_startup(queue='singlefile') -# def start_singlefile_queue(): -# print("[+] Starting singlefile worker...") -# update_version.call_local() - - -# @db_periodic_task(crontab(minute='*/5'), queue='singlefile') -# def update_version(): -# print('[*] Updating singlefile version... 5 minute interval') -# from django.conf import settings - -# bin = settings.BINARIES.SinglefileBinary.load() -# if bin.version: -# cache.set(f"bin:abspath:{bin.name}", bin.abspath) -# cache.set(f"bin:version:{bin.name}:{bin.abspath}", bin.version) -# print('[√] Updated singlefile version:', bin.version, bin.abspath) diff --git a/archivebox/plugins_extractor/wget/__init__.py b/archivebox/plugins_extractor/wget/__init__.py new file mode 100644 index 00000000..506ad7bf --- /dev/null +++ b/archivebox/plugins_extractor/wget/__init__.py @@ -0,0 +1,47 @@ +__package__ = 'plugins_extractor.wget' +__label__ = 'wget' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/wget' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'wget': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import WGET_CONFIG + + return { + 'wget': WGET_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import WGET_BINARY + + return { + 'wget': WGET_BINARY, + } + +@abx.hookimpl +def get_EXTRACTORS(): + from .extractors import WGET_EXTRACTOR, WARC_EXTRACTOR + + return { + 'wget': WGET_EXTRACTOR, + 'warc': WARC_EXTRACTOR, + } diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py deleted file mode 100644 index 1e54376b..00000000 --- a/archivebox/plugins_extractor/wget/apps.py +++ /dev/null @@ -1,127 +0,0 @@ -__package__ = 'plugins_extractor.wget' - -import sys -from typing import List, Optional -from pathlib import Path -from subprocess import run, DEVNULL - -from rich import print -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinName - -from abx.archivebox.base_plugin import BasePlugin, BaseHook -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_extractor import BaseExtractor, ExtractorName - -from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG -from .wget_util import wget_output_path - - -class WgetConfig(BaseConfigSet): - - SAVE_WGET: bool = True - SAVE_WARC: bool = True - - USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC) - - WGET_BINARY: str = Field(default='wget') - WGET_ARGS: List[str] = [ - '--no-verbose', - '--adjust-extension', - '--convert-links', - '--force-directories', - '--backup-converted', - '--span-hosts', - '--no-parent', - '-e', 'robots=off', - ] - WGET_EXTRA_ARGS: List[str] = [] - - SAVE_WGET_REQUISITES: bool = Field(default=True) - WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) - - WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) - WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) - WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) - - @model_validator(mode='after') - def validate_use_ytdlp(self): - if self.USE_WGET and self.WGET_TIMEOUT < 10: - print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' wget will fail to archive any sites if set to less than ~20 seconds.', file=sys.stderr) - print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) - print(file=sys.stderr) - return self - - @property - def WGET_AUTO_COMPRESSION(self) -> bool: - if hasattr(self, '_WGET_AUTO_COMPRESSION'): - return self._WGET_AUTO_COMPRESSION - try: - cmd = [ - self.WGET_BINARY, - "--compression=auto", - "--help", - ] - self._WGET_AUTO_COMPRESSION = not run(cmd, stdout=DEVNULL, stderr=DEVNULL, timeout=3).returncode - return self._WGET_AUTO_COMPRESSION - except (FileNotFoundError, OSError): - self._WGET_AUTO_COMPRESSION = False - return False - -WGET_CONFIG = WgetConfig() - - -class WgetBinary(BaseBinary): - name: BinName = WGET_CONFIG.WGET_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - -WGET_BINARY = WgetBinary() - - -class WgetExtractor(BaseExtractor): - name: ExtractorName = 'wget' - binary: BinName = WGET_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - wget_index_path = wget_output_path(snapshot.as_link()) - if wget_index_path: - return Path(wget_index_path) - return None - -WGET_EXTRACTOR = WgetExtractor() - - -class WarcExtractor(BaseExtractor): - name: ExtractorName = 'warc' - binary: BinName = WGET_BINARY.name - - def get_output_path(self, snapshot) -> Path | None: - warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz')) - if warc_files: - return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] - return None - - -WARC_EXTRACTOR = WarcExtractor() - - -class WgetPlugin(BasePlugin): - app_label: str = 'wget' - verbose_name: str = 'WGET' - - hooks: List[InstanceOf[BaseHook]] = [ - WGET_CONFIG, - WGET_BINARY, - WGET_EXTRACTOR, - WARC_EXTRACTOR, - ] - - -PLUGIN = WgetPlugin() -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/wget/binaries.py b/archivebox/plugins_extractor/wget/binaries.py new file mode 100644 index 00000000..6198beac --- /dev/null +++ b/archivebox/plugins_extractor/wget/binaries.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_extractor.wget' + +from typing import List + + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from .config import WGET_CONFIG + + +class WgetBinary(BaseBinary): + name: BinName = WGET_CONFIG.WGET_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + +WGET_BINARY = WgetBinary() diff --git a/archivebox/plugins_extractor/wget/config.py b/archivebox/plugins_extractor/wget/config.py new file mode 100644 index 00000000..2cc99668 --- /dev/null +++ b/archivebox/plugins_extractor/wget/config.py @@ -0,0 +1,72 @@ +__package__ = 'plugins_extractor.wget' + +import subprocess +from typing import List, Optional +from pathlib import Path + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG +from archivebox.misc.logging import STDERR + + +class WgetConfig(BaseConfigSet): + + SAVE_WGET: bool = True + SAVE_WARC: bool = True + + USE_WGET: bool = Field(default=lambda c: c.SAVE_WGET or c.SAVE_WARC) + + WGET_BINARY: str = Field(default='wget') + WGET_ARGS: List[str] = [ + '--no-verbose', + '--adjust-extension', + '--convert-links', + '--force-directories', + '--backup-converted', + '--span-hosts', + '--no-parent', + '-e', 'robots=off', + ] + WGET_EXTRA_ARGS: List[str] = [] + + SAVE_WGET_REQUISITES: bool = Field(default=True) + WGET_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES) + + WGET_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT) + WGET_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + WGET_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT) + WGET_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE) + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_WGET and self.WGET_TIMEOUT < 10: + STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.WGET_TIMEOUT} seconds)[/red]') + STDERR.print(' wget will fail to archive any sites if set to less than ~20 seconds.') + STDERR.print(' (Setting it somewhere over 60 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') + STDERR.print() + return self + + @property + def WGET_AUTO_COMPRESSION(self) -> bool: + if hasattr(self, '_WGET_AUTO_COMPRESSION'): + return self._WGET_AUTO_COMPRESSION + try: + cmd = [ + self.WGET_BINARY, + "--compression=auto", + "--help", + ] + self._WGET_AUTO_COMPRESSION = not subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=3).returncode + return self._WGET_AUTO_COMPRESSION + except (FileNotFoundError, OSError): + self._WGET_AUTO_COMPRESSION = False + return False + +WGET_CONFIG = WgetConfig() + diff --git a/archivebox/plugins_extractor/wget/extractors.py b/archivebox/plugins_extractor/wget/extractors.py new file mode 100644 index 00000000..86fa3923 --- /dev/null +++ b/archivebox/plugins_extractor/wget/extractors.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.wget' + +from pathlib import Path + +from pydantic_pkgr import BinName + +from abx.archivebox.base_extractor import BaseExtractor, ExtractorName + +from .binaries import WGET_BINARY +from .wget_util import wget_output_path + +class WgetExtractor(BaseExtractor): + name: ExtractorName = 'wget' + binary: BinName = WGET_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + wget_index_path = wget_output_path(snapshot.as_link()) + if wget_index_path: + return Path(wget_index_path) + return None + +WGET_EXTRACTOR = WgetExtractor() + + +class WarcExtractor(BaseExtractor): + name: ExtractorName = 'warc' + binary: BinName = WGET_BINARY.name + + def get_output_path(self, snapshot) -> Path | None: + warc_files = list((Path(snapshot.link_dir) / 'warc').glob('*.warc.gz')) + if warc_files: + return sorted(warc_files, key=lambda x: x.stat().st_size, reverse=True)[0] + return None + + +WARC_EXTRACTOR = WarcExtractor() + diff --git a/archivebox/plugins_extractor/ytdlp/__init__.py b/archivebox/plugins_extractor/ytdlp/__init__.py index e69de29b..26157c24 100644 --- a/archivebox/plugins_extractor/ytdlp/__init__.py +++ b/archivebox/plugins_extractor/ytdlp/__init__.py @@ -0,0 +1,37 @@ +__package__ = 'plugins_extractor.ytdlp' +__label__ = 'YT-DLP' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/yt-dlp/yt-dlp' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'ytdlp': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import YTDLP_CONFIG + + return { + 'ytdlp': YTDLP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import YTDLP_BINARY, FFMPEG_BINARY + + return { + 'ytdlp': YTDLP_BINARY, + 'ffmpeg': FFMPEG_BINARY, + } diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py deleted file mode 100644 index 742c742b..00000000 --- a/archivebox/plugins_extractor/ytdlp/apps.py +++ /dev/null @@ -1,98 +0,0 @@ -import sys -from typing import List -from subprocess import run, PIPE - -from rich import print -from pydantic import InstanceOf, Field, model_validator, AliasChoices -from pydantic_pkgr import BinProvider, BinName, BinaryOverrides - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_hook import BaseHook - -from archivebox.config.common import ARCHIVING_CONFIG -from plugins_pkg.pip.apps import pip - -###################### Config ########################## - - -class YtdlpConfig(BaseConfigSet): - USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) - - YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') - YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') - - YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) - YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT) - - @model_validator(mode='after') - def validate_use_ytdlp(self): - if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20: - print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr) - print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr) - print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr) - print(file=sys.stderr) - print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr) - print(file=sys.stderr) - return self - - -YTDLP_CONFIG = YtdlpConfig() - - - -class YtdlpBinary(BaseBinary): - name: BinName = YTDLP_CONFIG.YTDLP_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env] - -YTDLP_BINARY = YtdlpBinary() - - -class FfmpegBinary(BaseBinary): - name: BinName = 'ffmpeg' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - 'env': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH), - 'version': lambda: run(['ffmpeg', '-version'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - 'apt': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH), - 'version': lambda: run(['apt', 'show', 'ffmpeg'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - 'brew': { - # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH), - 'version': lambda: run(['brew', 'info', 'ffmpeg', '--quiet'], stdout=PIPE, stderr=PIPE, text=True).stdout, - }, - } - - # def get_ffmpeg_version(self) -> Optional[str]: - # return self.exec(cmd=['-version']).stdout - -FFMPEG_BINARY = FfmpegBinary() - - -# class YtdlpExtractor(BaseExtractor): -# name: str = 'ytdlp' -# binary: str = 'ytdlp' - - - -class YtdlpPlugin(BasePlugin): - app_label: str = 'ytdlp' - verbose_name: str = 'YT-DLP' - docs_url: str = 'https://github.com/yt-dlp/yt-dlp' - - hooks: List[InstanceOf[BaseHook]] = [ - YTDLP_CONFIG, - YTDLP_BINARY, - FFMPEG_BINARY, - ] - - -PLUGIN = YtdlpPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_extractor/ytdlp/binaries.py b/archivebox/plugins_extractor/ytdlp/binaries.py new file mode 100644 index 00000000..730de2dc --- /dev/null +++ b/archivebox/plugins_extractor/ytdlp/binaries.py @@ -0,0 +1,42 @@ +__package__ = 'plugins_extractor.ytdlp' + +import subprocess +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + +from plugins_pkg.pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER + +from .config import YTDLP_CONFIG + + +class YtdlpBinary(BaseBinary): + name: BinName = YTDLP_CONFIG.YTDLP_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env] + +YTDLP_BINARY = YtdlpBinary() + + +class FfmpegBinary(BaseBinary): + name: BinName = 'ffmpeg' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + 'env': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=env.PATH), + 'version': lambda: subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True).stdout, + }, + 'apt': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=apt.PATH), + 'version': lambda: subprocess.run(['apt', 'show', 'ffmpeg'], capture_output=True, text=True).stdout, + }, + 'brew': { + # 'abspath': lambda: shutil.which('ffmpeg', PATH=brew.PATH), + 'version': lambda: subprocess.run(['brew', 'info', 'ffmpeg', '--quiet'], capture_output=True, text=True).stdout, + }, + } + +FFMPEG_BINARY = FfmpegBinary() diff --git a/archivebox/plugins_extractor/ytdlp/config.py b/archivebox/plugins_extractor/ytdlp/config.py new file mode 100644 index 00000000..abe442bf --- /dev/null +++ b/archivebox/plugins_extractor/ytdlp/config.py @@ -0,0 +1,35 @@ +__package__ = 'plugins_extractor.ytdlp' + +from typing import List + +from pydantic import Field, model_validator, AliasChoices + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import ARCHIVING_CONFIG +from archivebox.misc.logging import STDERR + + +class YtdlpConfig(BaseConfigSet): + USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA')) + + YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY') + YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS') + + YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY) + YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT) + + @model_validator(mode='after') + def validate_use_ytdlp(self): + if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20: + STDERR.print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]') + STDERR.print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.') + STDERR.print(' (Setting it somewhere over 60 seconds is recommended)') + STDERR.print() + STDERR.print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:') + STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media') + STDERR.print() + return self + + +YTDLP_CONFIG = YtdlpConfig() diff --git a/archivebox/plugins_pkg/npm/__init__.py b/archivebox/plugins_pkg/npm/__init__.py index e69de29b..60b418eb 100644 --- a/archivebox/plugins_pkg/npm/__init__.py +++ b/archivebox/plugins_pkg/npm/__init__.py @@ -0,0 +1,47 @@ +__package__ = 'plugins_pkg.npm' +__label__ = 'npm' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://www.npmjs.com/' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'npm': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import NPM_CONFIG + + return { + 'npm': NPM_CONFIG, + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY + + return { + 'node': NODE_BINARY, + 'npm': NPM_BINARY, + 'npx': NPX_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER + + return { + 'lib_npm': LIB_NPM_BINPROVIDER, + 'sys_npm': SYS_NPM_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/npm/apps.py b/archivebox/plugins_pkg/npm/apps.py deleted file mode 100644 index 0ef53c36..00000000 --- a/archivebox/plugins_pkg/npm/apps.py +++ /dev/null @@ -1,114 +0,0 @@ -__package__ = 'archivebox.plugins_pkg.npm' - -from pathlib import Path -from typing import List, Optional - -from pydantic import InstanceOf, model_validator - -from pydantic_pkgr import BinProvider, NpmProvider, BinName, PATHStr, BinProviderName, BinaryOverrides - -from archivebox.config import DATA_DIR, CONSTANTS - -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew -from abx.archivebox.base_hook import BaseHook - - -###################### Config ########################## - - -class NpmDependencyConfigs(BaseConfigSet): - # USE_NPM: bool = True - # NPM_BINARY: str = Field(default='npm') - # NPM_ARGS: Optional[List[str]] = Field(default=None) - # NPM_EXTRA_ARGS: List[str] = [] - # NPM_DEFAULT_ARGS: List[str] = [] - pass - - -DEFAULT_GLOBAL_CONFIG = { -} -NPM_CONFIG = NpmDependencyConfigs(**DEFAULT_GLOBAL_CONFIG) - - -OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' -NEW_NODE_BIN_PATH = CONSTANTS.LIB_NPM_DIR / 'node_modules' / '.bin' - -class SystemNpmBinProvider(NpmProvider, BaseBinProvider): - name: BinProviderName = "sys_npm" - - npm_prefix: Optional[Path] = None - -class LibNpmBinProvider(NpmProvider, BaseBinProvider): - name: BinProviderName = "lib_npm" - PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' - - npm_prefix: Optional[Path] = CONSTANTS.LIB_NPM_DIR - - @model_validator(mode='after') - def validate_path(self): - assert self.npm_prefix == NEW_NODE_BIN_PATH.parent.parent - return self - - -SYS_NPM_BINPROVIDER = SystemNpmBinProvider() -LIB_NPM_BINPROVIDER = LibNpmBinProvider() -npm = LIB_NPM_BINPROVIDER - -class NodeBinary(BaseBinary): - name: BinName = 'node' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['nodejs']}, - } - - -NODE_BINARY = NodeBinary() - - -class NpmBinary(BaseBinary): - name: BinName = 'npm' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['npm']}, # already installed when nodejs is installed - brew.name: {'install': lambda: None}, # already installed when nodejs is installed - } - -NPM_BINARY = NpmBinary() - - -class NpxBinary(BaseBinary): - name: BinName = 'npx' - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'install': lambda: None}, # already installed when nodejs is installed - brew.name: {'install': lambda: None}, # already installed when nodejs is installed - } - -NPX_BINARY = NpxBinary() - - - - - -class NpmPlugin(BasePlugin): - app_label: str = 'npm' - verbose_name: str = 'NPM' - - hooks: List[InstanceOf[BaseHook]] = [ - NPM_CONFIG, - SYS_NPM_BINPROVIDER, - LIB_NPM_BINPROVIDER, - NODE_BINARY, - NPM_BINARY, - NPX_BINARY, - ] - - -PLUGIN = NpmPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/npm/binaries.py b/archivebox/plugins_pkg/npm/binaries.py new file mode 100644 index 00000000..dd9e6214 --- /dev/null +++ b/archivebox/plugins_pkg/npm/binaries.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_pkg.npm' + + +from typing import List + +from pydantic import InstanceOf + +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides + + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +class NodeBinary(BaseBinary): + name: BinName = 'node' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['nodejs']}, + } + + +NODE_BINARY = NodeBinary() + + +class NpmBinary(BaseBinary): + name: BinName = 'npm' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['npm']}, # already installed when nodejs is installed + brew.name: {'install': lambda: None}, # already installed when nodejs is installed + } + +NPM_BINARY = NpmBinary() + + +class NpxBinary(BaseBinary): + name: BinName = 'npx' + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'install': lambda: None}, # already installed when nodejs is installed + brew.name: {'install': lambda: None}, # already installed when nodejs is installed + } + +NPX_BINARY = NpxBinary() + diff --git a/archivebox/plugins_pkg/npm/binproviders.py b/archivebox/plugins_pkg/npm/binproviders.py new file mode 100644 index 00000000..b1b83168 --- /dev/null +++ b/archivebox/plugins_pkg/npm/binproviders.py @@ -0,0 +1,42 @@ +__package__ = 'plugins_pkg.npm' + +from pathlib import Path +from typing import Optional + +from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName + +from archivebox.config import DATA_DIR, CONSTANTS + +from abx.archivebox.base_binary import BaseBinProvider + + + +OLD_NODE_BIN_PATH = DATA_DIR / 'node_modules' / '.bin' +NEW_NODE_BIN_PATH = CONSTANTS.DEFAULT_LIB_DIR / 'npm' / 'node_modules' / '.bin' + + +class SystemNpmBinProvider(NpmProvider, BaseBinProvider): + name: BinProviderName = "sys_npm" + + npm_prefix: Optional[Path] = None + + +class LibNpmBinProvider(NpmProvider, BaseBinProvider): + name: BinProviderName = "lib_npm" + PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' + + npm_prefix: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'npm' + + def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.npm_prefix = STORAGE_CONFIG.LIB_DIR / 'npm' + self.PATH = f'{STORAGE_CONFIG.LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}' + + super().setup() + + +SYS_NPM_BINPROVIDER = SystemNpmBinProvider() +LIB_NPM_BINPROVIDER = LibNpmBinProvider() +npm = LIB_NPM_BINPROVIDER diff --git a/archivebox/plugins_pkg/npm/config.py b/archivebox/plugins_pkg/npm/config.py new file mode 100644 index 00000000..f69cfdd2 --- /dev/null +++ b/archivebox/plugins_pkg/npm/config.py @@ -0,0 +1,20 @@ +__package__ = 'plugins_pkg.npm' + + +from abx.archivebox.base_configset import BaseConfigSet + + +###################### Config ########################## + + +class NpmDependencyConfigs(BaseConfigSet): + # USE_NPM: bool = True + # NPM_BINARY: str = Field(default='npm') + # NPM_ARGS: Optional[List[str]] = Field(default=None) + # NPM_EXTRA_ARGS: List[str] = [] + # NPM_DEFAULT_ARGS: List[str] = [] + pass + + +NPM_CONFIG = NpmDependencyConfigs() + diff --git a/archivebox/plugins_pkg/pip/__init__.py b/archivebox/plugins_pkg/pip/__init__.py index e69de29b..c1be27b1 100644 --- a/archivebox/plugins_pkg/pip/__init__.py +++ b/archivebox/plugins_pkg/pip/__init__.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_pkg.pip' +__label__ = 'pip' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/pypa/pip' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'pip': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PIP_CONFIG + + return { + 'pip': PIP_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY + + return { + 'archivebox': ARCHIVEBOX_BINARY, + 'python': PYTHON_BINARY, + 'django': DJANGO_BINARY, + 'sqlite': SQLITE_BINARY, + 'pip': PIP_BINARY, + 'pipx': PIPX_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER + + return { + 'sys_pip': SYS_PIP_BINPROVIDER, + 'venv_pip': VENV_PIP_BINPROVIDER, + 'lib_pip': LIB_PIP_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/pip/apps.py b/archivebox/plugins_pkg/pip/binaries.py similarity index 62% rename from archivebox/plugins_pkg/pip/apps.py rename to archivebox/plugins_pkg/pip/binaries.py index 6ad1a5da..d4709edb 100644 --- a/archivebox/plugins_pkg/pip/apps.py +++ b/archivebox/plugins_pkg/pip/binaries.py @@ -1,105 +1,27 @@ -__package__ = 'archivebox.plugins_pkg.pip' +__package__ = 'plugins_pkg.pip' -import os import sys -import site from pathlib import Path -from typing import List, Optional -from pydantic import InstanceOf, Field, model_validator, validate_call +from typing import List +from pydantic import InstanceOf, Field, model_validator import django import django.db.backends.sqlite3.base from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type] -from pydantic_pkgr import BinProvider, PipProvider, BinName, BinProviderName, BinaryOverrides, SemVer +from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, SemVer -from archivebox.config import CONSTANTS, VERSION +from archivebox import VERSION -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env, apt, brew -from abx.archivebox.base_hook import BaseHook -from ...misc.logging import hint +from archivebox.misc.logging import hint +from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER ###################### Config ########################## -class PipDependencyConfigs(BaseConfigSet): - USE_PIP: bool = True - PIP_BINARY: str = Field(default='pip') - PIP_ARGS: Optional[List[str]] = Field(default=None) - PIP_EXTRA_ARGS: List[str] = [] - PIP_DEFAULT_ARGS: List[str] = [] - -PIP_CONFIG = PipDependencyConfigs() - - -class SystemPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "sys_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = None # global pip scope - - def on_install(self, bin_name: str, **kwargs): - # never modify system pip packages - return 'refusing to install packages globally with system pip, use a venv instead' - -class SystemPipxBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "pipx" - INSTALLER_BIN: BinName = "pipx" - - pip_venv: Optional[Path] = None # global pipx scope - - -IS_INSIDE_VENV = sys.prefix != sys.base_prefix - -class VenvPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "venv_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib')) - - def setup(self): - """never attempt to create a venv here, this is just used to detect if we are inside an existing one""" - return None - - -class LibPipBinProvider(PipProvider, BaseBinProvider): - name: BinProviderName = "lib_pip" - INSTALLER_BIN: BinName = "pip" - - pip_venv: Optional[Path] = CONSTANTS.LIB_PIP_DIR / 'venv' - -SYS_PIP_BINPROVIDER = SystemPipBinProvider() -PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() -VENV_PIP_BINPROVIDER = VenvPipBinProvider() -LIB_PIP_BINPROVIDER = LibPipBinProvider() -pip = LIB_PIP_BINPROVIDER - -# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path) -assert VENV_PIP_BINPROVIDER.pip_venv is not None -assert LIB_PIP_BINPROVIDER.pip_venv is not None - -major, minor, patch = sys.version_info[:3] -site_packages_dir = f'lib/python{major}.{minor}/site-packages' - -LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,) -VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,) -USER_SITE_PACKAGES = site.getusersitepackages() -SYS_SITE_PACKAGES = site.getsitepackages() - -ALL_SITE_PACKAGES = ( - *LIB_SITE_PACKAGES, - *VENV_SITE_PACKAGES, - *USER_SITE_PACKAGES, - *SYS_SITE_PACKAGES, -) -for site_packages_dir in ALL_SITE_PACKAGES: - if site_packages_dir not in sys.path: - sys.path.append(str(site_packages_dir)) - class ArchiveboxBinary(BaseBinary): name: BinName = 'archivebox' @@ -237,27 +159,3 @@ class PipxBinary(BaseBinary): binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env] PIPX_BINARY = PipxBinary() - - -class PipPlugin(BasePlugin): - app_label: str = 'pip' - verbose_name: str = 'PIP' - - hooks: List[InstanceOf[BaseHook]] = [ - PIP_CONFIG, - SYS_PIP_BINPROVIDER, - PIPX_PIP_BINPROVIDER, - VENV_PIP_BINPROVIDER, - LIB_PIP_BINPROVIDER, - PIP_BINARY, - PIPX_BINARY, - ARCHIVEBOX_BINARY, - PYTHON_BINARY, - SQLITE_BINARY, - DJANGO_BINARY, - ] - - -PLUGIN = PipPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/pip/binproviders.py b/archivebox/plugins_pkg/pip/binproviders.py new file mode 100644 index 00000000..e51dc780 --- /dev/null +++ b/archivebox/plugins_pkg/pip/binproviders.py @@ -0,0 +1,88 @@ +__package__ = 'plugins_pkg.pip' + +import os +import sys +import site +from pathlib import Path +from typing import Optional + +from pydantic_pkgr import PipProvider, BinName, BinProviderName + +from archivebox.config import CONSTANTS + +from abx.archivebox.base_binary import BaseBinProvider + + +###################### Config ########################## + +class SystemPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "sys_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = None # global pip scope + + def on_install(self, bin_name: str, **kwargs): + # never modify system pip packages + return 'refusing to install packages globally with system pip, use a venv instead' + +class SystemPipxBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "pipx" + INSTALLER_BIN: BinName = "pipx" + + pip_venv: Optional[Path] = None # global pipx scope + + +IS_INSIDE_VENV = sys.prefix != sys.base_prefix + +class VenvPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "venv_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib')) + + def setup(self): + """never attempt to create a venv here, this is just used to detect if we are inside an existing one""" + return None + + +class LibPipBinProvider(PipProvider, BaseBinProvider): + name: BinProviderName = "lib_pip" + INSTALLER_BIN: BinName = "pip" + + pip_venv: Optional[Path] = CONSTANTS.DEFAULT_LIB_DIR / 'pip' / 'venv' + + def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.pip_venv = STORAGE_CONFIG.LIB_DIR / 'pip' / 'venv' + + super().setup() + +SYS_PIP_BINPROVIDER = SystemPipBinProvider() +PIPX_PIP_BINPROVIDER = SystemPipxBinProvider() +VENV_PIP_BINPROVIDER = VenvPipBinProvider() +LIB_PIP_BINPROVIDER = LibPipBinProvider() +pip = LIB_PIP_BINPROVIDER + +# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path) +assert VENV_PIP_BINPROVIDER.pip_venv is not None +assert LIB_PIP_BINPROVIDER.pip_venv is not None + +major, minor, patch = sys.version_info[:3] +site_packages_dir = f'lib/python{major}.{minor}/site-packages' + +LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,) +VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,) +USER_SITE_PACKAGES = site.getusersitepackages() +SYS_SITE_PACKAGES = site.getsitepackages() + +ALL_SITE_PACKAGES = ( + *LIB_SITE_PACKAGES, + *VENV_SITE_PACKAGES, + *USER_SITE_PACKAGES, + *SYS_SITE_PACKAGES, +) +for site_packages_dir in ALL_SITE_PACKAGES: + if site_packages_dir not in sys.path: + sys.path.append(str(site_packages_dir)) diff --git a/archivebox/plugins_pkg/pip/config.py b/archivebox/plugins_pkg/pip/config.py new file mode 100644 index 00000000..26cf0f8e --- /dev/null +++ b/archivebox/plugins_pkg/pip/config.py @@ -0,0 +1,16 @@ +__package__ = 'pip' + +from typing import List, Optional +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + + +class PipDependencyConfigs(BaseConfigSet): + USE_PIP: bool = True + PIP_BINARY: str = Field(default='pip') + PIP_ARGS: Optional[List[str]] = Field(default=None) + PIP_EXTRA_ARGS: List[str] = [] + PIP_DEFAULT_ARGS: List[str] = [] + +PIP_CONFIG = PipDependencyConfigs() diff --git a/archivebox/plugins_pkg/playwright/__init__.py b/archivebox/plugins_pkg/playwright/__init__.py index e69de29b..0f66f42c 100644 --- a/archivebox/plugins_pkg/playwright/__init__.py +++ b/archivebox/plugins_pkg/playwright/__init__.py @@ -0,0 +1,44 @@ +__package__ = 'plugins_pkg.playwright' +__label__ = 'playwright' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/microsoft/playwright-python' + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'playwright': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PLAYWRIGHT_CONFIG + + return { + 'playwright': PLAYWRIGHT_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import PLAYWRIGHT_BINARY + + return { + 'playwright': PLAYWRIGHT_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import PLAYWRIGHT_BINPROVIDER + + return { + 'playwright': PLAYWRIGHT_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/playwright/binaries.py b/archivebox/plugins_pkg/playwright/binaries.py new file mode 100644 index 00000000..0ef63646 --- /dev/null +++ b/archivebox/plugins_pkg/playwright/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_pkg.playwright' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinName, BinProvider + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER + +from .config import PLAYWRIGHT_CONFIG + + + + +class PlaywrightBinary(BaseBinary): + name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY + + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env] + + +PLAYWRIGHT_BINARY = PlaywrightBinary() diff --git a/archivebox/plugins_pkg/playwright/apps.py b/archivebox/plugins_pkg/playwright/binproviders.py similarity index 75% rename from archivebox/plugins_pkg/playwright/apps.py rename to archivebox/plugins_pkg/playwright/binproviders.py index 131d8726..68e62bb5 100644 --- a/archivebox/plugins_pkg/playwright/apps.py +++ b/archivebox/plugins_pkg/playwright/binproviders.py @@ -1,15 +1,13 @@ -__package__ = 'archivebox.plugins_pkg.playwright' +__package__ = 'plugins_pkg.playwright' import os import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, computed_field, Field +from pydantic import computed_field, Field from pydantic_pkgr import ( BinName, - BinProvider, BinProviderName, BinProviderOverrides, InstallArgs, @@ -22,56 +20,29 @@ from pydantic_pkgr import ( from archivebox.config import CONSTANTS -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import BaseBinProvider, env -from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER +from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER + +from .binaries import PLAYWRIGHT_BINARY -###################### Config ########################## - - -class PlaywrightConfigs(BaseConfigSet): - # PLAYWRIGHT_BINARY: str = Field(default='wget') - # PLAYWRIGHT_ARGS: Optional[List[str]] = Field(default=None) - # PLAYWRIGHT_EXTRA_ARGS: List[str] = [] - # PLAYWRIGHT_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - pass - - -PLAYWRIGHT_CONFIG = PlaywrightConfigs() - -LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR - - - -class PlaywrightBinary(BaseBinary): - name: BinName = "playwright" - - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env] - - - -PLAYWRIGHT_BINARY = PlaywrightBinary() +MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright") +LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright") class PlaywrightBinProvider(BaseBinProvider): name: BinProviderName = "playwright" INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name - PATH: PATHStr = f"{CONSTANTS.LIB_BIN_DIR}:{DEFAULT_ENV_PATH}" + PATH: PATHStr = f"{CONSTANTS.DEFAULT_LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}" playwright_browsers_dir: Path = ( - Path("~/Library/Caches/ms-playwright").expanduser() # macos playwright cache dir + MACOS_PLAYWRIGHT_CACHE_DIR.expanduser() if OPERATING_SYSTEM == "darwin" else - Path("~/.cache/ms-playwright").expanduser() # linux playwright cache dir + LINUX_PLAYWRIGHT_CACHE_DIR.expanduser() ) - playwright_install_args: List[str] = ["install"] # --with-deps + playwright_install_args: List[str] = ["install"] packages_handler: BinProviderOverrides = Field(default={ "chrome": ["chromium"], @@ -85,6 +56,11 @@ class PlaywrightBinProvider(BaseBinProvider): return PLAYWRIGHT_BINARY.load().abspath def setup(self) -> None: + # update paths from config if they arent the default + from archivebox.config.common import STORAGE_CONFIG + if STORAGE_CONFIG.LIB_DIR != CONSTANTS.DEFAULT_LIB_DIR: + self.PATH = f"{STORAGE_CONFIG.LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}" + assert SYS_PIP_BINPROVIDER.INSTALLER_BIN_ABSPATH, "Pip bin provider not initialized" if self.playwright_browsers_dir: @@ -183,21 +159,3 @@ class PlaywrightBinProvider(BaseBinProvider): return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip() PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider() - - - -class PlaywrightPlugin(BasePlugin): - app_label: str = 'playwright' - verbose_name: str = 'Playwright (PIP)' - - hooks: List[InstanceOf[BaseHook]] = [ - PLAYWRIGHT_CONFIG, - PLAYWRIGHT_BINPROVIDER, - PLAYWRIGHT_BINARY, - ] - - - -PLUGIN = PlaywrightPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/playwright/config.py b/archivebox/plugins_pkg/playwright/config.py new file mode 100644 index 00000000..23f22efc --- /dev/null +++ b/archivebox/plugins_pkg/playwright/config.py @@ -0,0 +1,10 @@ +__package__ = 'playwright' + +from abx.archivebox.base_configset import BaseConfigSet + + +class PlaywrightConfigs(BaseConfigSet): + PLAYWRIGHT_BINARY: str = 'playwright' + + +PLAYWRIGHT_CONFIG = PlaywrightConfigs() diff --git a/archivebox/plugins_pkg/puppeteer/__init__.py b/archivebox/plugins_pkg/puppeteer/__init__.py index e69de29b..7acc5b1b 100644 --- a/archivebox/plugins_pkg/puppeteer/__init__.py +++ b/archivebox/plugins_pkg/puppeteer/__init__.py @@ -0,0 +1,46 @@ +__package__ = 'plugins_pkg.puppeteer' +__label__ = 'puppeteer' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/puppeteer/puppeteer' +__dependencies__ = ['npm'] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'puppeteer': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import PUPPETEER_CONFIG + + return { + 'puppeteer': PUPPETEER_CONFIG + } + +@abx.hookimpl +def get_BINARIES(): + from .binaries import PUPPETEER_BINARY + + return { + 'puppeteer': PUPPETEER_BINARY, + } + +@abx.hookimpl +def get_BINPROVIDERS(): + from .binproviders import PUPPETEER_BINPROVIDER + + return { + 'puppeteer': PUPPETEER_BINPROVIDER, + } diff --git a/archivebox/plugins_pkg/puppeteer/binaries.py b/archivebox/plugins_pkg/puppeteer/binaries.py new file mode 100644 index 00000000..7e592bba --- /dev/null +++ b/archivebox/plugins_pkg/puppeteer/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_pkg.puppeteer' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinName + + +from abx.archivebox.base_binary import BaseBinary, env + +from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER + + +###################### Config ########################## + + +class PuppeteerBinary(BaseBinary): + name: BinName = "puppeteer" + + binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] + + +PUPPETEER_BINARY = PuppeteerBinary() diff --git a/archivebox/plugins_pkg/puppeteer/apps.py b/archivebox/plugins_pkg/puppeteer/binproviders.py similarity index 74% rename from archivebox/plugins_pkg/puppeteer/apps.py rename to archivebox/plugins_pkg/puppeteer/binproviders.py index 8dad3392..2ef0eb7a 100644 --- a/archivebox/plugins_pkg/puppeteer/apps.py +++ b/archivebox/plugins_pkg/puppeteer/binproviders.py @@ -1,14 +1,12 @@ -__package__ = 'archivebox.plugins_pkg.puppeteer' +__package__ = 'plugins_pkg.puppeteer' import os import platform from pathlib import Path from typing import List, Optional, Dict, ClassVar -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field +from pydantic import Field from pydantic_pkgr import ( - BinProvider, BinName, BinProviderName, BinProviderOverrides, @@ -20,53 +18,21 @@ from pydantic_pkgr import ( from archivebox.config import CONSTANTS from archivebox.config.permissions import ARCHIVEBOX_USER -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, env -# from abx.archivebox.base_extractor import BaseExtractor -# from abx.archivebox.base_queue import BaseQueue -from abx.archivebox.base_hook import BaseHook +from abx.archivebox.base_binary import BaseBinProvider -# Depends on Other Plugins: -from plugins_pkg.npm.apps import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER - - -###################### Config ########################## - - -class PuppeteerConfigs(BaseConfigSet): - # PUPPETEER_BINARY: str = Field(default='wget') - # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None) - # PUPPETEER_EXTRA_ARGS: List[str] = [] - # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] - pass - - -PUPPETEER_CONFIG = PuppeteerConfigs() - -LIB_DIR_BROWSERS = CONSTANTS.LIB_BROWSERS_DIR - - -class PuppeteerBinary(BaseBinary): - name: BinName = "puppeteer" - - binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env] - - -PUPPETEER_BINARY = PuppeteerBinary() +from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER class PuppeteerBinProvider(BaseBinProvider): name: BinProviderName = "puppeteer" INSTALLER_BIN: BinName = "npx" - PATH: PATHStr = str(CONSTANTS.LIB_BIN_DIR) + PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin') euid: Optional[int] = ARCHIVEBOX_USER - puppeteer_browsers_dir: Path = LIB_DIR_BROWSERS - puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install", "--path", str(LIB_DIR_BROWSERS)] + puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers' + puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"] packages_handler: BinProviderOverrides = Field(default={ "chrome": lambda: @@ -76,6 +42,11 @@ class PuppeteerBinProvider(BaseBinProvider): _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {} def setup(self) -> None: + # update paths from config + from archivebox.config.common import STORAGE_CONFIG + self.puppeteer_browsers_dir = STORAGE_CONFIG.LIB_DIR / 'browsers' + self.PATH = str(STORAGE_CONFIG.LIB_DIR / 'bin') + assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized" if self.puppeteer_browsers_dir: @@ -121,7 +92,7 @@ class PuppeteerBinProvider(BaseBinProvider): # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}') - install_args = [*self.puppeteer_install_args] + install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)] proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages]) @@ -157,20 +128,3 @@ PUPPETEER_BINPROVIDER = PuppeteerBinProvider() # "binproviders_supported": self.binproviders_supported, # } # ) - - -class PuppeteerPlugin(BasePlugin): - app_label: str ='puppeteer' - verbose_name: str = 'Puppeteer (NPM)' - - hooks: List[InstanceOf[BaseHook]] = [ - PUPPETEER_CONFIG, - PUPPETEER_BINPROVIDER, - PUPPETEER_BINARY, - ] - - - -PLUGIN = PuppeteerPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_pkg/puppeteer/config.py b/archivebox/plugins_pkg/puppeteer/config.py new file mode 100644 index 00000000..b76d0779 --- /dev/null +++ b/archivebox/plugins_pkg/puppeteer/config.py @@ -0,0 +1,18 @@ +__package__ = 'plugins_pkg.puppeteer' + + +from abx.archivebox.base_configset import BaseConfigSet + + +###################### Config ########################## + + +class PuppeteerConfig(BaseConfigSet): + PUPPETEER_BINARY: str = 'puppeteer' + # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None) + # PUPPETEER_EXTRA_ARGS: List[str] = [] + # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}'] + pass + + +PUPPETEER_CONFIG = PuppeteerConfig() diff --git a/archivebox/plugins_search/ripgrep/__init__.py b/archivebox/plugins_search/ripgrep/__init__.py index e69de29b..ac1e417c 100644 --- a/archivebox/plugins_search/ripgrep/__init__.py +++ b/archivebox/plugins_search/ripgrep/__init__.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_search.ripgrep' +__label__ = 'ripgrep' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/BurntSushi/ripgrep' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'ripgrep': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import RIPGREP_CONFIG + + return { + 'ripgrep': RIPGREP_CONFIG + } + + +@abx.hookimpl +def get_BINARIES(): + from .binaries import RIPGREP_BINARY + + return { + 'ripgrep': RIPGREP_BINARY + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import RIPGREP_SEARCH_BACKEND + + return { + 'ripgrep': RIPGREP_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py deleted file mode 100644 index 27d0f5e1..00000000 --- a/archivebox/plugins_search/ripgrep/apps.py +++ /dev/null @@ -1,114 +0,0 @@ -__package__ = 'archivebox.plugins_search.ripgrep' - -import re -from pathlib import Path -from subprocess import run -from typing import List, Iterable -# from typing_extensions import Self - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, apt, brew -from abx.archivebox.base_hook import BaseHook -from abx.archivebox.base_searchbackend import BaseSearchBackend - -# Depends on Other Plugins: -from archivebox.config import CONSTANTS -from archivebox.config.common import SEARCH_BACKEND_CONFIG - -###################### Config ########################## - -class RipgrepConfig(BaseConfigSet): - RIPGREP_BINARY: str = Field(default='rg') - - RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') - RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ - # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md - f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', - '--type-not=ignore', - '--ignore-case', - '--files-with-matches', - '--regexp', - ]) - RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR - -RIPGREP_CONFIG = RipgrepConfig() - - - -class RipgrepBinary(BaseBinary): - name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] - - overrides: BinaryOverrides = { - apt.name: {'packages': ['ripgrep']}, - brew.name: {'packages': ['ripgrep']}, - } - -RIPGREP_BINARY = RipgrepBinary() - -# regex to match archive//... snapshot dir names -TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/') - -class RipgrepSearchBackend(BaseSearchBackend): - name: str = 'ripgrep' - docs_url: str = 'https://github.com/BurntSushi/ripgrep' - - @staticmethod - def index(snapshot_id: str, texts: List[str]): - return - - @staticmethod - def flush(snapshot_ids: Iterable[str]): - return - - @staticmethod - def search(text: str) -> List[str]: - from core.models import Snapshot - - ripgrep_binary = RIPGREP_BINARY.load() - if not ripgrep_binary.version: - raise Exception("ripgrep binary not found, install ripgrep to use this search backend") - - cmd = [ - ripgrep_binary.abspath, - *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, - text, - str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR), - ] - proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) - timestamps = set() - for path in proc.stdout.splitlines(): - ts = TIMESTAMP_REGEX.findall(path) - if ts: - timestamps.add(ts[0]) - - snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] - - return snap_ids - -RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() - - - - -class RipgrepSearchPlugin(BasePlugin): - app_label: str ='ripgrep' - verbose_name: str = 'Ripgrep' - - hooks: List[InstanceOf[BaseHook]] = [ - RIPGREP_CONFIG, - RIPGREP_BINARY, - RIPGREP_SEARCH_BACKEND, - ] - - - -PLUGIN = RipgrepSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_search/ripgrep/binaries.py b/archivebox/plugins_search/ripgrep/binaries.py new file mode 100644 index 00000000..710a1ef0 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/binaries.py @@ -0,0 +1,23 @@ +__package__ = 'plugins_search.ripgrep' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env, apt, brew + + +from .config import RIPGREP_CONFIG + + +class RipgrepBinary(BaseBinary): + name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env] + + overrides: BinaryOverrides = { + apt.name: {'packages': ['ripgrep']}, + brew.name: {'packages': ['ripgrep']}, + } + +RIPGREP_BINARY = RipgrepBinary() diff --git a/archivebox/plugins_search/ripgrep/config.py b/archivebox/plugins_search/ripgrep/config.py new file mode 100644 index 00000000..726c21e8 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/config.py @@ -0,0 +1,29 @@ +__package__ = 'plugins_search.ripgrep' + +from pathlib import Path +from typing import List + +from pydantic import Field + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config import CONSTANTS +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + +class RipgrepConfig(BaseConfigSet): + RIPGREP_BINARY: str = Field(default='rg') + + RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg') + RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [ + # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md + f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}', + '--type-not=ignore', + '--ignore-case', + '--files-with-matches', + '--regexp', + ]) + RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR + RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT) + +RIPGREP_CONFIG = RipgrepConfig() diff --git a/archivebox/plugins_search/ripgrep/searchbackend.py b/archivebox/plugins_search/ripgrep/searchbackend.py new file mode 100644 index 00000000..3c30af85 --- /dev/null +++ b/archivebox/plugins_search/ripgrep/searchbackend.py @@ -0,0 +1,55 @@ +__package__ = 'plugins_search.ripgrep' + +import re +import subprocess + +from typing import List, Iterable + +from abx.archivebox.base_searchbackend import BaseSearchBackend + +from .binaries import RIPGREP_BINARY +from .config import RIPGREP_CONFIG + + + +# regex to match archive//... snapshot dir names +TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/') + +class RipgrepSearchBackend(BaseSearchBackend): + name: str = 'ripgrep' + docs_url: str = 'https://github.com/BurntSushi/ripgrep' + + @staticmethod + def index(snapshot_id: str, texts: List[str]): + return + + @staticmethod + def flush(snapshot_ids: Iterable[str]): + return + + @staticmethod + def search(text: str) -> List[str]: + from core.models import Snapshot + + ripgrep_binary = RIPGREP_BINARY.load() + if not ripgrep_binary.version: + raise Exception("ripgrep binary not found, install ripgrep to use this search backend") + + cmd = [ + ripgrep_binary.abspath, + *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, + text, + str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR), + ] + proc = subprocess.run(cmd, timeout=RIPGREP_CONFIG.RIPGREP_TIMEOUT, capture_output=True, text=True) + timestamps = set() + for path in proc.stdout.splitlines(): + ts = TIMESTAMP_REGEX.findall(path) + if ts: + timestamps.add(ts[0]) + + snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] + + return snap_ids + +RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend() diff --git a/archivebox/plugins_search/sonic/__init__.py b/archivebox/plugins_search/sonic/__init__.py index e69de29b..4b81b0be 100644 --- a/archivebox/plugins_search/sonic/__init__.py +++ b/archivebox/plugins_search/sonic/__init__.py @@ -0,0 +1,48 @@ +__package__ = 'plugins_search.sonic' +__label__ = 'sonic' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/valeriansaliou/sonic' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'sonic': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SONIC_CONFIG + + return { + 'sonic': SONIC_CONFIG + } + + +@abx.hookimpl +def get_BINARIES(): + from .binaries import SONIC_BINARY + + return { + 'sonic': SONIC_BINARY + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import SONIC_SEARCH_BACKEND + + return { + 'sonic': SONIC_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/sonic/apps.py b/archivebox/plugins_search/sonic/apps.py deleted file mode 100644 index d62d1f12..00000000 --- a/archivebox/plugins_search/sonic/apps.py +++ /dev/null @@ -1,131 +0,0 @@ -__package__ = 'archivebox.plugins_search.sonic' - -import sys -from typing import List, Generator, cast - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field, model_validator -from pydantic_pkgr import BinProvider, BinaryOverrides, BinName - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, env, brew -from abx.archivebox.base_hook import BaseHook -from abx.archivebox.base_searchbackend import BaseSearchBackend - -# Depends on Other Plugins: -from archivebox.config.common import SEARCH_BACKEND_CONFIG - -SONIC_LIB = None -try: - import sonic - SONIC_LIB = sonic -except ImportError: - SONIC_LIB = None - -###################### Config ########################## - -class SonicConfig(BaseConfigSet): - SONIC_BINARY: str = Field(default='sonic') - - SONIC_HOST: str = Field(default='localhost', alias='SEARCH_BACKEND_HOST_NAME') - SONIC_PORT: int = Field(default=1491, alias='SEARCH_BACKEND_PORT') - SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD') - SONIC_COLLECTION: str = Field(default='archivebox') - SONIC_BUCKET: str = Field(default='archivebox') - - SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000) - SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000) - SONIC_MAX_RETRIES: int = Field(default=5) - - @model_validator(mode='after') - def validate_sonic_port(self): - if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None: - sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n') - # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap - # sys.exit(1) - SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') - return self - -SONIC_CONFIG = SonicConfig() - - -class SonicBinary(BaseBinary): - name: BinName = SONIC_CONFIG.SONIC_BINARY - binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo - - overrides: BinaryOverrides = { - brew.name: {'packages': ['sonic']}, - # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo - } - - # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally - # def on_get_version(self): - # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - # return SemVer.parse(str(ingestcl.protocol)) - -SONIC_BINARY = SonicBinary() - - - -class SonicSearchBackend(BaseSearchBackend): - name: str = 'sonic' - docs_url: str = 'https://github.com/valeriansaliou/sonic' - - @staticmethod - def index(snapshot_id: str, texts: List[str]): - error_count = 0 - with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - for text in texts: - chunks = ( - text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH] - for i in range( - 0, - min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH), - SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH, - ) - ) - try: - for chunk in chunks: - ingestcl.push(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, snapshot_id, str(chunk)) - except Exception as err: - print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') - error_count += 1 - if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES: - raise - - @staticmethod - def flush(snapshot_ids: Generator[str, None, None]): - with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: - for id in snapshot_ids: - ingestcl.flush_object(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, str(id)) - - - @staticmethod - def search(text: str) -> List[str]: - with sonic.SearchClient(SONIC_CONFIG.SONIC_HOST, SONIC_CONFIG.SONIC_PORT, SONIC_CONFIG.SONIC_PASSWORD) as querycl: - snap_ids = cast(List[str], querycl.query(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, text)) - return [str(id) for id in snap_ids] - - -SONIC_SEARCH_BACKEND = SonicSearchBackend() - - - - -class SonicSearchPlugin(BasePlugin): - app_label: str ='sonic' - verbose_name: str = 'Sonic' - - hooks: List[InstanceOf[BaseHook]] = [ - SONIC_CONFIG, - *([SONIC_BINARY] if (SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic') else []), - SONIC_SEARCH_BACKEND, - ] - - - -PLUGIN = SonicSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/plugins_search/sonic/binaries.py b/archivebox/plugins_search/sonic/binaries.py new file mode 100644 index 00000000..eab987c5 --- /dev/null +++ b/archivebox/plugins_search/sonic/binaries.py @@ -0,0 +1,27 @@ +__package__ = 'plugins_search.sonic' + +from typing import List + +from pydantic import InstanceOf +from pydantic_pkgr import BinProvider, BinaryOverrides, BinName + +from abx.archivebox.base_binary import BaseBinary, env, brew + +from .config import SONIC_CONFIG + + +class SonicBinary(BaseBinary): + name: BinName = SONIC_CONFIG.SONIC_BINARY + binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo + + overrides: BinaryOverrides = { + brew.name: {'packages': ['sonic']}, + # cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo + } + + # TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally + # def on_get_version(self): + # with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + # return SemVer.parse(str(ingestcl.protocol)) + +SONIC_BINARY = SonicBinary() diff --git a/archivebox/plugins_search/sonic/config.py b/archivebox/plugins_search/sonic/config.py new file mode 100644 index 00000000..a16c8c42 --- /dev/null +++ b/archivebox/plugins_search/sonic/config.py @@ -0,0 +1,44 @@ +__package__ = 'plugins_search.sonic' + +import sys + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + +SONIC_LIB = None +try: + import sonic + SONIC_LIB = sonic +except ImportError: + SONIC_LIB = None + +###################### Config ########################## + + +class SonicConfig(BaseConfigSet): + SONIC_BINARY: str = Field(default='sonic') + + SONIC_HOST: str = Field(default='localhost', alias='SEARCH_BACKEND_HOST_NAME') + SONIC_PORT: int = Field(default=1491, alias='SEARCH_BACKEND_PORT') + SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD') + SONIC_COLLECTION: str = Field(default='archivebox') + SONIC_BUCKET: str = Field(default='archivebox') + + SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000) + SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000) + SONIC_MAX_RETRIES: int = Field(default=5) + + @model_validator(mode='after') + def validate_sonic_port(self): + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None: + sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n') + # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap + # sys.exit(1) + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') + return self + +SONIC_CONFIG = SonicConfig() diff --git a/archivebox/plugins_search/sonic/searchbackend.py b/archivebox/plugins_search/sonic/searchbackend.py new file mode 100644 index 00000000..1662e5b2 --- /dev/null +++ b/archivebox/plugins_search/sonic/searchbackend.py @@ -0,0 +1,51 @@ +__package__ = 'plugins_search.sonic' + +from typing import List, Generator, cast + +from abx.archivebox.base_searchbackend import BaseSearchBackend + + +from .config import SONIC_CONFIG, SONIC_LIB + + +class SonicSearchBackend(BaseSearchBackend): + name: str = 'sonic' + docs_url: str = 'https://github.com/valeriansaliou/sonic' + + @staticmethod + def index(snapshot_id: str, texts: List[str]): + error_count = 0 + with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + for text in texts: + chunks = ( + text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH] + for i in range( + 0, + min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH), + SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH, + ) + ) + try: + for chunk in chunks: + ingestcl.push(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, snapshot_id, str(chunk)) + except Exception as err: + print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}') + error_count += 1 + if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES: + raise + + @staticmethod + def flush(snapshot_ids: Generator[str, None, None]): + with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl: + for id in snapshot_ids: + ingestcl.flush_object(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, str(id)) + + + @staticmethod + def search(text: str) -> List[str]: + with SONIC_LIB.SearchClient(SONIC_CONFIG.SONIC_HOST, SONIC_CONFIG.SONIC_PORT, SONIC_CONFIG.SONIC_PASSWORD) as querycl: + snap_ids = cast(List[str], querycl.query(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, text)) + return [str(id) for id in snap_ids] + + +SONIC_SEARCH_BACKEND = SonicSearchBackend() diff --git a/archivebox/plugins_search/sqlitefts/__init__.py b/archivebox/plugins_search/sqlitefts/__init__.py new file mode 100644 index 00000000..63fb1b12 --- /dev/null +++ b/archivebox/plugins_search/sqlitefts/__init__.py @@ -0,0 +1,39 @@ +__package__ = 'plugins_search.sqlitefts' +__label__ = 'sqlitefts' +__version__ = '2024.10.14' +__author__ = 'ArchiveBox' +__homepage__ = 'https://github.com/ArchiveBox/archivebox' +__dependencies__ = [] + +import abx + + +@abx.hookimpl +def get_PLUGIN(): + return { + 'sqlitefts': { + 'PACKAGE': __package__, + 'LABEL': __label__, + 'VERSION': __version__, + 'AUTHOR': __author__, + 'HOMEPAGE': __homepage__, + 'DEPENDENCIES': __dependencies__, + } + } + +@abx.hookimpl +def get_CONFIG(): + from .config import SQLITEFTS_CONFIG + + return { + 'sqlitefts': SQLITEFTS_CONFIG + } + + +@abx.hookimpl +def get_SEARCHBACKENDS(): + from .searchbackend import SQLITEFTS_SEARCH_BACKEND + + return { + 'sqlitefts': SQLITEFTS_SEARCH_BACKEND, + } diff --git a/archivebox/plugins_search/sqlitefts/config.py b/archivebox/plugins_search/sqlitefts/config.py new file mode 100644 index 00000000..77209f27 --- /dev/null +++ b/archivebox/plugins_search/sqlitefts/config.py @@ -0,0 +1,73 @@ +__package__ = 'plugins_search.sqlitefts' + +import sys +import sqlite3 +from typing import Callable + +from django.core.exceptions import ImproperlyConfigured + +from pydantic import Field, model_validator + +from abx.archivebox.base_configset import BaseConfigSet + +from archivebox.config.common import SEARCH_BACKEND_CONFIG + + + +###################### Config ########################## + +class SqliteftsConfig(BaseConfigSet): + SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') + SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') + SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') + + # Not really meant to be user-modified, just here as constants + SQLITEFTS_DB: str = Field(default='search.sqlite3') + SQLITEFTS_TABLE: str = Field(default='snapshot_fts') + SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') + SQLITEFTS_COLUMN: str = Field(default='texts') + + @model_validator(mode='after') + def validate_fts_separate_database(self): + if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB: + sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n') + SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') + return self + + @property + def get_connection(self) -> Callable[[], sqlite3.Connection]: + # Make get_connection callable, because `django.db.connection.cursor()` + # has to be called to get a context manager, but sqlite3.Connection + # is a context manager without being called. + if self.SQLITEFTS_SEPARATE_DATABASE: + return lambda: sqlite3.connect(self.SQLITEFTS_DB) + else: + from django.db import connection as database + return database.cursor + + @property + def SQLITE_BIND(self) -> str: + if self.SQLITEFTS_SEPARATE_DATABASE: + return "?" + else: + return "%s" + + @property + def SQLITE_LIMIT_LENGTH(self) -> int: + from django.db import connection as database + + # Only Python >= 3.11 supports sqlite3.Connection.getlimit(), + # so fall back to the default if the API to get the real value isn't present + try: + limit_id = sqlite3.SQLITE_LIMIT_LENGTH # type: ignore[attr-defined] + + if self.SQLITEFTS_SEPARATE_DATABASE: + cursor = self.get_connection() + return cursor.connection.getlimit(limit_id) # type: ignore[attr-defined] + else: + with database.temporary_connection() as cursor: # type: ignore[attr-defined] + return cursor.connection.getlimit(limit_id) + except (AttributeError, ImproperlyConfigured): + return self.SQLITEFTS_MAX_LENGTH + +SQLITEFTS_CONFIG = SqliteftsConfig() diff --git a/archivebox/plugins_search/sqlite/apps.py b/archivebox/plugins_search/sqlitefts/searchbackend.py similarity index 66% rename from archivebox/plugins_search/sqlite/apps.py rename to archivebox/plugins_search/sqlitefts/searchbackend.py index 67917f19..630bdd4c 100644 --- a/archivebox/plugins_search/sqlite/apps.py +++ b/archivebox/plugins_search/sqlitefts/searchbackend.py @@ -1,83 +1,12 @@ -__package__ = 'archivebox.plugins_search.sqlite' +__package__ = 'plugins_search.sqlitefts' -import sys import codecs import sqlite3 -from typing import List, Iterable, Callable +from typing import List, Iterable -from django.core.exceptions import ImproperlyConfigured - -# Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field, model_validator - -# Depends on other Django apps: -from abx.archivebox.base_plugin import BasePlugin -from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_searchbackend import BaseSearchBackend -# Depends on Other Plugins: -from archivebox.config.common import SEARCH_BACKEND_CONFIG - - - -###################### Config ########################## - -class SqliteftsConfig(BaseConfigSet): - SQLITEFTS_SEPARATE_DATABASE: bool = Field(default=True, alias='FTS_SEPARATE_DATABASE') - SQLITEFTS_TOKENIZERS: str = Field(default='porter unicode61 remove_diacritics 2', alias='FTS_TOKENIZERS') - SQLITEFTS_MAX_LENGTH: int = Field(default=int(1e9), alias='FTS_SQLITE_MAX_LENGTH') - - # Not really meant to be user-modified, just here as constants - SQLITEFTS_DB: str = Field(default='search.sqlite3') - SQLITEFTS_TABLE: str = Field(default='snapshot_fts') - SQLITEFTS_ID_TABLE: str = Field(default='snapshot_id_fts') - SQLITEFTS_COLUMN: str = Field(default='texts') - - @model_validator(mode='after') - def validate_fts_separate_database(self): - if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sqlite' and self.SQLITEFTS_SEPARATE_DATABASE and not self.SQLITEFTS_DB: - sys.stderr.write('[X] Error: SQLITEFTS_DB must be set if SQLITEFTS_SEPARATE_DATABASE is True\n') - SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') - return self - - @property - def get_connection(self) -> Callable[[], sqlite3.Connection]: - # Make get_connection callable, because `django.db.connection.cursor()` - # has to be called to get a context manager, but sqlite3.Connection - # is a context manager without being called. - if self.SQLITEFTS_SEPARATE_DATABASE: - return lambda: sqlite3.connect(self.SQLITEFTS_DB) - else: - from django.db import connection as database - return database.cursor - - @property - def SQLITE_BIND(self) -> str: - if self.SQLITEFTS_SEPARATE_DATABASE: - return "?" - else: - return "%s" - - @property - def SQLITE_LIMIT_LENGTH(self) -> int: - from django.db import connection as database - - # Only Python >= 3.11 supports sqlite3.Connection.getlimit(), - # so fall back to the default if the API to get the real value isn't present - try: - limit_id = sqlite3.SQLITE_LIMIT_LENGTH # type: ignore[attr-defined] - - if self.SQLITEFTS_SEPARATE_DATABASE: - cursor = self.get_connection() - return cursor.connection.getlimit(limit_id) # type: ignore[attr-defined] - else: - with database.temporary_connection() as cursor: # type: ignore[attr-defined] - return cursor.connection.getlimit(limit_id) - except (AttributeError, ImproperlyConfigured): - return self.SQLITEFTS_MAX_LENGTH - -SQLITEFTS_CONFIG = SqliteftsConfig() +from .config import SQLITEFTS_CONFIG @@ -242,20 +171,3 @@ class SqliteftsSearchBackend(BaseSearchBackend): _handle_query_exception(e) SQLITEFTS_SEARCH_BACKEND = SqliteftsSearchBackend() - - - -class SqliteftsSearchPlugin(BasePlugin): - app_label: str ='sqlitefts' - verbose_name: str = 'SQLite FTS5 Search' - - hooks: List[InstanceOf[BaseHook]] = [ - SQLITEFTS_CONFIG, - SQLITEFTS_SEARCH_BACKEND, - ] - - - -PLUGIN = SqliteftsSearchPlugin() -# PLUGIN.register(settings) -DJANGO_APP = PLUGIN.AppConfig diff --git a/archivebox/queues/admin.py b/archivebox/queues/admin.py new file mode 100644 index 00000000..aee5788b --- /dev/null +++ b/archivebox/queues/admin.py @@ -0,0 +1,26 @@ +__package__ = 'archivebox.queues' + +import abx + +from django.contrib.auth import get_permission_codename + +from huey_monitor.apps import HueyMonitorConfig +from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin + + +HueyMonitorConfig.verbose_name = 'Background Workers' + + +class CustomTaskModelAdmin(TaskModelAdmin): + actions = ["delete_selected"] + + def has_delete_permission(self, request, obj=None): + codename = get_permission_codename("delete", self.opts) + return request.user.has_perm("%s.%s" % (self.opts.app_label, codename)) + + + +@abx.hookimpl +def register_admin(admin_site): + admin_site.register(TaskModel, CustomTaskModelAdmin) + admin_site.register(SignalInfoModel, SignalInfoModelAdmin) diff --git a/archivebox/queues/apps.py b/archivebox/queues/apps.py index 1555e810..4a83d483 100644 --- a/archivebox/queues/apps.py +++ b/archivebox/queues/apps.py @@ -1,6 +1,14 @@ from django.apps import AppConfig +import abx + class QueuesConfig(AppConfig): default_auto_field = 'django.db.models.BigAutoField' name = 'queues' + + +@abx.hookimpl +def register_admin(admin_site): + from queues.admin import register_admin + register_admin(admin_site) diff --git a/archivebox/queues/settings.py b/archivebox/queues/settings.py deleted file mode 100644 index ab1a975c..00000000 --- a/archivebox/queues/settings.py +++ /dev/null @@ -1,33 +0,0 @@ -import tempfile -from pathlib import Path - -from archivebox.config import CONSTANTS -from archivebox.config.paths import get_collection_id - -DATA_DIR = CONSTANTS.DATA_DIR -LOGS_DIR = CONSTANTS.LOGS_DIR -TMP_DIR = CONSTANTS.TMP_DIR - -SUPERVISORD_CONFIG_FILE = TMP_DIR / "supervisord.conf" -PID_FILE = TMP_DIR / "supervisord.pid" -SOCK_FILE = TMP_DIR / "supervisord.sock" -LOG_FILE = TMP_DIR / "supervisord.log" -WORKERS_DIR = TMP_DIR / "workers" - - -def get_sock_file(): - TMP_DIR.mkdir(parents=True, exist_ok=True) - - if len(str(SOCK_FILE)) > 100: - # socket absolute paths cannot be longer than 108 characters on some systems - # symlink it to a shorter path and use that instead - - # use tmpfile to atomically overwrite any existing symlink - symlink = Path(tempfile.gettempdir()) / f"archivebox_supervisord_{get_collection_id()}.sock.tmp" - symlink.unlink(missing_ok=True) - symlink.symlink_to(SOCK_FILE) - symlink.rename(str(symlink).replace('.sock.tmp', '.sock')) - assert len(str(symlink)) <= 100, f'Failed to create supervisord SOCK_FILE, system tmp dir location is too long {symlink} (unix only allows 108 characters for socket paths)' - return symlink - - return SOCK_FILE diff --git a/archivebox/queues/supervisor_util.py b/archivebox/queues/supervisor_util.py index 1dc87395..f181da08 100644 --- a/archivebox/queues/supervisor_util.py +++ b/archivebox/queues/supervisor_util.py @@ -1,23 +1,39 @@ __package__ = 'archivebox.queues' +import sys import time import signal import psutil import shutil import subprocess + +from typing import Dict, cast, Iterator from pathlib import Path +from functools import cache + from rich import print - -from typing import Dict, cast - from supervisor.xmlrpc import SupervisorTransport from xmlrpc.client import ServerProxy +from archivebox.config import CONSTANTS +from archivebox.config.paths import get_or_create_working_tmp_dir from archivebox.config.permissions import ARCHIVEBOX_USER +from archivebox.misc.logging import STDERR +from archivebox.logging_util import pretty_path -from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, get_sock_file, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR +LOG_FILE_NAME = "supervisord.log" +CONFIG_FILE_NAME = "supervisord.conf" +PID_FILE_NAME = "supervisord.pid" +WORKERS_DIR_NAME = "workers" -from typing import Iterator +@cache +def get_sock_file(): + """Get the path to the supervisord socket file, symlinking to a shorter path if needed due to unix path length limits""" + TMP_DIR = get_or_create_working_tmp_dir(autofix=True, quiet=False) + assert TMP_DIR, "Failed to find or create a writable TMP_DIR!" + socket_file = TMP_DIR / "supervisord.sock" + + return socket_file def follow(file, sleep_sec=0.1) -> Iterator[str]: """ Yield each line from a file as they are written. @@ -35,24 +51,30 @@ def follow(file, sleep_sec=0.1) -> Iterator[str]: def create_supervisord_config(): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + config_content = f""" [supervisord] nodaemon = true environment = IS_SUPERVISORD_PARENT="true" -pidfile = {TMP_DIR}/{PID_FILE.name} -logfile = {LOGS_DIR}/{LOG_FILE.name} -childlogdir = {LOGS_DIR} -directory = {DATA_DIR} +pidfile = {PID_FILE} +logfile = {LOG_FILE} +childlogdir = {CONSTANTS.LOGS_DIR} +directory = {CONSTANTS.DATA_DIR} strip_ansi = true nocleanup = true user = {ARCHIVEBOX_USER} [unix_http_server] -file = {get_sock_file()} +file = {SOCK_FILE} chmod = 0700 [supervisorctl] -serverurl = unix://{get_sock_file()} +serverurl = unix://{SOCK_FILE} [rpcinterface:supervisor] supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface @@ -61,9 +83,14 @@ supervisor.rpcinterface_factory = supervisor.rpcinterface:make_main_rpcinterface files = {WORKERS_DIR}/*.conf """ - SUPERVISORD_CONFIG_FILE.write_text(config_content) + CONFIG_FILE.write_text(config_content) + Path.mkdir(WORKERS_DIR, exist_ok=True) + (WORKERS_DIR / 'initial_startup.conf').write_text('') # hides error about "no files found to include" when supervisord starts def create_worker_config(daemon): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + Path.mkdir(WORKERS_DIR, exist_ok=True) name = daemon['name'] @@ -80,13 +107,14 @@ def create_worker_config(daemon): def get_existing_supervisord_process(): + SOCK_FILE = get_sock_file() try: - transport = SupervisorTransport(None, None, f"unix://{get_sock_file()}") + transport = SupervisorTransport(None, None, f"unix://{SOCK_FILE}") server = ServerProxy("http://localhost", transport=transport) current_state = cast(Dict[str, int | str], server.supervisor.getState()) if current_state["statename"] == "RUNNING": pid = server.supervisor.getPID() - print(f"[πŸ¦Έβ€β™‚οΈ] Supervisord connected (pid={pid}) via unix://{str(get_sock_file()).replace(str(DATA_DIR), '.')}.") + print(f"[πŸ¦Έβ€β™‚οΈ] Supervisord connected (pid={pid}) via unix://{pretty_path(SOCK_FILE)}.") return server.supervisor except FileNotFoundError: return None @@ -95,58 +123,83 @@ def get_existing_supervisord_process(): return None def stop_existing_supervisord_process(): + SOCK_FILE = get_sock_file() + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + try: - pid = int(PID_FILE.read_text()) - except FileNotFoundError: - return - except ValueError: - PID_FILE.unlink() - return + try: + pid = int(PID_FILE.read_text()) + except (FileNotFoundError, ValueError): + return - try: - print(f"[πŸ¦Έβ€β™‚οΈ] Stopping supervisord process (pid={pid})...") - proc = psutil.Process(pid) - proc.terminate() - proc.wait() - except Exception: - pass - try: - PID_FILE.unlink() - except FileNotFoundError: - pass + try: + print(f"[πŸ¦Έβ€β™‚οΈ] Stopping supervisord process (pid={pid})...") + proc = psutil.Process(pid) + proc.terminate() + proc.wait() + except (Exception, BrokenPipeError, IOError): + pass + finally: + try: + # clear PID file and socket file + PID_FILE.unlink(missing_ok=True) + get_sock_file().unlink(missing_ok=True) + except Exception: + pass def start_new_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + LOG_FILE = CONSTANTS.LOGS_DIR / LOG_FILE_NAME + CONFIG_FILE = SOCK_FILE.parent / CONFIG_FILE_NAME + PID_FILE = SOCK_FILE.parent / PID_FILE_NAME + print(f"[πŸ¦Έβ€β™‚οΈ] Supervisord starting{' in background' if daemonize else ''}...") - # Create a config file in the current working directory + pretty_log_path = pretty_path(LOG_FILE) + print(f" > Writing supervisord logs to: {pretty_log_path}") + print(f" > Writing task worker logs to: {pretty_log_path.replace('supervisord.log', 'worker_*.log')}") + print(f' > Using supervisord config file: {pretty_path(CONFIG_FILE)}') + print(f" > Using supervisord UNIX socket: {pretty_path(SOCK_FILE)}") + print() # clear out existing stale state files shutil.rmtree(WORKERS_DIR, ignore_errors=True) PID_FILE.unlink(missing_ok=True) get_sock_file().unlink(missing_ok=True) - SUPERVISORD_CONFIG_FILE.unlink(missing_ok=True) + CONFIG_FILE.unlink(missing_ok=True) + # create the supervisord config file create_supervisord_config() # Start supervisord + # panel = Panel(f"Starting supervisord with config: {SUPERVISORD_CONFIG_FILE}") + # with Live(panel, refresh_per_second=1) as live: + subprocess.Popen( - f"supervisord --configuration={SUPERVISORD_CONFIG_FILE}", + f"supervisord --configuration={CONFIG_FILE}", stdin=None, shell=True, start_new_session=daemonize, ) def exit_signal_handler(signum, frame): - if signum != 13: - print(f"\n[πŸ¦Έβ€β™‚οΈ] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") + if signum == 2: + STDERR.print("\n[πŸ›‘] Got Ctrl+C. Terminating child processes...") + elif signum != 13: + STDERR.print(f"\n[πŸ¦Έβ€β™‚οΈ] Supervisord got stop signal ({signal.strsignal(signum)}). Terminating child processes...") stop_existing_supervisord_process() raise SystemExit(0) # Monitor for termination signals and cleanup child processes if not daemonize: - signal.signal(signal.SIGINT, exit_signal_handler) - signal.signal(signal.SIGHUP, exit_signal_handler) - signal.signal(signal.SIGPIPE, exit_signal_handler) - signal.signal(signal.SIGTERM, exit_signal_handler) + try: + signal.signal(signal.SIGINT, exit_signal_handler) + signal.signal(signal.SIGHUP, exit_signal_handler) + signal.signal(signal.SIGPIPE, exit_signal_handler) + signal.signal(signal.SIGTERM, exit_signal_handler) + except Exception: + # signal handlers only work in main thread + pass # otherwise supervisord will containue in background even if parent proc is ends (aka daemon mode) time.sleep(2) @@ -154,14 +207,32 @@ def start_new_supervisord_process(daemonize=False): return get_existing_supervisord_process() def get_or_create_supervisord_process(daemonize=False): + SOCK_FILE = get_sock_file() + WORKERS_DIR = SOCK_FILE.parent / WORKERS_DIR_NAME + supervisor = get_existing_supervisord_process() if supervisor is None: stop_existing_supervisord_process() supervisor = start_new_supervisord_process(daemonize=daemonize) time.sleep(0.5) + # wait up to 5s in case supervisord is slow to start + if not supervisor: + for _ in range(10): + if supervisor is not None: + print() + break + sys.stdout.write('.') + sys.stdout.flush() + time.sleep(0.5) + supervisor = get_existing_supervisord_process() + else: + print() + assert supervisor, "Failed to start supervisord or connect to it!" supervisor.getPID() # make sure it doesn't throw an exception + + (WORKERS_DIR / 'initial_startup.conf').unlink(missing_ok=True) return supervisor @@ -242,9 +313,9 @@ def tail_worker_logs(log_path: str): for line in follow(f): if '://' in line: live.console.print(f"Working on: {line.strip()}") - table.add_row("123124234", line.strip()) - except KeyboardInterrupt: - print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") + # table.add_row("123124234", line.strip()) + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") except SystemExit: pass @@ -321,12 +392,12 @@ def start_server_workers(host='0.0.0.0', port='8000', daemonize=False): if not daemonize: try: watch_worker(supervisor, "worker_daphne") - except KeyboardInterrupt: - print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping web server gracefully...") + STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping web server gracefully...") raise finally: stop_worker(supervisor, "worker_daphne") @@ -350,12 +421,12 @@ def start_cli_workers(watch=False): if watch: try: watch_worker(supervisor, "worker_system_tasks") - except KeyboardInterrupt: - print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") + except (KeyboardInterrupt, BrokenPipeError, IOError): + STDERR.print("\n[πŸ›‘] Got Ctrl+C, stopping gracefully...") except SystemExit: pass except BaseException as e: - print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping web server gracefully...") + STDERR.print(f"\n[πŸ›‘] Got {e.__class__.__name__} exception, stopping web server gracefully...") raise finally: stop_worker(supervisor, "worker_system_tasks") diff --git a/archivebox/queues/tasks.py b/archivebox/queues/tasks.py index dd22bbd6..acfeab0b 100644 --- a/archivebox/queues/tasks.py +++ b/archivebox/queues/tasks.py @@ -1,5 +1,8 @@ __package__ = 'archivebox.queues' +from functools import wraps +from django.utils import timezone + from django_huey import db_task, task from huey_monitor.models import TaskModel @@ -7,6 +10,38 @@ from huey_monitor.tqdm import ProcessInfo from .supervisor_util import get_or_create_supervisord_process +# @db_task(queue="system_tasks", context=True, schedule=1) +# def scheduler_tick(): +# print('SCHEDULER TICK', timezone.now().isoformat()) +# # abx.archivebox.events.on_scheduler_runloop_start(timezone.now(), machine=Machine.objects.get_current_machine()) + +# # abx.archivebox.events.on_scheduler_tick_start(timezone.now(), machine=Machine.objects.get_current_machine()) + +# scheduled_crawls = CrawlSchedule.objects.filter(is_enabled=True) +# scheduled_crawls_due = scheduled_crawls.filter(next_run_at__lte=timezone.now()) + +# for scheduled_crawl in scheduled_crawls_due: +# try: +# abx.archivebox.events.on_crawl_schedule_tick(scheduled_crawl) +# except Exception as e: +# abx.archivebox.events.on_crawl_schedule_failure(timezone.now(), machine=Machine.objects.get_current_machine(), error=e, schedule=scheduled_crawl) + +# # abx.archivebox.events.on_scheduler_tick_end(timezone.now(), machine=Machine.objects.get_current_machine(), tasks=scheduled_tasks_due) + +def db_task_with_parent(func): + """Decorator for db_task that sets the parent task for the db_task""" + + @wraps(func) + def wrapper(*args, **kwargs): + task = kwargs.get('task') + parent_task_id = kwargs.get('parent_task_id') + + if task and parent_task_id: + TaskModel.objects.set_parent_task(main_task_id=parent_task_id, sub_task_id=task.id) + + return func(*args, **kwargs) + + return wrapper @db_task(queue="system_tasks", context=True) def bg_add(add_kwargs, task=None, parent_task_id=None): diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index f7394171..2e7d4f69 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -6,7 +6,7 @@ from typing import List, Union from django.db.models import QuerySet from django.conf import settings -import abx.archivebox.use +import abx.archivebox.reads from archivebox.index.schema import Link from archivebox.misc.util import enforce_types @@ -57,7 +57,7 @@ def get_indexable_content(results: QuerySet): def import_backend(): - for backend in abx.archivebox.use.get_SEARCHBACKENDS().values(): + for backend in abx.archivebox.reads.get_SEARCHBACKENDS().values(): if backend.name == SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE: return backend raise Exception(f'Could not load {SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE} as search backend') diff --git a/archivebox/core/mixins.py b/archivebox/search/admin.py similarity index 96% rename from archivebox/core/mixins.py rename to archivebox/search/admin.py index 6dbab974..42aadf6f 100644 --- a/archivebox/core/mixins.py +++ b/archivebox/search/admin.py @@ -1,3 +1,5 @@ +__package__ = 'archivebox.search' + from django.contrib import messages from archivebox.search import query_search_index diff --git a/archivebox/seeds/__init__.py b/archivebox/seeds/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/seeds/admin.py b/archivebox/seeds/admin.py new file mode 100644 index 00000000..8c38f3f3 --- /dev/null +++ b/archivebox/seeds/admin.py @@ -0,0 +1,3 @@ +from django.contrib import admin + +# Register your models here. diff --git a/archivebox/seeds/apps.py b/archivebox/seeds/apps.py new file mode 100644 index 00000000..38eb4fde --- /dev/null +++ b/archivebox/seeds/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class SeedsConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "seeds" diff --git a/archivebox/seeds/migrations/__init__.py b/archivebox/seeds/migrations/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/archivebox/seeds/models.py b/archivebox/seeds/models.py new file mode 100644 index 00000000..b0d83b2e --- /dev/null +++ b/archivebox/seeds/models.py @@ -0,0 +1,67 @@ +__package__ = 'archivebox.seeds' + + +from datetime import datetime + +from django_stubs_ext.db.models import TypedModelMeta + +from django.db import models +from django.db.models import Q +from django.core.validators import MaxValueValidator, MinValueValidator +from django.conf import settings +from django.utils import timezone +from django.utils.functional import cached_property +from django.urls import reverse_lazy + +from pathlib import Path + + +from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats + + +class Seed(ABIDModel, ModelWithHealthStats): + """ + A fountain that produces URLs (+metadata) each time it's queried e.g. + - file:///data/sources/2024-01-02_11-57-51__cli_add.txt + - file:///data/sources/2024-01-02_11-57-51__web_ui_add.txt + - file:///Users/squash/Library/Application Support/Google/Chrome/Default/Bookmarks + - https://getpocket.com/user/nikisweeting/feed + - https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml + - ... + Each query of a Seed can produce the same list of URLs, or a different list each time. + The list of URLs it returns is used to create a new Crawl and seed it with new pending Snapshots. + + When a crawl is created, a root_snapshot is initially created with a URI set to the Seed URI. + The seed's preferred extractor is executed on that URI, which produces an ArchiveResult containing outlinks. + The outlinks then get turned into new pending Snapshots under the same crawl, + and the cycle repeats until Crawl.max_depth. + + Each consumption of a Seed by an Extractor can produce new urls, as Seeds can point to + stateful remote services, files with contents that change, directories that have new files within, etc. + """ + + abid_prefix = 'src_' + abid_ts_src = 'self.created_at' + abid_uri_src = 'self.uri' + abid_subtype_src = 'self.extractor' + abid_rand_src = 'self.id' + abid_drift_allowed = True + + uri = models.URLField(max_length=255, blank=False, null=False, unique=True) # unique source location where URLs will be loaded from + + extractor = models.CharField(default='auto', max_length=32) # suggested extractor to use to load this URL source + tags_str = models.CharField(max_length=255, null=False, blank=True, default='') # tags to attach to any URLs that come from this source + config = models.JSONField(default=dict) # extra config to put in scope when loading URLs from this source + + created_at = AutoDateTimeField(default=None, null=False, db_index=True) + modified_at = models.DateTimeField(auto_now=True) + created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False) + + @property + def source_type(self): + # e.g. http/https:// + # file:// + # pocketapi:// + # s3:// + # etc.. + return self.uri.split('://')[0].lower() diff --git a/archivebox/seeds/tests.py b/archivebox/seeds/tests.py new file mode 100644 index 00000000..7ce503c2 --- /dev/null +++ b/archivebox/seeds/tests.py @@ -0,0 +1,3 @@ +from django.test import TestCase + +# Create your tests here. diff --git a/archivebox/seeds/views.py b/archivebox/seeds/views.py new file mode 100644 index 00000000..91ea44a2 --- /dev/null +++ b/archivebox/seeds/views.py @@ -0,0 +1,3 @@ +from django.shortcuts import render + +# Create your views here. diff --git a/archivebox/templates/static/admin.css b/archivebox/templates/static/admin.css index 103e5bf3..9cd14b78 100755 --- a/archivebox/templates/static/admin.css +++ b/archivebox/templates/static/admin.css @@ -68,6 +68,21 @@ body.model-snapshot.change-list #content .object-tools { background: #772948; } +#content .adv-data textarea { + width: 82vw; + max-width: 100%; + min-height: 100px; + height: auto; + background-color: #145454; + color: #f1f1fd; + font-size: 12px; + font-family: monospace; + border-radius: 8px; + line-height: 1.2; + padding: 6px 9px; +} + + #content .object-tools { margin-top: -35px; margin-right: -10px; diff --git a/pyproject.toml b/pyproject.toml index 91587387..a56e3948 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,7 +75,7 @@ dependencies = [ "python-benedict[io,parse]>=0.33.2", "pydantic-settings>=2.5.2", "atomicwrites==1.4.1", - "django-taggit==1.3.0", + "django-taggit==6.1.0", "base32-crockford==0.3.0", # "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7", "pydantic-pkgr>=0.5.3", diff --git a/uv.lock b/uv.lock index f0456652..bbe8cdc7 100644 --- a/uv.lock +++ b/uv.lock @@ -141,7 +141,7 @@ requires-dist = [ { name = "django-pydantic-field", specifier = ">=0.3.10" }, { name = "django-signal-webhooks", specifier = ">=0.3.0" }, { name = "django-stubs", specifier = ">=5.0.4" }, - { name = "django-taggit", specifier = "==1.3.0" }, + { name = "django-taggit", specifier = "==6.1.0" }, { name = "feedparser", specifier = ">=6.0.11" }, { name = "ipython", specifier = ">=8.27.0" }, { name = "mypy-extensions", specifier = ">=1.0.0" }, @@ -882,14 +882,14 @@ wheels = [ [[package]] name = "django-taggit" -version = "1.3.0" +version = "6.1.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "django" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c2/9e/1f8a8511d58f0c9fa539a93581a3744d93b46316d40c7d297464c57e9b50/django-taggit-1.3.0.tar.gz", hash = "sha256:4a833bf71f4c2deddd9745924eee53be1c075d7f0020a06f12e29fa3d752732d", size = 46986 } +sdist = { url = "https://files.pythonhosted.org/packages/34/a6/f1beaf8f552fe90c153cc039316ebab942c23dfbc88588dde081fefca816/django_taggit-6.1.0.tar.gz", hash = "sha256:c4d1199e6df34125dd36db5eb0efe545b254dec3980ce5dd80e6bab3e78757c3", size = 38151 } wheels = [ - { url = "https://files.pythonhosted.org/packages/56/3e/dde2d7151bc0c4ac65d225e611a85e54a897c551507e8eca2c06a083f3f4/django_taggit-1.3.0-py3-none-any.whl", hash = "sha256:609b0223d8a652f3fae088b7fd29f294fdadaca2d7931d45c27d6c59b02fdf31", size = 45709 }, + { url = "https://files.pythonhosted.org/packages/6b/34/4185c345530b91d05cb82e05d07148f481a5eb5dc2ac44e092b3daa6f206/django_taggit-6.1.0-py3-none-any.whl", hash = "sha256:ab776264bbc76cb3d7e49e1bf9054962457831bd21c3a42db9138b41956e4cf0", size = 75749 }, ] [[package]] diff --git a/website/shadcn-theme.css b/website/shadcn-theme.css new file mode 100644 index 00000000..6655a907 --- /dev/null +++ b/website/shadcn-theme.css @@ -0,0 +1,47 @@ +@layer base { + :root { + --background: 339 49% 100%; + --foreground: 339 66% 4%; + --muted: 99 20% 93%; + --muted-foreground: 99 9% 39%; + --popover: 339 49% 100%; + --popover-foreground: 339 66% 4%; + --card: 339 49% 100%; + --card-foreground: 339 66% 4%; + --border: 339 7% 94%; + --input: 339 7% 94%; + --primary: 339 48% 41%; + --primary-foreground: 0 0% 100%; + --secondary: 99 48% 41%; + --secondary-foreground: 0 0% 100%; + --accent: 219 48% 41%; + --accent-foreground: 0 0% 100%; + --destructive: 19 85% 36%; + --destructive-foreground: 0 0% 100%; + --ring: 339 48% 41%; + --radius: 0.5rem; + } + + .dark { + --background: 339 32% 4%; + --foreground: 339 18% 99%; + --muted: 99 20% 7%; + --muted-foreground: 99 9% 61%; + --popover: 339 32% 4%; + --popover-foreground: 339 18% 99%; + --card: 339 32% 4%; + --card-foreground: 339 18% 99%; + --border: 339 7% 12%; + --input: 339 7% 12%; + --primary: 339 48% 41%; + --primary-foreground: 0 0% 100%; + --secondary: 99 48% 41%; + --secondary-foreground: 0 0% 100%; + --accent: 219 48% 41%; + --accent-foreground: 0 0% 100%; + --destructive: 19 85% 53%; + --destructive-foreground: 0 0% 0%; + --ring: 339 48% 41%; + } + } + \ No newline at end of file