Merge branch 'newmodels' into dev

This commit is contained in:
Nick Sweeting 2024-10-21 00:38:56 -07:00
commit 354c1ede35
No known key found for this signature in database
185 changed files with 5524 additions and 3819 deletions

View file

@ -300,10 +300,15 @@ RUN --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
RUN openssl rand -hex 16 > /etc/machine-id \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
&& mkdir -p "/tmp/archivebox" \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp/archivebox" \
&& mkdir -p "/usr/share/archivebox/lib" \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/usr/share/archivebox/lib" \
ENV GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
TMP_DIR=/tmp/archivebox \
LIB_DIR=/usr/share/archivebox/lib \
ALLOWED_HOSTS=*
# Print version for nice docker finish summary

View file

@ -130,7 +130,7 @@ curl -fsSL 'https://get.archivebox.io' | sh
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (yt-dlp), articles (readability), code (git), etc.](#output-formats)
- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
- [**Uses standard, durable, long-term formats**](#output-formats) like HTML, JSON, PDF, PNG, MP4, TXT, and WARC
- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox)
- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox)
- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode)
- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!)
- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345)

172
archivebox/Architecture.md Normal file
View file

@ -0,0 +1,172 @@
# ArchiveBox UI
## Page: Getting Started
### What do you want to capture?
- Save some URLs now -> [Add page]
- Paste some URLs to archive now
- Upload a file containing URLs (bookmarks.html export, RSS.xml feed, markdown file, word doc, PDF, etc.)
- Pull in URLs to archive from a remote location (e.g. RSS feed URL, remote TXT file, JSON file, etc.)
- Import URLs from a browser -> [Import page]
- Desktop: Get the ArchiveBox Chrome/Firefox extension
- Mobile: Get the ArchiveBox iOS App / Android App
- Upload a bookmarks.html export file
- Upload a browser_history.sqlite3 export file
- Import URLs from a 3rd party bookmarking service -> [Sync page]
- Pocket
- Pinboard
- Instapaper
- Wallabag
- Zapier, N8N, IFTTT, etc.
- Upload a bookmarks.html export, bookmarks.json, RSS, etc. file
- Archive URLs on a schedule -> [Schedule page]
- Archive an entire website -> [Crawl page]
- What starting URL/domain?
- How deep?
- Follow links to external domains?
- Follow links to parent URLs?
- Maximum number of pages to save?
- Maximum number of requests/minute?
- Crawl for URLs with a search engine and save automatically
-
- Some URLs on a schedule
- Save an entire website (e.g. `https://example.com`)
- Save results matching a search query (e.g. "site:example.com")
- Save a social media feed (e.g. `https://x.com/user/1234567890`)
--------------------------------------------------------------------------------
### Crawls App
- Archive an entire website -> [Crawl page]
- What are the seed URLs?
- How many hops to follow?
- Follow links to external domains?
- Follow links to parent URLs?
- Maximum number of pages to save?
- Maximum number of requests/minute?
--------------------------------------------------------------------------------
### Scheduler App
- Archive URLs on a schedule -> [Schedule page]
- What URL(s)?
- How often?
- Do you want to discard old snapshots after x amount of time?
- Any filter rules?
- Want to be notified when changes are detected -> redirect[Alerts app/create new alert(crawl=self)]
* Choose Schedule check for new URLs: Schedule.objects.get(pk=xyz)
- 1 minute
- 5 minutes
- 1 hour
- 1 day
* Choose Destination Crawl to archive URLs using : Crawl.objects.get(pk=xyz)
- Tags
- Persona
- Created By ID
- Config
- Filters
- URL patterns to include
- URL patterns to exclude
- ONLY_NEW= Ignore URLs if already saved once / save URL each time it appears / only save is last save > x time ago
--------------------------------------------------------------------------------
### Sources App (For managing sources that ArchiveBox pulls URLs in from)
- Add a new source to pull URLs in from (WIZARD)
- Choose URI:
- [x] Web UI
- [x] CLI
- Local filesystem path (directory to monitor for new files containing URLs)
- Remote URL (RSS/JSON/XML feed)
- Chrome browser profile sync (login using gmail to pull bookmarks/history)
- Pocket, Pinboard, Instapaper, Wallabag, etc.
- Zapier, N8N, IFTTT, etc.
- Local server filesystem path (directory to monitor for new files containing URLs)
- Google drive (directory to monitor for new files containing URLs)
- Remote server FTP/SFTP/SCP path (directory to monitor for new files containing URLs)
- AWS/S3/B2/GCP bucket (directory to monitor for new files containing URLs)
- XBrowserSync (login to pull bookmarks)
- Choose extractor
- auto
- RSS
- Pocket
- etc.
- Specify extra Config, e.g.
- credentials
- extractor tuning options (e.g. verify_ssl, cookies, etc.)
- Provide credentials for the source
- API Key
- Username / Password
- OAuth
--------------------------------------------------------------------------------
### Alerts App
- Create a new alert, choose condition
- Get notified when a site goes down (<x% success ratio for Snapshots)
- Get notified when a site changes visually more than x% (screenshot diff)
- Get notified when a site's text content changes more than x% (text diff)
- Get notified when a keyword appears
- Get notified when a keyword dissapears
- When an AI prompt returns some result
- Choose alert threshold:
- any condition is met
- all conditions are met
- condition is met for x% of URLs
- condition is met for x% of time
- Choose how to notify: (List[AlertDestination])
- maximum alert frequency
- destination type: email / Slack / Webhook / Google Sheet / logfile
- destination info:
- email address(es)
- Slack channel
- Webhook URL
- Choose scope:
- Choose ArchiveResult scope (extractors): (a query that returns ArchiveResult.objects QuerySet)
- All extractors
- Only screenshots
- Only readability / mercury text
- Only video
- Only html
- Only headers
- Choose Snapshot scope (URL): (a query that returns Snapshot.objects QuerySet)
- All domains
- Specific domain
- All domains in a tag
- All domains in a tag category
- All URLs matching a certain regex pattern
- Choose crawl scope: (a query that returns Crawl.objects QuerySet)
- All crawls
- Specific crawls
- crawls by a certain user
- crawls using a certain persona
class AlertDestination(models.Model):
destination_type: [email, slack, webhook, google_sheet, local logfile, b2/s3/gcp bucket, etc.]
maximum_frequency
filter_rules
credentials
alert_template: JINJA2 json/text template that gets populated with alert contents

View file

@ -56,7 +56,7 @@ from .config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .config.version import VERSION # noqa
__version__ = VERSION
__author__ = 'Nick Sweeting'
__author__ = 'ArchiveBox'
__license__ = 'MIT'
ASCII_ICON = """

View file

@ -10,12 +10,11 @@ from django.shortcuts import redirect
from django_object_actions import DjangoObjectActions, action
from api.auth import get_or_create_api_token
from archivebox.misc.util import parse_date
from .abid import ABID
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):
"""highlight each character in red that differs with the char at the same index in compare_val"""
@ -37,6 +36,8 @@ def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color
))
def get_abid_info(self, obj, request=None):
from archivebox.api.auth import get_or_create_api_token
try:
#abid_diff = f' != obj.ABID: {highlight_diff(obj.ABID, obj.abid)} ❌' if str(obj.ABID) != str(obj.abid) else ' == .ABID ✅'

View file

@ -321,6 +321,44 @@ class ABIDModel(models.Model):
def get_absolute_url(self):
return self.api_docs_url
class ModelWithHealthStats(models.Model):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
class Meta:
abstract = True
def record_health_failure(self) -> None:
self.num_uses_failed += 1
self.save()
def record_health_success(self) -> None:
self.num_uses_succeeded += 1
self.save()
def reset_health(self) -> None:
# move all the failures to successes when resetting so we dont lose track of the total count
self.num_uses_succeeded = self.num_uses_failed + self.num_uses_succeeded
self.num_uses_failed = 0
self.save()
@property
def health(self) -> int:
total_uses = max((self.num_uses_failed + self.num_uses_succeeded, 1))
success_pct = (self.num_uses_succeeded / total_uses) * 100
return round(success_pct)
####################################################
# Django helpers

View file

@ -2,11 +2,11 @@ __package__ = 'abx'
import importlib
from pathlib import Path
from typing import Dict
from typing import Dict, Callable, List
from . import hookspec as base_spec
from .hookspec import hookimpl, hookspec # noqa
from .manager import pm, PluginManager # noqa
from abx.hookspec import hookimpl, hookspec # noqa
from abx.manager import pm, PluginManager # noqa
pm.add_hookspecs(base_spec)
@ -23,21 +23,28 @@ def get_plugin_order(plugin_entrypoint: Path):
pass
return (order, plugin_entrypoint)
def register_hookspecs(hookspecs):
def register_hookspecs(hookspecs: List[str]):
"""
Register all the hookspecs from a list of module names.
"""
for hookspec_import_path in hookspecs:
hookspec_module = importlib.import_module(hookspec_import_path)
pm.add_hookspecs(hookspec_module)
def find_plugins_in_dir(plugins_dir: Path, prefix: str) -> Dict[str, Path]:
"""
Find all the plugins in a given directory. Just looks for an __init__.py file.
"""
return {
f"{prefix}.{plugin_entrypoint.parent.name}": plugin_entrypoint.parent
for plugin_entrypoint in sorted(plugins_dir.glob("*/apps.py"), key=get_plugin_order)
for plugin_entrypoint in sorted(plugins_dir.glob("*/__init__.py"), key=get_plugin_order)
if plugin_entrypoint.parent.name != 'abx'
} # "plugins_pkg.pip": "/app/archivebox/plugins_pkg/pip"
def get_pip_installed_plugins(group='abx'):
"""replaces pm.load_setuptools_entrypoints("abx")"""
"""replaces pm.load_setuptools_entrypoints("abx"), finds plugins that registered entrypoints via pip"""
import importlib.metadata
DETECTED_PLUGINS = {} # module_name: module_dir_path
@ -52,6 +59,9 @@ def get_pip_installed_plugins(group='abx'):
def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]):
"""
Get the mapping of dir_name: {plugin_id: plugin_dir} for all plugins in the given directories.
"""
DETECTED_PLUGINS = {}
for plugin_prefix, plugin_dir in plugin_dirs.items():
DETECTED_PLUGINS.update(find_plugins_in_dir(plugin_dir, prefix=plugin_prefix))
@ -61,6 +71,9 @@ def get_plugins_in_dirs(plugin_dirs: Dict[str, Path]):
# Load all plugins from pip packages, archivebox built-ins, and user plugins
def load_plugins(plugins_dict: Dict[str, Path]):
"""
Load all the plugins from a dictionary of module names and directory paths.
"""
LOADED_PLUGINS = {}
for plugin_module, plugin_dir in plugins_dict.items():
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
@ -71,6 +84,9 @@ def load_plugins(plugins_dict: Dict[str, Path]):
return LOADED_PLUGINS
def get_registered_plugins():
"""
Get all the plugins registered with Pluggy.
"""
plugins = {}
plugin_to_distinfo = dict(pm.list_plugin_distinfo())
for plugin in pm.get_plugins():
@ -88,3 +104,28 @@ def get_registered_plugins():
return plugins
def get_plugin_hooks(plugin_pkg: str | None) -> Dict[str, Callable]:
"""
Get all the functions marked with @hookimpl on a module.
"""
if not plugin_pkg:
return {}
hooks = {}
plugin_module = importlib.import_module(plugin_pkg)
for attr_name in dir(plugin_module):
if attr_name.startswith('_'):
continue
try:
attr = getattr(plugin_module, attr_name)
if isinstance(attr, Callable):
hooks[attr_name] = None
pm.parse_hookimpl_opts(plugin_module, attr_name)
hooks[attr_name] = attr
except Exception as e:
print(f'Error getting hookimpls for {plugin_pkg}: {e}')
return hooks

View file

@ -10,31 +10,21 @@ from pathlib import Path
def load_archivebox_plugins(pm, plugins_dict: Dict[str, Path]):
"""Load archivebox plugins, very similar to abx.load_plugins but it looks for a pydantic PLUGIN model + hooks in apps.py"""
LOADED_PLUGINS = {}
for plugin_module, plugin_dir in plugins_dict.items():
for plugin_module, plugin_dir in reversed(plugins_dict.items()):
# print(f'Loading plugin: {plugin_module} from {plugin_dir}')
archivebox_plugins_found = []
# 1. register the plugin module directly in case it contains any look hookimpls (e.g. in __init__.py)
try:
plugin_module_loaded = importlib.import_module(plugin_module)
pm.register(plugin_module_loaded)
if hasattr(plugin_module_loaded, 'PLUGIN'):
archivebox_plugins_found.append(plugin_module_loaded.PLUGIN)
except Exception as e:
print(f'Error registering plugin: {plugin_module} - {e}')
# 2. then try to import plugin_module.apps as well
if os.access(plugin_dir / 'apps.py', os.R_OK):
plugin_apps = importlib.import_module(plugin_module + '.apps')
pm.register(plugin_apps) # register the whole .apps in case it contains loose hookimpls (not in a class)
if hasattr(plugin_apps, 'PLUGIN'):
archivebox_plugins_found.append(plugin_apps.PLUGIN)
# 3. then try to look for plugin_module.PLUGIN and register it + all its hooks
for ab_plugin in archivebox_plugins_found:
pm.register(ab_plugin)
for hook in ab_plugin.hooks:
hook.__signature__ = hook.__class__.__signature__ # fix to make pydantic model usable as Pluggy plugin
pm.register(hook)
LOADED_PLUGINS[plugin_module] = ab_plugin
# print(f' √ Loaded plugin: {LOADED_PLUGINS}')
# print(f' √ Loaded plugin: {plugin_module} {len(archivebox_plugins_found) * "🧩"}')
return LOADED_PLUGINS

View file

@ -1,38 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Dict
import abx
from .base_hook import BaseHook, HookType
class BaseAdminDataView(BaseHook):
hook_type: HookType = "ADMINDATAVIEW"
name: str = 'example_admin_data_view_list'
verbose_name: str = 'Data View'
route: str = '/__OVERRIDE_THIS__/'
view: str = 'plugins_example.example.views.example_view_list'
items: Dict[str, str] = {
'route': '<str:key>/',
"name": 'example_admin_data_view_item',
'view': 'plugins_example.example.views.example_view_item',
}
@abx.hookimpl
def get_ADMINDATAVIEWS(self):
return [self]
@abx.hookimpl
def get_ADMIN_DATA_VIEWS_URLS(self):
"""routes to be added to django.conf.settings.ADMIN_DATA_VIEWS['urls']"""
route = {
"route": self.route,
"view": self.view,
"name": self.verbose_name,
"items": self.items,
}
return [route]

View file

@ -18,12 +18,9 @@ from archivebox.config import CONSTANTS
from archivebox.config.permissions import ARCHIVEBOX_USER
import abx
from .base_hook import BaseHook, HookType
class BaseBinProvider(BaseHook, BinProvider):
hook_type: HookType = "BINPROVIDER"
class BaseBinProvider(BinProvider):
# TODO: add install/load/load_or_install methods as abx.hookimpl methods
@ -36,12 +33,12 @@ class BaseBinProvider(BaseHook, BinProvider):
def get_BINPROVIDERS(self):
return [self]
class BaseBinary(BaseHook, Binary):
hook_type: HookType = "BINARY"
class BaseBinary(Binary):
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
bin_dir = bin_dir or CONSTANTS.LIB_BIN_DIR
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.R_OK)):
return
@ -59,9 +56,10 @@ class BaseBinary(BaseHook, Binary):
@validate_call
def load(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
if fresh:
binary = super().load(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
else:
# get cached binary from db
try:
@ -76,16 +74,18 @@ class BaseBinary(BaseHook, Binary):
@validate_call
def install(self, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
binary = super().install(**kwargs)
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
@validate_call
def load_or_install(self, fresh=False, **kwargs) -> Self:
from archivebox.config.common import STORAGE_CONFIG
try:
binary = self.load(fresh=fresh)
if binary and binary.version:
self.symlink_to_lib(binary=binary, bin_dir=CONSTANTS.LIB_BIN_DIR)
self.symlink_to_lib(binary=binary, bin_dir=STORAGE_CONFIG.LIB_DIR / 'bin')
return binary
except Exception:
pass

View file

@ -1,8 +1,13 @@
__package__ = 'abx.archivebox'
import os
import sys
import re
from pathlib import Path
from typing import Type, Tuple, Callable, ClassVar
from typing import Type, Tuple, Callable, ClassVar, Dict, Any
import toml
from rich import print
from benedict import benedict
from pydantic import model_validator, TypeAdapter
@ -11,15 +16,18 @@ from pydantic_settings.sources import TomlConfigSettingsSource
from pydantic_pkgr import func_takes_args_or_kwargs
import abx
from .base_hook import BaseHook, HookType
from . import toml_util
PACKAGE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.getcwd()).resolve()
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
AUTOFIXES_HEADER = "[AUTOFIXES]"
AUTOFIXES_SUBHEADER = "# The following config was added automatically to fix problems detected at startup:"
class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
@ -55,7 +63,7 @@ class FlatTomlConfigSettingsSource(TomlConfigSettingsSource):
super(TomlConfigSettingsSource, self).__init__(settings_cls, self.toml_data)
class ArchiveBoxBaseConfig(BaseSettings):
class BaseConfigSet(BaseSettings):
"""
This is the base class for an ArchiveBox ConfigSet.
It handles loading values from schema defaults, ArchiveBox.conf TOML config, and environment variables.
@ -85,7 +93,7 @@ class ArchiveBoxBaseConfig(BaseSettings):
loc_by_alias=False,
validate_assignment=True,
validate_return=True,
revalidate_instances="always",
revalidate_instances="subclass-instances",
)
load_from_defaults: ClassVar[bool] = True
@ -103,9 +111,6 @@ class ArchiveBoxBaseConfig(BaseSettings):
) -> Tuple[PydanticBaseSettingsSource, ...]:
"""Defines the config precedence order: Schema defaults -> ArchiveBox.conf (TOML) -> Environment variables"""
ARCHIVEBOX_CONFIG_FILE = DATA_DIR / "ArchiveBox.conf"
ARCHIVEBOX_CONFIG_FILE_BAK = ARCHIVEBOX_CONFIG_FILE.parent / ".ArchiveBox.conf.bak"
# import ipdb; ipdb.set_trace()
precedence_order = {}
@ -154,7 +159,17 @@ class ArchiveBoxBaseConfig(BaseSettings):
def fill_defaults(self):
"""Populate any unset values using function provided as their default"""
for key, field in self.model_fields.items():
for key in self.model_fields.keys():
if isinstance(getattr(self, key), Callable):
if self.load_from_defaults:
computed_default = self.get_default_value(key)
# set generated default value as final validated value
setattr(self, key, computed_default)
return self
def get_default_value(self, key):
"""Get the default value for a given config key"""
field = self.model_fields[key]
value = getattr(self, key)
if isinstance(value, Callable):
@ -170,11 +185,10 @@ class ArchiveBoxBaseConfig(BaseSettings):
# coerce/check to make sure default factory return value matches type annotation
TypeAdapter(field.annotation).validate_python(computed_default)
# set generated default value as final validated value
setattr(self, key, computed_default)
return self
return computed_default
return value
def update_in_place(self, warn=True, **kwargs):
def update_in_place(self, warn=True, persist=False, hint='', **kwargs):
"""
Update the config with new values. Use this sparingly! We should almost never be updating config at runtime.
Sets them in the environment so they propagate to spawned subprocesses / across future re-__init__()s and reload from environment
@ -182,48 +196,106 @@ class ArchiveBoxBaseConfig(BaseSettings):
Example acceptable use case: user config says SEARCH_BACKEND_ENGINE=sonic but sonic_client pip library is not installed so we cannot use it.
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep') can be used to reset it back to ripgrep so we can continue.
"""
from archivebox.misc.toml_util import CustomTOMLEncoder
if warn:
print('[!] WARNING: Some of the provided user config values cannot be used, temporarily ignoring them:')
fix_scope = 'in ArchiveBox.conf' if persist else 'just for current run'
print(f'[yellow]:warning: WARNING: Some config cannot be used as-is, fixing automatically {fix_scope}:[/yellow] {hint}', file=sys.stderr)
# set the new values in the environment
for key, value in kwargs.items():
os.environ[key] = str(value)
original_value = getattr(self, key)
if warn:
print(f' {key}={original_value} -> {value}')
# if persist=True, write config changes to data/ArchiveBox.conf [AUTOFIXES] section
try:
if persist and ARCHIVEBOX_CONFIG_FILE.is_file():
autofixes_to_add = benedict(kwargs).to_toml(encoder=CustomTOMLEncoder())
existing_config = ARCHIVEBOX_CONFIG_FILE.read_text().split(AUTOFIXES_HEADER, 1)[0].strip()
if AUTOFIXES_HEADER in existing_config:
existing_autofixes = existing_config.split(AUTOFIXES_HEADER, 1)[-1].strip().replace(AUTOFIXES_SUBHEADER, '').replace(AUTOFIXES_HEADER, '').strip()
else:
existing_autofixes = ''
new_config = '\n'.join(line for line in [
existing_config,
'\n' + AUTOFIXES_HEADER,
AUTOFIXES_SUBHEADER,
existing_autofixes,
autofixes_to_add,
] if line.strip()).strip() + '\n'
ARCHIVEBOX_CONFIG_FILE.write_text(new_config)
except Exception:
pass
self.__init__()
if warn:
print(file=sys.stderr)
return self
def as_legacy_config_schema(self):
@property
def toml_section_header(self):
"""Convert the class name to a TOML section header e.g. ShellConfig -> SHELL_CONFIG"""
class_name = self.__class__.__name__
return re.sub('([A-Z]+)', r'_\1', class_name).upper().strip('_')
def from_defaults(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the default values"""
class OnlyDefaultsConfig(self.__class__):
load_from_defaults = True
load_from_configfile = False
load_from_environment = False
return benedict(OnlyDefaultsConfig().model_dump(exclude_unset=False, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_configfile(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the configfile ArchiveBox.conf"""
class OnlyConfigFileConfig(self.__class__):
load_from_defaults = False
load_from_configfile = True
load_from_environment = False
return benedict(OnlyConfigFileConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_environment(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the environment variables"""
class OnlyEnvironmentConfig(self.__class__):
load_from_defaults = False
load_from_configfile = False
load_from_environment = True
return benedict(OnlyEnvironmentConfig().model_dump(exclude_unset=True, exclude_defaults=False, exclude=set(self.model_computed_fields.keys())))
def from_computed(self) -> Dict[str, Any]:
"""Get the dictionary of {key: value} config loaded from the computed fields"""
return benedict(self.model_dump(include=set(self.model_computed_fields.keys())))
def to_toml_dict(self, defaults=False) -> Dict[str, Any]:
"""Get the current config as a TOML-ready dict"""
config_dict = {}
for key, value in benedict(self).items():
if defaults or value != self.get_default_value(key):
config_dict[key] = value
return benedict({self.toml_section_header: config_dict})
def to_toml_str(self, defaults=False) -> str:
"""Get the current config as a TOML string"""
from archivebox.misc.toml_util import CustomTOMLEncoder
toml_dict = self.to_toml_dict(defaults=defaults)
if not toml_dict[self.toml_section_header]:
# if the section is empty, don't write it
toml_dict.pop(self.toml_section_header)
return toml.dumps(toml_dict, encoder=CustomTOMLEncoder())
def as_legacy_config_schema(self) -> Dict[str, Any]:
# shim for backwards compatibility with old config schema style
model_values = self.model_dump()
return benedict({
key: {'type': field.annotation, 'default': model_values[key]}
for key, field in self.model_fields.items()
})
class BaseConfigSet(ArchiveBoxBaseConfig, BaseHook): # type: ignore[type-arg]
hook_type: ClassVar[HookType] = 'CONFIG'
# @abx.hookimpl
# def ready(self, settings):
# # reload config from environment, in case it's been changed by any other plugins
# self.__init__()
@abx.hookimpl
def get_CONFIGS(self):
try:
return {self.id: self}
except Exception as e:
# raise Exception(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing CONFIGS for {type(self)}: {e.__class__.__name__}: {e}')
return {}
@abx.hookimpl
def get_FLAT_CONFIG(self):
try:
return self.model_dump()
except Exception as e:
# raise Exception(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
print(f'Error computing FLAT_CONFIG for {type(self)}: {e.__class__.__name__}: {e}')
return {}

View file

@ -14,7 +14,6 @@ from django.utils import timezone
import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
@ -28,8 +27,7 @@ HandlerFuncStr = Annotated[str, AfterValidator(lambda s: s.startswith('self.'))]
CmdArgsList = Annotated[List[str] | Tuple[str, ...], AfterValidator(no_empty_args)]
class BaseExtractor(BaseHook):
hook_type: HookType = 'EXTRACTOR'
class BaseExtractor:
name: ExtractorName
binary: BinName
@ -51,9 +49,9 @@ class BaseExtractor(BaseHook):
def get_output_path(self, snapshot) -> Path:
return Path(self.id.lower())
return Path(self.__class__.__name__.lower())
def should_extract(self, snapshot) -> bool:
def should_extract(self, uri: str, config: dict | None=None) -> bool:
try:
assert self.detect_installed_binary().version
except Exception:
@ -197,8 +195,8 @@ class BaseExtractor(BaseHook):
@cached_property
def BINARY(self) -> BaseBinary:
import abx.archivebox.use
for binary in abx.archivebox.use.get_BINARIES().values():
import abx.archivebox.reads
for binary in abx.archivebox.reads.get_BINARIES().values():
if binary.name == self.binary:
return binary
raise ValueError(f'Binary {self.binary} not found')

View file

@ -1,80 +0,0 @@
__package__ = 'abx.archivebox'
import inspect
from huey.api import TaskWrapper
from pathlib import Path
from typing import Tuple, Literal, ClassVar, get_args
from pydantic import BaseModel, ConfigDict
from django.utils.functional import cached_property
import abx
HookType = Literal['CONFIG', 'BINPROVIDER', 'BINARY', 'EXTRACTOR', 'REPLAYER', 'CHECK', 'ADMINDATAVIEW', 'QUEUE', 'SEARCHBACKEND']
hook_type_names: Tuple[HookType] = get_args(HookType)
class BaseHook(BaseModel):
model_config = ConfigDict(
extra="allow",
arbitrary_types_allowed=True,
from_attributes=True,
populate_by_name=True,
validate_defaults=True,
validate_assignment=False,
revalidate_instances="subclass-instances",
ignored_types=(TaskWrapper, cached_property),
)
hook_type: ClassVar[HookType] # e.g. = 'CONFIG'
# verbose_name: str = Field()
_is_registered: bool = False
_is_ready: bool = False
@property
def id(self) -> str:
return self.__class__.__name__
@property
def hook_module(self) -> str:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return f'{self.__module__}.{self.__class__.__name__}'
@property
def hook_file(self) -> Path:
"""e.g. plugins_extractor.singlefile.apps.SinglefileConfigSet"""
return Path(inspect.getfile(self.__class__))
@property
def plugin_module(self) -> str:
"""e.g. plugins_extractor.singlefile"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit(".apps.", 1)[0]
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@property
def admin_url(self) -> str:
# e.g. /admin/environment/config/LdapConfig/
return f"/admin/environment/{self.hook_type.lower()}/{self.id}/"
@abx.hookimpl
def register(self, settings):
"""Called when django.apps.AppConfig.ready() is called"""
# print("REGISTERED HOOK:", self.hook_module)
self._is_registered = True
@abx.hookimpl
def ready(self):
"""Called when django.apps.AppConfig.ready() is called"""
assert self._is_registered, f"Tried to run {self.hook_module}.ready() but it was never registered!"
# print("READY HOOK:", self.hook_module)
self._is_ready = True

View file

@ -1,154 +0,0 @@
__package__ = 'abx.archivebox'
import abx
import inspect
from pathlib import Path
from django.apps import AppConfig
from typing import List, Type, Dict
from typing_extensions import Self
from pydantic import (
BaseModel,
ConfigDict,
Field,
model_validator,
InstanceOf,
computed_field,
)
from benedict import benedict
from .base_hook import BaseHook, HookType
class BasePlugin(BaseModel):
model_config = ConfigDict(
extra='forbid',
arbitrary_types_allowed=True,
populate_by_name=True,
from_attributes=True,
validate_defaults=False,
validate_assignment=False,
revalidate_instances="always",
# frozen=True,
)
# Required by AppConfig:
app_label: str = Field() # e.g. 'singlefile' (one-word machine-readable representation, to use as url-safe id/db-table prefix_/attr name)
verbose_name: str = Field() # e.g. 'SingleFile' (human-readable *short* label, for use in column names, form labels, etc.)
docs_url: str = Field(default=None) # e.g. 'https://github.com/...'
# All the hooks the plugin will install:
hooks: List[InstanceOf[BaseHook]] = Field(default=[])
_is_registered: bool = False
_is_ready: bool = False
@computed_field
@property
def id(self) -> str:
return self.__class__.__name__
@property
def name(self) -> str:
return self.app_label
# @computed_field
@property
def plugin_module(self) -> str: # DottedImportPath
""" "
Dotted import path of the plugin's module (after its loaded via settings.INSTALLED_APPS).
e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin' -> 'plugins_pkg.npm'
"""
return f"{self.__module__}.{self.__class__.__name__}".split("archivebox.", 1)[-1].rsplit('.apps.', 1)[0]
@property
def plugin_module_full(self) -> str: # DottedImportPath
"""e.g. 'archivebox.plugins_pkg.npm.apps.NpmPlugin'"""
return f"{self.__module__}.{self.__class__.__name__}"
# @computed_field
@property
def plugin_dir(self) -> Path:
return Path(inspect.getfile(self.__class__)).parent.resolve()
@model_validator(mode='after')
def validate(self) -> Self:
"""Validate the plugin's build-time configuration here before it's registered in Django at runtime."""
# VERY IMPORTANT:
# preserve references to original default objects,
# pydantic deepcopies them by default which breaks mutability
# see https://github.com/pydantic/pydantic/issues/7608
# if we dont do this, then plugins_extractor.SINGLEFILE_CONFIG != settings.CONFIGS.SingleFileConfig for example
# and calling .__init__() on one of them will not update the other
self.hooks = self.model_fields['hooks'].default
assert self.app_label and self.app_label and self.verbose_name, f'{self.__class__.__name__} is missing .name or .app_label or .verbose_name'
# assert json.dumps(self.model_json_schema(), indent=4), f"Plugin {self.plugin_module} has invalid JSON schema."
return self
@property
def AppConfig(plugin_self) -> Type[AppConfig]:
"""Generate a Django AppConfig class for this plugin."""
class PluginAppConfig(AppConfig):
"""Django AppConfig for plugin, allows it to be loaded as a Django app listed in settings.INSTALLED_APPS."""
name = plugin_self.plugin_module
app_label = plugin_self.app_label
verbose_name = plugin_self.verbose_name
default_auto_field = 'django.db.models.AutoField'
# handled by abx.hookimpl ready()
# def ready(self):
# from django.conf import settings
# plugin_self.ready(settings)
return PluginAppConfig
@property
def HOOKS_BY_ID(self) -> Dict[str, InstanceOf[BaseHook]]:
return benedict({hook.id: hook for hook in self.hooks})
@property
def HOOKS_BY_TYPE(self) -> Dict[HookType, Dict[str, InstanceOf[BaseHook]]]:
hooks = benedict({})
for hook in self.hooks:
hooks[hook.hook_type] = hooks.get(hook.hook_type) or benedict({})
hooks[hook.hook_type][hook.id] = hook
return hooks
@abx.hookimpl
def register(self, settings):
from archivebox.config.legacy import bump_startup_progress_bar
self._is_registered = True
bump_startup_progress_bar()
# print('◣----------------- REGISTERED PLUGIN:', self.plugin_module, '-----------------◢')
# print()
@abx.hookimpl
def ready(self, settings=None):
"""Runs any runtime code needed when AppConfig.ready() is called (after all models are imported)."""
from archivebox.config.legacy import bump_startup_progress_bar
assert self._is_registered, f"Tried to run {self.plugin_module}.ready() but it was never registered!"
self._is_ready = True
# settings.PLUGINS[self.id]._is_ready = True
bump_startup_progress_bar()
@abx.hookimpl
def get_INSTALLED_APPS(self):
return [self.plugin_module]

View file

@ -1,106 +0,0 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, List, TYPE_CHECKING
from pydantic import Field, InstanceOf
from benedict import benedict
if TYPE_CHECKING:
from huey.api import TaskWrapper
import abx
from .base_hook import BaseHook, HookType
from .base_binary import BaseBinary
class BaseQueue(BaseHook):
hook_type: HookType = 'QUEUE'
name: str = Field() # e.g. 'singlefile'
binaries: List[InstanceOf[BaseBinary]] = Field()
@property
def tasks(self) -> Dict[str, 'TaskWrapper']:
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
all_tasks = {}
for task_name, task in tasks.__dict__.items():
# if attr is a Huey task and its queue_name matches our hook's queue name
if hasattr(task, "task_class") and task.huey.name == self.name:
all_tasks[task_name] = task
return benedict(all_tasks)
def get_django_huey_config(self, QUEUE_DATABASE_NAME) -> dict:
"""Get the config dict to insert into django.conf.settings.DJANGO_HUEY['queues']."""
return {
"huey_class": "huey.SqliteHuey",
"filename": QUEUE_DATABASE_NAME,
"name": self.name,
"results": True,
"store_none": True,
"immediate": False,
"utc": True,
"consumer": {
"workers": 1,
"worker_type": "thread",
"initial_delay": 0.1, # Smallest polling interval, same as -d.
"backoff": 1.15, # Exponential backoff using this rate, -b.
"max_delay": 10.0, # Max possible polling interval, -m.
"scheduler_interval": 1, # Check schedule every second, -s.
"periodic": True, # Enable crontab feature.
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
def get_supervisord_config(self, settings) -> dict:
"""Ge the config dict used to tell sueprvisord to start a huey consumer for this queue."""
return {
"name": f"worker_{self.name}",
"command": f"archivebox manage djangohuey --queue {self.name}",
"stdout_logfile": f"logs/worker_{self.name}.log",
"redirect_stderr": "true",
"autorestart": "true",
"autostart": "false",
}
def start_supervisord_worker(self, settings, lazy=True):
from queues.supervisor_util import get_or_create_supervisord_process, start_worker
print()
try:
supervisor = get_or_create_supervisord_process(daemonize=False)
except Exception as e:
print(f"Error starting worker for queue {self.name}: {e}")
return None
print()
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
# Update settings.WORKERS to include this worker
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
return worker
@abx.hookimpl
def get_QUEUES(self):
return [self]
@abx.hookimpl
def get_DJANGO_HUEY_QUEUES(self, QUEUE_DATABASE_NAME):
"""queue configs to be added to django.conf.settings.DJANGO_HUEY['queues']"""
return {
self.name: self.get_django_huey_config(QUEUE_DATABASE_NAME)
}
# @abx.hookimpl
# def ready(self, settings):
# self.start_supervisord_worker(settings, lazy=True)
# super().ready(settings)

View file

@ -2,14 +2,10 @@ __package__ = 'abx.archivebox'
import abx
from .base_hook import BaseHook, HookType
class BaseReplayer(BaseHook):
class BaseReplayer:
"""Describes how to render an ArchiveResult in several contexts"""
hook_type: HookType = 'REPLAYER'
url_pattern: str = '*'
row_template: str = 'plugins/generic_replayer/templates/row.html'

View file

@ -1,33 +1,25 @@
__package__ = 'abx.archivebox'
from typing import Iterable, List
from pydantic import Field
import abx
from .base_hook import BaseHook, HookType
import abc
class BaseSearchBackend(BaseHook):
hook_type: HookType = 'SEARCHBACKEND'
name: str = Field() # e.g. 'singlefile'
# TODO: move these to a hookimpl
class BaseSearchBackend(abc.ABC):
name: str
@staticmethod
@abc.abstractmethod
def index(snapshot_id: str, texts: List[str]):
return
@staticmethod
@abc.abstractmethod
def flush(snapshot_ids: Iterable[str]):
return
@staticmethod
@abc.abstractmethod
def search(text: str) -> List[str]:
raise NotImplementedError("search method must be implemented by subclass")
@abx.hookimpl
def get_SEARCHBACKENDS(self):
return [self]

View file

@ -0,0 +1,20 @@
"""
Hookspec for side-effects that ArchiveBox plugins can trigger.
(e.g. network requests, binary execution, remote API calls, external library calls, etc.)
"""
__package__ = 'abx.archivebox'
import abx
@abx.hookspec
def check_remote_seed_connection(urls, extractor, credentials, created_by):
pass
@abx.hookspec
def exec_extractor(url, extractor, credentials, config):
pass

View file

@ -0,0 +1,45 @@
"""
Hookspec for ArchiveBox system events that plugins can hook into.
Loosely modeled after Django's signals architecture.
https://docs.djangoproject.com/en/5.1/ref/signals/
"""
__package__ = 'abx.archivebox'
import abx
@abx.hookspec
def on_crawl_schedule_tick(crawl_schedule):
pass
@abx.hookspec
def on_seed_post_save(seed, created=False):
...
@abx.hookspec
def on_crawl_post_save(crawl, created=False):
...
@abx.hookspec
def on_snapshot_post_save(snapshot, created=False):
...
# @abx.hookspec
# def on_snapshot_post_delete(snapshot):
# ...
@abx.hookspec
def on_archiveresult_post_save(archiveresult, created=False):
...
# @abx.hookspec
# def on_archiveresult_post_delete(archiveresult):
# ...

View file

@ -4,32 +4,49 @@ from typing import Dict, Any
from .. import hookspec
@hookspec
def get_CONFIGS():
return {}
@hookspec
def get_EXTRACTORS():
return {}
@hookspec
def get_REPLAYERS():
return {}
@hookspec
def get_ADMINDATAVIEWS():
return {}
@hookspec
def get_QUEUES():
return {}
@hookspec
def get_SEARCHBACKENDS():
return {}
from .base_binary import BaseBinary, BaseBinProvider
from .base_configset import BaseConfigSet
from .base_extractor import BaseExtractor
from .base_searchbackend import BaseSearchBackend
@hookspec
def extract(snapshot_id) -> Dict[str, Any]:
def get_PLUGIN() -> Dict[str, Dict[str, Any]]:
return {}
@hookspec
def get_CONFIG() -> Dict[str, BaseConfigSet]:
return {}
@hookspec
def get_EXTRACTORS() -> Dict[str, BaseExtractor]:
return {}
@hookspec
def get_SEARCHBACKENDS() -> Dict[str, BaseSearchBackend]:
return {}
# @hookspec
# def get_REPLAYERS() -> Dict[str, BaseReplayer]:
# return {}
# @hookspec
# def get_ADMINDATAVIEWS():
# return {}
# @hookspec
# def get_QUEUES():
# return {}
##############################################################
# provided by abx.pydantic_pkgr.hookspec:
# @hookspec
# def get_BINARIES() -> Dict[str, BaseBinary]:
# return {}
# @hookspec
# def get_BINPROVIDERS() -> Dict[str, BaseBinProvider]:
# return {}

View file

@ -0,0 +1,160 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, Set, Any, TYPE_CHECKING
from benedict import benedict
import abx
from .. import pm
if TYPE_CHECKING:
from .base_configset import BaseConfigSet
from .base_binary import BaseBinary, BaseBinProvider
from .base_extractor import BaseExtractor
from .base_searchbackend import BaseSearchBackend
# from .base_replayer import BaseReplayer
# from .base_queue import BaseQueue
# from .base_admindataview import BaseAdminDataView
# API exposed to ArchiveBox code
def get_PLUGINS() -> Dict[str, Dict[str, Any]]:
return benedict({
plugin_id: plugin
for plugin_dict in pm.hook.get_PLUGIN()
for plugin_id, plugin in plugin_dict.items()
})
def get_PLUGIN(plugin_id: str) -> Dict[str, Any]:
plugin_info = get_PLUGINS().get(plugin_id, {})
package = plugin_info.get('package', plugin_info.get('PACKAGE', None))
if not package:
return {'id': plugin_id, 'hooks': {}}
module = importlib.import_module(package)
hooks = abx.get_plugin_hooks(module.__package__)
assert plugin_info and (plugin_info.get('id') or plugin_info.get('ID') or hooks)
return benedict({
'id': plugin_id,
'label': getattr(module, '__label__', plugin_id),
'module': module,
'package': module.__package__,
'hooks': hooks,
'version': getattr(module, '__version__', '999.999.999'),
'author': getattr(module, '__author__', 'Unknown'),
'homepage': getattr(module, '__homepage__', 'https://github.com/ArchiveBox/ArchiveBox'),
'dependencies': getattr(module, '__dependencies__', []),
'source_code': module.__file__,
**plugin_info,
})
def get_HOOKS() -> Set[str]:
return {
hook_name
for plugin_id in get_PLUGINS().keys()
for hook_name in get_PLUGIN(plugin_id).hooks
}
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
return benedict({
config_id: configset
for plugin_configs in pm.hook.get_CONFIG()
for config_id, configset in plugin_configs.items()
})
def get_FLAT_CONFIG() -> Dict[str, Any]:
return benedict({
key: value
for configset in get_CONFIGS().values()
for key, value in configset.model_dump().items()
})
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
# TODO: move these to plugins
from abx.archivebox.base_binary import apt, brew, env
builtin_binproviders = {
'apt': apt,
'brew': brew,
'env': env,
}
return benedict({
binprovider_id: binprovider
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
for binprovider_id, binprovider in plugin_binproviders.items()
})
def get_BINARIES() -> Dict[str, 'BaseBinary']:
return benedict({
binary_id: binary
for plugin_binaries in pm.hook.get_BINARIES()
for binary_id, binary in plugin_binaries.items()
})
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
return benedict({
extractor_id: extractor
for plugin_extractors in pm.hook.get_EXTRACTORS()
for extractor_id, extractor in plugin_extractors.items()
})
# def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
# return benedict({
# replayer.id: replayer
# for plugin_replayers in pm.hook.get_REPLAYERS()
# for replayer in plugin_replayers
# })
# def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
# return benedict({
# admin_dataview.id: admin_dataview
# for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
# for admin_dataview in plugin_admin_dataviews
# })
# def get_QUEUES() -> Dict[str, 'BaseQueue']:
# return benedict({
# queue.id: queue
# for plugin_queues in pm.hook.get_QUEUES()
# for queue in plugin_queues
# })
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
return benedict({
searchbackend_id: searchbackend
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
for searchbackend_id,searchbackend in plugin_searchbackends.items()
})
def get_scope_config(defaults: benedict | None = None, persona=None, seed=None, crawl=None, snapshot=None, archiveresult=None, extra_config=None):
"""Get all the relevant config for the given scope, in correct precedence order"""
from django.conf import settings
default_config: benedict = defaults or settings.CONFIG
snapshot = snapshot or (archiveresult and archiveresult.snapshot)
crawl = crawl or (snapshot and snapshot.crawl)
seed = seed or (crawl and crawl.seed)
persona = persona or (crawl and crawl.persona)
persona_config = persona.config if persona else {}
seed_config = seed.config if seed else {}
crawl_config = crawl.config if crawl else {}
snapshot_config = snapshot.config if snapshot else {}
archiveresult_config = archiveresult.config if archiveresult else {}
extra_config = extra_config or {}
return {
**default_config, # defaults / config file / environment variables
**persona_config, # lowest precedence
**seed_config,
**crawl_config,
**snapshot_config,
**archiveresult_config,
**extra_config, # highest precedence
}

View file

@ -1,130 +0,0 @@
__package__ = 'abx.archivebox'
from typing import Dict, Any, TYPE_CHECKING
from django.utils import timezone
from benedict import benedict
from .. import pm
if TYPE_CHECKING:
from .base_hook import BaseHook
from .base_configset import BaseConfigSet
from .base_binary import BaseBinary, BaseBinProvider
from .base_extractor import BaseExtractor
from .base_replayer import BaseReplayer
from .base_queue import BaseQueue
from .base_admindataview import BaseAdminDataView
from .base_searchbackend import BaseSearchBackend
# API exposed to ArchiveBox code
def get_PLUGINS():
return benedict({
plugin.PLUGIN.id: plugin.PLUGIN
for plugin in pm.get_plugins()
})
def get_HOOKS(PLUGINS) -> Dict[str, 'BaseHook']:
return benedict({
hook.id: hook
for plugin in PLUGINS.values()
for hook in plugin.hooks
})
def get_CONFIGS() -> Dict[str, 'BaseConfigSet']:
return benedict({
config_id: config
for plugin_configs in pm.hook.get_CONFIGS()
for config_id, config in plugin_configs.items()
})
def get_FLAT_CONFIG() -> Dict[str, Any]:
return benedict({
key: value
for plugin_config_dict in pm.hook.get_FLAT_CONFIG()
for key, value in plugin_config_dict.items()
})
def get_BINPROVIDERS() -> Dict[str, 'BaseBinProvider']:
# TODO: move these to plugins
from abx.archivebox.base_binary import apt, brew, env
builtin_binproviders = [apt, brew, env]
return benedict({
binprovider.id: binprovider
for plugin_binproviders in [builtin_binproviders, *pm.hook.get_BINPROVIDERS()]
for binprovider in plugin_binproviders
})
def get_BINARIES() -> Dict[str, 'BaseBinary']:
return benedict({
binary.id: binary
for plugin_binaries in pm.hook.get_BINARIES()
for binary in plugin_binaries
})
def get_EXTRACTORS() -> Dict[str, 'BaseExtractor']:
return benedict({
extractor.id: extractor
for plugin_extractors in pm.hook.get_EXTRACTORS()
for extractor in plugin_extractors
})
def get_REPLAYERS() -> Dict[str, 'BaseReplayer']:
return benedict({
replayer.id: replayer
for plugin_replayers in pm.hook.get_REPLAYERS()
for replayer in plugin_replayers
})
def get_ADMINDATAVIEWS() -> Dict[str, 'BaseAdminDataView']:
return benedict({
admin_dataview.id: admin_dataview
for plugin_admin_dataviews in pm.hook.get_ADMINDATAVIEWS()
for admin_dataview in plugin_admin_dataviews
})
def get_QUEUES() -> Dict[str, 'BaseQueue']:
return benedict({
queue.id: queue
for plugin_queues in pm.hook.get_QUEUES()
for queue in plugin_queues
})
def get_SEARCHBACKENDS() -> Dict[str, 'BaseSearchBackend']:
return benedict({
searchbackend.id: searchbackend
for plugin_searchbackends in pm.hook.get_SEARCHBACKENDS()
for searchbackend in plugin_searchbackends
})
###########################
def register_all_hooks(settings):
pm.hook.register(settings=settings)
def extract(url_or_snapshot_id):
from core.models import Snapshot
url, snapshot_abid, snapshot_id = None, None, None
snapshot = None
if '://' in url_or_snapshot_id:
url = url_or_snapshot_id
try:
snapshot = Snapshot.objects.get(url=url)
except Snapshot.DoesNotExist:
snapshot = Snapshot(url=url_or_snapshot_id, timestamp=str(timezone.now().timestamp()), bookmarked_at=timezone.now())
snapshot.save()
elif '-' in url_or_snapshot_id:
snapshot_id = url_or_snapshot_id
snapshot = Snapshot.objects.get(id=snapshot_id)
else:
snapshot_abid = url_or_snapshot_id
snapshot = Snapshot.objects.get(abid=snapshot_abid)
return pm.hook.extract(snapshot_id=snapshot.id)

View file

@ -0,0 +1,133 @@
__package__ = 'abx.archivebox'
import importlib
from typing import Dict, Set, Any, TYPE_CHECKING
from benedict import benedict
from django.conf import settings
import abx
from .. import pm
@abx.hookimpl
def get_or_create_snapshot(crawl, url, config):
pass
@abx.hookimpl
def update_crawl_schedule_next_run_at(crawl_schedule, next_run_at):
pass
@abx.hookimpl
def create_crawl_copy(crawl_to_copy, schedule):
pass
@abx.hookimpl
def create_crawl(seed, depth, tags_str, persona, created_by, config, schedule):
pass
def create_crawl_from_ui_action(urls, extractor, credentials, depth, tags_str, persona, created_by, crawl_config):
if seed_is_remote(urls, extractor, credentials):
# user's seed is a remote source that will provide the urls (e.g. RSS feed URL, Pocket API, etc.)
uri, extractor, credentials = abx.archivebox.effects.check_remote_seed_connection(urls, extractor, credentials, created_by)
else:
# user's seed is some raw text they provided to parse for urls, save it to a file then load the file as a Seed
uri = abx.archivebox.writes.write_raw_urls_to_local_file(urls, extractor, tags_str, created_by) # file:///data/sources/some_import.txt
seed = abx.archivebox.writes.get_or_create_seed(uri=remote_uri, extractor, credentials, created_by)
# abx.archivebox.events.on_seed_created(seed)
crawl = abx.archivebox.writes.create_crawl(seed=seed, depth=depth, tags_str=tags_str, persona=persona, created_by=created_by, config=crawl_config, schedule=None)
abx.archivebox.events.on_crawl_created(crawl)
@abx.hookimpl(specname='on_crawl_schedule_tick')
def create_crawl_from_crawlschedule_if_due(crawl_schedule):
# make sure it's not too early to run this scheduled import (makes this function indepmpotent / safe to call multiple times / every second)
if timezone.now() < crawl_schedule.next_run_at:
# it's not time to run it yet, wait for the next tick
return
else:
# we're going to run it now, bump the next run time so that no one else runs it at the same time as us
abx.archivebox.writes.update_crawl_schedule_next_run_at(crawl_schedule, next_run_at=crawl_schedule.next_run_at + crawl_schedule.interval)
crawl_to_copy = None
try:
crawl_to_copy = crawl_schedule.crawl_set.first() # alternatively use .last() to copy most recent crawl instead of very first crawl
except Crawl.DoesNotExist:
# there is no template crawl to base the next one off of
# user must add at least one crawl to a schedule that serves as the template for all future repeated crawls
return
new_crawl = abx.archivebox.writes.create_crawl_copy(crawl_to_copy=crawl_to_copy, schedule=crawl_schedule)
abx.archivebox.events.on_crawl_created(new_crawl)
@abx.hookimpl(specname='on_crawl_post_save')
def create_root_snapshot_from_seed(crawl):
# create a snapshot for the seed URI which kicks off the crawl
# only a single extractor will run on it, which will produce outlinks which get added back to the crawl
root_snapshot, created = abx.archivebox.writes.get_or_create_snapshot(crawl=crawl, url=crawl.seed.uri, config={
'extractors': (
abx.archivebox.reads.get_extractors_that_produce_outlinks()
if crawl.seed.extractor == 'auto' else
[crawl.seed.extractor]
),
**crawl.seed.config,
})
if created:
abx.archivebox.events.on_snapshot_created(root_snapshot)
abx.archivebox.writes.update_crawl_stats(started_at=timezone.now())
@abx.hookimpl(specname='on_snapshot_created')
def create_archiveresults_pending_from_snapshot(snapshot, config):
config = get_scope_config(
# defaults=settings.CONFIG_FROM_DEFAULTS,
# configfile=settings.CONFIG_FROM_FILE,
# environment=settings.CONFIG_FROM_ENVIRONMENT,
persona=archiveresult.snapshot.crawl.persona,
seed=archiveresult.snapshot.crawl.seed,
crawl=archiveresult.snapshot.crawl,
snapshot=archiveresult.snapshot,
archiveresult=archiveresult,
# extra_config=extra_config,
)
extractors = abx.archivebox.reads.get_extractors_for_snapshot(snapshot, config)
for extractor in extractors:
archiveresult, created = abx.archivebox.writes.get_or_create_archiveresult_pending(
snapshot=snapshot,
extractor=extractor,
status='pending'
)
if created:
abx.archivebox.events.on_archiveresult_created(archiveresult)
@abx.hookimpl(specname='on_archiveresult_updated')
def create_snapshots_pending_from_archiveresult_outlinks(archiveresult):
config = get_scope_config(...)
# check if extractor has finished succesfully, if not, dont bother checking for outlinks
if not archiveresult.status == 'succeeded':
return
# check if we have already reached the maximum recursion depth
hops_to_here = abx.archivebox.reads.get_outlink_parents(crawl_pk=archiveresult.snapshot.crawl_id, url=archiveresult.url, config=config)
if len(hops_to_here) >= archiveresult.crawl.max_depth +1:
return
# parse the output to get outlink url_entries
discovered_urls = abx.archivebox.reads.get_archiveresult_discovered_url_entries(archiveresult, config=config)
for url_entry in discovered_urls:
abx.archivebox.writes.create_outlink_record(src=archiveresult.snapshot.url, dst=url_entry.url, via=archiveresult)
abx.archivebox.writes.create_snapshot(crawl=archiveresult.snapshot.crawl, url_entry=url_entry)
# abx.archivebox.events.on_crawl_updated(archiveresult.snapshot.crawl)

View file

@ -110,6 +110,11 @@ def register_checks():
"""Register django checks with django system checks system"""
pass
@hookspec
def register_admin(admin_site):
"""Register django admin views/models with the main django admin site instance"""
pass
###########################################################################################

View file

@ -96,3 +96,6 @@ def register_checks():
"""register any django system checks"""
pm.hook.register_checks()
def register_admin(admin_site):
"""register any django admin models/views with the main django admin site instance"""
pm.hook.register_admin(admin_site=admin_site)

31
archivebox/api/admin.py Normal file
View file

@ -0,0 +1,31 @@
__package__ = 'archivebox.api'
from signal_webhooks.admin import WebhookAdmin
from signal_webhooks.utils import get_webhook_model
from abid_utils.admin import ABIDModelAdmin
from api.models import APIToken
class APITokenAdmin(ABIDModelAdmin):
list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires')
sort_fields = ('abid', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at', 'abid_info')
search_fields = ('id', 'abid', 'created_by__username', 'token')
fields = ('created_by', 'token', 'expires', *readonly_fields)
list_filter = ('created_by',)
ordering = ['-created_at']
list_per_page = 100
class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin):
list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display)
sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields)
def register_admin(admin_site):
admin_site.register(APIToken, APITokenAdmin)
admin_site.register(get_webhook_model(), CustomWebhookAdmin)

View file

@ -2,10 +2,14 @@ __package__ = 'archivebox.api'
from django.apps import AppConfig
import abx
class APIConfig(AppConfig):
name = 'api'
def ready(self):
pass
@abx.hookimpl
def register_admin(admin_site):
from api.admin import register_admin
register_admin(admin_site)

View file

@ -6,7 +6,6 @@ from typing import List, Optional, Union, Any
from datetime import datetime
from django.db.models import Q
from django.shortcuts import get_object_or_404
from django.core.exceptions import ValidationError
from django.contrib.auth import get_user_model
@ -16,7 +15,6 @@ from ninja.errors import HttpError
from core.models import Snapshot, ArchiveResult, Tag
from api.models import APIToken, OutboundWebhook
from abid_utils.abid import ABID
from .auth import API_AUTH_METHODS
@ -397,11 +395,70 @@ def get_tag(request, tag_id: str, with_snapshots: bool=True):
# class CrawlSchema(Schema):
# TYPE: str = 'core.models.Crawl'
# id: UUID
# abid: str
# modified_at: datetime
# created_at: datetime
# created_by_id: str
# created_by_username: str
# urls: str
# depth: int
# parser: str
# # snapshots: List[SnapshotSchema]
# @staticmethod
# def resolve_created_by_id(obj):
# return str(obj.created_by_id)
# @staticmethod
# def resolve_created_by_username(obj):
# User = get_user_model()
# return User.objects.get(id=obj.created_by_id).username
# @staticmethod
# def resolve_snapshots(obj, context):
# if context['request'].with_snapshots:
# return obj.snapshot_set.all().distinct()
# return Snapshot.objects.none()
# @router.get("/crawl/{crawl_id}", response=CrawlSchema, url_name="get_crawl")
# def get_crawl(request, crawl_id: str, with_snapshots: bool=False, with_archiveresults: bool=False):
# """Get a specific Crawl by id or abid."""
# crawl = None
# request.with_snapshots = with_snapshots
# request.with_archiveresults = with_archiveresults
# try:
# crawl = Crawl.objects.get(abid__icontains=crawl_id)
# except Exception:
# pass
# try:
# crawl = crawl or Crawl.objects.get(id__icontains=crawl_id)
# except Exception:
# pass
# return crawl
# [..., CrawlSchema]
@router.get("/any/{abid}", response=Union[SnapshotSchema, ArchiveResultSchema, TagSchema], url_name="get_any")
def get_any(request, abid: str):
request.with_snapshots = False
request.with_archiveresults = False
if abid.startswith(APIToken.abid_prefix):
raise HttpError(403, 'APIToken objects are not accessible via REST API')
if abid.startswith(OutboundWebhook.abid_prefix):
raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
response = None
try:
response = response or get_snapshot(request, abid)
@ -418,10 +475,12 @@ def get_any(request, abid: str):
except Exception:
pass
if abid.startswith(APIToken.abid_prefix):
raise HttpError(403, 'APIToken objects are not accessible via REST API')
if abid.startswith(OutboundWebhook.abid_prefix):
raise HttpError(403, 'OutboundWebhook objects are not accessible via REST API')
# try:
# response = response or get_crawl(request, abid)
# except Exception:
# pass
if not response:
raise HttpError(404, 'Object with given ABID not found')
return response

View file

@ -164,13 +164,18 @@ def run_subcommand(subcommand: str,
# print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds
cmd_requires_db = (subcommand in archive_cmds)
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
check_db = cmd_requires_db and not init_pending
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
for ignore_pattern in ('help', '-h', '--help', 'version', '--version'):
if ignore_pattern in sys.argv[:4]:
cmd_requires_db = False
break
if subcommand in archive_cmds:
if cmd_requires_db:
check_migrations()

View file

@ -5,5 +5,34 @@ from .paths import (
DATA_DIR, # noqa
ARCHIVE_DIR, # noqa
)
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
import abx
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return ['config']
@abx.hookimpl
def get_CONFIG():
from .common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}

View file

@ -1,57 +0,0 @@
__package__ = 'archivebox.config'
from typing import List
from pydantic import InstanceOf
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401
ServerConfig, # noqa: F401
ArchivingConfig, # noqa: F401
SearchBackendConfig, # noqa: F401
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
###################### Config ##########################
class ConfigPlugin(BasePlugin):
app_label: str = 'CONFIG'
verbose_name: str = 'Configuration'
hooks: List[InstanceOf[BaseHook]] = [
SHELL_CONFIG,
GENERAL_CONFIG,
STORAGE_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
]
PLUGIN = ConfigPlugin()
DJANGO_APP = PLUGIN.AppConfig
# # register django apps
# @abx.hookimpl
# def get_INSTALLED_APPS():
# return [DJANGO_APP.name]
# # register configs
# @abx.hookimpl
# def register_CONFIG():
# return PLUGIN.HOOKS_BY_TYPE['CONFIG'].values()

View file

@ -1,18 +1,18 @@
__package__ = 'archivebox.config'
import os
import sys
import shutil
import tempfile
from typing import Dict, Optional
from pathlib import Path
from rich import print
from pydantic import Field, field_validator, computed_field
from pydantic import Field, field_validator, computed_field, model_validator
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME
from .permissions import IN_DOCKER
@ -35,7 +35,6 @@ class ShellConfig(BaseConfigSet):
VERSIONS_AVAILABLE: bool = False # .check_for_update.get_versions_available_on_github(c)},
CAN_UPGRADE: bool = False # .check_for_update.can_upgrade(c)},
@computed_field
@property
def TERM_WIDTH(self) -> int:
@ -57,6 +56,16 @@ SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet):
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default='644')
RESTRICT_FILE_NAMES: str = Field(default='windows')
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)

View file

@ -1,6 +1,5 @@
__package__ = 'archivebox.config'
import os
import re
import sys
@ -97,14 +96,10 @@ class ConstantsDict(Mapping):
# Runtime dirs
TMP_DIR_NAME: str = 'tmp'
TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID
DEFAULT_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / MACHINE_ID # ./data/tmp/abc3244323
LIB_DIR_NAME: str = 'lib'
LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
DEFAULT_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / MACHINE_TYPE # ./data/lib/arm64-linux-docker
# Config constants
TIMEZONE: str = 'UTC'
@ -199,90 +194,6 @@ class ConstantsDict(Mapping):
"Dockerfile",
))
CODE_LOCATIONS = benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CUSTOM_TEMPLATES_DIR) and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(USER_PLUGINS_DIR),
'is_valid': os.path.isdir(USER_PLUGINS_DIR) and os.access(USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(LIB_DIR) and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.W_OK), # read + write
},
})
DATA_LOCATIONS = benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONFIG_FILE) and os.access(CONFIG_FILE, os.R_OK) and os.access(CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(QUEUE_DATABASE_FILE) and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(SOURCES_DIR) and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(PERSONAS_DIR),
"is_valid": os.path.isdir(PERSONAS_DIR) and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(LOGS_DIR) and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(TMP_DIR) and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@classmethod
def __getitem__(cls, key: str):

View file

@ -50,13 +50,11 @@ from ..misc.logging import (
)
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
ANSI = SHELL_CONFIG.ANSI
LDAP = LDAP_CONFIG.LDAP_ENABLED
############################### Config Schema ##################################
@ -73,8 +71,6 @@ CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
'STORAGE_CONFIG': STORAGE_CONFIG.as_legacy_config_schema(),
'LDAP_CONFIG': LDAP_CONFIG.as_legacy_config_schema(),
# 'FAVICON_CONFIG': FAVICON_CONFIG.as_legacy_config_schema(),
# 'WGET_CONFIG': WGET_CONFIG.as_legacy_config_schema(),
@ -263,6 +259,9 @@ def load_config_val(key: str,
elif type is list or type is dict:
return json.loads(val)
elif type is Path:
return Path(val)
raise Exception('Config values can only be str, bool, int, or json')
@ -578,7 +577,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
with SudoPermission(uid=0):
# running as root is a special case where it's ok to be a bit slower
# make sure data dir is always owned by the correct user
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{CONSTANTS.DATA_DIR}"/* 2>/dev/null')
bump_startup_progress_bar()

View file

@ -1,12 +1,16 @@
__package__ = 'archivebox.config'
import os
import socket
import hashlib
import tempfile
import platform
from pathlib import Path
from functools import cache
from datetime import datetime
from benedict import benedict
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
#############################################################################################
@ -41,7 +45,8 @@ def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
try:
# only persist collection_id file if we already have an index.sqlite3 file present
# otherwise we might be running in a directory that is not a collection, no point creating cruft files
if os.path.isfile(DATABASE_FILE) and os.access(DATA_DIR, os.W_OK) or force_create:
collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
if collection_is_active or force_create:
collection_id_file.write_text(collection_id)
# if we're running as root right now, make sure the collection_id file is owned by the archivebox user
@ -87,7 +92,7 @@ def get_machine_type() -> str:
return LIB_DIR_SCOPE
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid
@ -100,10 +105,197 @@ def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = No
test_file.unlink()
return True
except (IOError, OSError, PermissionError):
pass
if chown:
# try fixing it using sudo permissions
with SudoPermission(uid=uid, fallback=fallback):
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
return False
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
from archivebox.logging_util import pretty_path
try:
socket_path = str(dir_path / '.test_socket.sock')
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
try:
os.remove(socket_path)
except OSError:
pass
s.bind(socket_path)
s.close()
try:
os.remove(socket_path)
except OSError:
pass
except Exception as e:
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
return True
def create_and_chown_dir(dir_path: Path) -> None:
with SudoPermission(uid=0, fallback=True):
dir_path.mkdir(parents=True, exist_ok=True)
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
@cache
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_tmp_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.TMP_DIR, # <user-specified>
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
@cache
def get_or_create_working_lib_dir(autofix=True, quiet=False):
from archivebox import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.checks import check_lib_dir
# try a few potential directories in order of preference
CANDIDATES = [
STORAGE_CONFIG.LIB_DIR, # <user-specified>
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
]
for candidate in CANDIDATES:
try:
create_and_chown_dir(candidate)
except Exception:
pass
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
return candidate
if not quiet:
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
@cache
def get_data_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONSTANTS.CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": CONSTANTS.QUEUE_DATABASE_FILE,
"enabled": True,
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": CONSTANTS.SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
},
"PERSONAS_DIR": {
"path": CONSTANTS.PERSONAS_DIR.resolve(),
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
},
"LOGS_DIR": {
"path": CONSTANTS.LOGS_DIR.resolve(),
"enabled": True,
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': STORAGE_CONFIG.TMP_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
# },
})
@cache
def get_code_locations():
from archivebox.config import CONSTANTS
from archivebox.config.common import STORAGE_CONFIG
return benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
},
'TEMPLATES_DIR': {
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
},
'CUSTOM_TEMPLATES_DIR': {
'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
},
'USER_PLUGINS_DIR': {
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
},
'LIB_DIR': {
'path': STORAGE_CONFIG.LIB_DIR.resolve(),
'enabled': True,
'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
},
})
# @cache

View file

@ -2,6 +2,7 @@ __package__ = 'abx.archivebox'
import os
import inspect
from pathlib import Path
from typing import Any, List, Dict, cast
from benedict import benedict
@ -13,6 +14,8 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import abx.archivebox.reads
from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date
@ -82,8 +85,12 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
if '_BINARY' in key or '_VERSION' in key
}
for plugin in settings.PLUGINS.values():
for binary in plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
for plugin_id, plugin in abx.archivebox.reads.get_PLUGINS().items():
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
if not plugin.hooks.get('get_BINARIES'):
continue
for binary in plugin.hooks.get_BINARIES().values():
try:
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
binary = installed_binary.load_from_db()
@ -92,7 +99,7 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
rows['Found Version'].append(f'{binary.loaded_version}' if binary.loaded_version else '❌ missing')
rows['From Plugin'].append(plugin.plugin_module)
rows['From Plugin'].append(plugin.package)
rows['Provided By'].append(
', '.join(
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
@ -128,11 +135,16 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
binary = None
plugin = None
for loaded_plugin in settings.PLUGINS.values():
for loaded_binary in loaded_plugin.HOOKS_BY_TYPE.get('BINARY', {}).values():
for plugin_id in abx.archivebox.reads.get_PLUGINS().keys():
loaded_plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
try:
for loaded_binary in loaded_plugin.hooks.get_BINARIES().values():
if loaded_binary.name == key:
binary = loaded_binary
plugin = loaded_plugin
# break # last write wins
except Exception as e:
print(e)
assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
@ -149,7 +161,7 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"name": binary.name,
"description": binary.abspath,
"fields": {
'plugin': plugin.name,
'plugin': plugin.package,
'binprovider': binary.loaded_binprovider,
'abspath': binary.loaded_abspath,
'version': binary.loaded_version,
@ -170,28 +182,68 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Name": [],
"verbose_name": [],
"module": [],
"source_code": [],
"hooks": [],
"Label": [],
"Version": [],
"Author": [],
"Package": [],
"Source Code": [],
"Config": [],
"Binaries": [],
"Package Managers": [],
# "Search Backends": [],
}
config_colors = {
'_BINARY': '#339',
'USE_': 'green',
'SAVE_': 'green',
'_ARGS': '#33e',
'KEY': 'red',
'COOKIES': 'red',
'AUTH': 'red',
'SECRET': 'red',
'TOKEN': 'red',
'PASSWORD': 'red',
'TIMEOUT': '#533',
'RETRIES': '#533',
'MAX': '#533',
'MIN': '#533',
}
def get_color(key):
for pattern, color in config_colors.items():
if pattern in key:
return color
return 'black'
for plugin in settings.PLUGINS.values():
# try:
# plugin.load_binaries()
# except Exception as e:
# print(e)
for plugin_id in settings.PLUGINS.keys():
rows['Name'].append(ItemLink(plugin.id, key=plugin.id))
rows['verbose_name'].append(mark_safe(f'<a href="{plugin.docs_url}" target="_blank">{plugin.verbose_name}</a>'))
rows['module'].append(str(plugin.plugin_module))
rows['source_code'].append(str(plugin.plugin_dir))
rows['hooks'].append(mark_safe(', '.join(
f'<a href="{hook.admin_url}">{hook.id}</a>'
for hook in plugin.hooks
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
plugin.hooks.get_BINPROVIDERS = plugin.hooks.get('get_BINPROVIDERS', lambda: {})
plugin.hooks.get_BINARIES = plugin.hooks.get('get_BINARIES', lambda: {})
plugin.hooks.get_CONFIG = plugin.hooks.get('get_CONFIG', lambda: {})
rows['Label'].append(ItemLink(plugin.label, key=plugin.package))
rows['Version'].append(str(plugin.version))
rows['Author'].append(mark_safe(f'<a href="{plugin.homepage}" target="_blank">{plugin.author}</a>'))
rows['Package'].append(ItemLink(plugin.package, key=plugin.package))
rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.source_code).replace(str(Path('~').expanduser()), '~')))
rows['Config'].append(mark_safe(''.join(
f'<a href="/admin/environment/config/{key}/"><b><code style="color: {get_color(key)};">{key}</code></b>=<code>{value}</code></a><br/>'
for configdict in plugin.hooks.get_CONFIG().values()
for key, value in benedict(configdict).items()
)))
rows['Binaries'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
for binary in plugin.hooks.get_BINARIES().values()
)))
rows['Package Managers'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
for binprovider in plugin.hooks.get_BINPROVIDERS().values()
)))
# rows['Search Backends'].append(mark_safe(', '.join(
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
# for searchbackend in plugin.SEARCHBACKENDS.values()
# )))
return TableContext(
title="Installed plugins",
@ -203,28 +255,33 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugin = None
for loaded_plugin in settings.PLUGINS.values():
if loaded_plugin.id == key:
plugin = loaded_plugin
plugin_id = None
for check_plugin_id, loaded_plugin in settings.PLUGINS.items():
if check_plugin_id.split('.')[-1] == key.split('.')[-1]:
plugin_id = check_plugin_id
break
assert plugin, f'Could not find a plugin matching the specified name: {key}'
assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
try:
plugin = plugin.load_binaries()
except Exception as e:
print(e)
plugin = abx.archivebox.reads.get_PLUGIN(plugin_id)
return ItemContext(
slug=key,
title=key,
data=[
{
"name": plugin.id,
"description": plugin.verbose_name,
"name": plugin.package,
"description": plugin.label,
"fields": {
"id": plugin.id,
"package": plugin.package,
"label": plugin.label,
"version": plugin.version,
"author": plugin.author,
"homepage": plugin.homepage,
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
"source_code": plugin.source_code,
"hooks": plugin.hooks,
"schema": obj_to_yaml(plugin.model_dump(include=("name", "verbose_name", "app_label", "hooks"))),
},
"help_texts": {
# TODO

View file

@ -1,859 +1,20 @@
__package__ = 'archivebox.core'
import os
from django.contrib.auth import get_user_model
from pathlib import Path
from django.contrib import admin, messages
from django.urls import path, reverse, resolve
from django.utils import timezone
from django.utils.functional import cached_property
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.contrib.auth import get_user_model, get_permission_codename
from django.contrib.auth.admin import UserAdmin
from django.core.paginator import Paginator
from django.core.exceptions import ValidationError
from django.template import Template, RequestContext
from django.conf import settings
from django import forms
from signal_webhooks.admin import WebhookAdmin
from signal_webhooks.utils import get_webhook_model
from archivebox.config import VERSION, DATA_DIR
from archivebox.misc.util import htmldecode, urldecode
from core.models import Snapshot, ArchiveResult, Tag
from core.mixins import SearchResultsAdminMixin
from api.models import APIToken
from abid_utils.admin import ABIDModelAdmin
from queues.tasks import bg_archive_links, bg_add
from machine.models import Machine, NetworkInterface, InstalledBinary
from core.admin_tags import TagAdmin
from core.admin_snapshots import SnapshotAdmin
from core.admin_archiveresults import ArchiveResultAdmin
from core.admin_users import UserAdmin
from index.html import snapshot_icons
from logging_util import printable_filesize
from main import remove
from extractors import archive_links
import abx
CONFIG = settings.FLAT_CONFIG
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
# Admin URLs
# /admin/
# /admin/login/
# /admin/core/
# /admin/core/snapshot/
# /admin/core/snapshot/:uuid/
# /admin/core/tag/
# /admin/core/tag/:uuid/
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
namespace = 'admin'
class CustomUserAdmin(UserAdmin):
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined']
readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set')
fieldsets = [*UserAdmin.fieldsets, ('Data', {'fields': readonly_fields})]
@admin.display(description='Snapshots')
def snapshot_set(self, obj):
total_count = obj.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
snap.pk,
snap.abid,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in obj.snapshot_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='Archive Result Logs')
def archiveresult_set(self, obj):
total_count = obj.archiveresult_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
result.pk,
result.abid,
result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...',
result.extractor,
result.snapshot.url[:64],
)
for result in obj.archiveresult_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='Tags')
def tag_set(self, obj):
total_count = obj.tag_set.count()
return mark_safe(', '.join(
format_html(
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
tag.pk,
tag.name,
)
for tag in obj.tag_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='API Tokens')
def apitoken_set(self, obj):
total_count = obj.apitoken_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
apitoken.pk,
apitoken.abid,
apitoken.token_redacted[:64],
apitoken.expires,
)
for apitoken in obj.apitoken_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='API Outbound Webhooks')
def outboundwebhook_set(self, obj):
total_count = obj.outboundwebhook_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
outboundwebhook.pk,
outboundwebhook.abid,
outboundwebhook.referenced_model,
outboundwebhook.endpoint,
)
for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.register(get_user_model(), CustomUserAdmin)
archivebox_admin.disable_action('delete_selected')
# archivebox_admin.register(CustomPlugin)
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
############### Additional sections are defined in settings.ADMIN_DATA_VIEWS #########
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
from huey_monitor.apps import HueyMonitorConfig
HueyMonitorConfig.verbose_name = 'Background Workers'
from huey_monitor.admin import TaskModel, TaskModelAdmin, SignalInfoModel, SignalInfoModelAdmin
archivebox_admin.register(SignalInfoModel, SignalInfoModelAdmin)
class CustomTaskModelAdmin(TaskModelAdmin):
actions = ["delete_selected"]
def has_delete_permission(self, request, obj=None):
codename = get_permission_codename("delete", self.opts)
return request.user.has_perm("%s.%s" % (self.opts.app_label, codename))
archivebox_admin.register(TaskModel, CustomTaskModelAdmin)
def result_url(result: TaskModel) -> str:
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
class AccelleratedPaginator(Paginator):
"""
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
Speeds up SELECT Count(*) on Admin views by >20x.
https://hakibenita.com/optimizing-the-django-admin-paginator
"""
@cached_property
def count(self):
if self.object_list._has_filters(): # type: ignore
# fallback to normal count method on filtered queryset
return super().count
else:
# otherwise count total rows in a separate fast query
return self.object_list.model.objects.count()
# Alternative approach for PostgreSQL: fallback count takes > 200ms
# from django.db import connection, transaction, OperationalError
# with transaction.atomic(), connection.cursor() as cursor:
# cursor.execute('SET LOCAL statement_timeout TO 200;')
# try:
# return super().count
# except OperationalError:
# return 9999999999999
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot'
extra = 0
sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
# # classes = ['collapse']
# # list_display_links = ['abid']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
try:
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
except (self.parent_model.DoesNotExist, ValidationError):
return self.parent_model.objects.get(pk=self.parent_model.id_from_abid(resolved.kwargs['object_id']))
@admin.display(
description='Completed',
ordering='end_ts',
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
def result_id(self, obj):
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
# import ipdb; ipdb.set_trace()
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
formset.form.base_fields['status'].initial = 'succeeded'
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class TagInline(admin.TabularInline):
model = Tag.snapshot_set.through # type: ignore
# fk_name = 'snapshot'
fields = ('id', 'tag')
extra = 1
# min_num = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import FilteredSelectMultiple
# class AutocompleteTags:
# model = Tag
# search_fields = ['name']
# name = 'name'
# # source_field = 'name'
# remote_field = Tag._meta.get_field('name')
# class AutocompleteTagsAdminStub:
# name = 'admin'
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
label='Edit tags',
queryset=Tag.objects.all(),
required=False,
widget=FilteredSelectMultiple(
'core_tag__name',
False,
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField(
# choices=ArchiveResult.EXTRACTOR_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )
@admin.register(Snapshot, site=archivebox_admin)
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
list_display = ('created_at', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
list_per_page = min(max(5, CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
paginator = AccelleratedPaginator
save_on_top = True
show_full_result_count = False
def changelist_view(self, request, extra_context=None):
self.request = request
extra_context = extra_context or {}
try:
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
except Exception as e:
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
]
return custom_urls + urls
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
@admin.action(
description="Imported Timestamp"
)
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
'bookmarked_date': obj.bookmarked,
'timestamp': obj.timestamp,
})
html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""")
return mark_safe(html.render(context))
# pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S')
# return f'{pretty_time} ({obj.timestamp})'
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.pk,
# )
def admin_actions(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page </a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions </a>
''',
obj.timestamp,
obj.timestamp,
obj.pk,
)
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
for tag in obj.tags.all()
if str(tag.name).strip()
)
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path,
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
description='Files Saved',
# ordering='archiveresult_count',
)
def files(self, obj):
# return '-'
return snapshot_icons(obj)
@admin.display(
# ordering='archiveresult_count'
)
def size(self, obj):
archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url,
obj.url[:128],
)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)
# Save before monkey patching to restore for changelist list view
saved_change_list_template = self.change_list_template
saved_list_per_page = self.list_per_page
saved_list_max_show_all = self.list_max_show_all
# Monkey patch here plus core_tags.py
self.change_list_template = 'private_index_grid.html'
self.list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
self.list_max_show_all = self.list_per_page
# Call monkey patched view
rendered_response = self.changelist_view(request, extra_context=extra_context)
# Restore values
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
# for debugging, uncomment this to print all requests:
# def changelist_view(self, request, extra_context=None):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description=" Get Title"
)
def update_titles(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
if len(links) < 3:
# run syncronously if there are only 1 or 2 links
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
else:
# otherwise run in a background worker
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
)
@admin.action(
description="⬇️ Get Missing"
)
def update_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="🆕 Archive Again"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="☠️ Delete"
)
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
messages.success(
request,
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
)
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
messages.success(
request,
f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
)
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
messages.success(
request,
f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
)
# @admin.register(SnapshotTag, site=archivebox_admin)
# class SnapshotTagAdmin(ABIDModelAdmin):
# list_display = ('id', 'snapshot', 'tag')
# sort_fields = ('id', 'snapshot', 'tag')
# search_fields = ('id', 'snapshot_id', 'tag_id')
# fields = ('snapshot', 'id')
# actions = ['delete_selected']
# ordering = ['-id']
@admin.register(Tag, site=archivebox_admin)
class TagAdmin(ABIDModelAdmin):
list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots')
list_filter = ('created_at', 'created_by')
sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at')
readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots')
search_fields = ('abid', 'name', 'slug')
fields = ('name', 'created_by', *readonly_fields)
actions = ['delete_selected']
ordering = ['-created_at']
paginator = AccelleratedPaginator
def num_snapshots(self, tag):
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,
tag.snapshot_set.count(),
)
def snapshots(self, tag):
total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
snap.pk,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
@admin.register(ArchiveResult, site=archivebox_admin)
class ArchiveResultAdmin(ABIDModelAdmin):
list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary')
search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields)
autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
save_on_top = True
actions = ['delete_selected']
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'
def change_view(self, request, object_id, form_url="", extra_context=None):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
@admin.display(
description='Snapshot Info'
)
def snapshot_info(self, result):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.timestamp,
result.snapshot.abid,
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@admin.display(
description='Snapshot Tags'
)
def tags_str(self, result):
return result.snapshot.tags_str()
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
result.output,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_str = format_html(
'<pre style="display: inline-block">{}</pre><br/>',
result.output,
)
output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
path_from_output_str = (snapshot_dir / result.output)
output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
if os.access(path_from_output_str, os.R_OK):
root_dir = str(path_from_output_str)
else:
root_dir = str(snapshot_dir)
# print(root_dir, str(list(os.walk(root_dir))))
for root, dirs, files in os.walk(root_dir):
depth = root.replace(root_dir, '').count(os.sep) + 1
if depth > 2:
continue
indent = ' ' * 4 * (depth)
output_str += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
indentation_str = ' ' * 4 * (depth + 1)
for filename in sorted(files):
is_hidden = filename.startswith('.')
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_str + format_html('</code></pre>')
@admin.register(APIToken, site=archivebox_admin)
class APITokenAdmin(ABIDModelAdmin):
list_display = ('created_at', 'abid', 'created_by', 'token_redacted', 'expires')
sort_fields = ('abid', 'created_at', 'created_by', 'expires')
readonly_fields = ('created_at', 'modified_at', 'abid_info')
search_fields = ('id', 'abid', 'created_by__username', 'token')
fields = ('created_by', 'token', 'expires', *readonly_fields)
list_filter = ('created_by',)
ordering = ['-created_at']
list_per_page = 100
@admin.register(get_webhook_model(), site=archivebox_admin)
class CustomWebhookAdmin(WebhookAdmin, ABIDModelAdmin):
list_display = ('created_at', 'created_by', 'abid', *WebhookAdmin.list_display)
sort_fields = ('created_at', 'created_by', 'abid', 'referenced_model', 'endpoint', 'last_success', 'last_error')
readonly_fields = ('created_at', 'modified_at', 'abid_info', *WebhookAdmin.readonly_fields)
@admin.register(Machine, site=archivebox_admin)
class MachineAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
sort_fields = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
# search_fields = ('id', 'abid', 'guid', 'hostname', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release')
readonly_fields = ('guid', 'created_at', 'modified_at', 'abid_info', 'ips')
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Public IP',
ordering='networkinterface__ip_public',
)
def ips(self, machine):
return format_html(
'<a href="/admin/machine/networkinterface/?q={}"><b><code>{}</code></b></a>',
machine.abid,
', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)),
)
@admin.register(NetworkInterface, site=archivebox_admin)
class NetworkInterfaceAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health')
sort_fields = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address')
search_fields = ('abid', 'machine__abid', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
readonly_fields = ('machine', 'created_at', 'modified_at', 'abid_info', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('isp', 'country', 'region')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Machine',
ordering='machine__abid',
)
def machine_info(self, iface):
return format_html(
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> &nbsp; {}</a>',
iface.machine.id,
iface.machine.abid,
iface.machine.hostname,
)
@admin.register(InstalledBinary, site=archivebox_admin)
class InstalledBinaryAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256')
readonly_fields = ('created_at', 'modified_at', 'abid_info')
fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('name', 'binprovider', 'machine_id')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Machine',
ordering='machine__abid',
)
def machine_info(self, installed_binary):
return format_html(
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> &nbsp; {}</a>',
installed_binary.machine.id,
installed_binary.machine.abid,
installed_binary.machine.hostname,
)
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(get_user_model(), UserAdmin)
admin_site.register(ArchiveResult, ArchiveResultAdmin)
admin_site.register(Snapshot, SnapshotAdmin)
admin_site.register(Tag, TagAdmin)

View file

@ -0,0 +1,199 @@
__package__ = 'archivebox.core'
import os
from pathlib import Path
from django.contrib import admin
from django.utils.html import format_html, mark_safe
from django.core.exceptions import ValidationError
from django.urls import reverse, resolve
from django.utils import timezone
from django.forms import forms
from huey_monitor.admin import TaskModel
import abx
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.paginators import AccelleratedPaginator
from abid_utils.admin import ABIDModelAdmin
from core.models import ArchiveResult, Snapshot
def result_url(result: TaskModel) -> str:
url = reverse("admin:huey_monitor_taskmodel_change", args=[str(result.id)])
return format_html('<a href="{url}" class="fade-in-progress-url">See progress...</a>'.format(url=url))
class ArchiveResultInline(admin.TabularInline):
name = 'Archive Results Log'
model = ArchiveResult
parent_model = Snapshot
# fk_name = 'snapshot'
extra = 0
sort_fields = ('end_ts', 'extractor', 'output', 'status', 'cmd_version')
readonly_fields = ('id', 'result_id', 'completed', 'command', 'version')
fields = ('start_ts', 'end_ts', *readonly_fields, 'extractor', 'cmd', 'cmd_version', 'pwd', 'created_by', 'status', 'output')
# exclude = ('id',)
ordering = ('end_ts',)
show_change_link = True
# # classes = ['collapse']
# # list_display_links = ['abid']
def get_parent_object_from_request(self, request):
resolved = resolve(request.path_info)
try:
return self.parent_model.objects.get(pk=resolved.kwargs['object_id'])
except (self.parent_model.DoesNotExist, ValidationError):
return self.parent_model.objects.get(pk=self.parent_model.id_from_abid(resolved.kwargs['object_id']))
@admin.display(
description='Completed',
ordering='end_ts',
)
def completed(self, obj):
return format_html('<p style="white-space: nowrap">{}</p>', obj.end_ts.strftime('%Y-%m-%d %H:%M:%S'))
def result_id(self, obj):
return format_html('<a href="{}"><code style="font-size: 10px">[{}]</code></a>', reverse('admin:core_archiveresult_change', args=(obj.id,)), obj.abid)
def command(self, obj):
return format_html('<small><code>{}</code></small>', " ".join(obj.cmd or []))
def version(self, obj):
return format_html('<small><code>{}</code></small>', obj.cmd_version or '-')
def get_formset(self, request, obj=None, **kwargs):
formset = super().get_formset(request, obj, **kwargs)
snapshot = self.get_parent_object_from_request(request)
# import ipdb; ipdb.set_trace()
# formset.form.base_fields['id'].widget = formset.form.base_fields['id'].hidden_widget()
# default values for new entries
formset.form.base_fields['status'].initial = 'succeeded'
formset.form.base_fields['start_ts'].initial = timezone.now()
formset.form.base_fields['end_ts'].initial = timezone.now()
formset.form.base_fields['cmd_version'].initial = '-'
formset.form.base_fields['pwd'].initial = str(snapshot.link_dir)
formset.form.base_fields['created_by'].initial = request.user
formset.form.base_fields['cmd'] = forms.JSONField(initial=['-'])
formset.form.base_fields['output'].initial = 'Manually recorded cmd output...'
if obj is not None:
# hidden values for existing entries and new entries
formset.form.base_fields['start_ts'].widget = formset.form.base_fields['start_ts'].hidden_widget()
formset.form.base_fields['end_ts'].widget = formset.form.base_fields['end_ts'].hidden_widget()
formset.form.base_fields['cmd'].widget = formset.form.base_fields['cmd'].hidden_widget()
formset.form.base_fields['pwd'].widget = formset.form.base_fields['pwd'].hidden_widget()
formset.form.base_fields['created_by'].widget = formset.form.base_fields['created_by'].hidden_widget()
formset.form.base_fields['cmd_version'].widget = formset.form.base_fields['cmd_version'].hidden_widget()
return formset
def get_readonly_fields(self, request, obj=None):
if obj is not None:
return self.readonly_fields
else:
return []
class ArchiveResultAdmin(ABIDModelAdmin):
list_display = ('start_ts', 'snapshot_info', 'tags_str', 'extractor', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('cmd_str', 'snapshot_info', 'tags_str', 'created_at', 'modified_at', 'abid_info', 'output_summary')
search_fields = ('id', 'abid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = ('snapshot', 'extractor', 'status', 'output', 'pwd', 'start_ts', 'end_ts', 'created_by', 'cmd_version', 'cmd', *readonly_fields)
autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
paginator = AccelleratedPaginator
save_on_top = True
actions = ['delete_selected']
class Meta:
verbose_name = 'Archive Result'
verbose_name_plural = 'Archive Results'
def change_view(self, request, object_id, form_url="", extra_context=None):
self.request = request
return super().change_view(request, object_id, form_url, extra_context)
@admin.display(
description='Snapshot Info'
)
def snapshot_info(self, result):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b> &nbsp; {} &nbsp; {}</a><br/>',
result.snapshot.timestamp,
result.snapshot.abid,
result.snapshot.bookmarked_at.strftime('%Y-%m-%d %H:%M'),
result.snapshot.url[:128],
)
@admin.display(
description='Snapshot Tags'
)
def tags_str(self, result):
return result.snapshot.tags_str()
def cmd_str(self, result):
return format_html(
'<pre>{}</pre>',
' '.join(result.cmd) if isinstance(result.cmd, list) else str(result.cmd),
)
def output_str(self, result):
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
result.snapshot.timestamp,
result.output if (result.status == 'succeeded') and result.extractor not in ('title', 'archive_org') else 'index.html',
result.output,
)
def output_summary(self, result):
snapshot_dir = Path(DATA_DIR) / str(result.pwd).split('data/', 1)[-1]
output_str = format_html(
'<pre style="display: inline-block">{}</pre><br/>',
result.output,
)
output_str += format_html('<a href="/archive/{}/index.html#all">See result files ...</a><br/><pre><code>', str(result.snapshot.timestamp))
path_from_output_str = (snapshot_dir / result.output)
output_str += format_html('<i style="padding: 1px">{}</i><b style="padding-right: 20px">/</b><i>{}</i><br/><hr/>', str(snapshot_dir), str(result.output))
if os.access(path_from_output_str, os.R_OK):
root_dir = str(path_from_output_str)
else:
root_dir = str(snapshot_dir)
# print(root_dir, str(list(os.walk(root_dir))))
for root, dirs, files in os.walk(root_dir):
depth = root.replace(root_dir, '').count(os.sep) + 1
if depth > 2:
continue
indent = ' ' * 4 * (depth)
output_str += format_html('<b style="padding: 1px">{}{}/</b><br/>', indent, os.path.basename(root))
indentation_str = ' ' * 4 * (depth + 1)
for filename in sorted(files):
is_hidden = filename.startswith('.')
output_str += format_html('<span style="opacity: {}.2">{}{}</span><br/>', int(not is_hidden), indentation_str, filename.strip())
return output_str + format_html('</code></pre>')
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(ArchiveResult, ArchiveResultAdmin)

View file

@ -0,0 +1,42 @@
__package__ = 'archivebox.core'
from django.contrib import admin
import abx.django.use
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Admin Views'
site_title = 'Admin'
namespace = 'admin'
archivebox_admin = ArchiveBoxAdmin()
archivebox_admin.disable_action('delete_selected')
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
# patch admin with methods to add data views (implemented by admin_data_views package)
# https://github.com/MrThearMan/django-admin-data-views
# https://mrthearman.github.io/django-admin-data-views/setup/
from admin_data_views.admin import get_app_list, admin_data_index_view, get_admin_data_urls, get_urls
archivebox_admin.get_app_list = get_app_list.__get__(archivebox_admin, ArchiveBoxAdmin)
archivebox_admin.admin_data_index_view = admin_data_index_view.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_admin_data_urls = get_admin_data_urls.__get__(archivebox_admin, ArchiveBoxAdmin) # type: ignore
archivebox_admin.get_urls = get_urls(archivebox_admin.get_urls).__get__(archivebox_admin, ArchiveBoxAdmin)
############### Admin Data View sections are defined in settings.ADMIN_DATA_VIEWS #########
def register_admin_site():
"""Replace the default admin site with our custom ArchiveBox admin site."""
from django.contrib import admin
from django.contrib.admin import sites
admin.site = archivebox_admin
sites.site = archivebox_admin
# register all plugins admin classes
abx.django.use.register_admin(archivebox_admin)
return archivebox_admin

View file

@ -0,0 +1,357 @@
__package__ = 'archivebox.core'
import os
from pathlib import Path
from django.contrib import admin, messages
from django.urls import path
from django.utils.html import format_html, mark_safe
from django.utils import timezone
from django import forms
from django.template import Template, RequestContext
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import FilteredSelectMultiple
from archivebox.config import DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.util import htmldecode, urldecode
from archivebox.misc.paginators import AccelleratedPaginator
from archivebox.search.admin import SearchResultsAdminMixin
from archivebox.logging_util import printable_filesize
from archivebox.index.html import snapshot_icons
from archivebox.extractors import archive_links
from archivebox.main import remove
from archivebox.abid_utils.admin import ABIDModelAdmin
from archivebox.queues.tasks import bg_archive_links, bg_add
from core.models import Tag
from core.admin_tags import TagInline
from core.admin_archiveresults import ArchiveResultInline, result_url
GLOBAL_CONTEXT = {'VERSION': VERSION, 'VERSIONS_AVAILABLE': [], 'CAN_UPGRADE': False}
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
label='Edit tags',
queryset=Tag.objects.all(),
required=False,
widget=FilteredSelectMultiple(
'core_tag__name',
False,
),
)
# TODO: allow selecting actions for specific extractors? is this useful?
# extractor = forms.ChoiceField(
# choices=ArchiveResult.EXTRACTOR_CHOICES,
# required=False,
# widget=forms.MultileChoiceField(attrs={'class': "form-control"})
# )
class SnapshotAdmin(SearchResultsAdminMixin, ABIDModelAdmin):
list_display = ('created_at', 'title_str', 'files', 'size', 'url_str')
sort_fields = ('title_str', 'url_str', 'created_at')
readonly_fields = ('admin_actions', 'status_info', 'tags_str', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'abid_info', 'link_dir')
search_fields = ('id', 'url', 'abid', 'timestamp', 'title', 'tags__name')
list_filter = ('created_at', 'downloaded_at', 'archiveresult__status', 'created_by', 'tags__name')
fields = ('url', 'title', 'created_by', 'bookmarked_at', *readonly_fields)
ordering = ['-created_at']
actions = ['add_tags', 'remove_tags', 'update_titles', 'update_snapshots', 'resnapshot_snapshot', 'overwrite_snapshots', 'delete_snapshots']
inlines = [TagInline, ArchiveResultInline]
list_per_page = min(max(5, SERVER_CONFIG.SNAPSHOTS_PER_PAGE), 5000)
action_form = SnapshotActionForm
paginator = AccelleratedPaginator
save_on_top = True
show_full_result_count = False
def changelist_view(self, request, extra_context=None):
self.request = request
extra_context = extra_context or {}
try:
return super().changelist_view(request, extra_context | GLOBAL_CONTEXT)
except Exception as e:
self.message_user(request, f'Error occurred while loading the page: {str(e)} {request.GET} {request.POST}')
return super().changelist_view(request, GLOBAL_CONTEXT)
def get_urls(self):
urls = super().get_urls()
custom_urls = [
path('grid/', self.admin_site.admin_view(self.grid_view), name='grid')
]
return custom_urls + urls
# def get_queryset(self, request):
# # tags_qs = SnapshotTag.objects.all().select_related('tag')
# # prefetch = Prefetch('snapshottag_set', queryset=tags_qs)
# self.request = request
# return super().get_queryset(request).prefetch_related('archiveresult_set').distinct() # .annotate(archiveresult_count=Count('archiveresult'))
@admin.action(
description="Imported Timestamp"
)
def imported_timestamp(self, obj):
context = RequestContext(self.request, {
'bookmarked_date': obj.bookmarked,
'timestamp': obj.timestamp,
})
html = Template("""{{bookmarked_date}} (<code>{{timestamp}}</code>)""")
return mark_safe(html.render(context))
# pretty_time = obj.bookmarked.strftime('%Y-%m-%d %H:%M:%S')
# return f'{pretty_time} ({obj.timestamp})'
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.pk,
# )
def admin_actions(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}">Summary page </a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/archive/{}/index.html#all">Result files 📑</a> &nbsp; &nbsp;
<a class="btn" style="font-size: 18px; display: inline-block; border-radius: 10px; border: 3px solid #eee; padding: 4px 8px" href="/admin/core/snapshot/?id__exact={}">Admin actions </a>
''',
obj.timestamp,
obj.timestamp,
obj.pk,
)
def status_info(self, obj):
return format_html(
# URL Hash: <code style="font-size: 10px; user-select: all">{}</code><br/>
'''
Archived: {} ({} files {}) &nbsp; &nbsp;
Favicon: <img src="{}" style="height: 20px"/> &nbsp; &nbsp;
Status code: {} &nbsp; &nbsp;<br/>
Server: {} &nbsp; &nbsp;
Content type: {} &nbsp; &nbsp;
Extension: {} &nbsp; &nbsp;
''',
'' if obj.is_archived else '',
obj.num_outputs,
self.size(obj) or '0kb',
f'/archive/{obj.timestamp}/favicon.ico',
obj.status_code or '-',
obj.headers and obj.headers.get('Server') or '-',
obj.headers and obj.headers.get('Content-Type') or '-',
obj.extension or '-',
)
@admin.display(
description='Title',
ordering='title',
)
def title_str(self, obj):
tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.pk, tag.name)
for tag in obj.tags.all()
if str(tag.name).strip()
)
return format_html(
'<a href="/{}">'
'<img src="/{}/favicon.ico" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/index.html">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path,
obj.archive_path,
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f' <span class="tags">{tags}</span>')
@admin.display(
description='Files Saved',
# ordering='archiveresult_count',
)
def files(self, obj):
# return '-'
return snapshot_icons(obj)
@admin.display(
# ordering='archiveresult_count'
)
def size(self, obj):
archive_size = os.access(Path(obj.link_dir) / 'index.html', os.F_OK) and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
size_txt = mark_safe(f'<b>{size_txt}</b>')
else:
size_txt = mark_safe('<span style="opacity: 0.3">...</span>')
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
size_txt,
)
@admin.display(
description='Original URL',
ordering='url',
)
def url_str(self, obj):
return format_html(
'<a href="{}"><code style="user-select: all;">{}</code></a>',
obj.url,
obj.url[:128],
)
def grid_view(self, request, extra_context=None):
# cl = self.get_changelist_instance(request)
# Save before monkey patching to restore for changelist list view
saved_change_list_template = self.change_list_template
saved_list_per_page = self.list_per_page
saved_list_max_show_all = self.list_max_show_all
# Monkey patch here plus core_tags.py
self.change_list_template = 'private_index_grid.html'
self.list_per_page = SERVER_CONFIG.SNAPSHOTS_PER_PAGE
self.list_max_show_all = self.list_per_page
# Call monkey patched view
rendered_response = self.changelist_view(request, extra_context=extra_context)
# Restore values
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
# for debugging, uncomment this to print all requests:
# def changelist_view(self, request, extra_context=None):
# print('[*] Got request', request.method, request.POST)
# return super().changelist_view(request, extra_context=None)
@admin.action(
description=" Get Title"
)
def update_titles(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
if len(links) < 3:
# run syncronously if there are only 1 or 2 links
archive_links(links, overwrite=True, methods=('title','favicon'), out_dir=DATA_DIR)
messages.success(request, f"Title and favicon have been fetched and saved for {len(links)} URLs.")
else:
# otherwise run in a background worker
result = bg_archive_links((links,), kwargs={"overwrite": True, "methods": ["title", "favicon"], "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Title and favicon are updating in the background for {len(links)} URLs. {result_url(result)}"),
)
@admin.action(
description="⬇️ Get Missing"
)
def update_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
result = bg_archive_links((links,), kwargs={"overwrite": False, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Re-trying any previously failed methods for {len(links)} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="🆕 Archive Again"
)
def resnapshot_snapshot(self, request, queryset):
for snapshot in queryset:
timestamp = timezone.now().isoformat('T', 'seconds')
new_url = snapshot.url.split('#')[0] + f'#{timestamp}'
result = bg_add({'urls': new_url, 'tag': snapshot.tags_str()})
messages.success(
request,
mark_safe(f"Creating new fresh snapshots for {queryset.count()} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="🔄 Redo"
)
def overwrite_snapshots(self, request, queryset):
links = [snapshot.as_link() for snapshot in queryset]
result = bg_archive_links((links,), kwargs={"overwrite": True, "out_dir": DATA_DIR})
messages.success(
request,
mark_safe(f"Clearing all previous results and re-downloading {len(links)} URLs in the background. {result_url(result)}"),
)
@admin.action(
description="☠️ Delete"
)
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=DATA_DIR)
messages.success(
request,
mark_safe(f"Succesfully deleted {queryset.count()} Snapshots. Don't forget to scrub URLs from import logs (data/sources) and error logs (data/logs) if needed."),
)
@admin.action(
description="+"
)
def add_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[+] Adding tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.add(*tags)
messages.success(
request,
f"Added {len(tags)} tags to {queryset.count()} Snapshots.",
)
@admin.action(
description=""
)
def remove_tags(self, request, queryset):
tags = request.POST.getlist('tags')
print('[-] Removing tags', tags, 'to Snapshots', queryset)
for obj in queryset:
obj.tags.remove(*tags)
messages.success(
request,
f"Removed {len(tags)} tags from {queryset.count()} Snapshots.",
)

View file

@ -0,0 +1,165 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from django.utils.html import format_html, mark_safe
import abx
from abid_utils.admin import ABIDModelAdmin
from archivebox.misc.paginators import AccelleratedPaginator
from core.models import Tag
class TagInline(admin.TabularInline):
model = Tag.snapshot_set.through # type: ignore
# fk_name = 'snapshot'
fields = ('id', 'tag')
extra = 1
# min_num = 1
max_num = 1000
autocomplete_fields = (
'tag',
)
# class AutocompleteTags:
# model = Tag
# search_fields = ['name']
# name = 'name'
# # source_field = 'name'
# remote_field = Tag._meta.get_field('name')
# class AutocompleteTagsAdminStub:
# name = 'admin'
# class TaggedItemInline(admin.TabularInline):
# readonly_fields = ('object_link',)
# fields = ('id', 'tag', 'content_type', 'object_id', *readonly_fields)
# model = TaggedItem
# extra = 1
# show_change_link = True
# @admin.display(description='object')
# def object_link(self, obj):
# obj = obj.content_type.get_object_for_this_type(pk=obj.object_id)
# return format_html('<a href="/admin/{}/{}/{}/change"><b>[{}]</b></a>', obj._meta.app_label, obj._meta.model_name, obj.pk, str(obj))
class TagAdmin(ABIDModelAdmin):
list_display = ('created_at', 'created_by', 'abid', 'name', 'num_snapshots', 'snapshots')
list_filter = ('created_at', 'created_by')
sort_fields = ('name', 'slug', 'abid', 'created_by', 'created_at')
readonly_fields = ('slug', 'abid', 'created_at', 'modified_at', 'abid_info', 'snapshots')
search_fields = ('abid', 'name', 'slug')
fields = ('name', 'created_by', *readonly_fields)
actions = ['delete_selected', 'merge_tags']
ordering = ['-created_at']
# inlines = [TaggedItemInline]
paginator = AccelleratedPaginator
def num_snapshots(self, tag):
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
tag.id,
tag.snapshot_set.count(),
)
def snapshots(self, tag):
total_count = tag.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> {}',
snap.pk,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in tag.snapshot_set.order_by('-downloaded_at')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={tag.id}">{total_count} total snapshots...<a>'))
# def get_urls(self):
# urls = super().get_urls()
# custom_urls = [
# path(
# "merge-tags/",
# self.admin_site.admin_view(self.merge_tags_view),
# name="taggit_tag_merge_tags",
# ),
# ]
# return custom_urls + urls
# @admin.action(description="Merge selected tags")
# def merge_tags(self, request, queryset):
# selected = request.POST.getlist(admin.helpers.ACTION_CHECKBOX_NAME)
# if not selected:
# self.message_user(request, "Please select at least one tag.")
# return redirect(request.get_full_path())
# selected_tag_ids = ",".join(selected)
# redirect_url = f"{request.get_full_path()}merge-tags/"
# request.session["selected_tag_ids"] = selected_tag_ids
# return redirect(redirect_url)
# def merge_tags_view(self, request):
# selected_tag_ids = request.session.get("selected_tag_ids", "").split(",")
# if request.method == "POST":
# form = MergeTagsForm(request.POST)
# if form.is_valid():
# new_tag_name = form.cleaned_data["new_tag_name"]
# new_tag, created = Tag.objects.get_or_create(name=new_tag_name)
# with transaction.atomic():
# for tag_id in selected_tag_ids:
# tag = Tag.objects.get(id=tag_id)
# tagged_items = TaggedItem.objects.filter(tag=tag)
# for tagged_item in tagged_items:
# if TaggedItem.objects.filter(
# tag=new_tag,
# content_type=tagged_item.content_type,
# object_id=tagged_item.object_id,
# ).exists():
# # we have the new tag as well, so we can just
# # remove the tag association
# tagged_item.delete()
# else:
# # point this taggedItem to the new one
# tagged_item.tag = new_tag
# tagged_item.save()
# # delete the old tag
# if tag.id != new_tag.id:
# tag.delete()
# self.message_user(request, "Tags have been merged", level="success")
# # clear the selected_tag_ids from session after merge is complete
# request.session.pop("selected_tag_ids", None)
# return redirect("..")
# else:
# self.message_user(request, "Form is invalid.", level="error")
# context = {
# "form": MergeTagsForm(),
# "selected_tag_ids": selected_tag_ids,
# }
# return render(request, "admin/taggit/merge_tags_form.html", context)
# @admin.register(SnapshotTag, site=archivebox_admin)
# class SnapshotTagAdmin(ABIDModelAdmin):
# list_display = ('id', 'snapshot', 'tag')
# sort_fields = ('id', 'snapshot', 'tag')
# search_fields = ('id', 'snapshot_id', 'tag_id')
# fields = ('snapshot', 'id')
# actions = ['delete_selected']
# ordering = ['-id']
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(Tag, TagAdmin)

View file

@ -0,0 +1,91 @@
__package__ = 'archivebox.core'
from django.contrib import admin
from django.contrib.auth.admin import UserAdmin
from django.utils.html import format_html, mark_safe
from django.contrib.auth import get_user_model
import abx
class CustomUserAdmin(UserAdmin):
sort_fields = ['id', 'email', 'username', 'is_superuser', 'last_login', 'date_joined']
list_display = ['username', 'id', 'email', 'is_superuser', 'last_login', 'date_joined']
readonly_fields = ('snapshot_set', 'archiveresult_set', 'tag_set', 'apitoken_set', 'outboundwebhook_set')
fieldsets = [*UserAdmin.fieldsets, ('Data', {'fields': readonly_fields})]
@admin.display(description='Snapshots')
def snapshot_set(self, obj):
total_count = obj.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> {}',
snap.pk,
snap.abid,
snap.downloaded_at.strftime('%Y-%m-%d %H:%M') if snap.downloaded_at else 'pending...',
snap.url[:64],
)
for snap in obj.snapshot_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/snapshot/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='Archive Result Logs')
def archiveresult_set(self, obj):
total_count = obj.archiveresult_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/core/archiveresult/{}/change"><b>[{}]</b></a></code> <b>📅 {}</b> <b>📄 {}</b> {}',
result.pk,
result.abid,
result.snapshot.downloaded_at.strftime('%Y-%m-%d %H:%M') if result.snapshot.downloaded_at else 'pending...',
result.extractor,
result.snapshot.url[:64],
)
for result in obj.archiveresult_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/archiveresult/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='Tags')
def tag_set(self, obj):
total_count = obj.tag_set.count()
return mark_safe(', '.join(
format_html(
'<code><a href="/admin/core/tag/{}/change"><b>{}</b></a></code>',
tag.pk,
tag.name,
)
for tag in obj.tag_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/core/tag/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='API Tokens')
def apitoken_set(self, obj):
total_count = obj.apitoken_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/apitoken/{}/change"><b>[{}]</b></a></code> {} (expires {})',
apitoken.pk,
apitoken.abid,
apitoken.token_redacted[:64],
apitoken.expires,
)
for apitoken in obj.apitoken_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/apitoken/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@admin.display(description='API Outbound Webhooks')
def outboundwebhook_set(self, obj):
total_count = obj.outboundwebhook_set.count()
return mark_safe('<br/>'.join(
format_html(
'<code><a href="/admin/api/outboundwebhook/{}/change"><b>[{}]</b></a></code> {} -> {}',
outboundwebhook.pk,
outboundwebhook.abid,
outboundwebhook.referenced_model,
outboundwebhook.endpoint,
)
for outboundwebhook in obj.outboundwebhook_set.order_by('-modified_at')[:10]
) + f'<br/><a href="/admin/api/outboundwebhook/?created_by__id__exact={obj.pk}">{total_count} total records...<a>')
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(get_user_model(), CustomUserAdmin)

View file

@ -2,27 +2,22 @@ __package__ = 'archivebox.core'
from django.apps import AppConfig
import abx
class CoreConfig(AppConfig):
name = 'core'
def ready(self):
# register our custom admin as the primary django admin
from django.contrib import admin
from django.contrib.admin import sites
from core.admin import archivebox_admin
admin.site = archivebox_admin
sites.site = archivebox_admin
# register signal handlers
from .auth import register_signals
register_signals()
"""Register the archivebox.core.admin_site as the main django admin site"""
from core.admin_site import register_admin_site
register_admin_site()
# from django.contrib.admin.apps import AdminConfig
# class CoreAdminConfig(AdminConfig):
# default_site = "core.admin.get_admin_site"
@abx.hookimpl
def register_admin(admin_site):
"""Register the core.models views (Snapshot, ArchiveResult, Tag, etc.) with the admin site"""
from core.admin import register_admin
register_admin(admin_site)

View file

@ -1,12 +0,0 @@
__package__ = 'archivebox.core'
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
def register_signals():
if LDAP_CONFIG.LDAP_ENABLED:
import django_auth_ldap.backend
from .auth_ldap import create_user
django_auth_ldap.backend.populate_user.connect(create_user)

View file

@ -1,8 +0,0 @@
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
def create_user(sender, user=None, ldap_user=None, **kwargs):
if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
user.is_superuser = True
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')

View file

@ -1,101 +0,0 @@
# Generated by Django 5.1.1 on 2024-10-01 02:10
import abid_utils.models
import charidfield.fields
import django.core.validators
import django.db.models.deletion
from django.conf import settings
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("core", "0074_alter_snapshot_downloaded_at"),
migrations.swappable_dependency(settings.AUTH_USER_MODEL),
]
operations = [
migrations.CreateModel(
name="Crawl",
fields=[
(
"id",
models.UUIDField(
default=None,
editable=False,
primary_key=True,
serialize=False,
unique=True,
verbose_name="ID",
),
),
(
"abid",
charidfield.fields.CharIDField(
blank=True,
db_index=True,
default=None,
help_text="ABID-format identifier for this entity (e.g. snp_01BJQMF54D093DXEAWZ6JYRPAQ)",
max_length=30,
null=True,
prefix="crl_",
unique=True,
),
),
(
"created_at",
abid_utils.models.AutoDateTimeField(db_index=True, default=None),
),
("modified_at", models.DateTimeField(auto_now=True)),
("urls", models.TextField()),
(
"depth",
models.PositiveSmallIntegerField(
default=1,
validators=[
django.core.validators.MinValueValidator(0),
django.core.validators.MaxValueValidator(2),
],
),
),
(
"parser",
models.CharField(
choices=[
("auto", "auto"),
("pocket_api", "Pocket API"),
("readwise_reader_api", "Readwise Reader API"),
("wallabag_atom", "Wallabag Atom"),
("pocket_html", "Pocket HTML"),
("pinboard_rss", "Pinboard RSS"),
("shaarli_rss", "Shaarli RSS"),
("medium_rss", "Medium RSS"),
("netscape_html", "Netscape HTML"),
("rss", "Generic RSS"),
("json", "Generic JSON"),
("jsonl", "Generic JSONL"),
("html", "Generic HTML"),
("txt", "Generic TXT"),
("url_list", "URL List"),
],
default="auto",
max_length=32,
),
),
(
"created_by",
models.ForeignKey(
default=None,
on_delete=django.db.models.deletion.CASCADE,
related_name="crawl_set",
to=settings.AUTH_USER_MODEL,
),
),
],
options={
"verbose_name": "Crawl",
"verbose_name_plural": "Crawls",
},
),
]

View file

@ -15,7 +15,6 @@ from django.utils.text import slugify
from django.core.cache import cache
from django.urls import reverse, reverse_lazy
from django.db.models import Case, When, Value, IntegerField
from django.core.validators import MaxValueValidator, MinValueValidator
from django.contrib import admin
from django.conf import settings
@ -23,6 +22,7 @@ from archivebox.config import CONSTANTS
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from queues.tasks import bg_archive_snapshot
# from crawls.models import Crawl
# from machine.models import Machine, NetworkInterface
from archivebox.misc.system import get_dir_size
@ -30,7 +30,6 @@ from archivebox.misc.util import parse_date, base_url
from ..index.schema import Link
from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
from ..parsers import PARSERS
# class BaseModel(models.Model):
@ -45,9 +44,11 @@ from ..parsers import PARSERS
class Tag(ABIDModel):
"""
Based on django-taggit model + ABID base.
Loosely based on django-taggit model + ABID base.
"""
abid_prefix = 'tag_'
abid_ts_src = 'self.created_at'
@ -68,7 +69,7 @@ class Tag(ABIDModel):
# slug is autoset on save from name, never set it manually
snapshot_set: models.Manager['Snapshot']
crawl_set: models.Manager['Crawl']
# crawl_set: models.Manager['Crawl']
class Meta(TypedModelMeta):
verbose_name = "Tag"
@ -83,8 +84,12 @@ class Tag(ABIDModel):
slug += "_%d" % i
return slug
def clean(self, *args, **kwargs):
self.slug = self.slug or self.slugify(self.name)
super().clean(*args, **kwargs)
def save(self, *args, **kwargs):
if self._state.adding and not self.slug:
if self._state.adding:
self.slug = self.slugify(self.name)
# if name is different but slug conficts with another tags slug, append a counter
@ -114,6 +119,8 @@ class Tag(ABIDModel):
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_tag'
class SnapshotTag(models.Model):
id = models.AutoField(primary_key=True)
@ -136,69 +143,6 @@ class SnapshotTag(models.Model):
# unique_together = [('crawl', 'tag')]
class Crawl(ABIDModel):
abid_prefix = 'crl_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.urls'
abid_subtype_src = 'self.crawler'
abid_rand_src = 'self.id'
abid_drift_allowed = True
# CRAWLER_CHOICES = (
# ('breadth_first', 'Breadth-First'),
# ('depth_first', 'Depth-First'),
# )
PARSER_CHOICES = (
('auto', 'auto'),
*((parser_key, value[0]) for parser_key, value in PARSERS.items()),
)
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
urls = models.TextField(blank=False, null=False)
depth = models.PositiveSmallIntegerField(default=1, validators=[MinValueValidator(0), MaxValueValidator(2)])
parser = models.CharField(choices=PARSER_CHOICES, default='auto', max_length=32)
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
# schedule = models.JSONField()
# config = models.JSONField()
class Meta(TypedModelMeta):
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
def __str__(self):
return self.parser
@cached_property
def crawl_dir(self):
return Path()
@property
def api_url(self) -> str:
# /api/v1/core/crawl/{uulid}
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
# def get_absolute_url(self):
# return f'/crawls/{self.abid}'
def crawl(self):
# write self.urls to sources/crawl__<user>__YYYYMMDDHHMMSS.txt
# run parse_links(sources/crawl__<user>__YYYYMMDDHHMMSS.txt, parser=self.parser) and for each resulting link:
# create a Snapshot
# enqueue task bg_archive_snapshot(snapshot)
pass
@ -227,6 +171,8 @@ class Snapshot(ABIDModel):
bookmarked_at = AutoDateTimeField(default=None, null=False, editable=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
# crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set')
url = models.URLField(unique=True, db_index=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
@ -561,9 +507,10 @@ class ArchiveResult(ABIDModel):
# return f'[{self.abid}] 📅 {self.start_ts.strftime("%Y-%m-%d %H:%M")} 📄 {self.extractor} {self.snapshot.url}'
return self.extractor
@cached_property
def machine(self):
return self.iface.machine if self.iface else None
# TODO: finish connecting machine.models
# @cached_property
# def machine(self):
# return self.iface.machine if self.iface else None
@cached_property
def snapshot_dir(self):

View file

@ -10,7 +10,7 @@ from django.utils.crypto import get_random_string
import abx
import abx.archivebox
import abx.archivebox.use
import abx.archivebox.reads
import abx.django.use
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
@ -19,8 +19,7 @@ from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
IS_GETTING_VERSION_OR_HELP = 'version' in sys.argv or 'help' in sys.argv or '--version' in sys.argv or '--help' in sys.argv
################################################################################
### ArchiveBox Plugin Settings
@ -41,7 +40,7 @@ BUILTIN_PLUGIN_DIRS = {
'plugins_extractor': PACKAGE_DIR / 'plugins_extractor',
}
USER_PLUGIN_DIRS = {
'user_plugins': DATA_DIR / 'user_plugins',
# 'user_plugins': DATA_DIR / 'user_plugins',
}
# Discover ArchiveBox plugins
@ -52,19 +51,18 @@ ALL_PLUGINS = {**BUILTIN_PLUGINS, **PIP_PLUGINS, **USER_PLUGINS}
# Load ArchiveBox plugins
PLUGIN_MANAGER = abx.pm
PLUGINS = abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
HOOKS = abx.archivebox.use.get_HOOKS(PLUGINS)
abx.archivebox.load_archivebox_plugins(PLUGIN_MANAGER, ALL_PLUGINS)
PLUGINS = abx.archivebox.reads.get_PLUGINS()
# Load ArchiveBox config from plugins
CONFIGS = abx.archivebox.use.get_CONFIGS()
FLAT_CONFIG = abx.archivebox.use.get_FLAT_CONFIG()
BINPROVIDERS = abx.archivebox.use.get_BINPROVIDERS()
BINARIES = abx.archivebox.use.get_BINARIES()
EXTRACTORS = abx.archivebox.use.get_EXTRACTORS()
REPLAYERS = abx.archivebox.use.get_REPLAYERS()
ADMINDATAVIEWS = abx.archivebox.use.get_ADMINDATAVIEWS()
QUEUES = abx.archivebox.use.get_QUEUES()
SEARCHBACKENDS = abx.archivebox.use.get_SEARCHBACKENDS()
CONFIGS = abx.archivebox.reads.get_CONFIGS()
CONFIG = FLAT_CONFIG = abx.archivebox.reads.get_FLAT_CONFIG()
BINPROVIDERS = abx.archivebox.reads.get_BINPROVIDERS()
BINARIES = abx.archivebox.reads.get_BINARIES()
EXTRACTORS = abx.archivebox.reads.get_EXTRACTORS()
SEARCHBACKENDS = abx.archivebox.reads.get_SEARCHBACKENDS()
# REPLAYERS = abx.archivebox.reads.get_REPLAYERS()
# ADMINDATAVIEWS = abx.archivebox.reads.get_ADMINDATAVIEWS()
################################################################################
@ -101,10 +99,13 @@ INSTALLED_APPS = [
'django_object_actions', # provides easy Django Admin action buttons on change views https://github.com/crccheck/django-object-actions
# Our ArchiveBox-provided apps
#'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
# 'abid_utils', # handles ABID ID creation, handling, and models
'config', # ArchiveBox config settings (loaded as a plugin, don't need to add it here)
'machine', # handles collecting and storing information about the host machine, network interfaces, installed binaries, etc.
'queues', # handles starting and managing background workers and processes
'abid_utils', # handles ABID ID creation, handling, and models
'seeds', # handles Seed model and URL source management
'crawls', # handles Crawl and CrawlSchedule models and management
'personas', # handles Persona and session management
'core', # core django model with Snapshot, ArchiveResult, etc.
'api', # Django-Ninja-based Rest API interfaces, config, APIToken model, etc.
@ -262,7 +263,8 @@ MIGRATION_MODULES = {'signal_webhooks': None}
DEFAULT_AUTO_FIELD = 'django.db.models.BigAutoField'
HUEY = {
if not IS_GETTING_VERSION_OR_HELP: # dont create queue.sqlite3 file if we're just running to get --version or --help
HUEY = {
"huey_class": "huey.SqliteHuey",
"filename": CONSTANTS.QUEUE_DATABASE_FILENAME,
"name": "system_tasks",
@ -281,18 +283,18 @@ HUEY = {
"check_worker_health": True, # Enable worker health checks.
"health_check_interval": 1, # Check worker health every second.
},
}
}
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
# https://github.com/gaiacoop/django-huey
DJANGO_HUEY = {
# https://huey.readthedocs.io/en/latest/contrib.html#setting-things-up
# https://github.com/gaiacoop/django-huey
DJANGO_HUEY = {
"default": "system_tasks",
"queues": {
HUEY["name"]: HUEY.copy(),
# more registered here at plugin import-time by BaseQueue.register()
**abx.django.use.get_DJANGO_HUEY_QUEUES(QUEUE_DATABASE_NAME=CONSTANTS.QUEUE_DATABASE_FILENAME),
},
}
}
class HueyDBRouter:
"""
@ -410,7 +412,7 @@ SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL:
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'core' / 'shell_welcome_message.py')
os.environ['PYTHONSTARTUP'] = str(PACKAGE_DIR / 'misc' / 'shell_welcome_message.py')
################################################################################
@ -610,6 +612,6 @@ if DEBUG_REQUESTS_TRACKER:
abx.django.use.register_checks()
abx.archivebox.use.register_all_hooks(globals())
# abx.archivebox.reads.register_all_hooks(globals())
# import ipdb; ipdb.set_trace()

View file

@ -5,9 +5,10 @@ from django.views import static
from django.conf import settings
from django.views.generic.base import RedirectView
from .admin import archivebox_admin
from .views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
from .serve_static import serve_static
from archivebox.misc.serve_static import serve_static
from core.admin_site import archivebox_admin
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
# from archivebox.config import VERSION, VERSIONS_AVAILABLE, CAN_UPGRADE

View file

@ -24,16 +24,15 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
from core.models import Snapshot
from core.forms import AddLinkForm
from core.admin import result_url
from queues.tasks import bg_add
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.serve_static import serve_static_with_byterange_support
from .serve_static import serve_static_with_byterange_support
from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from ..plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from ..logging_util import printable_filesize
from ..search import query_search_index
@ -452,6 +451,8 @@ class AddView(UserPassesTestMixin, FormView):
}
def form_valid(self, form):
from core.admin_archiveresults import result_url
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
parser = form.cleaned_data["parser"]
@ -502,7 +503,7 @@ def find_config_section(key: str) -> str:
if key in CONSTANTS_CONFIG:
return 'CONSTANT'
matching_sections = [
section.id for section in settings.CONFIGS.values() if key in section.model_fields
section_id for section_id, section in settings.CONFIGS.items() if key in section.model_fields
]
section = matching_sections[0] if matching_sections else 'DYNAMIC'
return section
@ -559,9 +560,9 @@ def live_config_list_view(request: HttpRequest, **kwargs) -> TableContext:
# "Aliases": [],
}
for section in reversed(list(settings.CONFIGS.values())):
for section_id, section in reversed(list(settings.CONFIGS.items())):
for key, field in section.model_fields.items():
rows['Section'].append(section.id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Section'].append(section_id) # section.replace('_', ' ').title().replace(' Config', '')
rows['Key'].append(ItemLink(key, key=key))
rows['Type'].append(format_html('<code>{}</code>', find_config_type(key)))
rows['Value'].append(mark_safe(f'<code>{getattr(section, key)}</code>') if key_is_safe(key) else '******** (redacted)')
@ -612,7 +613,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
"fields": {
'Key': key,
'Type': find_config_type(key),
'Value': settings.FLAT_CONFIG[key] if key_is_safe(key) else '********',
'Value': settings.FLAT_CONFIG.get(key, settings.CONFIGS.get(key, None)) if key_is_safe(key) else '********',
},
"help_texts": {
'Key': mark_safe(f'''

View file

@ -0,0 +1,28 @@
__package__ = 'archivebox.crawls'
import abx
from abid_utils.admin import ABIDModelAdmin
from crawls.models import Crawl
class CrawlAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls')
sort_fields = ('abid', 'created_at', 'created_by', 'depth', 'parser', 'urls')
search_fields = ('abid', 'created_by__username', 'depth', 'parser', 'urls')
readonly_fields = ('created_at', 'modified_at', 'abid_info')
fields = ('urls', 'depth', 'parser', 'created_by', *readonly_fields)
list_filter = ('depth', 'parser', 'created_by')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(Crawl, CrawlAdmin)

View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class CrawlsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "crawls"

164
archivebox/crawls/models.py Normal file
View file

@ -0,0 +1,164 @@
__package__ = 'archivebox.crawls'
from django_stubs_ext.db.models import TypedModelMeta
from django.db import models
from django.db.models import Q
from django.core.validators import MaxValueValidator, MinValueValidator
from django.conf import settings
from django.utils import timezone
from django.urls import reverse_lazy
from seeds.models import Seed
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
class CrawlSchedule(ABIDModel, ModelWithHealthStats):
"""
A record for a job that should run repeatedly on a given schedule.
It pulls from a given Seed and creates a new Crawl for each scheduled run.
The new Crawl will inherit all the properties of the crawl_template Crawl.
"""
abid_prefix = 'sch_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.created_by_id'
abid_subtype_src = 'self.schedule'
abid_rand_src = 'self.id'
schedule = models.CharField(max_length=64, blank=False, null=False)
is_enabled = models.BooleanField(default=True)
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
crawl_set: models.Manager['Crawl']
@property
def template(self):
"""The base crawl that each new scheduled job should copy as a template"""
return self.crawl_set.first()
class Crawl(ABIDModel, ModelWithHealthStats):
"""
A single session of URLs to archive starting from a given Seed and expanding outwards. An "archiving session" so to speak.
A new Crawl should be created for each loading from a Seed (because it can produce a different set of URLs every time its loaded).
E.g. every scheduled import from an RSS feed should create a new Crawl, and more loadings from the same seed each create a new Crawl
Every "Add" task triggered from the Web UI, CLI, or Scheduled Crawl should create a new Crawl with the seed set to a
file URI e.g. file:///sources/<date>_{ui,cli}_add.txt containing the user's input.
"""
abid_prefix = 'crl_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.seed.uri'
abid_subtype_src = 'self.persona_id'
abid_rand_src = 'self.id'
abid_drift_allowed = True
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
abid = ABIDField(prefix=abid_prefix)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False, related_name='crawl_set')
created_at = AutoDateTimeField(default=None, null=False, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
seed = models.ForeignKey(Seed, on_delete=models.PROTECT, related_name='crawl_set', null=False, blank=False)
max_depth = models.PositiveSmallIntegerField(default=0, validators=[MinValueValidator(0), MaxValueValidator(4)])
tags_str = models.CharField(max_length=1024, blank=True, null=False, default='')
persona = models.CharField(max_length=32, blank=True, null=False, default='auto')
config = models.JSONField(default=dict)
schedule = models.ForeignKey(CrawlSchedule, on_delete=models.SET_NULL, null=True, blank=True, editable=True)
# crawler = models.CharField(choices=CRAWLER_CHOICES, default='breadth_first', max_length=32)
# tags = models.ManyToManyField(Tag, blank=True, related_name='crawl_set', through='CrawlTag')
# schedule = models.JSONField()
# config = models.JSONField()
# snapshot_set: models.Manager['Snapshot']
class Meta(TypedModelMeta):
verbose_name = 'Crawl'
verbose_name_plural = 'Crawls'
@property
def template(self):
"""If this crawl was created under a ScheduledCrawl, returns the original template Crawl it was based off"""
if not self.schedule:
return None
return self.schedule.template
@property
def api_url(self) -> str:
# /api/v1/core/crawl/{uulid}
# TODO: implement get_crawl
return reverse_lazy('api-1:get_crawl', args=[self.abid]) # + f'?api_key={get_or_create_api_token(request.user)}'
@property
def api_docs_url(self) -> str:
return '/api/v1/docs#/Core%20Models/api_v1_core_get_crawl'
class Outlink(models.Model):
"""A record of a link found on a page, pointing to another page."""
id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
src = models.URLField() # parent page where the outlink/href was found e.g. https://example.com/downloads
dst = models.URLField() # remote location the child outlink/href points to e.g. https://example.com/downloads/some_file.pdf
crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, null=False, blank=False, related_name='outlink_set')
via = models.ForeignKey('core.ArchiveResult', on_delete=models.SET_NULL, null=True, blank=True, related_name='outlink_set')
class Meta:
unique_together = (('src', 'dst', 'via'),)
# @abx.hookimpl.on_archiveresult_created
# def exec_archiveresult_extractor_effects(archiveresult):
# config = get_scope_config(...)
# # abx.archivebox.writes.update_archiveresult_started(archiveresult, start_ts=timezone.now())
# # abx.archivebox.events.on_archiveresult_updated(archiveresult)
# # check if it should be skipped
# if not abx.archivebox.reads.get_archiveresult_should_run(archiveresult, config):
# abx.archivebox.writes.update_archiveresult_skipped(archiveresult, status='skipped')
# abx.archivebox.events.on_archiveresult_skipped(archiveresult, config)
# return
# # run the extractor method and save the output back to the archiveresult
# try:
# output = abx.archivebox.effects.exec_archiveresult_extractor(archiveresult, config)
# abx.archivebox.writes.update_archiveresult_succeeded(archiveresult, output=output, error=None, end_ts=timezone.now())
# except Exception as e:
# abx.archivebox.writes.update_archiveresult_failed(archiveresult, error=e, end_ts=timezone.now())
# # bump the modified time on the archiveresult and Snapshot
# abx.archivebox.events.on_archiveresult_updated(archiveresult)
# abx.archivebox.events.on_snapshot_updated(archiveresult.snapshot)
# @abx.hookimpl.reads.get_outlink_parents
# def get_outlink_parents(url, crawl_pk=None, config=None):
# scope = Q(dst=url)
# if crawl_pk:
# scope = scope | Q(via__snapshot__crawl_id=crawl_pk)
# parent = list(Outlink.objects.filter(scope))
# if not parent:
# # base case: we reached the top of the chain, no more parents left
# return []
# # recursive case: there is another parent above us, get its parents
# yield parent[0]
# yield from get_outlink_parents(parent[0].src, crawl_pk=crawl_pk, config=config)

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -8,8 +8,9 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..logging_util import TimedProgress

View file

@ -11,6 +11,9 @@ from archivebox.misc.util import (
)
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.html'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -34,8 +36,6 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -4,8 +4,9 @@ from pathlib import Path
from archivebox.misc.system import chmod_file, run
from archivebox.misc.util import enforce_types, domain, dedupe
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress

View file

@ -13,10 +13,12 @@ from archivebox.misc.util import (
without_query,
without_fragment,
)
from archivebox.plugins_extractor.git.apps import GIT_CONFIG, GIT_BINARY
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.plugins_extractor.git.config import GIT_CONFIG
from archivebox.plugins_extractor.git.binaries import GIT_BINARY
def get_output_path():
return 'git/'

View file

@ -10,7 +10,8 @@ from archivebox.misc.util import (
get_headers,
dedupe,
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..logging_util import TimedProgress

View file

@ -3,11 +3,13 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.ytdlp.config import YTDLP_CONFIG
from plugins_extractor.ytdlp.binaries import YTDLP_BINARY
def get_output_path():
return 'media/'
@ -25,7 +27,6 @@ def get_embed_path(archiveresult=None):
@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
if is_static_file(link.url):
return False
@ -40,10 +41,6 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
# from plugins_extractor.chrome.apps import CHROME_CONFIG
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
YTDLP_BIN = YTDLP_BINARY.load()
assert YTDLP_BIN.abspath and YTDLP_BIN.version

View file

@ -12,7 +12,8 @@ from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from archivebox.plugins_extractor.mercury.apps import MERCURY_CONFIG, MERCURY_BINARY
from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
from ..logging_util import TimedProgress

View file

@ -3,14 +3,17 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.pdf'
@ -18,7 +21,6 @@ def get_output_path():
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -34,8 +36,6 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -6,12 +6,16 @@ from tempfile import NamedTemporaryFile
from typing import Optional
import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write
from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress
from .title import get_html
from plugins_extractor.readability.config import READABILITY_CONFIG
from plugins_extractor.readability.binaries import READABILITY_BINARY
def get_output_path():
return 'readability/'
@ -21,7 +25,6 @@ def get_embed_path(archiveresult=None):
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.readability.apps import READABILITY_CONFIG
if is_static_file(link.url):
return False
@ -37,8 +40,6 @@ def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite:
def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
"""download reader friendly version using @mozilla/readability"""
from plugins_extractor.readability.apps import READABILITY_CONFIG, READABILITY_BINARY
READABILITY_BIN = READABILITY_BINARY.load()
assert READABILITY_BIN.abspath and READABILITY_BIN.version

View file

@ -3,11 +3,14 @@ __package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'screenshot.png'
@ -15,7 +18,6 @@ def get_output_path():
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -30,7 +32,6 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version

View file

@ -10,6 +10,11 @@ from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
from plugins_extractor.singlefile.config import SINGLEFILE_CONFIG
from plugins_extractor.singlefile.binaries import SINGLEFILE_BINARY
def get_output_path():
return 'singlefile.html'
@ -17,7 +22,6 @@ def get_output_path():
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
if is_static_file(link.url):
return False
@ -26,16 +30,13 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
return CHROME_CONFIG.USE_CHROME and SINGLEFILE_CONFIG.SAVE_SINGLEFILE
@enforce_types
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""download full site using single-file"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
SINGLEFILE_BIN = SINGLEFILE_BINARY.load()

View file

@ -11,7 +11,9 @@ from archivebox.misc.util import (
htmldecode,
dedupe,
)
from archivebox.plugins_extractor.curl.apps import CURL_CONFIG, CURL_BINARY
from archivebox.plugins_extractor.curl.config import CURL_CONFIG
from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress

View file

@ -17,8 +17,8 @@ from archivebox.misc.util import (
urldecode,
dedupe,
)
from archivebox.plugins_extractor.wget.apps import WGET_BINARY, WGET_CONFIG
from archivebox.plugins_extractor.wget.config import WGET_CONFIG
from archivebox.plugins_extractor.wget.binaries import WGET_BINARY
from ..logging_util import TimedProgress
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError

View file

@ -19,7 +19,7 @@ from archivebox.misc.util import (
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
from .schema import Link
from ..logging_util import printable_filesize

View file

@ -19,7 +19,7 @@ from django.utils.functional import cached_property
from archivebox.config import ARCHIVE_DIR, CONSTANTS
from plugins_extractor.favicon.apps import FAVICON_CONFIG
from plugins_extractor.favicon.config import FAVICON_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import ts_to_date_str, parse_date

View file

@ -160,4 +160,4 @@ def apply_migrations(out_dir: Path=DATA_DIR) -> List[str]:
@enforce_types
def get_admins(out_dir: Path=DATA_DIR) -> List[str]:
from django.contrib.auth.models import User
return User.objects.filter(is_superuser=True)
return User.objects.filter(is_superuser=True).exclude(username='system')

View file

@ -510,7 +510,7 @@ def log_removal_finished(all_links: int, to_remove: int):
### Helpers
@enforce_types
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str:
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = str(Path(pwd)) # .resolve()
path = str(path)
@ -520,7 +520,10 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR) -> str:
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/') and path != pwd:
if color:
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
else:
path = path.replace(pwd, '.', 1)
# quote paths containing spaces
if ' ' in path:

View file

@ -0,0 +1,94 @@
__package__ = 'archivebox.machine'
import abx
from django.contrib import admin
from django.utils.html import format_html
from abid_utils.admin import ABIDModelAdmin
from machine.models import Machine, NetworkInterface, InstalledBinary
class MachineAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
sort_fields = ('abid', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')
# search_fields = ('id', 'abid', 'guid', 'hostname', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release')
readonly_fields = ('guid', 'created_at', 'modified_at', 'abid_info', 'ips')
fields = (*readonly_fields, 'hostname', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'hw_uuid', 'os_arch', 'os_family', 'os_platform', 'os_kernel', 'os_release', 'stats', 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('hw_in_docker', 'hw_in_vm', 'os_arch', 'os_family', 'os_platform')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Public IP',
ordering='networkinterface__ip_public',
)
def ips(self, machine):
return format_html(
'<a href="/admin/machine/networkinterface/?q={}"><b><code>{}</code></b></a>',
machine.abid,
', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)),
)
class NetworkInterfaceAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health')
sort_fields = ('abid', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address')
search_fields = ('abid', 'machine__abid', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')
readonly_fields = ('machine', 'created_at', 'modified_at', 'abid_info', 'mac_address', 'ip_public', 'ip_local', 'dns_server')
fields = (*readonly_fields, 'iface', 'hostname', 'isp', 'city', 'region', 'country', 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('isp', 'country', 'region')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Machine',
ordering='machine__abid',
)
def machine_info(self, iface):
return format_html(
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> &nbsp; {}</a>',
iface.machine.id,
iface.machine.abid,
iface.machine.hostname,
)
class InstalledBinaryAdmin(ABIDModelAdmin):
list_display = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'health')
sort_fields = ('abid', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
search_fields = ('abid', 'machine__abid', 'name', 'binprovider', 'version', 'abspath', 'sha256')
readonly_fields = ('created_at', 'modified_at', 'abid_info')
fields = ('machine', 'name', 'binprovider', 'abspath', 'version', 'sha256', *readonly_fields, 'num_uses_succeeded', 'num_uses_failed')
list_filter = ('name', 'binprovider', 'machine_id')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(
description='Machine',
ordering='machine__abid',
)
def machine_info(self, installed_binary):
return format_html(
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> &nbsp; {}</a>',
installed_binary.machine.id,
installed_binary.machine.abid,
installed_binary.machine.hostname,
)
@abx.hookimpl
def register_admin(admin_site):
admin_site.register(Machine, MachineAdmin)
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
admin_site.register(InstalledBinary, InstalledBinaryAdmin)

View file

@ -2,9 +2,17 @@ __package__ = 'archivebox.machine'
from django.apps import AppConfig
import abx
class MachineConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'machine'
verbose_name = 'Machine Info'
@abx.hookimpl
def register_admin(admin_site):
from machine.admin import register_admin
register_admin(admin_site)

View file

@ -8,66 +8,41 @@ from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property
import abx.archivebox.reads
import abx.archivebox.use
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider
from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from archivebox.abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
CURRENT_MACHINE = None # global cache for the current machine
CURRENT_INTERFACE = None # global cache for the current network interface
CURRENT_BINARIES = {} # global cache for the currently installed binaries
_CURRENT_MACHINE = None # global cache for the current machine
_CURRENT_INTERFACE = None # global cache for the current network interface
_CURRENT_BINARIES = {} # global cache for the currently installed binaries
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60 # 1 week (how often should we check for OS/hardware changes?)
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60 # 1 hour (how often should we check for public IP/private IP/DNS changes?)
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60 # 30min (how often should we check for changes to locally installed binaries?)
class ModelWithHealthStats(models.Model):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
class Meta:
abstract = True
def record_health_failure(self) -> None:
self.num_uses_failed += 1
self.save()
def record_health_success(self) -> None:
self.num_uses_succeeded += 1
self.save()
def reset_health(self) -> None:
# move all the failures to successes when resetting so we dont lose track of the total count
self.num_uses_succeeded = self.num_uses_failed + self.num_uses_succeeded
self.num_uses_failed = 0
self.save()
@property
def health(self) -> int:
total_uses = max((self.num_uses_failed + self.num_uses_succeeded, 1))
success_pct = (self.num_uses_succeeded / total_uses) * 100
return round(success_pct)
class MachineManager(models.Manager):
def current(self) -> 'Machine':
"""Get the current machine that ArchiveBox is running on."""
global CURRENT_MACHINE
if CURRENT_MACHINE:
expires_at = CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL)
global _CURRENT_MACHINE
if _CURRENT_MACHINE:
expires_at = _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL)
if timezone.now() < expires_at:
# assume current machine cant change *while archivebox is actively running on it*
# it's not strictly impossible to swap hardware while code is running,
# but its rare and unusual so we check only once per week
# (e.g. VMWare can live-migrate a VM to a new host while it's running)
return CURRENT_MACHINE
return _CURRENT_MACHINE
else:
CURRENT_MACHINE = None
_CURRENT_MACHINE = None
CURRENT_MACHINE, _created = self.update_or_create(
_CURRENT_MACHINE, _created = self.update_or_create(
guid=get_host_guid(),
defaults={
'hostname': socket.gethostname(),
@ -76,11 +51,14 @@ class MachineManager(models.Manager):
'stats': get_host_stats(),
},
)
CURRENT_MACHINE.save() # populate ABID
_CURRENT_MACHINE.save() # populate ABID
return _CURRENT_MACHINE
return CURRENT_MACHINE
class Machine(ABIDModel, ModelWithHealthStats):
"""Audit log entry for a physical machine that was used to do archiving."""
abid_prefix = 'mxn_'
abid_ts_src = 'self.created_at'
abid_uri_src = 'self.guid'
@ -113,6 +91,7 @@ class Machine(ABIDModel, ModelWithHealthStats):
# STATS COUNTERS
stats = models.JSONField(default=dict, null=False) # e.g. {"cpu_load": [1.25, 2.4, 1.4], "mem_swap_used_pct": 56, ...}
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
# num_uses_succeeded = models.PositiveIntegerField(default=0)
@ -127,18 +106,18 @@ class NetworkInterfaceManager(models.Manager):
def current(self) -> 'NetworkInterface':
"""Get the current network interface for the current machine."""
global CURRENT_INTERFACE
if CURRENT_INTERFACE:
global _CURRENT_INTERFACE
if _CURRENT_INTERFACE:
# assume the current network interface (public IP, DNS servers, etc.) wont change more than once per hour
expires_at = CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
expires_at = _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
if timezone.now() < expires_at:
return CURRENT_INTERFACE
return _CURRENT_INTERFACE
else:
CURRENT_INTERFACE = None
_CURRENT_INTERFACE = None
machine = Machine.objects.current()
net_info = get_host_network()
CURRENT_INTERFACE, _created = self.update_or_create(
_CURRENT_INTERFACE, _created = self.update_or_create(
machine=machine,
ip_public=net_info.pop('ip_public'),
ip_local=net_info.pop('ip_local'),
@ -146,14 +125,16 @@ class NetworkInterfaceManager(models.Manager):
dns_server=net_info.pop('dns_server'),
defaults=net_info,
)
CURRENT_INTERFACE.save() # populate ABID
_CURRENT_INTERFACE.save() # populate ABID
return CURRENT_INTERFACE
return _CURRENT_INTERFACE
class NetworkInterface(ABIDModel, ModelWithHealthStats):
"""Audit log entry for a physical network interface / internet connection that was used to do archiving."""
abid_prefix = 'ixf_'
abid_ts_src = 'self.machine.created_at'
abid_uri_src = 'self.machine.guid'
@ -183,7 +164,7 @@ class NetworkInterface(ABIDModel, ModelWithHealthStats):
region = models.CharField(max_length=63, default=None, null=False) # e.g. California
country = models.CharField(max_length=63, default=None, null=False) # e.g. United States
# STATS COUNTERS (from ModelWithHealthStats)
# STATS COUNTERS (inherited from ModelWithHealthStats)
# num_uses_failed = models.PositiveIntegerField(default=0)
# num_uses_succeeded = models.PositiveIntegerField(default=0)
@ -202,8 +183,8 @@ class InstalledBinaryManager(models.Manager):
def get_from_db_or_cache(self, binary: BaseBinary) -> 'InstalledBinary':
"""Get or create an InstalledBinary record for a Binary on the local machine"""
global CURRENT_BINARIES
cached_binary = CURRENT_BINARIES.get(binary.id)
global _CURRENT_BINARIES
cached_binary = _CURRENT_BINARIES.get(binary.name)
if cached_binary:
expires_at = cached_binary.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL)
if timezone.now() < expires_at:
@ -218,7 +199,7 @@ class InstalledBinaryManager(models.Manager):
or binary.sha256 != cached_binary.sha256
)
if is_different_from_cache:
CURRENT_BINARIES.pop(binary.id)
_CURRENT_BINARIES.pop(binary.name)
else:
return cached_binary
else:
@ -229,7 +210,7 @@ class InstalledBinaryManager(models.Manager):
return cached_binary
else:
# cached binary is too old, reload it from scratch
CURRENT_BINARIES.pop(binary.id)
_CURRENT_BINARIES.pop(binary.name)
if not binary.abspath or not binary.version or not binary.sha256:
# if binary was not yet loaded from filesystem, do it now
@ -239,7 +220,7 @@ class InstalledBinaryManager(models.Manager):
assert binary.loaded_binprovider and binary.loaded_abspath and binary.loaded_version and binary.loaded_sha256, f'Failed to load binary {binary.name} abspath, version, and sha256'
CURRENT_BINARIES[binary.id], _created = self.update_or_create(
_CURRENT_BINARIES[binary.name], _created = self.update_or_create(
machine=Machine.objects.current(),
name=binary.name,
binprovider=binary.loaded_binprovider.name,
@ -247,7 +228,7 @@ class InstalledBinaryManager(models.Manager):
abspath=str(binary.loaded_abspath),
sha256=str(binary.loaded_sha256),
)
cached_binary = CURRENT_BINARIES[binary.id]
cached_binary = _CURRENT_BINARIES[binary.name]
cached_binary.save() # populate ABID
# if we get this far make sure DB record matches in-memroy cache
@ -282,11 +263,11 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
version = models.CharField(max_length=32, default=None, null=False, blank=True)
sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
# MUTABLE PROPERTIES
# MUTABLE PROPERTIES (TODO)
# is_pinned = models.BooleanField(default=False) # i.e. should this binary superceede other binaries with the same name on the host?
# is_valid = models.BooleanField(default=True) # i.e. is this binary still available on the host?
# STATS COUNTERS (from ModelWithHealthStats)
# STATS COUNTERS (inherited from ModelWithHealthStats)
# num_uses_failed = models.PositiveIntegerField(default=0)
# num_uses_succeeded = models.PositiveIntegerField(default=0)
@ -310,7 +291,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
if not hasattr(self, 'machine'):
self.machine = Machine.objects.current()
if not self.binprovider:
all_known_binproviders = list(abx.archivebox.use.get_BINPROVIDERS().values())
all_known_binproviders = list(abx.archivebox.reads.get_BINPROVIDERS().values())
binary = BaseBinary(name=self.name, binproviders=all_known_binproviders).load(fresh=True)
self.binprovider = binary.loaded_binprovider.name if binary.loaded_binprovider else None
if not self.abspath:
@ -324,7 +305,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
@cached_property
def BINARY(self) -> BaseBinary:
for binary in abx.archivebox.use.get_BINARIES().values():
for binary in abx.archivebox.reads.get_BINARIES().values():
if binary.name == self.name:
return binary
raise Exception(f'Orphaned InstalledBinary {self.name} {self.binprovider} was found in DB, could not find any plugin that defines it')
@ -332,7 +313,7 @@ class InstalledBinary(ABIDModel, ModelWithHealthStats):
@cached_property
def BINPROVIDER(self) -> BaseBinProvider:
for binprovider in abx.archivebox.use.get_BINPROVIDERS().values():
for binprovider in abx.archivebox.reads.get_BINPROVIDERS().values():
if binprovider.name == self.binprovider:
return binprovider
raise Exception(f'Orphaned InstalledBinary(name={self.name}) was found in DB, could not find any plugin that defines BinProvider(name={self.binprovider})')

View file

@ -189,14 +189,16 @@ def version(quiet: bool=False,
if quiet or '--version' in sys.argv:
return
from rich.panel import Panel
from rich.console import Console
console = Console()
prnt = console.print
from plugins_auth.ldap.apps import LDAP_CONFIG
from plugins_auth.ldap.config import LDAP_CONFIG
from django.conf import settings
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
from archivebox.config.paths import get_data_locations, get_code_locations
from abx.archivebox.base_binary import BaseBinary, apt, brew, env
@ -221,7 +223,7 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
)
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
@ -241,6 +243,21 @@ def version(quiet: bool=False,
)
prnt()
if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
PANEL_TEXT = '\n'.join((
# '',
# f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
'',
'[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
' [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
'',
' [i][grey53](this is [red]REQUIRED[/red] if you are opening a Github Issue to get help)[/grey53][/i]',
'',
))
prnt(Panel(PANEL_TEXT, expand=False, border_style='grey53', title='[red]:exclamation: No collection [blue]DATA_DIR[/blue] is currently active[/red]', subtitle='Full version info is only available when inside a collection [light_slate_blue]DATA DIR[/light_slate_blue]'))
prnt()
return
prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
failures = []
for name, binary in reversed(list(settings.BINARIES.items())):
@ -299,13 +316,13 @@ def version(quiet: bool=False,
prnt()
prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
for name, path in CONSTANTS.CODE_LOCATIONS.items():
for name, path in get_code_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt()
if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in CONSTANTS.DATA_LOCATIONS.items():
for name, path in get_data_locations().items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
from archivebox.misc.checks import check_data_dir_permissions
@ -395,7 +412,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
print(f' √ ./{CONSTANTS.DATABASE_FILE.relative_to(DATA_DIR)}')
# from django.contrib.auth.models import User
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exists():
# if SHELL_CONFIG.IS_TTY and not User.objects.filter(is_superuser=True).exclude(username='system').exists():
# print('{green}[+] Creating admin user account...{reset}'.format(**SHELL_CONFIG.ANSI))
# call_command("createsuperuser", interactive=True)
@ -486,8 +503,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False, out_dir: Pat
html_index.rename(f"{index_name}.html")
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
if install:
run_subcommand('install', pwd=out_dir)
@ -1115,14 +1136,14 @@ def install(out_dir: Path=DATA_DIR, binproviders: Optional[List[str]]=None, bina
from django.contrib.auth import get_user_model
User = get_user_model()
if not User.objects.filter(is_superuser=True).exists():
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
extra_args = []
if binproviders:
@ -1253,7 +1274,7 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from archivebox.plugins_pkg.pip.binaries import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@ -1399,23 +1420,14 @@ def server(runserver_args: Optional[List[str]]=None,
from django.core.management import call_command
from django.contrib.auth.models import User
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(' > Logging errors to ./logs/errors.log')
if not User.objects.filter(is_superuser=True).exists():
print('[yellow][!] No admin users exist yet, you will not be able to edit links in the UI.[/yellow]')
if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
print()
print(' [violet]Hint:[/violet] To create an admin user, run:')
print(' archivebox manage createsuperuser')
# print('[yellow][!] No admin accounts exist, you must create one to be able to log in to the Admin UI![/yellow]')
print('[violet]Hint:[/violet] To create an [bold]admin username & password[/bold] for the [deep_sky_blue3][underline][link=http://{host}:{port}/admin]Admin UI[/link][/underline][/deep_sky_blue3], run:')
print(' [green]archivebox manage createsuperuser[/green]')
print()
if SHELL_CONFIG.DEBUG:
if not reload:
runserver_args.append('--noreload') # '--insecure'
call_command("runserver", *runserver_args)
else:
host = '127.0.0.1'
port = '8000'
@ -1431,14 +1443,20 @@ def server(runserver_args: Optional[List[str]]=None,
except IndexError:
pass
print('[green][+] Starting ArchiveBox webserver...[/green]')
print(f' [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
print(f' [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
print(' > Writing ArchiveBox error log to ./logs/errors.log')
if SHELL_CONFIG.DEBUG:
if not reload:
runserver_args.append('--noreload') # '--insecure'
call_command("runserver", *runserver_args)
else:
from queues.supervisor_util import start_server_workers
print()
start_server_workers(host=host, port=port, daemonize=False)
print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")

View file

@ -5,16 +5,24 @@ import sys
from pathlib import Path
from rich import print
from rich.panel import Panel
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE to anything other than builtin python libraries
# this file is imported by archivebox/__init__.py
# and any imports here will be imported by EVERYTHING else
# so this file should only be used for pure python checks
# that don't need to import other parts of ArchiveBox
# if a check needs to import other parts of ArchiveBox,
# the imports should be done inside the check function
# and you should make sure if you need to import any django stuff
# that the check is called after django.setup() has been called
def check_data_folder() -> None:
from archivebox import DATA_DIR, ARCHIVE_DIR
from archivebox.config import CONSTANTS
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
archive_dir_exists = os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()
if not archive_dir_exists:
@ -30,13 +38,27 @@ def check_data_folder() -> None:
raise SystemExit(2)
# Create data dir subdirs
create_and_chown_dir(CONSTANTS.SOURCES_DIR)
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
create_and_chown_dir(CONSTANTS.LOGS_DIR)
# create_and_chown_dir(CONSTANTS.CACHE_DIR)
# Create /tmp and /lib dirs if they don't exist
get_or_create_working_tmp_dir(autofix=True, quiet=False)
get_or_create_working_lib_dir(autofix=True, quiet=False)
# Check data dir permissions, /tmp, and /lib permissions
check_data_dir_permissions()
def check_migrations():
from archivebox import DATA_DIR, CONSTANTS
from archivebox import DATA_DIR
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
if pending_migrations:
if pending_migrations and not is_migrating:
print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
print(f' {DATA_DIR}', file=sys.stderr)
print(file=sys.stderr)
@ -44,13 +66,6 @@ def check_migrations():
print(' archivebox init', file=sys.stderr)
raise SystemExit(3)
CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
CONSTANTS.LOGS_DIR.mkdir(exist_ok=True)
# CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
(CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
(CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
def check_io_encoding():
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
@ -127,3 +142,98 @@ def check_data_dir_permissions():
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
from archivebox.config.common import STORAGE_CONFIG
# Check /tmp dir permissions
check_tmp_dir(STORAGE_CONFIG.TMP_DIR, throw=False, must_exist=True)
# Check /lib dir permissions
check_lib_dir(STORAGE_CONFIG.LIB_DIR, throw=False, must_exist=True)
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
from archivebox.config.paths import assert_dir_can_contain_unix_sockets, dir_is_writable, get_or_create_working_tmp_dir
from archivebox.misc.logging import STDERR
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.common import STORAGE_CONFIG
from archivebox.logging_util import pretty_path
tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
if not must_exist and not os.path.isdir(tmp_dir):
# just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
return len(f'file://{socket_file}') <= 96
tmp_is_valid = False
try:
tmp_is_valid = dir_is_writable(tmp_dir)
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
STDERR.print()
if throw:
raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
return False
def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_exist=True):
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.misc.logging import STDERR
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
from archivebox.config.common import STORAGE_CONFIG
from archivebox.logging_util import pretty_path
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
if not must_exist and not os.path.isdir(lib_dir):
return True
lib_is_valid = False
try:
lib_is_valid = dir_is_writable(lib_dir)
assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
STDERR.print()
if throw:
raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
return False

View file

@ -0,0 +1,30 @@
__package__ = 'archivebox.misc'
from django.core.paginator import Paginator
from django.utils.functional import cached_property
class AccelleratedPaginator(Paginator):
"""
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
Speeds up SELECT Count(*) on Admin views by >20x.
https://hakibenita.com/optimizing-the-django-admin-paginator
"""
@cached_property
def count(self):
if self.object_list._has_filters(): # type: ignore
# fallback to normal count method on filtered queryset
return super().count
else:
# otherwise count total rows in a separate fast query
return self.object_list.model.objects.count()
# Alternative approach for PostgreSQL: fallback count takes > 200ms
# from django.db import connection, transaction, OperationalError
# with transaction.atomic(), connection.cursor() as cursor:
# cursor.execute('SET LOCAL statement_timeout TO 200;')
# try:
# return super().count
# except OperationalError:
# return 9999999999999

View file

@ -49,7 +49,7 @@ if __name__ == '__main__':
prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
prnt(' [link=https://docs.archivebox.io/en/latest/modules.html]https://docs.archivebox.io/en/latest/modules.html[/link]')
prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
prnt()
prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')

View file

View file

@ -0,0 +1,3 @@
from django.contrib import admin
# Register your models here.

View file

@ -0,0 +1,6 @@
from django.apps import AppConfig
class SessionsConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "personas"

View file

@ -0,0 +1,67 @@
from django.db import models
from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField, ModelWithHealthStats
from django.conf import settings
# class Persona(ABIDModel, ModelWithHealthStats):
# """Aka a "SessionType", its a template for a crawler browsing session containing some config."""
# abid_prefix = 'prs_'
# abid_ts_src = 'self.created_at'
# abid_uri_src = 'self.name'
# abid_subtype_src = 'self.created_by'
# abid_rand_src = 'self.id'
# id = models.UUIDField(primary_key=True, default=None, null=False, editable=False, unique=True, verbose_name='ID')
# abid = ABIDField(prefix=abid_prefix)
# created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=None, null=False)
# created_at = AutoDateTimeField(default=None, null=False, db_index=True)
# modified_at = models.DateTimeField(auto_now=True)
# name = models.CharField(max_length=100, blank=False, null=False, editable=False)
# persona_dir = models.FilePathField(path=settings.PERSONAS_DIR, allow_files=False, allow_folders=True, blank=True, null=False, editable=False)
# config = models.JSONField(default=dict)
# # e.g. {
# # USER_AGENT: 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36',
# # COOKIES_TXT_FILE: '/path/to/cookies.txt',
# # CHROME_USER_DATA_DIR: '/path/to/chrome/user/data/dir',
# # CHECK_SSL_VALIDITY: False,
# # SAVE_ARCHIVE_DOT_ORG: True,
# # CHROME_BINARY: 'chromium'
# # ...
# # }
# # domain_allowlist = models.CharField(max_length=1024, blank=True, null=False, default='')
# # domain_denylist = models.CharField(max_length=1024, blank=True, null=False, default='')
# class Meta:
# verbose_name = 'Session Type'
# verbose_name_plural = 'Session Types'
# unique_together = (('created_by', 'name'),)
# def clean(self):
# self.persona_dir = settings.PERSONAS_DIR / self.name
# assert self.persona_dir == settings.PERSONAS_DIR / self.name, f'Persona dir {self.persona_dir} must match settings.PERSONAS_DIR / self.name'
# # make sure config keys all exist in FLAT_CONFIG
# # make sure config values all match expected types
# pass
# def save(self, *args, **kwargs):
# self.full_clean()
# # make sure basic file structure is present in persona_dir:
# # - PERSONAS_DIR / self.name /
# # - chrome_profile/
# # - chrome_downloads/
# # - chrome_extensions/
# # - cookies.txt
# # - auth.json
# # - config.json # json dump of the model
# super().save(*args, **kwargs)

View file

@ -0,0 +1,3 @@
from django.test import TestCase
# Create your tests here.

View file

@ -0,0 +1,3 @@
from django.shortcuts import render
# Create your views here.

View file

@ -0,0 +1,72 @@
__package__ = 'plugins_auth.ldap'
__id__ = 'ldap'
__label__ = 'LDAP'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
__dependencies__ = ['pip']
import abx
@abx.hookimpl
def get_PLUGIN():
return {
__id__: {
'id': __id__,
'package': __package__,
'label': __label__,
'version': __version__,
'author': __author__,
'homepage': __homepage__,
'dependencies': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import LDAP_CONFIG
return {
__id__: LDAP_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import LDAP_BINARY
return {
'ldap': LDAP_BINARY,
}
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
"""
Invoked after LDAP authenticates a user, but before they have a local User account created.
ArchiveBox requires staff/superuser status to view the admin at all, so we must create a user
+ set staff and superuser when LDAP authenticates a new person.
"""
from django.conf import settings
if user is None:
return # not authenticated at all
if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet
user.is_staff = True
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
@abx.hookimpl
def ready():
"""
Called at AppConfig.ready() time (settings + models are all loaded)
"""
from django.conf import settings
if settings.CONFIGS.ldap.LDAP_ENABLED:
# tell django-auth-ldap to call our function when a user is authenticated via LDAP
import django_auth_ldap.backend
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap'
__package__ = 'plugins_auth.ldap'
import inspect
@ -9,15 +9,14 @@ from pydantic import InstanceOf
from pydantic_pkgr import BinaryOverrides, SemVer
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
from plugins_pkg.pip.apps import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
from .settings import LDAP_CONFIG, get_ldap_lib
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
from .config import get_ldap_lib
###################### Config ##########################
def get_LDAP_LIB_path(paths=()):
LDAP_LIB = get_ldap_lib()[0]
@ -34,10 +33,12 @@ def get_LDAP_LIB_path(paths=()):
return lib_path
return None
def get_LDAP_LIB_version():
LDAP_LIB = get_ldap_lib()[0]
return LDAP_LIB and SemVer(LDAP_LIB.__version__)
class LdapBinary(BaseBinary):
name: str = 'ldap'
description: str = 'LDAP Authentication'
@ -67,17 +68,3 @@ class LdapBinary(BaseBinary):
}
LDAP_BINARY = LdapBinary()
class LdapAuthPlugin(BasePlugin):
app_label: str = 'ldap'
verbose_name: str = 'LDAP Authentication'
hooks: List[InstanceOf[BaseHook]] = [
LDAP_CONFIG,
*([LDAP_BINARY] if LDAP_CONFIG.LDAP_ENABLED else []),
]
PLUGIN = LdapAuthPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox.plugins_auth.ldap'
__package__ = 'plugins_auth.ldap'
import sys

View file

@ -0,0 +1,39 @@
__package__ = 'plugins_extractor.archivedotorg'
__label__ = 'archivedotorg'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://archive.org'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'archivedotorg': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import ARCHIVEDOTORG_CONFIG
return {
'archivedotorg': ARCHIVEDOTORG_CONFIG
}
# @abx.hookimpl
# def get_EXTRACTORS():
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
#
# return {
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
# }

View file

@ -1,28 +0,0 @@
__package__ = 'archivebox.plugins_extractor.archivedotorg'
from typing import List
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_hook import BaseHook
###################### Config ##########################
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
class ArchivedotorgPlugin(BasePlugin):
app_label: str = 'archivedotorg'
verbose_name: str = 'Archive.org'
hooks: List[BaseHook] = [
ARCHIVEDOTORG_CONFIG
]
PLUGIN = ArchivedotorgPlugin()
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,11 @@
__package__ = 'plugins_extractor.archivedotorg'
from abx.archivebox.base_configset import BaseConfigSet
class ArchivedotorgConfig(BaseConfigSet):
SAVE_ARCHIVE_DOT_ORG: bool = True
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()

View file

@ -0,0 +1,65 @@
__package__ = 'plugins_extractor.chrome'
__label__ = 'chrome'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'chrome': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'chrome': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }
# Hooks Available:
# Events:
# on_crawl_schedule_tick
# on_seed_post_save
# on_crawl_post_save
# on_snapshot_post_save
# on_archiveresult_post_save
# create_root_snapshot_from_seed
# create_archiveresults_pending_from_snapshot
# create_crawl_from_crawlschedule_if_due
# create_crawl_copy_from_template
#
# create_crawl_from_crawlschedule_if_due

View file

@ -0,0 +1,148 @@
__package__ = 'plugins_extractor.chrome'
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import SHELL_CONFIG
from plugins_pkg.puppeteer.binproviders import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
from archivebox.config.common import STORAGE_CONFIG
bin_dir = bin_dir or STORAGE_CONFIG.LIB_DIR / 'bin'
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -1,35 +1,18 @@
__package__ = 'archivebox.plugins_extractor.chrome'
__package__ = 'plugins_extractor.chrome'
import os
import sys
import platform
from pathlib import Path
from typing import List, Optional
# Depends on other PyPI/vendor packages:
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
from pydantic import Field, model_validator
from pydantic_pkgr import bin_abspath
# Depends on other Django apps:
from abx.archivebox.base_plugin import BasePlugin
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor
# from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_binary import env
# Depends on Other Plugins:
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from archivebox.misc.logging import STDERR
from archivebox.misc.util import dedupe
@ -129,33 +112,34 @@ class ChromeConfig(BaseConfigSet):
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
STDERR.print()
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and os.access(self.CHROME_USER_DATA_DIR, os.R_OK):
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).is_dir():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
STDERR.print(' For more info see:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
STDERR.print()
STDERR.print(' Try removing /Default from the end e.g.:')
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]))
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
self.update_in_place(CHROME_USER_DATA_DIR=None)
else:
self.CHROME_USER_DATA_DIR = None
if self.CHROME_USER_DATA_DIR is not None:
self.update_in_place(CHROME_USER_DATA_DIR=None)
return self
@ -206,81 +190,3 @@ class ChromeConfig(BaseConfigSet):
CHROME_CONFIG = ChromeConfig()
class ChromeBinary(BaseBinary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
if not (binary.abspath and os.access(binary.abspath, os.F_OK)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception as err:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
if SHELL_CONFIG.IN_DOCKER and os.access(lock_file, os.F_OK):
lock_file.unlink()
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
if os.access(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock', os.F_OK):
lock_file.unlink()
CHROME_BINARY = ChromeBinary()
class ChromePlugin(BasePlugin):
app_label: str = 'chrome'
verbose_name: str = 'Chrome Browser'
hooks: List[InstanceOf[BaseHook]] = [
CHROME_CONFIG,
CHROME_BINARY,
]
PLUGIN = ChromePlugin()
# PLUGIN.register(settings)
DJANGO_APP = PLUGIN.AppConfig

View file

@ -0,0 +1,38 @@
__package__ = 'plugins_extractor.curl'
__label__ = 'curl'
__version__ = '2024.10.14'
__author__ = 'ArchiveBox'
__homepage__ = 'https://github.com/curl/curl'
__dependencies__ = []
import abx
@abx.hookimpl
def get_PLUGIN():
return {
'curl': {
'PACKAGE': __package__,
'LABEL': __label__,
'VERSION': __version__,
'AUTHOR': __author__,
'HOMEPAGE': __homepage__,
'DEPENDENCIES': __dependencies__,
}
}
@abx.hookimpl
def get_CONFIG():
from .config import CURL_CONFIG
return {
'curl': CURL_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CURL_BINARY
return {
'curl': CURL_BINARY,
}

View file

@ -1,79 +0,0 @@
__package__ = 'plugins_extractor.curl'
from typing import List, Optional
from pathlib import Path
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
class CurlConfig(BaseConfigSet):
SAVE_TITLE: bool = Field(default=True)
SAVE_HEADERS: bool = Field(default=True)
USE_CURL: bool = Field(default=lambda c:
ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
or FAVICON_CONFIG.SAVE_FAVICON
or c.SAVE_HEADERS
or c.SAVE_TITLE
)
CURL_BINARY: str = Field(default='curl')
CURL_ARGS: List[str] = [
'--silent',
'--location',
'--compressed',
]
CURL_EXTRA_ARGS: List[str] = []
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
CURL_CONFIG = CurlConfig()
class CurlBinary(BaseBinary):
name: BinName = CURL_CONFIG.CURL_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
CURL_BINARY = CurlBinary()
# class CurlExtractor(BaseExtractor):
# name: ExtractorName = 'curl'
# binary: str = CURL_BINARY.name
# def get_output_path(self, snapshot) -> Path | None:
# curl_index_path = curl_output_path(snapshot.as_link())
# if curl_index_path:
# return Path(curl_index_path)
# return None
# CURL_EXTRACTOR = CurlExtractor()
class CurlPlugin(BasePlugin):
app_label: str = 'curl'
verbose_name: str = 'CURL'
hooks: List[InstanceOf[BaseHook]] = [
CURL_CONFIG,
CURL_BINARY,
# CURL_EXTRACTOR,
]
PLUGIN = CurlPlugin()
DJANGO_APP = PLUGIN.AppConfig

Some files were not shown because too many files have changed in this diff Show more