diff --git a/Dockerfile b/Dockerfile
index dafb8845..24a1a7ae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR"
RUN openssl rand -hex 16 > /etc/machine-id \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
ENV IN_DOCKER=True \
- SYSTEM_LIB_DIR=/app/lib \
- SYSTEM_TMP_DIR=/tmp \
+ SYSTEM_LIB_DIR=/usr/share/archivebox \
+ SYSTEM_TMP_DIR=/tmp/archivebox \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
- ## No need to set explicitly, these values will be autodetected by archivebox in docker:
- # WGET_BINARY="wget" \
- # YOUTUBEDL_BINARY="yt-dlp" \
- # CHROME_BINARY="/usr/bin/chromium-browser" \
- # USE_SINGLEFILE=True \
- # SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
- # USE_READABILITY=True \
- # READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
- # USE_MERCURY=True \
- # MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
# Print version for nice docker finish summary
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
diff --git a/archivebox/__init__.py b/archivebox/__init__.py
index eab371e2..9bff245a 100755
--- a/archivebox/__init__.py
+++ b/archivebox/__init__.py
@@ -13,7 +13,7 @@ __package__ = 'archivebox'
import os
import sys
-import tempfile
+
from pathlib import Path
ASCII_LOGO = """
@@ -25,37 +25,36 @@ ASCII_LOGO = """
╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝
"""
-SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox'
-SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True)
-os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR)
-os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
+# detect ArchiveBox user's UID/GID based on data dir ownership
+from archivebox.config.permissions import drop_privileges # noqa
+drop_privileges()
-# if we are outside a data dir, cd into an ephemeral tmp dir so that
-# we can run version/help without polluting cwd with an index.sqlite3
-if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
- current_dir = Path(os.getcwd()).resolve()
- if not (current_dir / 'index.sqlite3').exists():
- os.chdir(SYSTEM_TMP_DIR)
+from archivebox.misc.checks import check_not_root, check_io_encoding # noqa
+check_not_root()
+check_io_encoding()
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR))
+os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# print('INSTALLING MONKEY PATCHES')
-from .monkey_patches import * # noqa
+from archivebox.monkey_patches import * # noqa
# print('DONE INSTALLING MONKEY PATCHES')
# print('LOADING VENDORED LIBRARIES')
-from .vendor import load_vendored_libs # noqa
+from archivebox.vendor import load_vendored_libs # noqa
load_vendored_libs()
# print('DONE LOADING VENDORED LIBRARIES')
-from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
+from archivebox.config.constants import CONSTANTS # noqa
+from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
+from archivebox.config.version import VERSION # noqa
__version__ = VERSION
__author__ = 'Nick Sweeting'
diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py
index b71ceb3d..7076f5d1 100644
--- a/archivebox/api/v1_api.py
+++ b/archivebox/api/v1_api.py
@@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
-from archivebox.config import SHELL_CONFIG, VERSION
+from archivebox.config import VERSION
+from archivebox.config.version import get_COMMIT_HASH
from api.auth import API_AUTH_METHODS
-COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown'
+COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
html_description=f'''
Welcome to your ArchiveBox server's REST API [v1 ALPHA]
homepage!
diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py
index 9db7bcad..fe78f8c4 100644
--- a/archivebox/api/v1_cli.py
+++ b/archivebox/api/v1_cli.py
@@ -13,7 +13,7 @@ from ..main import (
schedule,
)
from archivebox.misc.util import ansi_to_html
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 1ac7a9f9..ab532a04 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -1,6 +1,7 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
+import os
import sys
import argparse
import threading
@@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup':
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
sys.argv[1] = 'install'
+if '--debug' in sys.argv:
+ os.environ['DEBUG'] = 'True'
+ sys.argv.remove('--debug')
+
# def list_subcommands() -> Dict[str, str]:
# """find and import all valid archivebox_.py files in CLI_DIR"""
@@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = {
'init': 'archivebox_init',
'install': 'archivebox_install',
+ ##############################################
'config': 'archivebox_config',
-
'add': 'archivebox_add',
'remove': 'archivebox_remove',
'update': 'archivebox_update',
@@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = {
'shell': 'archivebox_shell',
'manage': 'archivebox_manage',
- 'oneshot': 'archivebox_oneshot',
+ # 'oneshot': 'archivebox_oneshot',
}
# every imported command module must have these properties in order to be valid
@@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands()
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version') # dont require valid data folder at all
-main_cmds = ('init', 'config', 'setup', 'install') # dont require existing db present
-archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
+setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
+archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
fake_db = ("oneshot",) # use fake in-memory db
-display_first = (*meta_cmds, *main_cmds, *archive_cmds)
+display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
@@ -157,14 +162,16 @@ def run_subcommand(subcommand: str,
from archivebox.config.legacy import setup_django
# print('DATA_DIR is', DATA_DIR)
- # print('pwd is', os.getcwd())
+ # print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
- setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
+ check_db = cmd_requires_db and not init_pending
- if subcommand not in meta_cmds:
+ setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
+
+ if subcommand in archive_cmds:
if cmd_requires_db:
check_migrations()
diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py
index 8c44b18b..64a9c54c 100644
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -9,7 +9,8 @@ import argparse
from typing import List, Optional, IO
from archivebox.misc.util import docstring
-from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import ARCHIVING_CONFIG
from ..main import add
from ..parsers import PARSERS
diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py
index f25cc0c4..3c57bf43 100644
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -9,7 +9,8 @@ from pathlib import Path
from typing import Optional, List, IO
from archivebox.misc.util import docstring
-from archivebox.config import DATA_DIR, SERVER_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import SERVER_CONFIG
from ..logging_util import SmartFormatter, reject_stdin
from ..main import server
diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py
index 7eb3d52c..d70352e0 100644
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -1,27 +1,9 @@
__package__ = 'archivebox.config'
-from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
-from .defaults import (
- SHELL_CONFIG,
- STORAGE_CONFIG,
- GENERAL_CONFIG,
- SERVER_CONFIG,
- ARCHIVING_CONFIG,
- SEARCH_BACKEND_CONFIG,
+from .paths import (
+ PACKAGE_DIR, # noqa
+ DATA_DIR, # noqa
+ ARCHIVE_DIR, # noqa
)
-
-
-__all__ = [
- 'CONSTANTS',
- 'PACKAGE_DIR',
- 'DATA_DIR',
- 'ARCHIVE_DIR',
- 'VERSION',
- 'SHELL_CONFIG',
- 'STORAGE_CONFIG',
- 'GENERAL_CONFIG',
- 'SERVER_CONFIG',
- 'ARCHIVING_CONFIG',
- 'SEARCH_BACKEND_CONFIG',
- 'CONSTANTS_CONFIG',
-]
+from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
+from .version import VERSION # noqa
diff --git a/archivebox/config/apps.py b/archivebox/config/apps.py
index 88c94f8f..e56a9179 100644
--- a/archivebox/config/apps.py
+++ b/archivebox/config/apps.py
@@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
-from .defaults import (
+from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401
diff --git a/archivebox/config/check_for_update.py b/archivebox/config/check_for_update.py
deleted file mode 100644
index a725522a..00000000
--- a/archivebox/config/check_for_update.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# def get_versions_available_on_github(config):
-# """
-# returns a dictionary containing the ArchiveBox GitHub release info for
-# the recommended upgrade version and the currently installed version
-# """
-
-# # we only want to perform the (relatively expensive) check for new versions
-# # when its most relevant, e.g. when the user runs a long-running command
-# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
-# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
-# if subcommand_run_by_user not in long_running_commands:
-# return None
-
-# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
-# response = requests.get(github_releases_api)
-# if response.status_code != 200:
-# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
-# return None
-# all_releases = response.json()
-
-# installed_version = parse_version_string(config['VERSION'])
-
-# # find current version or nearest older version (to link to)
-# current_version = None
-# for idx, release in enumerate(all_releases):
-# release_version = parse_version_string(release['tag_name'])
-# if release_version <= installed_version:
-# current_version = release
-# break
-
-# current_version = current_version or all_releases[-1]
-
-# # recommended version is whatever comes after current_version in the release list
-# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
-# try:
-# recommended_version = all_releases[idx+1]
-# except IndexError:
-# recommended_version = None
-
-# return {'recommended_version': recommended_version, 'current_version': current_version}
-
-# def can_upgrade(config):
-# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
-# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
-# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
-# return recommended_version > current_version
-# return False
diff --git a/archivebox/config/defaults.py b/archivebox/config/common.py
similarity index 63%
rename from archivebox/config/defaults.py
rename to archivebox/config/common.py
index e4146f25..b17fde09 100644
--- a/archivebox/config/defaults.py
+++ b/archivebox/config/common.py
@@ -1,21 +1,21 @@
__package__ = 'archivebox.config'
-import os
import sys
import shutil
from typing import Dict, Optional
-from datetime import datetime
from pathlib import Path
from rich import print
-from pydantic import Field, field_validator, model_validator, computed_field
+from pydantic import Field, field_validator, computed_field
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
-from .constants import CONSTANTS, PACKAGE_DIR
+from .constants import CONSTANTS
+from .version import get_COMMIT_HASH, get_BUILD_TIME
+from .permissions import IN_DOCKER
###################### Config ##########################
@@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet):
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
- IN_DOCKER: bool = Field(default=False)
+ IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
-
- USER: str = Field(default=Path('~').expanduser().resolve().name)
- PUID: int = Field(default=os.getuid())
- PGID: int = Field(default=os.getgid())
-
- PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
@@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet):
@computed_field
@property
def COMMIT_HASH(self) -> Optional[str]:
- try:
- git_dir = PACKAGE_DIR / '../.git'
- ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
- commit_hash = git_dir.joinpath(ref).read_text().strip()
- return commit_hash
- except Exception:
- pass
-
- try:
- return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
- except Exception:
- pass
-
- return None
+ return get_COMMIT_HASH()
@computed_field
@property
def BUILD_TIME(self) -> str:
- if self.IN_DOCKER:
- docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
- return docker_build_end_time
-
- src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
- return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
-
-
- @model_validator(mode='after')
- def validate_not_running_as_root(self):
- attempted_command = ' '.join(sys.argv[:3])
- if self.PUID == 0 and attempted_command not in ('setup', 'install'):
- # stderr('[!] ArchiveBox should never be run as root!', color='red')
- # stderr(' For more information, see the security overview documentation:')
- # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
- print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
- print(' For more information, see the security overview documentation:', file=sys.stderr)
- print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
-
- if self.IN_DOCKER:
- print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
- print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
- print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
- print(' or:', file=sys.stderr)
- print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
- print(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
- raise SystemExit(2)
-
- # check python locale
- if self.PYTHON_ENCODING != 'UTF-8':
- print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
- print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
- print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
- print('')
- print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
- print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
- raise SystemExit(2)
-
- return self
+ return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()
diff --git a/archivebox/config/config_stubs.py b/archivebox/config/config_stubs.py
deleted file mode 100644
index 20c803bb..00000000
--- a/archivebox/config/config_stubs.py
+++ /dev/null
@@ -1,115 +0,0 @@
-from pathlib import Path
-from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List
-from mypy_extensions import TypedDict
-
-from benedict import benedict
-
-SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
-SimpleConfigValueDict = Dict[str, SimpleConfigValue]
-SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
-ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
-
-
-
-class BaseConfig(TypedDict):
- pass
-
-class ConfigDict(BaseConfig, benedict, total=False):
- """
- # Regenerate by pasting this quine into `archivebox shell` 🥚
- from archivebox.config import ConfigDict, CONFIG_DEFAULTS
- print('class ConfigDict(BaseConfig, total=False):')
- print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
- for section, configs in CONFIG_DEFAULTS.items():
- for key, attrs in configs.items():
- Type, default = attrs['type'], attrs['default']
- if default is None:
- print(f' {key}: Optional[{Type.__name__}]')
- else:
- print(f' {key}: {Type.__name__}')
- print()
- """
-
- IS_TTY: bool
- USE_COLOR: bool
- SHOW_PROGRESS: bool
- IN_DOCKER: bool
-
- PACKAGE_DIR: Path
- CONFIG_FILE: Path
- ONLY_NEW: bool
- TIMEOUT: int
- MEDIA_TIMEOUT: int
- OUTPUT_PERMISSIONS: str
- RESTRICT_FILE_NAMES: str
- URL_DENYLIST: str
-
- SECRET_KEY: Optional[str]
- BIND_ADDR: str
- ALLOWED_HOSTS: str
- DEBUG: bool
- PUBLIC_INDEX: bool
- PUBLIC_SNAPSHOTS: bool
- FOOTER_INFO: str
-
- SAVE_TITLE: bool
- SAVE_FAVICON: bool
- SAVE_WGET: bool
- SAVE_WGET_REQUISITES: bool
- SAVE_SINGLEFILE: bool
- SAVE_READABILITY: bool
- SAVE_MERCURY: bool
- SAVE_PDF: bool
- SAVE_SCREENSHOT: bool
- SAVE_DOM: bool
- SAVE_WARC: bool
- SAVE_GIT: bool
- SAVE_MEDIA: bool
- SAVE_ARCHIVE_DOT_ORG: bool
-
- RESOLUTION: str
- GIT_DOMAINS: str
- CHECK_SSL_VALIDITY: bool
- CURL_USER_AGENT: str
- WGET_USER_AGENT: str
- CHROME_USER_AGENT: str
- COOKIES_FILE: Union[str, Path, None]
- CHROME_USER_DATA_DIR: Union[str, Path, None]
- CHROME_TIMEOUT: int
- CHROME_HEADLESS: bool
- CHROME_SANDBOX: bool
-
- USE_CURL: bool
- USE_WGET: bool
- USE_SINGLEFILE: bool
- USE_READABILITY: bool
- USE_MERCURY: bool
- USE_GIT: bool
- USE_CHROME: bool
- USE_YOUTUBEDL: bool
- CURL_BINARY: str
- GIT_BINARY: str
- WGET_BINARY: str
- SINGLEFILE_BINARY: str
- READABILITY_BINARY: str
- MERCURY_BINARY: str
- YOUTUBEDL_BINARY: str
- CHROME_BINARY: Optional[str]
-
- YOUTUBEDL_ARGS: List[str]
- WGET_ARGS: List[str]
- CURL_ARGS: List[str]
- GIT_ARGS: List[str]
- TAG_SEPARATOR_PATTERN: str
-
-
-ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
-ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
-
-ConfigDefault = TypedDict('ConfigDefault', {
- 'default': ConfigDefaultValue,
- 'type': Optional[Type],
- 'aliases': Optional[Tuple[str, ...]],
-}, total=False)
-
-ConfigDefaultDict = Dict[str, ConfigDefault]
diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py
index 25082fa8..5e646e58 100644
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -1,118 +1,115 @@
__package__ = 'archivebox.config'
-
import os
import re
import platform
-import tempfile
from typing import Dict
from pathlib import Path
-import importlib.metadata
from collections.abc import Mapping
from benedict import benedict
from ..misc.logging import DEFAULT_CLI_COLORS
+from .paths import (
+ PACKAGE_DIR,
+ DATA_DIR,
+ ARCHIVE_DIR,
+ get_collection_id,
+ get_LIB_DIR,
+ get_TMP_DIR,
+)
+from .permissions import (
+ IS_ROOT,
+ IN_DOCKER,
+ RUNNING_AS_UID,
+ RUNNING_AS_GID,
+ DEFAULT_PUID,
+ DEFAULT_PGID,
+ ARCHIVEBOX_USER,
+ ARCHIVEBOX_GROUP,
+)
+from .version import detect_installed_version
+
###################### Config ##########################
-PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
-DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
-ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
-
-def _detect_installed_version(PACKAGE_DIR: Path):
- """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
- try:
- # if in production install, use pip-installed package metadata
- return importlib.metadata.version(__package__ or 'archivebox').strip()
- except importlib.metadata.PackageNotFoundError:
- pass
-
- try:
- # if in dev Git repo dir, use pyproject.toml file
- pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
- for line in pyproject_config:
- if line.startswith('version = '):
- return line.split(' = ', 1)[-1].strip('"').strip()
- except FileNotFoundError:
- # building docs, pyproject.toml is not available
- pass
-
- # raise Exception('Failed to detect installed archivebox version!')
- return 'dev'
-
-VERSION: str = _detect_installed_version(PACKAGE_DIR)
-
-
-
class ConstantsDict(Mapping):
- IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
- OS = platform.system().lower() # darwin, linux, etc.
- ARCH = platform.machine().lower() # arm64, x86_64, etc.
- LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
-
- PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir
- DATA_DIR: Path = DATA_DIR # archivebox user data dir
- ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir
- VERSION: str = VERSION
+ PACKAGE_DIR: Path = PACKAGE_DIR
+ DATA_DIR: Path = DATA_DIR
+ ARCHIVE_DIR: Path = ARCHIVE_DIR
+ COLLECTION_ID: str = get_collection_id(DATA_DIR)
+ # Host system
+ VERSION: str = detect_installed_version(PACKAGE_DIR)
+ OS: str = platform.system().lower() # darwin, linux, etc.
+ ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
+ IN_DOCKER: bool = IN_DOCKER
+
+ # Permissions
+ IS_ROOT: bool = IS_ROOT
+ ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
+ ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
+ RUNNING_AS_UID: int = RUNNING_AS_UID
+ RUNNING_AS_GID: int = RUNNING_AS_GID
+ DEFAULT_PUID: int = DEFAULT_PUID
+ DEFAULT_PGID: int = DEFAULT_PGID
+
+ # Source code dirs
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = 'templates'
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
- STATIC_DIR: Path = TEMPLATES_DIR / 'static'
+ STATIC_DIR_NAME: str = 'static'
+ STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
+
+ # Data dirs
+ ARCHIVE_DIR_NAME: str = 'archive'
+ SOURCES_DIR_NAME: str = 'sources'
+ PERSONAS_DIR_NAME: str = 'personas'
+ CRONTABS_DIR_NAME: str = 'crontabs'
+ CACHE_DIR_NAME: str = 'cache'
+ LOGS_DIR_NAME: str = 'logs'
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
-
- ARCHIVE_DIR_NAME: str = 'archive'
- SOURCES_DIR_NAME: str = 'sources'
- PERSONAS_DIR_NAME: str = 'personas'
- CRONTABS_DIR_NAME: str = 'crontabs'
- CACHE_DIR_NAME: str = 'cache'
- LOGS_DIR_NAME: str = 'logs'
- LIB_DIR_NAME: str = 'lib'
- TMP_DIR_NAME: str = 'tmp'
-
- SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
- # DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
- SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
- DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
-
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
- CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
- LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
- TMP_DIR: Path = SYSTEM_TMP_DIR
+ CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
+ # Data dir files
+ CONFIG_FILENAME: str = 'ArchiveBox.conf'
+ SQL_INDEX_FILENAME: str = 'index.sqlite3'
+ QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
+ CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
+ DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
+ QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
+
+ JSON_INDEX_FILENAME: str = 'index.json'
+ HTML_INDEX_FILENAME: str = 'index.html'
+ ROBOTS_TXT_FILENAME: str = 'robots.txt'
+ FAVICON_FILENAME: str = 'favicon.ico'
+
+ # Runtime dirs
+ TMP_DIR_NAME: str = 'tmp'
+ TMP_DIR: Path = get_TMP_DIR()
+ LIB_DIR_NAME: str = 'lib'
+ LIB_DIR: Path = get_LIB_DIR()
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
- CONFIG_FILENAME: str = 'ArchiveBox.conf'
- SQL_INDEX_FILENAME: str = 'index.sqlite3'
- QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
+ # Config constants
+ TIMEZONE: str = 'UTC'
+ DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
+ DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
- CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
- DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
- QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
-
- JSON_INDEX_FILENAME: str = 'index.json'
- HTML_INDEX_FILENAME: str = 'index.html'
- ROBOTS_TXT_FILENAME: str = 'robots.txt'
- FAVICON_FILENAME: str = 'favicon.ico'
-
- TIMEZONE: str = 'UTC'
- DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
- DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
-
- ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
+ ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
# 99.999% of the time, URLs ending in these extensions are static files
@@ -136,17 +133,6 @@ class ConstantsDict(Mapping):
# html, htm, shtml, xhtml, xml, aspx, php, cgi
))
- INGORED_PATHS: frozenset[str] = frozenset((
- ".git",
- ".svn",
- ".DS_Store",
- ".gitignore",
- "lost+found",
- ".DS_Store",
- ".env",
- "Dockerfile",
- ".ArchiveBox.conf.bak",
- ))
PIP_RELATED_NAMES: frozenset[str] = frozenset((
".venv",
"venv",
@@ -160,7 +146,15 @@ class ConstantsDict(Mapping):
"yarn.lock",
))
- DATA_DIR_NAMES: frozenset[str] = frozenset((
+ # When initializing archivebox in a new directory, we check to make sure the dir is
+ # actually empty so that we dont clobber someone's home directory or desktop by accident.
+ # These files are exceptions to the is_empty check when we're trying to init a new dir,
+ # as they could be from a previous archivebox version, system artifacts, dependencies, etc.
+ ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
+ *PIP_RELATED_NAMES,
+ *NPM_RELATED_NAMES,
+
+ ### Dirs:
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@@ -171,9 +165,12 @@ class ConstantsDict(Mapping):
CUSTOM_TEMPLATES_DIR_NAME,
USER_PLUGINS_DIR_NAME,
CRONTABS_DIR_NAME,
- ))
- DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
- DATA_FILE_NAMES: frozenset[str] = frozenset((
+ "static", # created by old static exports str:
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
-DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
+DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
@@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
def load_config_val(key: str,
- default: ConfigDefaultValue=None,
+ default: Any=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[benedict]=None,
env_vars: Optional[os._Environ]=None,
- config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
+ config_file_vars: Optional[Dict[str, str]]=None) -> Any:
"""parse bool, int, and str key=value pairs from env"""
assert isinstance(config, dict)
@@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
-def load_config(defaults: ConfigDefaultDict,
+def load_config(defaults: Dict[str, Any],
config: Optional[benedict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
@@ -505,7 +498,7 @@ def load_all_config():
# add all final config values in CONFIG to globals in this file
CONFIG: benedict = load_all_config()
globals().update(CONFIG)
-# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
+
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
@@ -521,8 +514,8 @@ globals().update(CONFIG)
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
-assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})' # noqa: F821
-os.environ["TZ"] = TIMEZONE # noqa: F821
+assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821
+os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
########################### Config Validity Checkers ###########################
@@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS:
os.environ['TERM'] = 'dumb'
# recreate rich console obj based on new config values
-CONSOLE = Console()
+STDOUT = CONSOLE = Console()
+STDERR = Console(stderr=True)
from ..misc import logging
logging.CONSOLE = CONSOLE
@@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
-def bump_startup_progress_bar():
+def bump_startup_progress_bar(advance=1):
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
- INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
+ INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
def setup_django_minimal():
@@ -559,6 +553,8 @@ DJANGO_SET_UP = False
def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
+ from rich.panel import Panel
+
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
global DJANGO_SET_UP
@@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
- with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
+ with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or CONSTANTS.DATA_DIR
@@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
- django.setup()
+ try:
+ django.setup()
+ except Exception as e:
+ bump_startup_progress_bar(advance=1000)
+ STDERR.print()
+ STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red'))
+ STDERR.print()
+ return
bump_startup_progress_bar()
@@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
+ # make sure the data dir is owned by a non-root user
+ if CONSTANTS.DATA_DIR.stat().st_uid == 0:
+ STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
+ STDERR.print(f' {CONSTANTS.DATA_DIR}')
+ STDERR.print()
+ STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
+ STDERR.print(' cd path/to/your/archive/data')
+ STDERR.print(' archivebox [command]')
+ STDERR.print()
+ raise SystemExit(9)
+
# Create cache table in DB if needed
try:
from django.core.cache import cache
diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py
new file mode 100644
index 00000000..a0bc69a9
--- /dev/null
+++ b/archivebox/config/paths.py
@@ -0,0 +1,152 @@
+__package__ = 'archivebox.config'
+
+import os
+import tempfile
+import hashlib
+from pathlib import Path
+
+from functools import cache
+from platformdirs import PlatformDirs
+
+from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
+
+#############################################################################################
+
+PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
+DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
+ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
+
+#############################################################################################
+
+@cache
+def get_collection_id(DATA_DIR=DATA_DIR):
+ """Get a short, stable, unique ID for the current collection"""
+ collection_id_file = DATA_DIR / '.collection_id'
+
+ try:
+ return collection_id_file.read_text().strip()
+ except (OSError, FileNotFoundError, PermissionError):
+ pass
+
+ hash_key = str(DATA_DIR.resolve()).encode()
+ collection_id = hashlib.sha256(hash_key).hexdigest()[:8]
+ try:
+ collection_id_file.write_text(collection_id)
+ except (OSError, FileNotFoundError, PermissionError):
+ pass
+ return collection_id
+
+
+def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
+ """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
+ current_uid, current_gid = os.geteuid(), os.getegid()
+ uid, gid = uid or current_uid, gid or current_gid
+
+ test_file = dir_path / '.permissions_test'
+ try:
+ with SudoPermission(uid=uid, fallback=fallback):
+ test_file.exists()
+ test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
+ test_file.unlink()
+ return True
+ except (IOError, OSError, PermissionError):
+ pass
+
+ return False
+
+
+
+@cache
+def get_LIB_DIR():
+ """
+ - should be shared with other collections on the same host
+ - must be scoped by CPU architecture, OS family, and archivebox version
+ - should not be shared with other hosts/archivebox versions
+ - must be writable by any archivebox user
+ - should be persistent across reboots
+ - can be on a docker bin mount but probably shouldnt be
+ - ok to have a long path (doesnt contain SOCKETS)
+ """
+ from .version import detect_installed_version
+
+ HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
+
+ if 'SYSTEM_LIB_DIR' in os.environ:
+ lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
+ else:
+ with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
+ lib_dir = HOST_DIRS.site_data_path
+
+ # Docker: /usr/local/share/archivebox/0.8.5
+ # Ubuntu: /usr/local/share/archivebox/0.8.5
+ # macOS: /Library/Application Support/archivebox
+ try:
+ with SudoPermission(uid=0, fallback=True):
+ lib_dir.mkdir(parents=True, exist_ok=True)
+ except PermissionError:
+ # our user cannot
+ lib_dir = HOST_DIRS.user_data_path
+ lib_dir.mkdir(parents=True, exist_ok=True)
+
+ if not dir_is_writable(lib_dir):
+ if IS_ROOT:
+ # make sure lib dir is owned by the archivebox user, not root
+ with SudoPermission(uid=0):
+ os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
+ else:
+ raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+
+ return lib_dir
+
+@cache
+def get_TMP_DIR():
+ """
+ - must NOT be inside DATA_DIR / inside a docker volume bind mount
+ - must NOT have a long PATH (UNIX socket path length restrictions)
+ - must NOT be shared with other collections/hosts
+ - must be writable by archivebox user & root
+ - must be cleared on every boot / not persisted
+ - must be cleared on every archivebox version upgrade
+ """
+ from .version import detect_installed_version
+
+ HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
+
+ # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
+ # print('RUNNING AS:', self.PUID, self.PGID)
+
+ if 'SYSTEM_TMP_DIR' in os.environ:
+ run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
+ with SudoPermission(uid=0, fallback=True):
+ run_dir.mkdir(parents=True, exist_ok=True)
+ if not dir_is_writable(run_dir):
+ if IS_ROOT:
+ with SudoPermission(uid=0, fallback=False):
+ os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
+ else:
+ raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+ assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
+ return run_dir
+
+ run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
+ try:
+ assert len(str(run_dir)) + len('/supervisord.sock') < 95
+ except AssertionError:
+ run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
+ assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
+
+ with SudoPermission(uid=0, fallback=True):
+ run_dir.mkdir(parents=True, exist_ok=True)
+
+ if not dir_is_writable(run_dir):
+ if IS_ROOT:
+ with SudoPermission(uid=0):
+ os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
+ else:
+ raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
+
+ # Docker: /tmp/archivebox/0.8.5/abc324235
+ # Ubuntu: /tmp/archivebox/0.8.5/abc324235
+ # macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235
+ return run_dir
+
diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py
new file mode 100644
index 00000000..46e9c3f5
--- /dev/null
+++ b/archivebox/config/permissions.py
@@ -0,0 +1,70 @@
+__package__ = 'archivebox.config'
+
+import os
+from pathlib import Path
+from contextlib import contextmanager
+
+#############################################################################################
+
+DATA_DIR = Path(os.getcwd())
+
+DATA_DIR_STAT = Path(DATA_DIR).stat()
+DATA_DIR_UID = DATA_DIR_STAT.st_uid
+DATA_DIR_GID = DATA_DIR_STAT.st_gid
+DEFAULT_PUID = 911
+DEFAULT_PGID = 911
+RUNNING_AS_UID = os.getuid()
+RUNNING_AS_GID = os.getgid()
+EUID = os.geteuid()
+EGID = os.getegid()
+USER: str = Path('~').expanduser().resolve().name
+
+IS_ROOT = RUNNING_AS_UID == 0
+IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
+
+os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID))
+os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID))
+
+ARCHIVEBOX_USER = int(os.environ['PUID'])
+ARCHIVEBOX_GROUP = int(os.environ['PGID'])
+
+#############################################################################################
+
+def drop_privileges():
+ """If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)"""
+
+ # always run archivebox as the user that owns the data dir, never as root
+ if os.getuid() == 0:
+ # drop permissions to the user that owns the data dir / provided PUID
+ if os.geteuid() != ARCHIVEBOX_USER:
+ os.seteuid(ARCHIVEBOX_USER)
+ # if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root
+
+
+@contextmanager
+def SudoPermission(uid=0, fallback=False):
+ """Attempt to run code with sudo permissions for a given user (or root)"""
+
+ if os.geteuid() == uid:
+ # no need to change effective UID, we are already that user
+ yield
+ return
+
+ try:
+ # change our effective UID to the given UID
+ os.seteuid(uid)
+ except PermissionError as err:
+ if not fallback:
+ raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
+ try:
+ # yield back to the caller so they can run code inside context as root
+ yield
+ finally:
+ # then set effective UID back to DATA_DIR owner
+ DATA_DIR_OWNER = DATA_DIR.stat().st_uid
+ try:
+ os.seteuid(DATA_DIR_OWNER)
+ except PermissionError as err:
+ if not fallback:
+ raise PermissionError(f'Failed to revert uid={uid} back to {DATA_DIR_OWNER} after running code with sudo') from err
+
diff --git a/archivebox/config/version.py b/archivebox/config/version.py
new file mode 100644
index 00000000..26df4592
--- /dev/null
+++ b/archivebox/config/version.py
@@ -0,0 +1,121 @@
+__package__ = 'archivebox.config'
+
+import os
+import importlib.metadata
+
+from pathlib import Path
+from functools import cache
+from datetime import datetime
+from typing import Optional
+
+#############################################################################################
+
+IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
+
+PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
+DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
+ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
+
+#############################################################################################
+
+
+@cache
+def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
+ """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
+ try:
+ # if in production install, use pip-installed package metadata
+ return importlib.metadata.version('archivebox').strip()
+ except importlib.metadata.PackageNotFoundError:
+ pass
+
+ try:
+ # if in dev Git repo dir, use pyproject.toml file
+ pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
+ for line in pyproject_config:
+ if line.startswith('version = '):
+ return line.split(' = ', 1)[-1].strip('"').strip()
+ except FileNotFoundError:
+ # building docs, pyproject.toml is not available
+ pass
+
+ # raise Exception('Failed to detect installed archivebox version!')
+ return 'dev'
+
+
+@cache
+def get_COMMIT_HASH() -> Optional[str]:
+ try:
+ git_dir = PACKAGE_DIR / '../.git'
+ ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
+ commit_hash = git_dir.joinpath(ref).read_text().strip()
+ return commit_hash
+ except Exception:
+ pass
+
+ try:
+ return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+ except Exception:
+ pass
+
+ return None
+
+@cache
+def get_BUILD_TIME() -> str:
+ if IN_DOCKER:
+ docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
+ return docker_build_end_time
+
+ src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
+ return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
+
+
+# def get_versions_available_on_github(config):
+# """
+# returns a dictionary containing the ArchiveBox GitHub release info for
+# the recommended upgrade version and the currently installed version
+# """
+
+# # we only want to perform the (relatively expensive) check for new versions
+# # when its most relevant, e.g. when the user runs a long-running command
+# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
+# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
+# if subcommand_run_by_user not in long_running_commands:
+# return None
+
+# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
+# response = requests.get(github_releases_api)
+# if response.status_code != 200:
+# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
+# return None
+# all_releases = response.json()
+
+# installed_version = parse_version_string(config['VERSION'])
+
+# # find current version or nearest older version (to link to)
+# current_version = None
+# for idx, release in enumerate(all_releases):
+# release_version = parse_version_string(release['tag_name'])
+# if release_version <= installed_version:
+# current_version = release
+# break
+
+# current_version = current_version or all_releases[-1]
+
+# # recommended version is whatever comes after current_version in the release list
+# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
+# try:
+# recommended_version = all_releases[idx+1]
+# except IndexError:
+# recommended_version = None
+
+# return {'recommended_version': recommended_version, 'current_version': current_version}
+
+# def can_upgrade(config):
+# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
+# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
+# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
+# return recommended_version > current_version
+# return False
+
+
+VERSION: str = detect_installed_version()
diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py
index 181d67f0..1cbe540e 100644
--- a/archivebox/core/middleware.py
+++ b/archivebox/core/middleware.py
@@ -5,7 +5,7 @@ from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.core.exceptions import ImproperlyConfigured
-from archivebox.config import SERVER_CONFIG
+from archivebox.config.common import SERVER_CONFIG
def detect_timezone(request, activate: bool=True):
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 77806188..e374ff4f 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -13,7 +13,8 @@ import abx.archivebox
import abx.archivebox.use
import abx.django.use
-from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa
+from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 7dbbf110..f3d7ef93 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -27,7 +27,8 @@ from core.admin import result_url
from queues.tasks import bg_add
-from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
+from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from .serve_static import serve_static_with_byterange_support
diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py
index 423f1601..16536d1f 100644
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@@ -5,7 +5,8 @@ import io
from pathlib import Path
from typing import Optional
-from archivebox.config import VERSION, ARCHIVING_CONFIG
+from archivebox.config import VERSION
+from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.legacy import SAVE_HTMLTOTEXT
from archivebox.misc.system import atomic_write
from archivebox.misc.util import enforce_types, is_static_file
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index bff099cd..e2000a68 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -12,9 +12,11 @@ from urllib.parse import urlparse
from django.db.models import QuerySet, Q
-from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
-from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
from archivebox.misc.logging import stderr
+from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
+
+from archivebox.config import DATA_DIR, CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
from ..logging_util import (
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index 307add0d..b46e9911 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -16,7 +16,9 @@ from archivebox.misc.util import (
htmlencode,
urldecode,
)
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
+from archivebox.config import CONSTANTS, DATA_DIR, VERSION
+from archivebox.config.common import SERVER_CONFIG
+from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from .schema import Link
@@ -56,7 +58,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
return render_django_template(template, {
'version': VERSION,
- 'git_sha': SHELL_CONFIG.COMMIT_HASH or VERSION,
+ 'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(links)),
'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index 945f73d1..d666b4b1 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -8,7 +8,8 @@ from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional, Iterator, Any, Union
-from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL_CONFIG
+from archivebox.config import VERSION, DATA_DIR, CONSTANTS
+from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
from .schema import Link
from archivebox.misc.system import atomic_write
diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py
index 892f11b7..cb07d546 100644
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -9,7 +9,8 @@ from django.db.models import QuerySet
from django.db import transaction
from archivebox.misc.util import enforce_types, parse_date
-from archivebox.config import DATA_DIR, GENERAL_CONFIG
+from archivebox.config import DATA_DIR
+from archivebox.config.common import GENERAL_CONFIG
from .schema import Link
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index d0de496d..b2ef9a8a 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -22,7 +22,8 @@ from rich.panel import Panel
from rich_argparse import RichHelpFormatter
from django.core.management.base import DjangoHelpFormatter
-from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
+from archivebox.config import CONSTANTS, DATA_DIR, VERSION
+from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI, stderr
diff --git a/archivebox/main.py b/archivebox/main.py
index 8a8fc59a..e1779b8b 100755
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -14,13 +14,15 @@ from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from django.utils import timezone
-from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR, SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
+from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
+from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
+from archivebox.config.permissions import SudoPermission, IN_DOCKER
from .cli import (
CLI_SUBCOMMANDS,
run_subcommand,
display_first,
meta_cmds,
- main_cmds,
+ setup_cmds,
archive_cmds,
)
from .parsers import (
@@ -101,7 +103,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
for cmd, func in all_subcommands.items()
- if cmd in main_cmds
+ if cmd in setup_cmds
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
for cmd, func in all_subcommands.items()
@@ -119,10 +121,10 @@ def help(out_dir: Path=DATA_DIR) -> None:
[grey53]# using Docker:[/grey53]
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
-''' if SHELL_CONFIG.IN_DOCKER else ''
- DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if SHELL_CONFIG.IN_DOCKER else ''
- DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if SHELL_CONFIG.IN_DOCKER else ''
- DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if SHELL_CONFIG.IN_DOCKER else ''
+''' if IN_DOCKER else ''
+ DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
+ DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
+ DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
print(f'''{DOCKER_USAGE}
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
@@ -158,7 +160,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
else:
DATA_SETUP_HELP = '\n'
- if SHELL_CONFIG.IN_DOCKER:
+ if IN_DOCKER:
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
@@ -190,6 +192,8 @@ def version(quiet: bool=False,
from plugins_auth.ldap.apps import LDAP_CONFIG
from django.conf import settings
+ from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
+ from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
@@ -198,13 +202,14 @@ def version(quiet: bool=False,
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
+ COMMIT_HASH = get_COMMIT_HASH()
prnt(
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
- f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
- f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
+ f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
+ f'BUILD_TIME={get_BUILD_TIME()}',
)
prnt(
- f'IN_DOCKER={SHELL_CONFIG.IN_DOCKER}',
+ f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
@@ -212,11 +217,13 @@ def version(quiet: bool=False,
f'PYTHON={sys.implementation.name.title()}',
)
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
+ DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
+ f'EUID={os.geteuid()} UID={RUNNING_AS_UID} PUID={ARCHIVEBOX_USER} FS_UID={DATA_DIR_STAT.st_uid}',
+ f'EGID={os.getegid()} GID={RUNNING_AS_GID} PGID={ARCHIVEBOX_GROUP} FS_GID={DATA_DIR_STAT.st_gid}',
+ f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
- f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
- f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
@@ -261,8 +268,36 @@ def version(quiet: bool=False,
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
-
+
prnt()
+
+ from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
+
+ data_dir_stat = Path(DATA_DIR).stat()
+ data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
+ data_owned_by_root = data_dir_uid == 0 or data_dir_gid == 0
+
+ data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
+ data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) and not IS_ROOT
+ data_not_writable = not (os.access(DATA_DIR, os.W_OK) and os.access(CONSTANTS.LIB_DIR, os.W_OK) and os.access(CONSTANTS.TMP_DIR, os.W_OK))
+ if data_owned_by_root:
+ prnt('[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], ArchiveBox will refuse to run![/yellow]')
+ elif data_owner_doesnt_match or data_not_writable:
+ prnt(f'[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
+ else:
+ prnt(f':information: [blue]DATA_DIR[/blue] is currently owned by [blue]{data_dir_uid}:{data_dir_gid}[/blue] (PUID:PGID)')
+
+ if data_owned_by_root or data_owner_doesnt_match or data_owned_by_default_user or data_not_writable:
+ prnt(f'[violet]Hint:[/violet] If you encounter permissions errors, change [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to match the user that will run ArchiveBox, e.g.:')
+ prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
+ prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}')
+ prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}')
+ prnt()
+ prnt('[blue]More info:[/blue]')
+ prnt(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
+ prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
+ prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
+ prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
@enforce_types
@@ -948,23 +983,56 @@ def list_folders(links: List[Link],
@enforce_types
def install(out_dir: Path=DATA_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
+
+ # if running as root:
+ # - run init to create index + lib dir
+ # - chown -R 911 DATA_DIR
+ # - install all binaries as root
+ # - chown -R 911 LIB_DIR
+ # else:
+ # - run init to create index + lib dir as current user
+ # - install all binaries as current user
+ # - recommend user re-run with sudo if any deps need to be installed as root
from rich import print
from django.conf import settings
+
+ from archivebox import CONSTANTS
+ from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
if not ARCHIVE_DIR.exists():
- run_subcommand('init', stdin=None, pwd=out_dir)
-
- stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
+ run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in
+ print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
+
+ # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
+ if IS_ROOT:
+ # if we have sudo/root permissions, take advantage of them just while installing dependencies
+ print()
+ print('[yellow]:warning: Using [red]root[/red] privileges only to install dependencies that need it, all other operations should be done as a [blue]non-root[/blue] user.[/yellow]')
+ print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
+ print()
+
for binary in reversed(list(settings.BINARIES.values())):
providers = ' [grey53]or[/grey53] '.join(provider.name for provider in binary.binproviders_supported)
print(f'[+] Locating / Installing [yellow]{binary.name}[/yellow] using [red]{providers}[/red]...')
try:
print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
+ if IS_ROOT:
+ with SudoPermission(uid=0):
+ os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
except Exception as e:
- print(f'[X] Failed to install {binary.name}: {e}')
-
+ if IS_ROOT:
+ print(f'[yellow]:warning: Retrying {binary.name} installation with [red]sudo[/red]...[/yellow]')
+ with SudoPermission(uid=0):
+ try:
+ print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
+ os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
+ except Exception as e:
+ print(f'[red]:cross_mark: Failed to install {binary.name} as root: {e}[/red]')
+ else:
+ print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
+
from django.contrib.auth import get_user_model
User = get_user_model()
@@ -974,12 +1042,13 @@ def install(out_dir: Path=DATA_DIR) -> None:
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
- stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
+ print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version'], capture_output=False, cwd=out_dir)
+
# backwards-compatibility:
setup = install
@@ -1100,6 +1169,7 @@ def schedule(add: bool=False,
check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
+ from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@@ -1156,7 +1226,7 @@ def schedule(add: bool=False,
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
- print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
+ print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not quiet:
stderr()
@@ -1170,7 +1240,7 @@ def schedule(add: bool=False,
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
- stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
+ stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(0)
@@ -1294,7 +1364,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
check_data_folder()
from django.core.management import execute_from_command_line
- if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
+ if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py
index 5e324cbb..bee8dcb2 100644
--- a/archivebox/misc/checks.py
+++ b/archivebox/misc/checks.py
@@ -1,37 +1,44 @@
__package__ = 'archivebox.misc'
-from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
+import sys
+from rich import print
-from .logging import stderr
+# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE
+# this file is imported by archivebox/__init__.py
+# and any imports here will be imported by EVERYTHING else
+# so this file should only be used for pure python checks
+# that don't need to import other parts of ArchiveBox
def check_data_folder() -> None:
-
+ from archivebox import DATA_DIR, ARCHIVE_DIR
+
archive_dir_exists = ARCHIVE_DIR.exists()
if not archive_dir_exists:
- stderr('[X] No archivebox index found in the current directory.', color='red')
- stderr(f' {DATA_DIR}', color='lightyellow')
- stderr()
- stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**SHELL_CONFIG.ANSI))
- stderr(' cd path/to/your/archive/folder')
- stderr(' archivebox [command]')
- stderr()
- stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**SHELL_CONFIG.ANSI))
- stderr(' archivebox init')
+ print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
+ print(f' {DATA_DIR}', file=sys.stderr)
+ print(file=sys.stderr)
+ print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
+ print(' cd path/to/your/archive/folder', file=sys.stderr)
+ print(' archivebox [command]', file=sys.stderr)
+ print(file=sys.stderr)
+ print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
+ print(' archivebox init', file=sys.stderr)
raise SystemExit(2)
-
-
+
+
def check_migrations():
+ from archivebox import DATA_DIR, CONSTANTS
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if pending_migrations:
- stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
- stderr(f' {DATA_DIR}')
- stderr()
- stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
- stderr(' archivebox init')
+ print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
+ print(f' {DATA_DIR}', file=sys.stderr)
+ print(file=sys.stderr)
+ print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
+ print(' archivebox init', file=sys.stderr)
raise SystemExit(3)
CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
@@ -39,3 +46,39 @@ def check_migrations():
# CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
(CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
(CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
+
+
+def check_io_encoding():
+ PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
+
+ if PYTHON_ENCODING != 'UTF-8':
+ print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
+ print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
+ print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
+ print('')
+ print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
+ print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
+ raise SystemExit(2)
+
+
+def check_not_root():
+ from archivebox.config.permissions import IS_ROOT, IN_DOCKER
+
+ attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
+ is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv[:2]
+ is_getting_version = '--version' in sys.argv or 'version' in sys.argv[:2]
+ is_installing = 'setup' in sys.argv[:2] or 'install' in sys.argv[:2]
+
+ if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
+ print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
+ print(' For more information, see the security overview documentation:', file=sys.stderr)
+ print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
+
+ if IN_DOCKER:
+ print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
+ print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
+ print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
+ print(' or:', file=sys.stderr)
+ print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
+ print(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
+ raise SystemExit(2)
diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py
index 44789cda..86983176 100644
--- a/archivebox/misc/logging.py
+++ b/archivebox/misc/logging.py
@@ -13,6 +13,7 @@ from rich.highlighter import Highlighter
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
CONSOLE = Console()
+STDERR = Console(stderr=True)
IS_TTY = CONSOLE.is_interactive
@@ -51,7 +52,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'37': [(255, 255, 255), (255, 255, 255)],
})
-# Logging Helpers
+# Logging Helpers (DEPRECATED, use rich.print instead going forward)
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py
index 4ae24d7e..f6814f8f 100644
--- a/archivebox/misc/system.py
+++ b/archivebox/misc/system.py
@@ -4,7 +4,6 @@ __package__ = 'archivebox.misc'
import os
import signal
import shutil
-import getpass
from json import dump
from pathlib import Path
@@ -14,7 +13,7 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write
-from archivebox.config import STORAGE_CONFIG
+from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.util import enforce_types, ExtendedEncoder
diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py
index eaf0bd75..a856fe64 100644
--- a/archivebox/misc/util.py
+++ b/archivebox/misc/util.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox'
+__package__ = 'archivebox.misc'
import re
import requests
@@ -25,10 +25,10 @@ except ImportError:
detect_encoding = lambda rawdata: "utf-8"
-from archivebox.config.constants import STATICFILE_EXTENSIONS
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG
-from .misc.logging import COLOR_DICT
+from .logging import COLOR_DICT
### Parsing Helpers
@@ -120,7 +120,7 @@ def find_all_urls(urls_str: str):
def is_static_file(url: str):
# TODO: the proper way is with MIME type detection + ext, not only extension
- return extension(url).lower() in STATICFILE_EXTENSIONS
+ return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
def enforce_types(func):
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index e89bf155..1abcd1d4 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -13,7 +13,8 @@ from typing import IO, Tuple, List, Optional
from datetime import datetime, timezone
from pathlib import Path
-from archivebox.config import DATA_DIR, CONSTANTS, SHELL_CONFIG, ARCHIVING_CONFIG
+from archivebox.config import DATA_DIR, CONSTANTS
+from archivebox.config.common import SHELL_CONFIG, ARCHIVING_CONFIG
from archivebox.misc.system import atomic_write
from archivebox.misc.logging import stderr, hint
from archivebox.misc.util import (
diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py
index 1222a1b2..fee4762c 100644
--- a/archivebox/plugins_extractor/chrome/apps.py
+++ b/archivebox/plugins_extractor/chrome/apps.py
@@ -25,7 +25,8 @@ from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
-from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
diff --git a/archivebox/plugins_extractor/curl/apps.py b/archivebox/plugins_extractor/curl/apps.py
index cab683b5..c496611b 100644
--- a/archivebox/plugins_extractor/curl/apps.py
+++ b/archivebox/plugins_extractor/curl/apps.py
@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
diff --git a/archivebox/plugins_extractor/git/apps.py b/archivebox/plugins_extractor/git/apps.py
index ff7146b2..ebdc9e9f 100644
--- a/archivebox/plugins_extractor/git/apps.py
+++ b/archivebox/plugins_extractor/git/apps.py
@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):
diff --git a/archivebox/plugins_extractor/mercury/apps.py b/archivebox/plugins_extractor/mercury/apps.py
index 78d505b2..58b8c249 100644
--- a/archivebox/plugins_extractor/mercury/apps.py
+++ b/archivebox/plugins_extractor/mercury/apps.py
@@ -5,14 +5,14 @@ from pathlib import Path
from subprocess import run
from pydantic import InstanceOf, Field
-from pydantic_pkgr import BinProvider, BinName, bin_abspath
+from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict, bin_abspath
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
-from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
+from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
class MercuryConfig(BaseConfigSet):
diff --git a/archivebox/plugins_extractor/readability/apps.py b/archivebox/plugins_extractor/readability/apps.py
index c7a84009..c61efb21 100644
--- a/archivebox/plugins_extractor/readability/apps.py
+++ b/archivebox/plugins_extractor/readability/apps.py
@@ -16,7 +16,7 @@ from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py
index e3535ded..8ebbc41c 100644
--- a/archivebox/plugins_extractor/singlefile/apps.py
+++ b/archivebox/plugins_extractor/singlefile/apps.py
@@ -1,11 +1,11 @@
__package__ = 'archivebox.plugins_extractor.singlefile'
from pathlib import Path
-from typing import List, Dict, Optional, ClassVar
+from typing import List, Dict, Optional
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
-from pydantic import InstanceOf, Field, validate_call
+from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, bin_abspath, ShallowBinary
# Depends on other Django apps:
@@ -17,7 +17,7 @@ from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py
index 171bebc4..1e54376b 100644
--- a/archivebox/plugins_extractor/wget/apps.py
+++ b/archivebox/plugins_extractor/wget/apps.py
@@ -14,7 +14,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
-from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from .wget_util import wget_output_path
diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py
index 21dfa0bc..2c935797 100644
--- a/archivebox/plugins_extractor/ytdlp/apps.py
+++ b/archivebox/plugins_extractor/ytdlp/apps.py
@@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_hook import BaseHook
-from archivebox.config import ARCHIVING_CONFIG
+from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################
diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py
index f7a1b986..cc94a807 100644
--- a/archivebox/plugins_search/ripgrep/apps.py
+++ b/archivebox/plugins_search/ripgrep/apps.py
@@ -18,7 +18,8 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
-from archivebox.config import CONSTANTS, SEARCH_BACKEND_CONFIG
+from archivebox.config import CONSTANTS
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
###################### Config ##########################
diff --git a/archivebox/plugins_search/sonic/apps.py b/archivebox/plugins_search/sonic/apps.py
index efc47ceb..c7342853 100644
--- a/archivebox/plugins_search/sonic/apps.py
+++ b/archivebox/plugins_search/sonic/apps.py
@@ -15,7 +15,7 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
SONIC_LIB = None
try:
diff --git a/archivebox/plugins_search/sqlite/apps.py b/archivebox/plugins_search/sqlite/apps.py
index 98db5363..9f34bfd8 100644
--- a/archivebox/plugins_search/sqlite/apps.py
+++ b/archivebox/plugins_search/sqlite/apps.py
@@ -17,7 +17,7 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
diff --git a/archivebox/queues/supervisor_util.py b/archivebox/queues/supervisor_util.py
index 4e3d749b..035f1e40 100644
--- a/archivebox/queues/supervisor_util.py
+++ b/archivebox/queues/supervisor_util.py
@@ -1,5 +1,6 @@
__package__ = 'archivebox.queues'
+import os
import time
import signal
import psutil
@@ -12,6 +13,8 @@ from typing import Dict, cast
from supervisor.xmlrpc import SupervisorTransport
from xmlrpc.client import ServerProxy
+from archivebox.config.permissions import ARCHIVEBOX_USER
+
from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, SOCK_FILE, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR
from typing import Iterator
@@ -42,6 +45,7 @@ childlogdir = {LOGS_DIR}
directory = {DATA_DIR}
strip_ansi = true
nocleanup = true
+user = {ARCHIVEBOX_USER}
[unix_http_server]
file = {TMP_DIR}/{SOCK_FILE.name}
diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py
index 81ae87e7..f7394171 100644
--- a/archivebox/search/__init__.py
+++ b/archivebox/search/__init__.py
@@ -11,7 +11,7 @@ import abx.archivebox.use
from archivebox.index.schema import Link
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
-from archivebox.config import SEARCH_BACKEND_CONFIG
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
def log_index_started(url):
diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh
index 9dfacdc9..287a2702 100755
--- a/bin/docker_entrypoint.sh
+++ b/bin/docker_entrypoint.sh
@@ -110,12 +110,11 @@ if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
fi
-# also chown tmp dir
-mkdir -p /tmp/archivebox
-chmod 777 /tmp
-chown $PUID:$PGID /tmp/archivebox
-mkdir -p /app/lib
-chown $PUID:$PGID /app/lib /app/lib/*
+# also chown tmp dir and lib dir
+mkdir -p "$SYSTEM_TMP_DIR"
+chown $PUID:$PGID "$SYSTEM_TMP_DIR"
+mkdir -p "$SYSTEM_LIB_DIR"
+chown $PUID:$PGID "$SYSTEM_LIB_DIR" "$SYSTEM_LIB_DIR"/*
# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"
diff --git a/pyproject.toml b/pyproject.toml
index 599e796e..1a7c60af 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
[project]
name = "archivebox"
-version = "0.8.5rc2"
+version = "0.8.5rc3"
requires-python = ">=3.10"
description = "Self-hosted internet archiving solution."
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
@@ -77,6 +77,7 @@ dependencies = [
"atomicwrites==1.4.1",
"django-taggit==1.3.0",
"base32-crockford==0.3.0",
+ "platformdirs>=4.3.6",
# "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7",
# "pydantic-pkgr>=0.4.7",
############# Plugin Dependencies ################
@@ -133,7 +134,6 @@ dev-dependencies = [
"django-autotyping>=0.5.1",
]
-
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
diff --git a/tests/test_init.py b/tests/test_init.py
index 156d1907..e3e2c852 100644
--- a/tests/test_init.py
+++ b/tests/test_init.py
@@ -7,11 +7,11 @@ from pathlib import Path
import json, shutil
import sqlite3
-from archivebox.config import OUTPUT_PERMISSIONS
+from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
-DIR_PERMISSIONS = OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
+DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
def test_init(tmp_path, process):
assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
@@ -57,7 +57,7 @@ def test_correct_permissions_output_folder(tmp_path, process):
index_files = ['index.sqlite3', 'archive']
for file in index_files:
file_path = tmp_path / file
- assert oct(file_path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
+ assert oct(file_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
@@ -65,7 +65,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
for path in archived_item_path.iterdir():
- assert oct(path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
+ assert oct(path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
diff --git a/uv.lock b/uv.lock
index ffb1c7ce..bce8f7ad 100644
--- a/uv.lock
+++ b/uv.lock
@@ -41,7 +41,7 @@ wheels = [
[[package]]
name = "archivebox"
-version = "0.8.5rc2"
+version = "0.8.5rc3"
source = { editable = "." }
dependencies = [
{ name = "atomicwrites" },