diff --git a/Dockerfile b/Dockerfile index dafb8845..24a1a7ae 100644 --- a/Dockerfile +++ b/Dockerfile @@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR" RUN openssl rand -hex 16 > /etc/machine-id \ && chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp" ENV IN_DOCKER=True \ - SYSTEM_LIB_DIR=/app/lib \ - SYSTEM_TMP_DIR=/tmp \ + SYSTEM_LIB_DIR=/usr/share/archivebox \ + SYSTEM_TMP_DIR=/tmp/archivebox \ GOOGLE_API_KEY=no \ GOOGLE_DEFAULT_CLIENT_ID=no \ GOOGLE_DEFAULT_CLIENT_SECRET=no \ ALLOWED_HOSTS=* - ## No need to set explicitly, these values will be autodetected by archivebox in docker: - # WGET_BINARY="wget" \ - # YOUTUBEDL_BINARY="yt-dlp" \ - # CHROME_BINARY="/usr/bin/chromium-browser" \ - # USE_SINGLEFILE=True \ - # SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \ - # USE_READABILITY=True \ - # READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \ - # USE_MERCURY=True \ - # MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser" # Print version for nice docker finish summary RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \ diff --git a/archivebox/__init__.py b/archivebox/__init__.py index eab371e2..9bff245a 100755 --- a/archivebox/__init__.py +++ b/archivebox/__init__.py @@ -13,7 +13,7 @@ __package__ = 'archivebox' import os import sys -import tempfile + from pathlib import Path ASCII_LOGO = """ @@ -25,37 +25,36 @@ ASCII_LOGO = """ ╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝╚═╝ ╚═╝╚═╝ ╚═══╝ ╚══════╝ ╚═════╝ ╚═════╝ ╚═╝ ╚═╝ """ -SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox' -SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True) -os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR) -os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings' +# detect ArchiveBox user's UID/GID based on data dir ownership +from archivebox.config.permissions import drop_privileges # noqa +drop_privileges() -# if we are outside a data dir, cd into an ephemeral tmp dir so that -# we can run version/help without polluting cwd with an index.sqlite3 -if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'): - current_dir = Path(os.getcwd()).resolve() - if not (current_dir / 'index.sqlite3').exists(): - os.chdir(SYSTEM_TMP_DIR) +from archivebox.misc.checks import check_not_root, check_io_encoding # noqa +check_not_root() +check_io_encoding() # make sure PACKAGE_DIR is in sys.path so we can import all subfolders # without necessarily waiting for django to load them thorugh INSTALLED_APPS PACKAGE_DIR = Path(__file__).resolve().parent if str(PACKAGE_DIR) not in sys.path: sys.path.append(str(PACKAGE_DIR)) +os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings' # print('INSTALLING MONKEY PATCHES') -from .monkey_patches import * # noqa +from archivebox.monkey_patches import * # noqa # print('DONE INSTALLING MONKEY PATCHES') # print('LOADING VENDORED LIBRARIES') -from .vendor import load_vendored_libs # noqa +from archivebox.vendor import load_vendored_libs # noqa load_vendored_libs() # print('DONE LOADING VENDORED LIBRARIES') -from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa +from archivebox.config.constants import CONSTANTS # noqa +from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa +from archivebox.config.version import VERSION # noqa __version__ = VERSION __author__ = 'Nick Sweeting' diff --git a/archivebox/api/v1_api.py b/archivebox/api/v1_api.py index b71ceb3d..7076f5d1 100644 --- a/archivebox/api/v1_api.py +++ b/archivebox/api/v1_api.py @@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger # TODO: explore adding https://eadwincode.github.io/django-ninja-extra/ -from archivebox.config import SHELL_CONFIG, VERSION +from archivebox.config import VERSION +from archivebox.config.version import get_COMMIT_HASH from api.auth import API_AUTH_METHODS -COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown' +COMMIT_HASH = get_COMMIT_HASH() or 'unknown' html_description=f'''

Welcome to your ArchiveBox server's REST API [v1 ALPHA] homepage!

diff --git a/archivebox/api/v1_cli.py b/archivebox/api/v1_cli.py index 9db7bcad..fe78f8c4 100644 --- a/archivebox/api/v1_cli.py +++ b/archivebox/api/v1_cli.py @@ -13,7 +13,7 @@ from ..main import ( schedule, ) from archivebox.misc.util import ansi_to_html -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG from .auth import API_AUTH_METHODS diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 1ac7a9f9..ab532a04 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.cli' __command__ = 'archivebox' +import os import sys import argparse import threading @@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup': print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead') sys.argv[1] = 'install' +if '--debug' in sys.argv: + os.environ['DEBUG'] = 'True' + sys.argv.remove('--debug') + # def list_subcommands() -> Dict[str, str]: # """find and import all valid archivebox_.py files in CLI_DIR""" @@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = { 'init': 'archivebox_init', 'install': 'archivebox_install', + ############################################## 'config': 'archivebox_config', - 'add': 'archivebox_add', 'remove': 'archivebox_remove', 'update': 'archivebox_update', @@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = { 'shell': 'archivebox_shell', 'manage': 'archivebox_manage', - 'oneshot': 'archivebox_oneshot', + # 'oneshot': 'archivebox_oneshot', } # every imported command module must have these properties in order to be valid @@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands() # these common commands will appear sorted before any others for ease-of-use meta_cmds = ('help', 'version') # dont require valid data folder at all -main_cmds = ('init', 'config', 'setup', 'install') # dont require existing db present -archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present +setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet +archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present fake_db = ("oneshot",) # use fake in-memory db -display_first = (*meta_cmds, *main_cmds, *archive_cmds) +display_first = (*meta_cmds, *setup_cmds, *archive_cmds) IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting @@ -157,14 +162,16 @@ def run_subcommand(subcommand: str, from archivebox.config.legacy import setup_django # print('DATA_DIR is', DATA_DIR) - # print('pwd is', os.getcwd()) + # print('pwd is', os.getcwd()) cmd_requires_db = subcommand in archive_cmds init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args - setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending) + check_db = cmd_requires_db and not init_pending - if subcommand not in meta_cmds: + setup_django(in_memory_db=subcommand in fake_db, check_db=check_db) + + if subcommand in archive_cmds: if cmd_requires_db: check_migrations() diff --git a/archivebox/cli/archivebox_add.py b/archivebox/cli/archivebox_add.py index 8c44b18b..64a9c54c 100644 --- a/archivebox/cli/archivebox_add.py +++ b/archivebox/cli/archivebox_add.py @@ -9,7 +9,8 @@ import argparse from typing import List, Optional, IO from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR, ARCHIVING_CONFIG +from archivebox.config import DATA_DIR +from archivebox.config.common import ARCHIVING_CONFIG from ..main import add from ..parsers import PARSERS diff --git a/archivebox/cli/archivebox_server.py b/archivebox/cli/archivebox_server.py index f25cc0c4..3c57bf43 100644 --- a/archivebox/cli/archivebox_server.py +++ b/archivebox/cli/archivebox_server.py @@ -9,7 +9,8 @@ from pathlib import Path from typing import Optional, List, IO from archivebox.misc.util import docstring -from archivebox.config import DATA_DIR, SERVER_CONFIG +from archivebox.config import DATA_DIR +from archivebox.config.common import SERVER_CONFIG from ..logging_util import SmartFormatter, reject_stdin from ..main import server diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py index 7eb3d52c..d70352e0 100644 --- a/archivebox/config/__init__.py +++ b/archivebox/config/__init__.py @@ -1,27 +1,9 @@ __package__ = 'archivebox.config' -from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION -from .defaults import ( - SHELL_CONFIG, - STORAGE_CONFIG, - GENERAL_CONFIG, - SERVER_CONFIG, - ARCHIVING_CONFIG, - SEARCH_BACKEND_CONFIG, +from .paths import ( + PACKAGE_DIR, # noqa + DATA_DIR, # noqa + ARCHIVE_DIR, # noqa ) - - -__all__ = [ - 'CONSTANTS', - 'PACKAGE_DIR', - 'DATA_DIR', - 'ARCHIVE_DIR', - 'VERSION', - 'SHELL_CONFIG', - 'STORAGE_CONFIG', - 'GENERAL_CONFIG', - 'SERVER_CONFIG', - 'ARCHIVING_CONFIG', - 'SEARCH_BACKEND_CONFIG', - 'CONSTANTS_CONFIG', -] +from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa +from .version import VERSION # noqa diff --git a/archivebox/config/apps.py b/archivebox/config/apps.py index 88c94f8f..e56a9179 100644 --- a/archivebox/config/apps.py +++ b/archivebox/config/apps.py @@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa -from .defaults import ( +from .common import ( ShellConfig, # noqa: F401 StorageConfig, # noqa: F401 GeneralConfig, # noqa: F401 diff --git a/archivebox/config/check_for_update.py b/archivebox/config/check_for_update.py deleted file mode 100644 index a725522a..00000000 --- a/archivebox/config/check_for_update.py +++ /dev/null @@ -1,47 +0,0 @@ -# def get_versions_available_on_github(config): -# """ -# returns a dictionary containing the ArchiveBox GitHub release info for -# the recommended upgrade version and the currently installed version -# """ - -# # we only want to perform the (relatively expensive) check for new versions -# # when its most relevant, e.g. when the user runs a long-running command -# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' -# long_running_commands = ('add', 'schedule', 'update', 'status', 'server') -# if subcommand_run_by_user not in long_running_commands: -# return None - -# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" -# response = requests.get(github_releases_api) -# if response.status_code != 200: -# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) -# return None -# all_releases = response.json() - -# installed_version = parse_version_string(config['VERSION']) - -# # find current version or nearest older version (to link to) -# current_version = None -# for idx, release in enumerate(all_releases): -# release_version = parse_version_string(release['tag_name']) -# if release_version <= installed_version: -# current_version = release -# break - -# current_version = current_version or all_releases[-1] - -# # recommended version is whatever comes after current_version in the release list -# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) -# try: -# recommended_version = all_releases[idx+1] -# except IndexError: -# recommended_version = None - -# return {'recommended_version': recommended_version, 'current_version': current_version} - -# def can_upgrade(config): -# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: -# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) -# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) -# return recommended_version > current_version -# return False diff --git a/archivebox/config/defaults.py b/archivebox/config/common.py similarity index 63% rename from archivebox/config/defaults.py rename to archivebox/config/common.py index e4146f25..b17fde09 100644 --- a/archivebox/config/defaults.py +++ b/archivebox/config/common.py @@ -1,21 +1,21 @@ __package__ = 'archivebox.config' -import os import sys import shutil from typing import Dict, Optional -from datetime import datetime from pathlib import Path from rich import print -from pydantic import Field, field_validator, model_validator, computed_field +from pydantic import Field, field_validator, computed_field from django.utils.crypto import get_random_string from abx.archivebox.base_configset import BaseConfigSet -from .constants import CONSTANTS, PACKAGE_DIR +from .constants import CONSTANTS +from .version import get_COMMIT_HASH, get_BUILD_TIME +from .permissions import IN_DOCKER ###################### Config ########################## @@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet): USE_COLOR: bool = Field(default=lambda c: c.IS_TTY) SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY) - IN_DOCKER: bool = Field(default=False) + IN_DOCKER: bool = Field(default=IN_DOCKER) IN_QEMU: bool = Field(default=False) - - USER: str = Field(default=Path('~').expanduser().resolve().name) - PUID: int = Field(default=os.getuid()) - PGID: int = Field(default=os.getgid()) - - PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')) ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS) @@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet): @computed_field @property def COMMIT_HASH(self) -> Optional[str]: - try: - git_dir = PACKAGE_DIR / '../.git' - ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1] - commit_hash = git_dir.joinpath(ref).read_text().strip() - return commit_hash - except Exception: - pass - - try: - return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip() - except Exception: - pass - - return None + return get_COMMIT_HASH() @computed_field @property def BUILD_TIME(self) -> str: - if self.IN_DOCKER: - docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0] - return docker_build_end_time - - src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime - return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s') - - - @model_validator(mode='after') - def validate_not_running_as_root(self): - attempted_command = ' '.join(sys.argv[:3]) - if self.PUID == 0 and attempted_command not in ('setup', 'install'): - # stderr('[!] ArchiveBox should never be run as root!', color='red') - # stderr(' For more information, see the security overview documentation:') - # stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root') - print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr) - print(' For more information, see the security overview documentation:', file=sys.stderr) - print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr) - - if self.IN_DOCKER: - print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr) - print(' docker compose run archivebox {attempted_command}', file=sys.stderr) - print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr) - print(' or:', file=sys.stderr) - print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) - print(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) - raise SystemExit(2) - - # check python locale - if self.PYTHON_ENCODING != 'UTF-8': - print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr) - print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr) - print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr) - print('') - print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr) - print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr) - raise SystemExit(2) - - return self + return get_BUILD_TIME() SHELL_CONFIG = ShellConfig() diff --git a/archivebox/config/config_stubs.py b/archivebox/config/config_stubs.py deleted file mode 100644 index 20c803bb..00000000 --- a/archivebox/config/config_stubs.py +++ /dev/null @@ -1,115 +0,0 @@ -from pathlib import Path -from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List -from mypy_extensions import TypedDict - -from benedict import benedict - -SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]] -SimpleConfigValueDict = Dict[str, SimpleConfigValue] -SimpleConfigValueGetter = Callable[[], SimpleConfigValue] -ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter] - - - -class BaseConfig(TypedDict): - pass - -class ConfigDict(BaseConfig, benedict, total=False): - """ - # Regenerate by pasting this quine into `archivebox shell` 🥚 - from archivebox.config import ConfigDict, CONFIG_DEFAULTS - print('class ConfigDict(BaseConfig, total=False):') - print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3) - for section, configs in CONFIG_DEFAULTS.items(): - for key, attrs in configs.items(): - Type, default = attrs['type'], attrs['default'] - if default is None: - print(f' {key}: Optional[{Type.__name__}]') - else: - print(f' {key}: {Type.__name__}') - print() - """ - - IS_TTY: bool - USE_COLOR: bool - SHOW_PROGRESS: bool - IN_DOCKER: bool - - PACKAGE_DIR: Path - CONFIG_FILE: Path - ONLY_NEW: bool - TIMEOUT: int - MEDIA_TIMEOUT: int - OUTPUT_PERMISSIONS: str - RESTRICT_FILE_NAMES: str - URL_DENYLIST: str - - SECRET_KEY: Optional[str] - BIND_ADDR: str - ALLOWED_HOSTS: str - DEBUG: bool - PUBLIC_INDEX: bool - PUBLIC_SNAPSHOTS: bool - FOOTER_INFO: str - - SAVE_TITLE: bool - SAVE_FAVICON: bool - SAVE_WGET: bool - SAVE_WGET_REQUISITES: bool - SAVE_SINGLEFILE: bool - SAVE_READABILITY: bool - SAVE_MERCURY: bool - SAVE_PDF: bool - SAVE_SCREENSHOT: bool - SAVE_DOM: bool - SAVE_WARC: bool - SAVE_GIT: bool - SAVE_MEDIA: bool - SAVE_ARCHIVE_DOT_ORG: bool - - RESOLUTION: str - GIT_DOMAINS: str - CHECK_SSL_VALIDITY: bool - CURL_USER_AGENT: str - WGET_USER_AGENT: str - CHROME_USER_AGENT: str - COOKIES_FILE: Union[str, Path, None] - CHROME_USER_DATA_DIR: Union[str, Path, None] - CHROME_TIMEOUT: int - CHROME_HEADLESS: bool - CHROME_SANDBOX: bool - - USE_CURL: bool - USE_WGET: bool - USE_SINGLEFILE: bool - USE_READABILITY: bool - USE_MERCURY: bool - USE_GIT: bool - USE_CHROME: bool - USE_YOUTUBEDL: bool - CURL_BINARY: str - GIT_BINARY: str - WGET_BINARY: str - SINGLEFILE_BINARY: str - READABILITY_BINARY: str - MERCURY_BINARY: str - YOUTUBEDL_BINARY: str - CHROME_BINARY: Optional[str] - - YOUTUBEDL_ARGS: List[str] - WGET_ARGS: List[str] - CURL_ARGS: List[str] - GIT_ARGS: List[str] - TAG_SEPARATOR_PATTERN: str - - -ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue] -ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter] - -ConfigDefault = TypedDict('ConfigDefault', { - 'default': ConfigDefaultValue, - 'type': Optional[Type], - 'aliases': Optional[Tuple[str, ...]], -}, total=False) - -ConfigDefaultDict = Dict[str, ConfigDefault] diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index 25082fa8..5e646e58 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -1,118 +1,115 @@ __package__ = 'archivebox.config' - import os import re import platform -import tempfile from typing import Dict from pathlib import Path -import importlib.metadata from collections.abc import Mapping from benedict import benedict from ..misc.logging import DEFAULT_CLI_COLORS +from .paths import ( + PACKAGE_DIR, + DATA_DIR, + ARCHIVE_DIR, + get_collection_id, + get_LIB_DIR, + get_TMP_DIR, +) +from .permissions import ( + IS_ROOT, + IN_DOCKER, + RUNNING_AS_UID, + RUNNING_AS_GID, + DEFAULT_PUID, + DEFAULT_PGID, + ARCHIVEBOX_USER, + ARCHIVEBOX_GROUP, +) +from .version import detect_installed_version + ###################### Config ########################## -PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir -DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir -ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir - -def _detect_installed_version(PACKAGE_DIR: Path): - """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file""" - try: - # if in production install, use pip-installed package metadata - return importlib.metadata.version(__package__ or 'archivebox').strip() - except importlib.metadata.PackageNotFoundError: - pass - - try: - # if in dev Git repo dir, use pyproject.toml file - pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n') - for line in pyproject_config: - if line.startswith('version = '): - return line.split(' = ', 1)[-1].strip('"').strip() - except FileNotFoundError: - # building docs, pyproject.toml is not available - pass - - # raise Exception('Failed to detect installed archivebox version!') - return 'dev' - -VERSION: str = _detect_installed_version(PACKAGE_DIR) - - - class ConstantsDict(Mapping): - IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes') - OS = platform.system().lower() # darwin, linux, etc. - ARCH = platform.machine().lower() # arm64, x86_64, etc. - LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '') - - PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir - DATA_DIR: Path = DATA_DIR # archivebox user data dir - ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir - VERSION: str = VERSION + PACKAGE_DIR: Path = PACKAGE_DIR + DATA_DIR: Path = DATA_DIR + ARCHIVE_DIR: Path = ARCHIVE_DIR + COLLECTION_ID: str = get_collection_id(DATA_DIR) + # Host system + VERSION: str = detect_installed_version(PACKAGE_DIR) + OS: str = platform.system().lower() # darwin, linux, etc. + ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. + IN_DOCKER: bool = IN_DOCKER + + # Permissions + IS_ROOT: bool = IS_ROOT + ARCHIVEBOX_USER: int = ARCHIVEBOX_USER + ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP + RUNNING_AS_UID: int = RUNNING_AS_UID + RUNNING_AS_GID: int = RUNNING_AS_GID + DEFAULT_PUID: int = DEFAULT_PUID + DEFAULT_PGID: int = DEFAULT_PGID + + # Source code dirs PACKAGE_DIR_NAME: str = PACKAGE_DIR.name TEMPLATES_DIR_NAME: str = 'templates' TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME - STATIC_DIR: Path = TEMPLATES_DIR / 'static' + STATIC_DIR_NAME: str = 'static' + STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME + + # Data dirs + ARCHIVE_DIR_NAME: str = 'archive' + SOURCES_DIR_NAME: str = 'sources' + PERSONAS_DIR_NAME: str = 'personas' + CRONTABS_DIR_NAME: str = 'crontabs' + CACHE_DIR_NAME: str = 'cache' + LOGS_DIR_NAME: str = 'logs' USER_PLUGINS_DIR_NAME: str = 'user_plugins' CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates' - - ARCHIVE_DIR_NAME: str = 'archive' - SOURCES_DIR_NAME: str = 'sources' - PERSONAS_DIR_NAME: str = 'personas' - CRONTABS_DIR_NAME: str = 'crontabs' - CACHE_DIR_NAME: str = 'cache' - LOGS_DIR_NAME: str = 'logs' - LIB_DIR_NAME: str = 'lib' - TMP_DIR_NAME: str = 'tmp' - - SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox') - # DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long - SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME) - DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE - ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME - CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME - LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker - TMP_DIR: Path = SYSTEM_TMP_DIR + CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME + # Data dir files + CONFIG_FILENAME: str = 'ArchiveBox.conf' + SQL_INDEX_FILENAME: str = 'index.sqlite3' + QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3' + CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME + DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME + QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME + + JSON_INDEX_FILENAME: str = 'index.json' + HTML_INDEX_FILENAME: str = 'index.html' + ROBOTS_TXT_FILENAME: str = 'robots.txt' + FAVICON_FILENAME: str = 'favicon.ico' + + # Runtime dirs + TMP_DIR_NAME: str = 'tmp' + TMP_DIR: Path = get_TMP_DIR() + LIB_DIR_NAME: str = 'lib' + LIB_DIR: Path = get_LIB_DIR() LIB_PIP_DIR: Path = LIB_DIR / 'pip' LIB_NPM_DIR: Path = LIB_DIR / 'npm' LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers' LIB_BIN_DIR: Path = LIB_DIR / 'bin' BIN_DIR: Path = LIB_BIN_DIR - CONFIG_FILENAME: str = 'ArchiveBox.conf' - SQL_INDEX_FILENAME: str = 'index.sqlite3' - QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3' + # Config constants + TIMEZONE: str = 'UTC' + DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS + DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS}) - CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME - DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME - QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME - - JSON_INDEX_FILENAME: str = 'index.json' - HTML_INDEX_FILENAME: str = 'index.html' - ROBOTS_TXT_FILENAME: str = 'robots.txt' - FAVICON_FILENAME: str = 'favicon.ico' - - TIMEZONE: str = 'UTC' - DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS - DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS}) - - ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE + ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE STATICFILE_EXTENSIONS: frozenset[str] = frozenset(( # 99.999% of the time, URLs ending in these extensions are static files @@ -136,17 +133,6 @@ class ConstantsDict(Mapping): # html, htm, shtml, xhtml, xml, aspx, php, cgi )) - INGORED_PATHS: frozenset[str] = frozenset(( - ".git", - ".svn", - ".DS_Store", - ".gitignore", - "lost+found", - ".DS_Store", - ".env", - "Dockerfile", - ".ArchiveBox.conf.bak", - )) PIP_RELATED_NAMES: frozenset[str] = frozenset(( ".venv", "venv", @@ -160,7 +146,15 @@ class ConstantsDict(Mapping): "yarn.lock", )) - DATA_DIR_NAMES: frozenset[str] = frozenset(( + # When initializing archivebox in a new directory, we check to make sure the dir is + # actually empty so that we dont clobber someone's home directory or desktop by accident. + # These files are exceptions to the is_empty check when we're trying to init a new dir, + # as they could be from a previous archivebox version, system artifacts, dependencies, etc. + ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset(( + *PIP_RELATED_NAMES, + *NPM_RELATED_NAMES, + + ### Dirs: ARCHIVE_DIR_NAME, SOURCES_DIR_NAME, LOGS_DIR_NAME, @@ -171,9 +165,12 @@ class ConstantsDict(Mapping): CUSTOM_TEMPLATES_DIR_NAME, USER_PLUGINS_DIR_NAME, CRONTABS_DIR_NAME, - )) - DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES) - DATA_FILE_NAMES: frozenset[str] = frozenset(( + "static", # created by old static exports str: # These are derived/computed values calculated *after* all user-provided config values are ingested # they appear in `archivebox config` output and are intended to be read-only for the user -DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { +DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = { 'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, 'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)}, @@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = { def load_config_val(key: str, - default: ConfigDefaultValue=None, + default: Any=None, type: Optional[Type]=None, aliases: Optional[Tuple[str, ...]]=None, config: Optional[benedict]=None, env_vars: Optional[os._Environ]=None, - config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue: + config_file_vars: Optional[Dict[str, str]]=None) -> Any: """parse bool, int, and str key=value pairs from env""" assert isinstance(config, dict) @@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA -def load_config(defaults: ConfigDefaultDict, +def load_config(defaults: Dict[str, Any], config: Optional[benedict]=None, out_dir: Optional[str]=None, env_vars: Optional[os._Environ]=None, @@ -505,7 +498,7 @@ def load_all_config(): # add all final config values in CONFIG to globals in this file CONFIG: benedict = load_all_config() globals().update(CONFIG) -# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ... + # print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV") @@ -521,8 +514,8 @@ globals().update(CONFIG) # Set timezone to UTC and umask to OUTPUT_PERMISSIONS -assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})' # noqa: F821 -os.environ["TZ"] = TIMEZONE # noqa: F821 +assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821 +os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821 os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821 ########################### Config Validity Checkers ########################### @@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS: os.environ['TERM'] = 'dumb' # recreate rich console obj based on new config values -CONSOLE = Console() +STDOUT = CONSOLE = Console() +STDERR = Console(stderr=True) from ..misc import logging logging.CONSOLE = CONSOLE @@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE INITIAL_STARTUP_PROGRESS = None INITIAL_STARTUP_PROGRESS_TASK = 0 -def bump_startup_progress_bar(): +def bump_startup_progress_bar(advance=1): global INITIAL_STARTUP_PROGRESS global INITIAL_STARTUP_PROGRESS_TASK if INITIAL_STARTUP_PROGRESS: - INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore + INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore def setup_django_minimal(): @@ -559,6 +553,8 @@ DJANGO_SET_UP = False def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None: + from rich.panel import Panel + global INITIAL_STARTUP_PROGRESS global INITIAL_STARTUP_PROGRESS_TASK global DJANGO_SET_UP @@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON # TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes return - with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS: + with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS: INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25) output_dir = out_dir or CONSTANTS.DATA_DIR @@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON else: # Otherwise use default sqlite3 file-based database and initialize django # without running migrations automatically (user runs them manually by calling init) - django.setup() + try: + django.setup() + except Exception as e: + bump_startup_progress_bar(advance=1000) + STDERR.print() + STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red')) + STDERR.print() + return bump_startup_progress_bar() @@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n") if check_db: + # make sure the data dir is owned by a non-root user + if CONSTANTS.DATA_DIR.stat().st_uid == 0: + STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]') + STDERR.print(f' {CONSTANTS.DATA_DIR}') + STDERR.print() + STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)') + STDERR.print(' cd path/to/your/archive/data') + STDERR.print(' archivebox [command]') + STDERR.print() + raise SystemExit(9) + # Create cache table in DB if needed try: from django.core.cache import cache diff --git a/archivebox/config/paths.py b/archivebox/config/paths.py new file mode 100644 index 00000000..a0bc69a9 --- /dev/null +++ b/archivebox/config/paths.py @@ -0,0 +1,152 @@ +__package__ = 'archivebox.config' + +import os +import tempfile +import hashlib +from pathlib import Path + +from functools import cache +from platformdirs import PlatformDirs + +from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP + +############################################################################################# + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir + +############################################################################################# + +@cache +def get_collection_id(DATA_DIR=DATA_DIR): + """Get a short, stable, unique ID for the current collection""" + collection_id_file = DATA_DIR / '.collection_id' + + try: + return collection_id_file.read_text().strip() + except (OSError, FileNotFoundError, PermissionError): + pass + + hash_key = str(DATA_DIR.resolve()).encode() + collection_id = hashlib.sha256(hash_key).hexdigest()[:8] + try: + collection_id_file.write_text(collection_id) + except (OSError, FileNotFoundError, PermissionError): + pass + return collection_id + + +def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool: + """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" + current_uid, current_gid = os.geteuid(), os.getegid() + uid, gid = uid or current_uid, gid or current_gid + + test_file = dir_path / '.permissions_test' + try: + with SudoPermission(uid=uid, fallback=fallback): + test_file.exists() + test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir') + test_file.unlink() + return True + except (IOError, OSError, PermissionError): + pass + + return False + + + +@cache +def get_LIB_DIR(): + """ + - should be shared with other collections on the same host + - must be scoped by CPU architecture, OS family, and archivebox version + - should not be shared with other hosts/archivebox versions + - must be writable by any archivebox user + - should be persistent across reboots + - can be on a docker bin mount but probably shouldnt be + - ok to have a long path (doesnt contain SOCKETS) + """ + from .version import detect_installed_version + + HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) + + if 'SYSTEM_LIB_DIR' in os.environ: + lib_dir = Path(os.environ['SYSTEM_LIB_DIR']) + else: + with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): + lib_dir = HOST_DIRS.site_data_path + + # Docker: /usr/local/share/archivebox/0.8.5 + # Ubuntu: /usr/local/share/archivebox/0.8.5 + # macOS: /Library/Application Support/archivebox + try: + with SudoPermission(uid=0, fallback=True): + lib_dir.mkdir(parents=True, exist_ok=True) + except PermissionError: + # our user cannot + lib_dir = HOST_DIRS.user_data_path + lib_dir.mkdir(parents=True, exist_ok=True) + + if not dir_is_writable(lib_dir): + if IS_ROOT: + # make sure lib dir is owned by the archivebox user, not root + with SudoPermission(uid=0): + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') + else: + raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') + + return lib_dir + +@cache +def get_TMP_DIR(): + """ + - must NOT be inside DATA_DIR / inside a docker volume bind mount + - must NOT have a long PATH (UNIX socket path length restrictions) + - must NOT be shared with other collections/hosts + - must be writable by archivebox user & root + - must be cleared on every boot / not persisted + - must be cleared on every archivebox version upgrade + """ + from .version import detect_installed_version + + HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) + + # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) + # print('RUNNING AS:', self.PUID, self.PGID) + + if 'SYSTEM_TMP_DIR' in os.environ: + run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR) + with SudoPermission(uid=0, fallback=True): + run_dir.mkdir(parents=True, exist_ok=True) + if not dir_is_writable(run_dir): + if IS_ROOT: + with SudoPermission(uid=0, fallback=False): + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') + else: + raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') + assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' + return run_dir + + run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() + try: + assert len(str(run_dir)) + len('/supervisord.sock') < 95 + except AssertionError: + run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) + assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' + + with SudoPermission(uid=0, fallback=True): + run_dir.mkdir(parents=True, exist_ok=True) + + if not dir_is_writable(run_dir): + if IS_ROOT: + with SudoPermission(uid=0): + os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') + else: + raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') + + # Docker: /tmp/archivebox/0.8.5/abc324235 + # Ubuntu: /tmp/archivebox/0.8.5/abc324235 + # macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235 + return run_dir + diff --git a/archivebox/config/permissions.py b/archivebox/config/permissions.py new file mode 100644 index 00000000..46e9c3f5 --- /dev/null +++ b/archivebox/config/permissions.py @@ -0,0 +1,70 @@ +__package__ = 'archivebox.config' + +import os +from pathlib import Path +from contextlib import contextmanager + +############################################################################################# + +DATA_DIR = Path(os.getcwd()) + +DATA_DIR_STAT = Path(DATA_DIR).stat() +DATA_DIR_UID = DATA_DIR_STAT.st_uid +DATA_DIR_GID = DATA_DIR_STAT.st_gid +DEFAULT_PUID = 911 +DEFAULT_PGID = 911 +RUNNING_AS_UID = os.getuid() +RUNNING_AS_GID = os.getgid() +EUID = os.geteuid() +EGID = os.getegid() +USER: str = Path('~').expanduser().resolve().name + +IS_ROOT = RUNNING_AS_UID == 0 +IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') + +os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID)) +os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID)) + +ARCHIVEBOX_USER = int(os.environ['PUID']) +ARCHIVEBOX_GROUP = int(os.environ['PGID']) + +############################################################################################# + +def drop_privileges(): + """If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)""" + + # always run archivebox as the user that owns the data dir, never as root + if os.getuid() == 0: + # drop permissions to the user that owns the data dir / provided PUID + if os.geteuid() != ARCHIVEBOX_USER: + os.seteuid(ARCHIVEBOX_USER) + # if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root + + +@contextmanager +def SudoPermission(uid=0, fallback=False): + """Attempt to run code with sudo permissions for a given user (or root)""" + + if os.geteuid() == uid: + # no need to change effective UID, we are already that user + yield + return + + try: + # change our effective UID to the given UID + os.seteuid(uid) + except PermissionError as err: + if not fallback: + raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err + try: + # yield back to the caller so they can run code inside context as root + yield + finally: + # then set effective UID back to DATA_DIR owner + DATA_DIR_OWNER = DATA_DIR.stat().st_uid + try: + os.seteuid(DATA_DIR_OWNER) + except PermissionError as err: + if not fallback: + raise PermissionError(f'Failed to revert uid={uid} back to {DATA_DIR_OWNER} after running code with sudo') from err + diff --git a/archivebox/config/version.py b/archivebox/config/version.py new file mode 100644 index 00000000..26df4592 --- /dev/null +++ b/archivebox/config/version.py @@ -0,0 +1,121 @@ +__package__ = 'archivebox.config' + +import os +import importlib.metadata + +from pathlib import Path +from functools import cache +from datetime import datetime +from typing import Optional + +############################################################################################# + +IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') + +PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir +DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir +ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir + +############################################################################################# + + +@cache +def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR): + """Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file""" + try: + # if in production install, use pip-installed package metadata + return importlib.metadata.version('archivebox').strip() + except importlib.metadata.PackageNotFoundError: + pass + + try: + # if in dev Git repo dir, use pyproject.toml file + pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n') + for line in pyproject_config: + if line.startswith('version = '): + return line.split(' = ', 1)[-1].strip('"').strip() + except FileNotFoundError: + # building docs, pyproject.toml is not available + pass + + # raise Exception('Failed to detect installed archivebox version!') + return 'dev' + + +@cache +def get_COMMIT_HASH() -> Optional[str]: + try: + git_dir = PACKAGE_DIR / '../.git' + ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1] + commit_hash = git_dir.joinpath(ref).read_text().strip() + return commit_hash + except Exception: + pass + + try: + return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip() + except Exception: + pass + + return None + +@cache +def get_BUILD_TIME() -> str: + if IN_DOCKER: + docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0] + return docker_build_end_time + + src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime + return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s') + + +# def get_versions_available_on_github(config): +# """ +# returns a dictionary containing the ArchiveBox GitHub release info for +# the recommended upgrade version and the currently installed version +# """ + +# # we only want to perform the (relatively expensive) check for new versions +# # when its most relevant, e.g. when the user runs a long-running command +# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help' +# long_running_commands = ('add', 'schedule', 'update', 'status', 'server') +# if subcommand_run_by_user not in long_running_commands: +# return None + +# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases" +# response = requests.get(github_releases_api) +# if response.status_code != 200: +# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config) +# return None +# all_releases = response.json() + +# installed_version = parse_version_string(config['VERSION']) + +# # find current version or nearest older version (to link to) +# current_version = None +# for idx, release in enumerate(all_releases): +# release_version = parse_version_string(release['tag_name']) +# if release_version <= installed_version: +# current_version = release +# break + +# current_version = current_version or all_releases[-1] + +# # recommended version is whatever comes after current_version in the release list +# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest) +# try: +# recommended_version = all_releases[idx+1] +# except IndexError: +# recommended_version = None + +# return {'recommended_version': recommended_version, 'current_version': current_version} + +# def can_upgrade(config): +# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']: +# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name']) +# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name']) +# return recommended_version > current_version +# return False + + +VERSION: str = detect_installed_version() diff --git a/archivebox/core/middleware.py b/archivebox/core/middleware.py index 181d67f0..1cbe540e 100644 --- a/archivebox/core/middleware.py +++ b/archivebox/core/middleware.py @@ -5,7 +5,7 @@ from django.utils import timezone from django.contrib.auth.middleware import RemoteUserMiddleware from django.core.exceptions import ImproperlyConfigured -from archivebox.config import SERVER_CONFIG +from archivebox.config.common import SERVER_CONFIG def detect_timezone(request, activate: bool=True): diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 77806188..e374ff4f 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -13,7 +13,8 @@ import abx.archivebox import abx.archivebox.use import abx.django.use -from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa +from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3] IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 7dbbf110..f3d7ef93 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -27,7 +27,8 @@ from core.admin import result_url from queues.tasks import bg_add -from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG +from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION +from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG from archivebox.misc.util import base_url, htmlencode, ts_to_date_str from .serve_static import serve_static_with_byterange_support diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py index 423f1601..16536d1f 100644 --- a/archivebox/extractors/htmltotext.py +++ b/archivebox/extractors/htmltotext.py @@ -5,7 +5,8 @@ import io from pathlib import Path from typing import Optional -from archivebox.config import VERSION, ARCHIVING_CONFIG +from archivebox.config import VERSION +from archivebox.config.common import ARCHIVING_CONFIG from archivebox.config.legacy import SAVE_HTMLTOTEXT from archivebox.misc.system import atomic_write from archivebox.misc.util import enforce_types, is_static_file diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index bff099cd..e2000a68 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -12,9 +12,11 @@ from urllib.parse import urlparse from django.db.models import QuerySet, Q -from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG -from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder from archivebox.misc.logging import stderr +from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder + +from archivebox.config import DATA_DIR, CONSTANTS +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN from ..logging_util import ( diff --git a/archivebox/index/html.py b/archivebox/index/html.py index 307add0d..b46e9911 100644 --- a/archivebox/index/html.py +++ b/archivebox/index/html.py @@ -16,7 +16,9 @@ from archivebox.misc.util import ( htmlencode, urldecode, ) -from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG +from archivebox.config import CONSTANTS, DATA_DIR, VERSION +from archivebox.config.common import SERVER_CONFIG +from archivebox.config.version import get_COMMIT_HASH from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG from .schema import Link @@ -56,7 +58,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) -> return render_django_template(template, { 'version': VERSION, - 'git_sha': SHELL_CONFIG.COMMIT_HASH or VERSION, + 'git_sha': get_COMMIT_HASH() or VERSION, 'num_links': str(len(links)), 'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'), 'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'), diff --git a/archivebox/index/json.py b/archivebox/index/json.py index 945f73d1..d666b4b1 100644 --- a/archivebox/index/json.py +++ b/archivebox/index/json.py @@ -8,7 +8,8 @@ from pathlib import Path from datetime import datetime, timezone from typing import List, Optional, Iterator, Any, Union -from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL_CONFIG +from archivebox.config import VERSION, DATA_DIR, CONSTANTS +from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG from .schema import Link from archivebox.misc.system import atomic_write diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index 892f11b7..cb07d546 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -9,7 +9,8 @@ from django.db.models import QuerySet from django.db import transaction from archivebox.misc.util import enforce_types, parse_date -from archivebox.config import DATA_DIR, GENERAL_CONFIG +from archivebox.config import DATA_DIR +from archivebox.config.common import GENERAL_CONFIG from .schema import Link diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py index d0de496d..b2ef9a8a 100644 --- a/archivebox/logging_util.py +++ b/archivebox/logging_util.py @@ -22,7 +22,8 @@ from rich.panel import Panel from rich_argparse import RichHelpFormatter from django.core.management.base import DjangoHelpFormatter -from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG +from archivebox.config import CONSTANTS, DATA_DIR, VERSION +from archivebox.config.common import SHELL_CONFIG from archivebox.misc.system import get_dir_size from archivebox.misc.util import enforce_types from archivebox.misc.logging import ANSI, stderr diff --git a/archivebox/main.py b/archivebox/main.py index 8a8fc59a..e1779b8b 100755 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -14,13 +14,15 @@ from crontab import CronTab, CronSlices from django.db.models import QuerySet from django.utils import timezone -from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR, SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG +from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR +from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG +from archivebox.config.permissions import SudoPermission, IN_DOCKER from .cli import ( CLI_SUBCOMMANDS, run_subcommand, display_first, meta_cmds, - main_cmds, + setup_cmds, archive_cmds, ) from .parsers import ( @@ -101,7 +103,7 @@ def help(out_dir: Path=DATA_DIR) -> None: ) + '\n\n ' + '\n '.join( f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' for cmd, func in all_subcommands.items() - if cmd in main_cmds + if cmd in setup_cmds ) + '\n\n ' + '\n '.join( f'[green]{cmd.ljust(20)}[/green] {func.__doc__}' for cmd, func in all_subcommands.items() @@ -119,10 +121,10 @@ def help(out_dir: Path=DATA_DIR) -> None: [grey53]# using Docker:[/grey53] [blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53] -''' if SHELL_CONFIG.IN_DOCKER else '' - DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if SHELL_CONFIG.IN_DOCKER else '' - DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if SHELL_CONFIG.IN_DOCKER else '' - DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if SHELL_CONFIG.IN_DOCKER else '' +''' if IN_DOCKER else '' + DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else '' + DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else '' + DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else '' print(f'''{DOCKER_USAGE} [deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT} @@ -158,7 +160,7 @@ def help(out_dir: Path=DATA_DIR) -> None: print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.')) else: DATA_SETUP_HELP = '\n' - if SHELL_CONFIG.IN_DOCKER: + if IN_DOCKER: DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n' DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n' DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n' @@ -190,6 +192,8 @@ def version(quiet: bool=False, from plugins_auth.ldap.apps import LDAP_CONFIG from django.conf import settings + from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID # 0.7.1 # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365 @@ -198,13 +202,14 @@ def version(quiet: bool=False, # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False p = platform.uname() + COMMIT_HASH = get_COMMIT_HASH() prnt( '[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION), - f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}', - f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}', + f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}', + f'BUILD_TIME={get_BUILD_TIME()}', ) prnt( - f'IN_DOCKER={SHELL_CONFIG.IN_DOCKER}', + f'IN_DOCKER={IN_DOCKER}', f'IN_QEMU={SHELL_CONFIG.IN_QEMU}', f'ARCH={p.machine}', f'OS={p.system}', @@ -212,11 +217,13 @@ def version(quiet: bool=False, f'PYTHON={sys.implementation.name.title()}', ) OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount + DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat() prnt( + f'EUID={os.geteuid()} UID={RUNNING_AS_UID} PUID={ARCHIVEBOX_USER} FS_UID={DATA_DIR_STAT.st_uid}', + f'EGID={os.getegid()} GID={RUNNING_AS_GID} PGID={ARCHIVEBOX_GROUP} FS_GID={DATA_DIR_STAT.st_gid}', + f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}', f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}', - f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}', - f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}', ) prnt( f'DEBUG={SHELL_CONFIG.DEBUG}', @@ -261,8 +268,36 @@ def version(quiet: bool=False, else: prnt() prnt('[red][i] Data locations:[/red] (not in a data directory)') - + prnt() + + from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER + + data_dir_stat = Path(DATA_DIR).stat() + data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid + data_owned_by_root = data_dir_uid == 0 or data_dir_gid == 0 + + data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID + data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) and not IS_ROOT + data_not_writable = not (os.access(DATA_DIR, os.W_OK) and os.access(CONSTANTS.LIB_DIR, os.W_OK) and os.access(CONSTANTS.TMP_DIR, os.W_OK)) + if data_owned_by_root: + prnt('[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], ArchiveBox will refuse to run![/yellow]') + elif data_owner_doesnt_match or data_not_writable: + prnt(f'[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]') + else: + prnt(f':information: [blue]DATA_DIR[/blue] is currently owned by [blue]{data_dir_uid}:{data_dir_gid}[/blue] (PUID:PGID)') + + if data_owned_by_root or data_owner_doesnt_match or data_owned_by_default_user or data_not_writable: + prnt(f'[violet]Hint:[/violet] If you encounter permissions errors, change [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to match the user that will run ArchiveBox, e.g.:') + prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}') + prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}') + prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}') + prnt() + prnt('[blue]More info:[/blue]') + prnt(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]') + prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]') + prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]') + prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]') @enforce_types @@ -948,23 +983,56 @@ def list_folders(links: List[Link], @enforce_types def install(out_dir: Path=DATA_DIR) -> None: """Automatically install all ArchiveBox dependencies and extras""" + + # if running as root: + # - run init to create index + lib dir + # - chown -R 911 DATA_DIR + # - install all binaries as root + # - chown -R 911 LIB_DIR + # else: + # - run init to create index + lib dir as current user + # - install all binaries as current user + # - recommend user re-run with sudo if any deps need to be installed as root from rich import print from django.conf import settings + + from archivebox import CONSTANTS + from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP if not ARCHIVE_DIR.exists(): - run_subcommand('init', stdin=None, pwd=out_dir) - - stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green') + run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in + print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]') + + # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID + if IS_ROOT: + # if we have sudo/root permissions, take advantage of them just while installing dependencies + print() + print('[yellow]:warning: Using [red]root[/red] privileges only to install dependencies that need it, all other operations should be done as a [blue]non-root[/blue] user.[/yellow]') + print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].') + print() + for binary in reversed(list(settings.BINARIES.values())): providers = ' [grey53]or[/grey53] '.join(provider.name for provider in binary.binproviders_supported) print(f'[+] Locating / Installing [yellow]{binary.name}[/yellow] using [red]{providers}[/red]...') try: print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'})) + if IS_ROOT: + with SudoPermission(uid=0): + os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"') except Exception as e: - print(f'[X] Failed to install {binary.name}: {e}') - + if IS_ROOT: + print(f'[yellow]:warning: Retrying {binary.name} installation with [red]sudo[/red]...[/yellow]') + with SudoPermission(uid=0): + try: + print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'})) + os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"') + except Exception as e: + print(f'[red]:cross_mark: Failed to install {binary.name} as root: {e}[/red]') + else: + print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]') + from django.contrib.auth import get_user_model User = get_user_model() @@ -974,12 +1042,13 @@ def install(out_dir: Path=DATA_DIR) -> None: stderr(' archivebox manage createsuperuser') # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir) - stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green') + print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr) from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version'], capture_output=False, cwd=out_dir) + # backwards-compatibility: setup = install @@ -1100,6 +1169,7 @@ def schedule(add: bool=False, check_data_folder() from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY + from archivebox.config.permissions import USER Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True) @@ -1156,7 +1226,7 @@ def schedule(add: bool=False, existing_jobs = list(cron.find_comment(CRON_COMMENT)) print() - print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) + print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI)) print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs)) if total_runs > 60 and not quiet: stderr() @@ -1170,7 +1240,7 @@ def schedule(add: bool=False, if existing_jobs: print('\n'.join(str(cmd) for cmd in existing_jobs)) else: - stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI)) + stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI)) stderr(' To schedule a new job, run:') stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml') raise SystemExit(0) @@ -1294,7 +1364,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None: check_data_folder() from django.core.management import execute_from_command_line - if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY): + if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY): stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow') stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow') stderr('') diff --git a/archivebox/misc/checks.py b/archivebox/misc/checks.py index 5e324cbb..bee8dcb2 100644 --- a/archivebox/misc/checks.py +++ b/archivebox/misc/checks.py @@ -1,37 +1,44 @@ __package__ = 'archivebox.misc' -from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG +import sys +from rich import print -from .logging import stderr +# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE +# this file is imported by archivebox/__init__.py +# and any imports here will be imported by EVERYTHING else +# so this file should only be used for pure python checks +# that don't need to import other parts of ArchiveBox def check_data_folder() -> None: - + from archivebox import DATA_DIR, ARCHIVE_DIR + archive_dir_exists = ARCHIVE_DIR.exists() if not archive_dir_exists: - stderr('[X] No archivebox index found in the current directory.', color='red') - stderr(f' {DATA_DIR}', color='lightyellow') - stderr() - stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**SHELL_CONFIG.ANSI)) - stderr(' cd path/to/your/archive/folder') - stderr(' archivebox [command]') - stderr() - stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**SHELL_CONFIG.ANSI)) - stderr(' archivebox init') + print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr) + print(f' {DATA_DIR}', file=sys.stderr) + print(file=sys.stderr) + print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr) + print(' cd path/to/your/archive/folder', file=sys.stderr) + print(' archivebox [command]', file=sys.stderr) + print(file=sys.stderr) + print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr) + print(' archivebox init', file=sys.stderr) raise SystemExit(2) - - + + def check_migrations(): + from archivebox import DATA_DIR, CONSTANTS from ..index.sql import list_migrations pending_migrations = [name for status, name in list_migrations() if not status] if pending_migrations: - stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow') - stderr(f' {DATA_DIR}') - stderr() - stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:') - stderr(' archivebox init') + print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]') + print(f' {DATA_DIR}', file=sys.stderr) + print(file=sys.stderr) + print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr) + print(' archivebox init', file=sys.stderr) raise SystemExit(3) CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True) @@ -39,3 +46,39 @@ def check_migrations(): # CONSTANTS.CACHE_DIR.mkdir(exist_ok=True) (CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True) (CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True) + + +def check_io_encoding(): + PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8') + + if PYTHON_ENCODING != 'UTF-8': + print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr) + print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr) + print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr) + print('') + print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr) + print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr) + raise SystemExit(2) + + +def check_not_root(): + from archivebox.config.permissions import IS_ROOT, IN_DOCKER + + attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else '' + is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv[:2] + is_getting_version = '--version' in sys.argv or 'version' in sys.argv[:2] + is_installing = 'setup' in sys.argv[:2] or 'install' in sys.argv[:2] + + if IS_ROOT and not (is_getting_help or is_getting_version or is_installing): + print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr) + print(' For more information, see the security overview documentation:', file=sys.stderr) + print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr) + + if IN_DOCKER: + print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr) + print(' docker compose run archivebox {attempted_command}', file=sys.stderr) + print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr) + print(' or:', file=sys.stderr) + print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) + print(f' docker exec -it --user=archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr) + raise SystemExit(2) diff --git a/archivebox/misc/logging.py b/archivebox/misc/logging.py index 44789cda..86983176 100644 --- a/archivebox/misc/logging.py +++ b/archivebox/misc/logging.py @@ -13,6 +13,7 @@ from rich.highlighter import Highlighter # SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS CONSOLE = Console() +STDERR = Console(stderr=True) IS_TTY = CONSOLE.is_interactive @@ -51,7 +52,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], { '37': [(255, 255, 255), (255, 255, 255)], }) -# Logging Helpers +# Logging Helpers (DEPRECATED, use rich.print instead going forward) def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None: ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI diff --git a/archivebox/misc/system.py b/archivebox/misc/system.py index 4ae24d7e..f6814f8f 100644 --- a/archivebox/misc/system.py +++ b/archivebox/misc/system.py @@ -4,7 +4,6 @@ __package__ = 'archivebox.misc' import os import signal import shutil -import getpass from json import dump from pathlib import Path @@ -14,7 +13,7 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro from crontab import CronTab from atomicwrites import atomic_write as lib_atomic_write -from archivebox.config import STORAGE_CONFIG +from archivebox.config.common import STORAGE_CONFIG from archivebox.misc.util import enforce_types, ExtendedEncoder diff --git a/archivebox/misc/util.py b/archivebox/misc/util.py index eaf0bd75..a856fe64 100644 --- a/archivebox/misc/util.py +++ b/archivebox/misc/util.py @@ -1,4 +1,4 @@ -__package__ = 'archivebox' +__package__ = 'archivebox.misc' import re import requests @@ -25,10 +25,10 @@ except ImportError: detect_encoding = lambda rawdata: "utf-8" -from archivebox.config.constants import STATICFILE_EXTENSIONS -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config import CONSTANTS +from archivebox.config.common import ARCHIVING_CONFIG -from .misc.logging import COLOR_DICT +from .logging import COLOR_DICT ### Parsing Helpers @@ -120,7 +120,7 @@ def find_all_urls(urls_str: str): def is_static_file(url: str): # TODO: the proper way is with MIME type detection + ext, not only extension - return extension(url).lower() in STATICFILE_EXTENSIONS + return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS def enforce_types(func): diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py index e89bf155..1abcd1d4 100644 --- a/archivebox/parsers/__init__.py +++ b/archivebox/parsers/__init__.py @@ -13,7 +13,8 @@ from typing import IO, Tuple, List, Optional from datetime import datetime, timezone from pathlib import Path -from archivebox.config import DATA_DIR, CONSTANTS, SHELL_CONFIG, ARCHIVING_CONFIG +from archivebox.config import DATA_DIR, CONSTANTS +from archivebox.config.common import SHELL_CONFIG, ARCHIVING_CONFIG from archivebox.misc.system import atomic_write from archivebox.misc.logging import stderr, hint from archivebox.misc.util import ( diff --git a/archivebox/plugins_extractor/chrome/apps.py b/archivebox/plugins_extractor/chrome/apps.py index 1222a1b2..fee4762c 100644 --- a/archivebox/plugins_extractor/chrome/apps.py +++ b/archivebox/plugins_extractor/chrome/apps.py @@ -25,7 +25,8 @@ from abx.archivebox.base_binary import BaseBinary, env from abx.archivebox.base_hook import BaseHook # Depends on Other Plugins: -from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG +from archivebox.config import CONSTANTS +from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER diff --git a/archivebox/plugins_extractor/curl/apps.py b/archivebox/plugins_extractor/curl/apps.py index cab683b5..c496611b 100644 --- a/archivebox/plugins_extractor/curl/apps.py +++ b/archivebox/plugins_extractor/curl/apps.py @@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew # from abx.archivebox.base_extractor import BaseExtractor, ExtractorName -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG diff --git a/archivebox/plugins_extractor/git/apps.py b/archivebox/plugins_extractor/git/apps.py index ff7146b2..ebdc9e9f 100644 --- a/archivebox/plugins_extractor/git/apps.py +++ b/archivebox/plugins_extractor/git/apps.py @@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew from abx.archivebox.base_extractor import BaseExtractor, ExtractorName -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG class GitConfig(BaseConfigSet): diff --git a/archivebox/plugins_extractor/mercury/apps.py b/archivebox/plugins_extractor/mercury/apps.py index 78d505b2..58b8c249 100644 --- a/archivebox/plugins_extractor/mercury/apps.py +++ b/archivebox/plugins_extractor/mercury/apps.py @@ -5,14 +5,14 @@ from pathlib import Path from subprocess import run from pydantic import InstanceOf, Field -from pydantic_pkgr import BinProvider, BinName, bin_abspath +from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict, bin_abspath from abx.archivebox.base_plugin import BasePlugin, BaseHook from abx.archivebox.base_configset import BaseConfigSet -from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env +from abx.archivebox.base_binary import BaseBinary, env from abx.archivebox.base_extractor import BaseExtractor, ExtractorName -from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER class MercuryConfig(BaseConfigSet): diff --git a/archivebox/plugins_extractor/readability/apps.py b/archivebox/plugins_extractor/readability/apps.py index c7a84009..c61efb21 100644 --- a/archivebox/plugins_extractor/readability/apps.py +++ b/archivebox/plugins_extractor/readability/apps.py @@ -16,7 +16,7 @@ from abx.archivebox.base_extractor import BaseExtractor from abx.archivebox.base_hook import BaseHook # Depends on Other Plugins: -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER ###################### Config ########################## diff --git a/archivebox/plugins_extractor/singlefile/apps.py b/archivebox/plugins_extractor/singlefile/apps.py index e3535ded..8ebbc41c 100644 --- a/archivebox/plugins_extractor/singlefile/apps.py +++ b/archivebox/plugins_extractor/singlefile/apps.py @@ -1,11 +1,11 @@ __package__ = 'archivebox.plugins_extractor.singlefile' from pathlib import Path -from typing import List, Dict, Optional, ClassVar +from typing import List, Dict, Optional # from typing_extensions import Self # Depends on other PyPI/vendor packages: -from pydantic import InstanceOf, Field, validate_call +from pydantic import InstanceOf, Field from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, bin_abspath, ShallowBinary # Depends on other Django apps: @@ -17,7 +17,7 @@ from abx.archivebox.base_queue import BaseQueue from abx.archivebox.base_hook import BaseHook # Depends on Other Plugins: -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER ###################### Config ########################## diff --git a/archivebox/plugins_extractor/wget/apps.py b/archivebox/plugins_extractor/wget/apps.py index 171bebc4..1e54376b 100644 --- a/archivebox/plugins_extractor/wget/apps.py +++ b/archivebox/plugins_extractor/wget/apps.py @@ -14,7 +14,7 @@ from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew from abx.archivebox.base_extractor import BaseExtractor, ExtractorName -from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG from .wget_util import wget_output_path diff --git a/archivebox/plugins_extractor/ytdlp/apps.py b/archivebox/plugins_extractor/ytdlp/apps.py index 21dfa0bc..2c935797 100644 --- a/archivebox/plugins_extractor/ytdlp/apps.py +++ b/archivebox/plugins_extractor/ytdlp/apps.py @@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet from abx.archivebox.base_binary import BaseBinary, env, apt, brew from abx.archivebox.base_hook import BaseHook -from archivebox.config import ARCHIVING_CONFIG +from archivebox.config.common import ARCHIVING_CONFIG from plugins_pkg.pip.apps import pip ###################### Config ########################## diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py index f7a1b986..cc94a807 100644 --- a/archivebox/plugins_search/ripgrep/apps.py +++ b/archivebox/plugins_search/ripgrep/apps.py @@ -18,7 +18,8 @@ from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_searchbackend import BaseSearchBackend # Depends on Other Plugins: -from archivebox.config import CONSTANTS, SEARCH_BACKEND_CONFIG +from archivebox.config import CONSTANTS +from archivebox.config.common import SEARCH_BACKEND_CONFIG ###################### Config ########################## diff --git a/archivebox/plugins_search/sonic/apps.py b/archivebox/plugins_search/sonic/apps.py index efc47ceb..c7342853 100644 --- a/archivebox/plugins_search/sonic/apps.py +++ b/archivebox/plugins_search/sonic/apps.py @@ -15,7 +15,7 @@ from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_searchbackend import BaseSearchBackend # Depends on Other Plugins: -from archivebox.config import SEARCH_BACKEND_CONFIG +from archivebox.config.common import SEARCH_BACKEND_CONFIG SONIC_LIB = None try: diff --git a/archivebox/plugins_search/sqlite/apps.py b/archivebox/plugins_search/sqlite/apps.py index 98db5363..9f34bfd8 100644 --- a/archivebox/plugins_search/sqlite/apps.py +++ b/archivebox/plugins_search/sqlite/apps.py @@ -17,7 +17,7 @@ from abx.archivebox.base_hook import BaseHook from abx.archivebox.base_searchbackend import BaseSearchBackend # Depends on Other Plugins: -from archivebox.config import SEARCH_BACKEND_CONFIG +from archivebox.config.common import SEARCH_BACKEND_CONFIG diff --git a/archivebox/queues/supervisor_util.py b/archivebox/queues/supervisor_util.py index 4e3d749b..035f1e40 100644 --- a/archivebox/queues/supervisor_util.py +++ b/archivebox/queues/supervisor_util.py @@ -1,5 +1,6 @@ __package__ = 'archivebox.queues' +import os import time import signal import psutil @@ -12,6 +13,8 @@ from typing import Dict, cast from supervisor.xmlrpc import SupervisorTransport from xmlrpc.client import ServerProxy +from archivebox.config.permissions import ARCHIVEBOX_USER + from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, SOCK_FILE, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR from typing import Iterator @@ -42,6 +45,7 @@ childlogdir = {LOGS_DIR} directory = {DATA_DIR} strip_ansi = true nocleanup = true +user = {ARCHIVEBOX_USER} [unix_http_server] file = {TMP_DIR}/{SOCK_FILE.name} diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index 81ae87e7..f7394171 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -11,7 +11,7 @@ import abx.archivebox.use from archivebox.index.schema import Link from archivebox.misc.util import enforce_types from archivebox.misc.logging import stderr -from archivebox.config import SEARCH_BACKEND_CONFIG +from archivebox.config.common import SEARCH_BACKEND_CONFIG def log_index_started(url): diff --git a/bin/docker_entrypoint.sh b/bin/docker_entrypoint.sh index 9dfacdc9..287a2702 100755 --- a/bin/docker_entrypoint.sh +++ b/bin/docker_entrypoint.sh @@ -110,12 +110,11 @@ if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/* fi -# also chown tmp dir -mkdir -p /tmp/archivebox -chmod 777 /tmp -chown $PUID:$PGID /tmp/archivebox -mkdir -p /app/lib -chown $PUID:$PGID /app/lib /app/lib/* +# also chown tmp dir and lib dir +mkdir -p "$SYSTEM_TMP_DIR" +chown $PUID:$PGID "$SYSTEM_TMP_DIR" +mkdir -p "$SYSTEM_LIB_DIR" +chown $PUID:$PGID "$SYSTEM_LIB_DIR" "$SYSTEM_LIB_DIR"/* # (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious) export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')" diff --git a/pyproject.toml b/pyproject.toml index 599e796e..1a7c60af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "archivebox" -version = "0.8.5rc2" +version = "0.8.5rc3" requires-python = ">=3.10" description = "Self-hosted internet archiving solution." authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}] @@ -77,6 +77,7 @@ dependencies = [ "atomicwrites==1.4.1", "django-taggit==1.3.0", "base32-crockford==0.3.0", + "platformdirs>=4.3.6", # "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7", # "pydantic-pkgr>=0.4.7", ############# Plugin Dependencies ################ @@ -133,7 +134,6 @@ dev-dependencies = [ "django-autotyping>=0.5.1", ] - [build-system] requires = ["pdm-backend"] build-backend = "pdm.backend" diff --git a/tests/test_init.py b/tests/test_init.py index 156d1907..e3e2c852 100644 --- a/tests/test_init.py +++ b/tests/test_init.py @@ -7,11 +7,11 @@ from pathlib import Path import json, shutil import sqlite3 -from archivebox.config import OUTPUT_PERMISSIONS +from archivebox.config.common import STORAGE_CONFIG from .fixtures import * -DIR_PERMISSIONS = OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5') +DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5') def test_init(tmp_path, process): assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8") @@ -57,7 +57,7 @@ def test_correct_permissions_output_folder(tmp_path, process): index_files = ['index.sqlite3', 'archive'] for file in index_files: file_path = tmp_path / file - assert oct(file_path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + assert oct(file_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS) def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) @@ -65,7 +65,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr env=disable_extractors_dict) archived_item_path = list(tmp_path.glob('archive/**/*'))[0] for path in archived_item_path.iterdir(): - assert oct(path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS) + assert oct(path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS) def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict): os.chdir(tmp_path) diff --git a/uv.lock b/uv.lock index ffb1c7ce..bce8f7ad 100644 --- a/uv.lock +++ b/uv.lock @@ -41,7 +41,7 @@ wheels = [ [[package]] name = "archivebox" -version = "0.8.5rc2" +version = "0.8.5rc3" source = { editable = "." } dependencies = [ { name = "atomicwrites" },