improve config loading of TMP_DIR, LIB_DIR, move to separate files

This commit is contained in:
Nick Sweeting 2024-10-07 23:45:11 -07:00
parent 7a895d9285
commit cf1ea8f80f
No known key found for this signature in database
49 changed files with 767 additions and 527 deletions

View file

@ -287,22 +287,12 @@ WORKDIR "$DATA_DIR"
RUN openssl rand -hex 16 > /etc/machine-id \
&& chown -R "$DEFAULT_PUID:$DEFAULT_PGID" "/tmp"
ENV IN_DOCKER=True \
SYSTEM_LIB_DIR=/app/lib \
SYSTEM_TMP_DIR=/tmp \
SYSTEM_LIB_DIR=/usr/share/archivebox \
SYSTEM_TMP_DIR=/tmp/archivebox \
GOOGLE_API_KEY=no \
GOOGLE_DEFAULT_CLIENT_ID=no \
GOOGLE_DEFAULT_CLIENT_SECRET=no \
ALLOWED_HOSTS=*
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# WGET_BINARY="wget" \
# YOUTUBEDL_BINARY="yt-dlp" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
# USE_SINGLEFILE=True \
# SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
# USE_READABILITY=True \
# READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
# USE_MERCURY=True \
# MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
# Print version for nice docker finish summary
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \

View file

@ -13,7 +13,7 @@ __package__ = 'archivebox'
import os
import sys
import tempfile
from pathlib import Path
ASCII_LOGO = """
@ -25,37 +25,36 @@ ASCII_LOGO = """
"""
SYSTEM_TMP_DIR = Path(tempfile.gettempdir()) / 'archivebox'
SYSTEM_TMP_DIR.mkdir(parents=True, exist_ok=True)
os.environ['SYSTEM_TMP_DIR'] = str(SYSTEM_TMP_DIR)
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# detect ArchiveBox user's UID/GID based on data dir ownership
from archivebox.config.permissions import drop_privileges # noqa
drop_privileges()
# if we are outside a data dir, cd into an ephemeral tmp dir so that
# we can run version/help without polluting cwd with an index.sqlite3
if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
current_dir = Path(os.getcwd()).resolve()
if not (current_dir / 'index.sqlite3').exists():
os.chdir(SYSTEM_TMP_DIR)
from archivebox.misc.checks import check_not_root, check_io_encoding # noqa
check_not_root()
check_io_encoding()
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR))
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# print('INSTALLING MONKEY PATCHES')
from .monkey_patches import * # noqa
from archivebox.monkey_patches import * # noqa
# print('DONE INSTALLING MONKEY PATCHES')
# print('LOADING VENDORED LIBRARIES')
from .vendor import load_vendored_libs # noqa
from archivebox.vendor import load_vendored_libs # noqa
load_vendored_libs()
# print('DONE LOADING VENDORED LIBRARIES')
from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
from archivebox.config.constants import CONSTANTS # noqa
from archivebox.config.paths import PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from archivebox.config.version import VERSION # noqa
__version__ = VERSION
__author__ = 'Nick Sweeting'

View file

@ -12,12 +12,13 @@ from ninja import NinjaAPI, Swagger
# TODO: explore adding https://eadwincode.github.io/django-ninja-extra/
from archivebox.config import SHELL_CONFIG, VERSION
from archivebox.config import VERSION
from archivebox.config.version import get_COMMIT_HASH
from api.auth import API_AUTH_METHODS
COMMIT_HASH = SHELL_CONFIG.COMMIT_HASH or 'unknown'
COMMIT_HASH = get_COMMIT_HASH() or 'unknown'
html_description=f'''
<h3>Welcome to your ArchiveBox server's REST API <code>[v1 ALPHA]</code> homepage!</h3>

View file

@ -13,7 +13,7 @@ from ..main import (
schedule,
)
from archivebox.misc.util import ansi_to_html
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from .auth import API_AUTH_METHODS

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
import os
import sys
import argparse
import threading
@ -25,6 +26,10 @@ if len(sys.argv) > 1 and sys.argv[1] == 'setup':
print(':warning: [bold red]DEPRECATED[/bold red] `archivebox setup` is deprecated, use `archivebox install` instead')
sys.argv[1] = 'install'
if '--debug' in sys.argv:
os.environ['DEBUG'] = 'True'
sys.argv.remove('--debug')
# def list_subcommands() -> Dict[str, str]:
# """find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
@ -50,8 +55,8 @@ SUBCOMMAND_MODULES = {
'init': 'archivebox_init',
'install': 'archivebox_install',
##############################################
'config': 'archivebox_config',
'add': 'archivebox_add',
'remove': 'archivebox_remove',
'update': 'archivebox_update',
@ -63,7 +68,7 @@ SUBCOMMAND_MODULES = {
'shell': 'archivebox_shell',
'manage': 'archivebox_manage',
'oneshot': 'archivebox_oneshot',
# 'oneshot': 'archivebox_oneshot',
}
# every imported command module must have these properties in order to be valid
@ -102,11 +107,11 @@ CLI_SUBCOMMANDS = LazySubcommands()
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version') # dont require valid data folder at all
main_cmds = ('init', 'config', 'setup', 'install') # dont require existing db present
archive_cmds = ('add', 'remove', 'update', 'list', 'status') # require existing db present
setup_cmds = ('init', 'setup', 'install') # require valid data folder, but dont require DB present in it yet
archive_cmds = ('add', 'remove', 'update', 'list', 'status', 'schedule', 'server', 'shell', 'manage') # require valid data folder + existing db present
fake_db = ("oneshot",) # use fake in-memory db
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
display_first = (*meta_cmds, *setup_cmds, *archive_cmds)
IGNORED_BG_THREADS = ('MainThread', 'ThreadPoolExecutor', 'IPythonHistorySavingThread', 'Scheduler') # threads we dont have to wait for before exiting
@ -157,14 +162,16 @@ def run_subcommand(subcommand: str,
from archivebox.config.legacy import setup_django
# print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd())
# print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
check_db = cmd_requires_db and not init_pending
if subcommand not in meta_cmds:
setup_django(in_memory_db=subcommand in fake_db, check_db=check_db)
if subcommand in archive_cmds:
if cmd_requires_db:
check_migrations()

View file

@ -9,7 +9,8 @@ import argparse
from typing import List, Optional, IO
from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR, ARCHIVING_CONFIG
from archivebox.config import DATA_DIR
from archivebox.config.common import ARCHIVING_CONFIG
from ..main import add
from ..parsers import PARSERS

View file

@ -9,7 +9,8 @@ from pathlib import Path
from typing import Optional, List, IO
from archivebox.misc.util import docstring
from archivebox.config import DATA_DIR, SERVER_CONFIG
from archivebox.config import DATA_DIR
from archivebox.config.common import SERVER_CONFIG
from ..logging_util import SmartFormatter, reject_stdin
from ..main import server

View file

@ -1,27 +1,9 @@
__package__ = 'archivebox.config'
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR, VERSION
from .defaults import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
from .paths import (
PACKAGE_DIR, # noqa
DATA_DIR, # noqa
ARCHIVE_DIR, # noqa
)
__all__ = [
'CONSTANTS',
'PACKAGE_DIR',
'DATA_DIR',
'ARCHIVE_DIR',
'VERSION',
'SHELL_CONFIG',
'STORAGE_CONFIG',
'GENERAL_CONFIG',
'SERVER_CONFIG',
'ARCHIVING_CONFIG',
'SEARCH_BACKEND_CONFIG',
'CONSTANTS_CONFIG',
]
from .constants import CONSTANTS, CONSTANTS_CONFIG # noqa
from .version import VERSION # noqa

View file

@ -8,7 +8,7 @@ from abx.archivebox.base_hook import BaseHook
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .defaults import (
from .common import (
ShellConfig, # noqa: F401
StorageConfig, # noqa: F401
GeneralConfig, # noqa: F401

View file

@ -1,47 +0,0 @@
# def get_versions_available_on_github(config):
# """
# returns a dictionary containing the ArchiveBox GitHub release info for
# the recommended upgrade version and the currently installed version
# """
# # we only want to perform the (relatively expensive) check for new versions
# # when its most relevant, e.g. when the user runs a long-running command
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
# if subcommand_run_by_user not in long_running_commands:
# return None
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
# response = requests.get(github_releases_api)
# if response.status_code != 200:
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
# return None
# all_releases = response.json()
# installed_version = parse_version_string(config['VERSION'])
# # find current version or nearest older version (to link to)
# current_version = None
# for idx, release in enumerate(all_releases):
# release_version = parse_version_string(release['tag_name'])
# if release_version <= installed_version:
# current_version = release
# break
# current_version = current_version or all_releases[-1]
# # recommended version is whatever comes after current_version in the release list
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
# try:
# recommended_version = all_releases[idx+1]
# except IndexError:
# recommended_version = None
# return {'recommended_version': recommended_version, 'current_version': current_version}
# def can_upgrade(config):
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
# return recommended_version > current_version
# return False

View file

@ -1,21 +1,21 @@
__package__ = 'archivebox.config'
import os
import sys
import shutil
from typing import Dict, Optional
from datetime import datetime
from pathlib import Path
from rich import print
from pydantic import Field, field_validator, model_validator, computed_field
from pydantic import Field, field_validator, computed_field
from django.utils.crypto import get_random_string
from abx.archivebox.base_configset import BaseConfigSet
from .constants import CONSTANTS, PACKAGE_DIR
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME
from .permissions import IN_DOCKER
###################### Config ##########################
@ -27,14 +27,8 @@ class ShellConfig(BaseConfigSet):
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
IN_DOCKER: bool = Field(default=False)
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
USER: str = Field(default=Path('~').expanduser().resolve().name)
PUID: int = Field(default=os.getuid())
PGID: int = Field(default=os.getgid())
PYTHON_ENCODING: str = Field(default=(sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8'))
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
@ -52,63 +46,12 @@ class ShellConfig(BaseConfigSet):
@computed_field
@property
def COMMIT_HASH(self) -> Optional[str]:
try:
git_dir = PACKAGE_DIR / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
return get_COMMIT_HASH()
@computed_field
@property
def BUILD_TIME(self) -> str:
if self.IN_DOCKER:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
@model_validator(mode='after')
def validate_not_running_as_root(self):
attempted_command = ' '.join(sys.argv[:3])
if self.PUID == 0 and attempted_command not in ('setup', 'install'):
# stderr('[!] ArchiveBox should never be run as root!', color='red')
# stderr(' For more information, see the security overview documentation:')
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
print(' For more information, see the security overview documentation:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
if self.IN_DOCKER:
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
print(' or:', file=sys.stderr)
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
raise SystemExit(2)
# check python locale
if self.PYTHON_ENCODING != 'UTF-8':
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {self.PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
raise SystemExit(2)
return self
return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()

View file

@ -1,115 +0,0 @@
from pathlib import Path
from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any, List
from mypy_extensions import TypedDict
from benedict import benedict
SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
class BaseConfig(TypedDict):
pass
class ConfigDict(BaseConfig, benedict, total=False):
"""
# Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
print('class ConfigDict(BaseConfig, total=False):')
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
for section, configs in CONFIG_DEFAULTS.items():
for key, attrs in configs.items():
Type, default = attrs['type'], attrs['default']
if default is None:
print(f' {key}: Optional[{Type.__name__}]')
else:
print(f' {key}: {Type.__name__}')
print()
"""
IS_TTY: bool
USE_COLOR: bool
SHOW_PROGRESS: bool
IN_DOCKER: bool
PACKAGE_DIR: Path
CONFIG_FILE: Path
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str
RESTRICT_FILE_NAMES: str
URL_DENYLIST: str
SECRET_KEY: Optional[str]
BIND_ADDR: str
ALLOWED_HOSTS: str
DEBUG: bool
PUBLIC_INDEX: bool
PUBLIC_SNAPSHOTS: bool
FOOTER_INFO: str
SAVE_TITLE: bool
SAVE_FAVICON: bool
SAVE_WGET: bool
SAVE_WGET_REQUISITES: bool
SAVE_SINGLEFILE: bool
SAVE_READABILITY: bool
SAVE_MERCURY: bool
SAVE_PDF: bool
SAVE_SCREENSHOT: bool
SAVE_DOM: bool
SAVE_WARC: bool
SAVE_GIT: bool
SAVE_MEDIA: bool
SAVE_ARCHIVE_DOT_ORG: bool
RESOLUTION: str
GIT_DOMAINS: str
CHECK_SSL_VALIDITY: bool
CURL_USER_AGENT: str
WGET_USER_AGENT: str
CHROME_USER_AGENT: str
COOKIES_FILE: Union[str, Path, None]
CHROME_USER_DATA_DIR: Union[str, Path, None]
CHROME_TIMEOUT: int
CHROME_HEADLESS: bool
CHROME_SANDBOX: bool
USE_CURL: bool
USE_WGET: bool
USE_SINGLEFILE: bool
USE_READABILITY: bool
USE_MERCURY: bool
USE_GIT: bool
USE_CHROME: bool
USE_YOUTUBEDL: bool
CURL_BINARY: str
GIT_BINARY: str
WGET_BINARY: str
SINGLEFILE_BINARY: str
READABILITY_BINARY: str
MERCURY_BINARY: str
YOUTUBEDL_BINARY: str
CHROME_BINARY: Optional[str]
YOUTUBEDL_ARGS: List[str]
WGET_ARGS: List[str]
CURL_ARGS: List[str]
GIT_ARGS: List[str]
TAG_SEPARATOR_PATTERN: str
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
ConfigDefault = TypedDict('ConfigDefault', {
'default': ConfigDefaultValue,
'type': Optional[Type],
'aliases': Optional[Tuple[str, ...]],
}, total=False)
ConfigDefaultDict = Dict[str, ConfigDefault]

View file

@ -1,118 +1,115 @@
__package__ = 'archivebox.config'
import os
import re
import platform
import tempfile
from typing import Dict
from pathlib import Path
import importlib.metadata
from collections.abc import Mapping
from benedict import benedict
from ..misc.logging import DEFAULT_CLI_COLORS
from .paths import (
PACKAGE_DIR,
DATA_DIR,
ARCHIVE_DIR,
get_collection_id,
get_LIB_DIR,
get_TMP_DIR,
)
from .permissions import (
IS_ROOT,
IN_DOCKER,
RUNNING_AS_UID,
RUNNING_AS_GID,
DEFAULT_PUID,
DEFAULT_PGID,
ARCHIVEBOX_USER,
ARCHIVEBOX_GROUP,
)
from .version import detect_installed_version
###################### Config ##########################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
def _detect_installed_version(PACKAGE_DIR: Path):
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
try:
# if in production install, use pip-installed package metadata
return importlib.metadata.version(__package__ or 'archivebox').strip()
except importlib.metadata.PackageNotFoundError:
pass
try:
# if in dev Git repo dir, use pyproject.toml file
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"').strip()
except FileNotFoundError:
# building docs, pyproject.toml is not available
pass
# raise Exception('Failed to detect installed archivebox version!')
return 'dev'
VERSION: str = _detect_installed_version(PACKAGE_DIR)
class ConstantsDict(Mapping):
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'yes')
OS = platform.system().lower() # darwin, linux, etc.
ARCH = platform.machine().lower() # arm64, x86_64, etc.
LIB_DIR_SCOPE = f'{ARCH}-{OS}' + ('-docker' if IN_DOCKER else '')
PACKAGE_DIR: Path = PACKAGE_DIR # archivebox source code dir
DATA_DIR: Path = DATA_DIR # archivebox user data dir
ARCHIVE_DIR: Path = ARCHIVE_DIR # archivebox snapshot data dir
VERSION: str = VERSION
PACKAGE_DIR: Path = PACKAGE_DIR
DATA_DIR: Path = DATA_DIR
ARCHIVE_DIR: Path = ARCHIVE_DIR
COLLECTION_ID: str = get_collection_id(DATA_DIR)
# Host system
VERSION: str = detect_installed_version(PACKAGE_DIR)
OS: str = platform.system().lower() # darwin, linux, etc.
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
IN_DOCKER: bool = IN_DOCKER
# Permissions
IS_ROOT: bool = IS_ROOT
ARCHIVEBOX_USER: int = ARCHIVEBOX_USER
ARCHIVEBOX_GROUP: int = ARCHIVEBOX_GROUP
RUNNING_AS_UID: int = RUNNING_AS_UID
RUNNING_AS_GID: int = RUNNING_AS_GID
DEFAULT_PUID: int = DEFAULT_PUID
DEFAULT_PGID: int = DEFAULT_PGID
# Source code dirs
PACKAGE_DIR_NAME: str = PACKAGE_DIR.name
TEMPLATES_DIR_NAME: str = 'templates'
TEMPLATES_DIR: Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR: Path = TEMPLATES_DIR / 'static'
STATIC_DIR_NAME: str = 'static'
STATIC_DIR: Path = TEMPLATES_DIR / STATIC_DIR_NAME
# Data dirs
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
USER_PLUGINS_DIR_NAME: str = 'user_plugins'
CUSTOM_TEMPLATES_DIR_NAME: str = 'user_templates'
ARCHIVE_DIR_NAME: str = 'archive'
SOURCES_DIR_NAME: str = 'sources'
PERSONAS_DIR_NAME: str = 'personas'
CRONTABS_DIR_NAME: str = 'crontabs'
CACHE_DIR_NAME: str = 'cache'
LOGS_DIR_NAME: str = 'logs'
LIB_DIR_NAME: str = 'lib'
TMP_DIR_NAME: str = 'tmp'
SYSTEM_TMP_DIR: Path = Path(os.environ['SYSTEM_TMP_DIR']) if 'SYSTEM_TMP_DIR' in os.environ else (Path(tempfile.gettempdir()) / 'archivebox')
# DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
SYSTEM_LIB_DIR: Path = Path(os.environ['SYSTEM_LIB_DIR']) if 'SYSTEM_LIB_DIR' in os.environ else (PACKAGE_DIR / LIB_DIR_NAME)
DATA_DIR_LIB_DIR: Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
ARCHIVE_DIR: Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR: Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR: Path = DATA_DIR / PERSONAS_DIR_NAME
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
LOGS_DIR: Path = DATA_DIR / LOGS_DIR_NAME
LIB_DIR: Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
TMP_DIR: Path = SYSTEM_TMP_DIR
CACHE_DIR: Path = DATA_DIR / CACHE_DIR_NAME
CUSTOM_TEMPLATES_DIR: Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR: Path = DATA_DIR / USER_PLUGINS_DIR_NAME
# Data dir files
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
# Runtime dirs
TMP_DIR_NAME: str = 'tmp'
TMP_DIR: Path = get_TMP_DIR()
LIB_DIR_NAME: str = 'lib'
LIB_DIR: Path = get_LIB_DIR()
LIB_PIP_DIR: Path = LIB_DIR / 'pip'
LIB_NPM_DIR: Path = LIB_DIR / 'npm'
LIB_BROWSERS_DIR: Path = LIB_DIR / 'browsers'
LIB_BIN_DIR: Path = LIB_DIR / 'bin'
BIN_DIR: Path = LIB_BIN_DIR
CONFIG_FILENAME: str = 'ArchiveBox.conf'
SQL_INDEX_FILENAME: str = 'index.sqlite3'
QUEUE_DATABASE_FILENAME: str = 'queue.sqlite3'
# Config constants
TIMEZONE: str = 'UTC'
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
CONFIG_FILE: Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE: Path = DATA_DIR / SQL_INDEX_FILENAME
QUEUE_DATABASE_FILE: Path = DATA_DIR / QUEUE_DATABASE_FILENAME
JSON_INDEX_FILENAME: str = 'index.json'
HTML_INDEX_FILENAME: str = 'index.html'
ROBOTS_TXT_FILENAME: str = 'robots.txt'
FAVICON_FILENAME: str = 'favicon.ico'
TIMEZONE: str = 'UTC'
DEFAULT_CLI_COLORS: Dict[str, str] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS: Dict[str, str] = benedict({k: '' for k in DEFAULT_CLI_COLORS})
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
STATICFILE_EXTENSIONS: frozenset[str] = frozenset((
# 99.999% of the time, URLs ending in these extensions are static files
@ -136,17 +133,6 @@ class ConstantsDict(Mapping):
# html, htm, shtml, xhtml, xml, aspx, php, cgi
))
INGORED_PATHS: frozenset[str] = frozenset((
".git",
".svn",
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
"Dockerfile",
".ArchiveBox.conf.bak",
))
PIP_RELATED_NAMES: frozenset[str] = frozenset((
".venv",
"venv",
@ -160,7 +146,15 @@ class ConstantsDict(Mapping):
"yarn.lock",
))
DATA_DIR_NAMES: frozenset[str] = frozenset((
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
### Dirs:
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@ -171,9 +165,12 @@ class ConstantsDict(Mapping):
CUSTOM_TEMPLATES_DIR_NAME,
USER_PLUGINS_DIR_NAME,
CRONTABS_DIR_NAME,
))
DATA_DIRS: frozenset[Path] = frozenset(DATA_DIR / dirname for dirname in DATA_DIR_NAMES)
DATA_FILE_NAMES: frozenset[str] = frozenset((
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount / sonic FTS process
".git",
".svn",
### Files:
CONFIG_FILENAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
@ -188,43 +185,37 @@ class ConstantsDict(Mapping):
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
f".{CONFIG_FILENAME}.bak",
"static_index.json",
))
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*INGORED_PATHS,
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
*DATA_DIR_NAMES,
*DATA_FILE_NAMES,
"static", # created by old static exports <v0.6.0
"sonic", # created by docker bind mount
".DS_Store",
".gitignore",
"lost+found",
".DS_Store",
".env",
".collection_id",
"Dockerfile",
))
CODE_LOCATIONS = benedict({
'PACKAGE_DIR': {
'path': (PACKAGE_DIR).resolve(),
'enabled': True,
'is_valid': (PACKAGE_DIR / '__main__.py').exists(),
'is_valid': (PACKAGE_DIR / '__main__.py').exists(), # read + list
},
'TEMPLATES_DIR': {
'path': TEMPLATES_DIR.resolve(),
'enabled': True,
'is_valid': STATIC_DIR.exists(),
'is_valid': STATIC_DIR.exists() and os.access(STATIC_DIR, os.R_OK) and os.access(STATIC_DIR, os.X_OK), # read + list
},
'LIB_DIR': {
'path': LIB_DIR.resolve(),
'enabled': True,
'is_valid': LIB_DIR.is_dir(),
'is_valid': LIB_DIR.is_dir() and os.access(LIB_DIR, os.R_OK) and os.access(LIB_DIR, os.X_OK) and os.access(LIB_DIR, os.W_OK), # read + write
},
'TMP_DIR': {
'path': TMP_DIR.resolve(),
'enabled': True,
'is_valid': TMP_DIR.is_dir(),
'is_valid': TMP_DIR.is_dir() and os.access(TMP_DIR, os.R_OK) and os.access(TMP_DIR, os.X_OK) and os.access(TMP_DIR, os.W_OK), # read + write
},
})
@ -232,61 +223,61 @@ class ConstantsDict(Mapping):
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_valid": DATABASE_FILE.exists() and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK) and os.access(DATA_DIR, os.X_OK),
"is_mount": os.path.ismount(DATA_DIR.resolve()),
},
"CONFIG_FILE": {
"path": CONFIG_FILE.resolve(),
"enabled": True,
"is_valid": CONFIG_FILE.exists(),
"is_valid": CONFIG_FILE.exists() and os.access(CONFIG_FILE, os.W_OK),
},
"SQL_INDEX": {
"path": DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),
"is_valid": DATABASE_FILE.exists() and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
},
"QUEUE_DATABASE": {
"path": QUEUE_DATABASE_FILE.resolve(),
"enabled": True,
"is_valid": QUEUE_DATABASE_FILE.exists(),
"is_valid": QUEUE_DATABASE_FILE.exists() and os.access(QUEUE_DATABASE_FILE, os.R_OK) and os.access(QUEUE_DATABASE_FILE, os.W_OK),
"is_mount": os.path.ismount(QUEUE_DATABASE_FILE.resolve()),
},
"ARCHIVE_DIR": {
"path": ARCHIVE_DIR.resolve(),
"enabled": True,
"is_valid": ARCHIVE_DIR.exists(),
"is_valid": ARCHIVE_DIR.exists() and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK) and os.access(ARCHIVE_DIR, os.X_OK),
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
},
"SOURCES_DIR": {
"path": SOURCES_DIR.resolve(),
"enabled": True,
"is_valid": SOURCES_DIR.exists(),
"is_valid": SOURCES_DIR.exists() and os.access(SOURCES_DIR, os.R_OK) and os.access(SOURCES_DIR, os.W_OK) and os.access(SOURCES_DIR, os.X_OK),
},
"LOGS_DIR": {
"path": LOGS_DIR.resolve(),
"enabled": True,
"is_valid": LOGS_DIR.is_dir(),
"is_valid": LOGS_DIR.is_dir() and os.access(LOGS_DIR, os.R_OK) and os.access(LOGS_DIR, os.W_OK) and os.access(LOGS_DIR, os.X_OK), # read + write
},
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": CACHE_DIR.is_dir(),
# "is_valid": CACHE_DIR.is_dir() and os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK) and os.access(CACHE_DIR, os.X_OK), # read + write
# },
"PERSONAS_DIR": {
"path": PERSONAS_DIR.resolve(),
"enabled": PERSONAS_DIR.exists(),
"is_valid": PERSONAS_DIR.is_dir(),
"is_valid": PERSONAS_DIR.is_dir() and os.access(PERSONAS_DIR, os.R_OK) and os.access(PERSONAS_DIR, os.W_OK) and os.access(PERSONAS_DIR, os.X_OK), # read + write
},
'CUSTOM_TEMPLATES_DIR': {
'path': CUSTOM_TEMPLATES_DIR.resolve(),
'enabled': CUSTOM_TEMPLATES_DIR.exists(),
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir(),
'is_valid': CUSTOM_TEMPLATES_DIR.is_dir() and os.access(CUSTOM_TEMPLATES_DIR, os.R_OK) and os.access(CUSTOM_TEMPLATES_DIR, os.X_OK), # read
},
'USER_PLUGINS_DIR': {
'path': USER_PLUGINS_DIR.resolve(),
'enabled': USER_PLUGINS_DIR.exists(),
'is_valid': USER_PLUGINS_DIR.is_dir(),
'is_valid': USER_PLUGINS_DIR.is_dir() and os.access(USER_PLUGINS_DIR, os.R_OK) and os.access(USER_PLUGINS_DIR, os.X_OK), # read
},
})
@ -314,5 +305,6 @@ globals().update(CONSTANTS)
# these need to always exist as we need them to run almost everything
# TODO: figure out a better time to make these than import-time
CONSTANTS.LIB_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.TMP_DIR.mkdir(parents=True, exist_ok=True)

View file

@ -22,41 +22,34 @@ Documentation:
__package__ = 'archivebox.config'
import os
import io
import re
import sys
import json
import shutil
from hashlib import md5
from pathlib import Path
from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from typing import Optional, Type, Tuple, Dict, Any
from subprocess import run, DEVNULL
from configparser import ConfigParser
from rich.progress import Progress
from rich.console import Console
from benedict import benedict
from pydantic_pkgr import SemVer
import django
from django.db.backends.sqlite3.base import Database as sqlite3
from .constants import CONSTANTS, TIMEZONE
from .constants import CONSTANTS
from .constants import *
from .config_stubs import (
ConfigValue,
ConfigDefaultValue,
ConfigDefaultDict,
)
from ..misc.logging import (
stderr,
hint, # noqa
)
from .defaults import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from .common import SHELL_CONFIG, GENERAL_CONFIG, ARCHIVING_CONFIG, SERVER_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG
from archivebox.plugins_auth.ldap.apps import LDAP_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.wget.apps import WGET_CONFIG
@ -67,7 +60,7 @@ LDAP = LDAP_CONFIG.LDAP_ENABLED
############################### Config Schema ##################################
CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
CONFIG_SCHEMA: Dict[str, Dict[str, Any]] = {
'SHELL_CONFIG': SHELL_CONFIG.as_legacy_config_schema(),
'SERVER_CONFIG': SERVER_CONFIG.as_legacy_config_schema(),
@ -194,7 +187,7 @@ def get_real_name(key: str) -> str:
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
DYNAMIC_CONFIG_SCHEMA: Dict[str, Any] = {
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)},
@ -209,12 +202,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
def load_config_val(key: str,
default: ConfigDefaultValue=None,
default: Any=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[benedict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
config_file_vars: Optional[Dict[str, str]]=None) -> Any:
"""parse bool, int, and str key=value pairs from env"""
assert isinstance(config, dict)
@ -372,7 +365,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=CONSTANTS.DATA
def load_config(defaults: ConfigDefaultDict,
def load_config(defaults: Dict[str, Any],
config: Optional[benedict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
@ -505,7 +498,7 @@ def load_all_config():
# add all final config values in CONFIG to globals in this file
CONFIG: benedict = load_all_config()
globals().update(CONFIG)
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
@ -521,8 +514,8 @@ globals().update(CONFIG)
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
assert TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {TIMEZONE})' # noqa: F821
os.environ["TZ"] = TIMEZONE # noqa: F821
assert CONSTANTS.TIMEZONE == 'UTC', f'The server timezone should always be set to UTC (got {CONSTANTS.TIMEZONE})' # noqa: F821
os.environ["TZ"] = CONSTANTS.TIMEZONE # noqa: F821
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
########################### Config Validity Checkers ###########################
@ -533,7 +526,8 @@ if not SHELL_CONFIG.SHOW_PROGRESS:
os.environ['TERM'] = 'dumb'
# recreate rich console obj based on new config values
CONSOLE = Console()
STDOUT = CONSOLE = Console()
STDERR = Console(stderr=True)
from ..misc import logging
logging.CONSOLE = CONSOLE
@ -541,11 +535,11 @@ logging.CONSOLE = CONSOLE
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
def bump_startup_progress_bar():
def bump_startup_progress_bar(advance=1):
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=advance) # type: ignore
def setup_django_minimal():
@ -559,6 +553,8 @@ DJANGO_SET_UP = False
def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CONFIG, in_memory_db=False) -> None:
from rich.panel import Panel
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
global DJANGO_SET_UP
@ -568,7 +564,7 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
# TODO: figure out why CLI entrypoints with init_pending are running this twice sometimes
return
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
with Progress(transient=True, expand=True, console=STDERR) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or CONSTANTS.DATA_DIR
@ -595,7 +591,14 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
try:
django.setup()
except Exception as e:
bump_startup_progress_bar(advance=1000)
STDERR.print()
STDERR.print(Panel(f'\n[red]{e.__class__.__name__}[/red]: [yellow]{e}[/yellow]\nPlease check your config and [blue]DATA_DIR[/blue] permissions.\n', title='\n\n[red][X] Error while trying to load database!', subtitle='[grey53]NO WRITES CAN BE PERFORMED[/grey53]', expand=False, style='bold red'))
STDERR.print()
return
bump_startup_progress_bar()
@ -608,6 +611,17 @@ def setup_django(out_dir: Path | None=None, check_db=False, config: benedict=CON
f.write(f"\n> {command}; TS={ts} VERSION={CONSTANTS.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# make sure the data dir is owned by a non-root user
if CONSTANTS.DATA_DIR.stat().st_uid == 0:
STDERR.print('[red][X] Error: ArchiveBox DATA_DIR cannot be owned by root![/red]')
STDERR.print(f' {CONSTANTS.DATA_DIR}')
STDERR.print()
STDERR.print('[violet]Hint:[/violet] Are you running archivebox in the right folder? (and as a non-root user?)')
STDERR.print(' cd path/to/your/archive/data')
STDERR.print(' archivebox [command]')
STDERR.print()
raise SystemExit(9)
# Create cache table in DB if needed
try:
from django.core.cache import cache

152
archivebox/config/paths.py Normal file
View file

@ -0,0 +1,152 @@
__package__ = 'archivebox.config'
import os
import tempfile
import hashlib
from pathlib import Path
from functools import cache
from platformdirs import PlatformDirs
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
#############################################################################################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
#############################################################################################
@cache
def get_collection_id(DATA_DIR=DATA_DIR):
"""Get a short, stable, unique ID for the current collection"""
collection_id_file = DATA_DIR / '.collection_id'
try:
return collection_id_file.read_text().strip()
except (OSError, FileNotFoundError, PermissionError):
pass
hash_key = str(DATA_DIR.resolve()).encode()
collection_id = hashlib.sha256(hash_key).hexdigest()[:8]
try:
collection_id_file.write_text(collection_id)
except (OSError, FileNotFoundError, PermissionError):
pass
return collection_id
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True) -> bool:
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
current_uid, current_gid = os.geteuid(), os.getegid()
uid, gid = uid or current_uid, gid or current_gid
test_file = dir_path / '.permissions_test'
try:
with SudoPermission(uid=uid, fallback=fallback):
test_file.exists()
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
test_file.unlink()
return True
except (IOError, OSError, PermissionError):
pass
return False
@cache
def get_LIB_DIR():
"""
- should be shared with other collections on the same host
- must be scoped by CPU architecture, OS family, and archivebox version
- should not be shared with other hosts/archivebox versions
- must be writable by any archivebox user
- should be persistent across reboots
- can be on a docker bin mount but probably shouldnt be
- ok to have a long path (doesnt contain SOCKETS)
"""
from .version import detect_installed_version
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
if 'SYSTEM_LIB_DIR' in os.environ:
lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
else:
with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
lib_dir = HOST_DIRS.site_data_path
# Docker: /usr/local/share/archivebox/0.8.5
# Ubuntu: /usr/local/share/archivebox/0.8.5
# macOS: /Library/Application Support/archivebox
try:
with SudoPermission(uid=0, fallback=True):
lib_dir.mkdir(parents=True, exist_ok=True)
except PermissionError:
# our user cannot
lib_dir = HOST_DIRS.user_data_path
lib_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(lib_dir):
if IS_ROOT:
# make sure lib dir is owned by the archivebox user, not root
with SudoPermission(uid=0):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
else:
raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
return lib_dir
@cache
def get_TMP_DIR():
"""
- must NOT be inside DATA_DIR / inside a docker volume bind mount
- must NOT have a long PATH (UNIX socket path length restrictions)
- must NOT be shared with other collections/hosts
- must be writable by archivebox user & root
- must be cleared on every boot / not persisted
- must be cleared on every archivebox version upgrade
"""
from .version import detect_installed_version
HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
# print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
# print('RUNNING AS:', self.PUID, self.PGID)
if 'SYSTEM_TMP_DIR' in os.environ:
run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
with SudoPermission(uid=0, fallback=True):
run_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(run_dir):
if IS_ROOT:
with SudoPermission(uid=0, fallback=False):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
else:
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
return run_dir
run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
try:
assert len(str(run_dir)) + len('/supervisord.sock') < 95
except AssertionError:
run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
with SudoPermission(uid=0, fallback=True):
run_dir.mkdir(parents=True, exist_ok=True)
if not dir_is_writable(run_dir):
if IS_ROOT:
with SudoPermission(uid=0):
os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
else:
raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
# Docker: /tmp/archivebox/0.8.5/abc324235
# Ubuntu: /tmp/archivebox/0.8.5/abc324235
# macOS: /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/0.8.5/abc324235
return run_dir

View file

@ -0,0 +1,70 @@
__package__ = 'archivebox.config'
import os
from pathlib import Path
from contextlib import contextmanager
#############################################################################################
DATA_DIR = Path(os.getcwd())
DATA_DIR_STAT = Path(DATA_DIR).stat()
DATA_DIR_UID = DATA_DIR_STAT.st_uid
DATA_DIR_GID = DATA_DIR_STAT.st_gid
DEFAULT_PUID = 911
DEFAULT_PGID = 911
RUNNING_AS_UID = os.getuid()
RUNNING_AS_GID = os.getgid()
EUID = os.geteuid()
EGID = os.getegid()
USER: str = Path('~').expanduser().resolve().name
IS_ROOT = RUNNING_AS_UID == 0
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
os.environ.setdefault('PUID', str(DATA_DIR_UID or RUNNING_AS_UID or DEFAULT_PUID))
os.environ.setdefault('PGID', str(DATA_DIR_GID or RUNNING_AS_GID or DEFAULT_PGID))
ARCHIVEBOX_USER = int(os.environ['PUID'])
ARCHIVEBOX_GROUP = int(os.environ['PGID'])
#############################################################################################
def drop_privileges():
"""If running as root, drop privileges to the user that owns the data dir (or PUID, or default=911)"""
# always run archivebox as the user that owns the data dir, never as root
if os.getuid() == 0:
# drop permissions to the user that owns the data dir / provided PUID
if os.geteuid() != ARCHIVEBOX_USER:
os.seteuid(ARCHIVEBOX_USER)
# if we need sudo (e.g. for installing dependencies) code should use SudoPermissions() context manager to regain root
@contextmanager
def SudoPermission(uid=0, fallback=False):
"""Attempt to run code with sudo permissions for a given user (or root)"""
if os.geteuid() == uid:
# no need to change effective UID, we are already that user
yield
return
try:
# change our effective UID to the given UID
os.seteuid(uid)
except PermissionError as err:
if not fallback:
raise PermissionError(f'Not enough permissions to run code as uid={uid}, please retry with sudo') from err
try:
# yield back to the caller so they can run code inside context as root
yield
finally:
# then set effective UID back to DATA_DIR owner
DATA_DIR_OWNER = DATA_DIR.stat().st_uid
try:
os.seteuid(DATA_DIR_OWNER)
except PermissionError as err:
if not fallback:
raise PermissionError(f'Failed to revert uid={uid} back to {DATA_DIR_OWNER} after running code with sudo') from err

View file

@ -0,0 +1,121 @@
__package__ = 'archivebox.config'
import os
import importlib.metadata
from pathlib import Path
from functools import cache
from datetime import datetime
from typing import Optional
#############################################################################################
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
#############################################################################################
@cache
def detect_installed_version(PACKAGE_DIR: Path=PACKAGE_DIR):
"""Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file"""
try:
# if in production install, use pip-installed package metadata
return importlib.metadata.version('archivebox').strip()
except importlib.metadata.PackageNotFoundError:
pass
try:
# if in dev Git repo dir, use pyproject.toml file
pyproject_config = (PACKAGE_DIR.parent / 'pyproject.toml').read_text().split('\n')
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"').strip()
except FileNotFoundError:
# building docs, pyproject.toml is not available
pass
# raise Exception('Failed to detect installed archivebox version!')
return 'dev'
@cache
def get_COMMIT_HASH() -> Optional[str]:
try:
git_dir = PACKAGE_DIR / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((PACKAGE_DIR / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
@cache
def get_BUILD_TIME() -> str:
if IN_DOCKER:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (PACKAGE_DIR / 'README.md').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
# def get_versions_available_on_github(config):
# """
# returns a dictionary containing the ArchiveBox GitHub release info for
# the recommended upgrade version and the currently installed version
# """
# # we only want to perform the (relatively expensive) check for new versions
# # when its most relevant, e.g. when the user runs a long-running command
# subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
# long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
# if subcommand_run_by_user not in long_running_commands:
# return None
# github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
# response = requests.get(github_releases_api)
# if response.status_code != 200:
# stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
# return None
# all_releases = response.json()
# installed_version = parse_version_string(config['VERSION'])
# # find current version or nearest older version (to link to)
# current_version = None
# for idx, release in enumerate(all_releases):
# release_version = parse_version_string(release['tag_name'])
# if release_version <= installed_version:
# current_version = release
# break
# current_version = current_version or all_releases[-1]
# # recommended version is whatever comes after current_version in the release list
# # (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
# try:
# recommended_version = all_releases[idx+1]
# except IndexError:
# recommended_version = None
# return {'recommended_version': recommended_version, 'current_version': current_version}
# def can_upgrade(config):
# if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
# recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
# current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
# return recommended_version > current_version
# return False
VERSION: str = detect_installed_version()

View file

@ -5,7 +5,7 @@ from django.utils import timezone
from django.contrib.auth.middleware import RemoteUserMiddleware
from django.core.exceptions import ImproperlyConfigured
from archivebox.config import SERVER_CONFIG
from archivebox.config.common import SERVER_CONFIG
def detect_timezone(request, activate: bool=True):

View file

@ -13,7 +13,8 @@ import abx.archivebox
import abx.archivebox.use
import abx.django.use
from archivebox.config import VERSION, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG, SERVER_CONFIG # noqa
from archivebox.config import DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, CONSTANTS
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG # noqa
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
IS_TESTING = 'test' in sys.argv[:3] or 'PYTEST_CURRENT_TEST' in os.environ

View file

@ -27,7 +27,8 @@ from core.admin import result_url
from queues.tasks import bg_add
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
from archivebox.config import CONSTANTS_CONFIG, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG, SERVER_CONFIG
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from .serve_static import serve_static_with_byterange_support

View file

@ -5,7 +5,8 @@ import io
from pathlib import Path
from typing import Optional
from archivebox.config import VERSION, ARCHIVING_CONFIG
from archivebox.config import VERSION
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.config.legacy import SAVE_HTMLTOTEXT
from archivebox.misc.system import atomic_write
from archivebox.misc.util import enforce_types, is_static_file

View file

@ -12,9 +12,11 @@ from urllib.parse import urlparse
from django.db.models import QuerySet, Q
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
from archivebox.misc.logging import stderr
from archivebox.misc.util import scheme, enforce_types, ExtendedEncoder
from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
from archivebox.config.legacy import URL_DENYLIST_PTN, URL_ALLOWLIST_PTN
from ..logging_util import (

View file

@ -16,7 +16,9 @@ from archivebox.misc.util import (
htmlencode,
urldecode,
)
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG, SERVER_CONFIG
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SERVER_CONFIG
from archivebox.config.version import get_COMMIT_HASH
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG
from .schema import Link
@ -56,7 +58,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
return render_django_template(template, {
'version': VERSION,
'git_sha': SHELL_CONFIG.COMMIT_HASH or VERSION,
'git_sha': get_COMMIT_HASH() or VERSION,
'num_links': str(len(links)),
'date_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d'),
'time_updated': datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M'),

View file

@ -8,7 +8,8 @@ from pathlib import Path
from datetime import datetime, timezone
from typing import List, Optional, Iterator, Any, Union
from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL_CONFIG
from archivebox.config import VERSION, DATA_DIR, CONSTANTS
from archivebox.config.common import SERVER_CONFIG, SHELL_CONFIG
from .schema import Link
from archivebox.misc.system import atomic_write

View file

@ -9,7 +9,8 @@ from django.db.models import QuerySet
from django.db import transaction
from archivebox.misc.util import enforce_types, parse_date
from archivebox.config import DATA_DIR, GENERAL_CONFIG
from archivebox.config import DATA_DIR
from archivebox.config.common import GENERAL_CONFIG
from .schema import Link

View file

@ -22,7 +22,8 @@ from rich.panel import Panel
from rich_argparse import RichHelpFormatter
from django.core.management.base import DjangoHelpFormatter
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
from archivebox.config import CONSTANTS, DATA_DIR, VERSION
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI, stderr

View file

@ -14,13 +14,15 @@ from crontab import CronTab, CronSlices
from django.db.models import QuerySet
from django.utils import timezone
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR, SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config import CONSTANTS, VERSION, DATA_DIR, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG, SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG
from archivebox.config.permissions import SudoPermission, IN_DOCKER
from .cli import (
CLI_SUBCOMMANDS,
run_subcommand,
display_first,
meta_cmds,
main_cmds,
setup_cmds,
archive_cmds,
)
from .parsers import (
@ -101,7 +103,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
for cmd, func in all_subcommands.items()
if cmd in main_cmds
if cmd in setup_cmds
) + '\n\n ' + '\n '.join(
f'[green]{cmd.ljust(20)}[/green] {func.__doc__}'
for cmd, func in all_subcommands.items()
@ -119,10 +121,10 @@ def help(out_dir: Path=DATA_DIR) -> None:
[grey53]# using Docker:[/grey53]
[blue]docker run[/blue] -v [light_slate_blue]$PWD:/data[/light_slate_blue] [grey53]-p 8000:8000[/grey53] -it [dark_green]archivebox/archivebox[/dark_green] [green]\\[command][/green] [green3][...args][/green3] [violet][--help][/violet] [grey53][--version][/grey53]
''' if SHELL_CONFIG.IN_DOCKER else ''
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if SHELL_CONFIG.IN_DOCKER else ''
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if SHELL_CONFIG.IN_DOCKER else ''
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if SHELL_CONFIG.IN_DOCKER else ''
''' if IN_DOCKER else ''
DOCKER_DOCS = '\n [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Docker[/link]' if IN_DOCKER else ''
DOCKER_OUTSIDE_HINT = "\n [grey53]# outside of Docker:[/grey53]" if IN_DOCKER else ''
DOCKER_CMD_PREFIX = "[blue]docker ... [/blue]" if IN_DOCKER else ''
print(f'''{DOCKER_USAGE}
[deep_sky_blue4]Usage:[/deep_sky_blue4]{DOCKER_OUTSIDE_HINT}
@ -158,7 +160,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
print(Panel(EXAMPLE_USAGE, expand=False, border_style='grey53', title='[green3]:white_check_mark: A collection [light_slate_blue]DATA DIR[/light_slate_blue] is currently active[/green3]', subtitle='Commands run inside this dir will only apply to this collection.'))
else:
DATA_SETUP_HELP = '\n'
if SHELL_CONFIG.IN_DOCKER:
if IN_DOCKER:
DATA_SETUP_HELP += '[violet]Hint:[/violet] When using Docker, you need to mount a volume to use as your data dir:\n'
DATA_SETUP_HELP += ' docker run [violet]-v /some/path/data:/data[/violet] archivebox/archivebox ...\n\n'
DATA_SETUP_HELP += 'To load an [dark_blue]existing[/dark_blue] collection:\n'
@ -190,6 +192,8 @@ def version(quiet: bool=False,
from plugins_auth.ldap.apps import LDAP_CONFIG
from django.conf import settings
from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID
# 0.7.1
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
@ -198,13 +202,14 @@ def version(quiet: bool=False,
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
COMMIT_HASH = get_COMMIT_HASH()
prnt(
'[dark_green]ArchiveBox[/dark_green] [dark_goldenrod]v{}[/dark_goldenrod]'.format(CONSTANTS.VERSION),
f'COMMIT_HASH={SHELL_CONFIG.COMMIT_HASH[:7] if SHELL_CONFIG.COMMIT_HASH else "unknown"}',
f'BUILD_TIME={SHELL_CONFIG.BUILD_TIME}',
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={get_BUILD_TIME()}',
)
prnt(
f'IN_DOCKER={SHELL_CONFIG.IN_DOCKER}',
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={SHELL_CONFIG.IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
@ -212,11 +217,13 @@ def version(quiet: bool=False,
f'PYTHON={sys.implementation.name.title()}',
)
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
prnt(
f'EUID={os.geteuid()} UID={RUNNING_AS_UID} PUID={ARCHIVEBOX_USER} FS_UID={DATA_DIR_STAT.st_uid}',
f'EGID={os.getegid()} GID={RUNNING_AS_GID} PGID={ARCHIVEBOX_GROUP} FS_GID={DATA_DIR_STAT.st_gid}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
)
prnt(
f'DEBUG={SHELL_CONFIG.DEBUG}',
@ -261,8 +268,36 @@ def version(quiet: bool=False,
else:
prnt()
prnt('[red][i] Data locations:[/red] (not in a data directory)')
prnt()
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
data_dir_stat = Path(DATA_DIR).stat()
data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
data_owned_by_root = data_dir_uid == 0 or data_dir_gid == 0
data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) and not IS_ROOT
data_not_writable = not (os.access(DATA_DIR, os.W_OK) and os.access(CONSTANTS.LIB_DIR, os.W_OK) and os.access(CONSTANTS.TMP_DIR, os.W_OK))
if data_owned_by_root:
prnt('[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], ArchiveBox will refuse to run![/yellow]')
elif data_owner_doesnt_match or data_not_writable:
prnt(f'[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
else:
prnt(f':information: [blue]DATA_DIR[/blue] is currently owned by [blue]{data_dir_uid}:{data_dir_gid}[/blue] (PUID:PGID)')
if data_owned_by_root or data_owner_doesnt_match or data_owned_by_default_user or data_not_writable:
prnt(f'[violet]Hint:[/violet] If you encounter permissions errors, change [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to match the user that will run ArchiveBox, e.g.:')
prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.LIB_DIR.resolve()}')
prnt(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {CONSTANTS.TMP_DIR.resolve()}')
prnt()
prnt('[blue]More info:[/blue]')
prnt(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
prnt(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
@enforce_types
@ -948,23 +983,56 @@ def list_folders(links: List[Link],
@enforce_types
def install(out_dir: Path=DATA_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
# if running as root:
# - run init to create index + lib dir
# - chown -R 911 DATA_DIR
# - install all binaries as root
# - chown -R 911 LIB_DIR
# else:
# - run init to create index + lib dir as current user
# - install all binaries as current user
# - recommend user re-run with sudo if any deps need to be installed as root
from rich import print
from django.conf import settings
from archivebox import CONSTANTS
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
if not ARCHIVE_DIR.exists():
run_subcommand('init', stdin=None, pwd=out_dir)
stderr('\n[+] Installing ArchiveBox dependencies automatically...', color='green')
run_subcommand('init', stdin=None, pwd=out_dir) # must init full index because we need a db to store InstalledBinary entries in
print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
# we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
if IS_ROOT:
# if we have sudo/root permissions, take advantage of them just while installing dependencies
print()
print('[yellow]:warning: Using [red]root[/red] privileges only to install dependencies that need it, all other operations should be done as a [blue]non-root[/blue] user.[/yellow]')
print(f' DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
print()
for binary in reversed(list(settings.BINARIES.values())):
providers = ' [grey53]or[/grey53] '.join(provider.name for provider in binary.binproviders_supported)
print(f'[+] Locating / Installing [yellow]{binary.name}[/yellow] using [red]{providers}[/red]...')
try:
print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
if IS_ROOT:
with SudoPermission(uid=0):
os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
except Exception as e:
print(f'[X] Failed to install {binary.name}: {e}')
if IS_ROOT:
print(f'[yellow]:warning: Retrying {binary.name} installation with [red]sudo[/red]...[/yellow]')
with SudoPermission(uid=0):
try:
print(binary.load_or_install(fresh=True).model_dump(exclude={'provider_overrides', 'bin_dir', 'hook_type'}))
os.system(f'chown -R {ARCHIVEBOX_USER} "{CONSTANTS.LIB_DIR.resolve()}"')
except Exception as e:
print(f'[red]:cross_mark: Failed to install {binary.name} as root: {e}[/red]')
else:
print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
from django.contrib.auth import get_user_model
User = get_user_model()
@ -974,12 +1042,13 @@ def install(out_dir: Path=DATA_DIR) -> None:
stderr(' archivebox manage createsuperuser')
# run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version'], capture_output=False, cwd=out_dir)
# backwards-compatibility:
setup = install
@ -1100,6 +1169,7 @@ def schedule(add: bool=False,
check_data_folder()
from archivebox.plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
from archivebox.config.permissions import USER
Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)
@ -1156,7 +1226,7 @@ def schedule(add: bool=False,
existing_jobs = list(cron.find_comment(CRON_COMMENT))
print()
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(SHELL_CONFIG.USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
print('{green}[√] Scheduled new ArchiveBox cron job for user: {} ({} jobs are active).{reset}'.format(USER, len(existing_jobs), **SHELL_CONFIG.ANSI))
print('\n'.join(f' > {cmd}' if str(cmd) == str(new_job) else f' {cmd}' for cmd in existing_jobs))
if total_runs > 60 and not quiet:
stderr()
@ -1170,7 +1240,7 @@ def schedule(add: bool=False,
if existing_jobs:
print('\n'.join(str(cmd) for cmd in existing_jobs))
else:
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(SHELL_CONFIG.USER, **SHELL_CONFIG.ANSI))
stderr('{red}[X] There are no ArchiveBox cron jobs scheduled for your user ({}).{reset}'.format(USER, **SHELL_CONFIG.ANSI))
stderr(' To schedule a new job, run:')
stderr(' archivebox schedule --every=[timeperiod] --depth=1 https://example.com/some/rss/feed.xml')
raise SystemExit(0)
@ -1294,7 +1364,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=DATA_DIR) -> None:
check_data_folder()
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (SHELL_CONFIG.IN_DOCKER and not SHELL_CONFIG.IS_TTY):
if (args and "createsuperuser" in args) and (IN_DOCKER and not SHELL_CONFIG.IS_TTY):
stderr('[!] Warning: you need to pass -it to use interactive commands in docker', color='lightyellow')
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')

View file

@ -1,37 +1,44 @@
__package__ = 'archivebox.misc'
from archivebox.config import DATA_DIR, ARCHIVE_DIR, CONSTANTS, SHELL_CONFIG
import sys
from rich import print
from .logging import stderr
# DO NOT ADD ANY TOP-LEVEL IMPORTS HERE
# this file is imported by archivebox/__init__.py
# and any imports here will be imported by EVERYTHING else
# so this file should only be used for pure python checks
# that don't need to import other parts of ArchiveBox
def check_data_folder() -> None:
from archivebox import DATA_DIR, ARCHIVE_DIR
archive_dir_exists = ARCHIVE_DIR.exists()
if not archive_dir_exists:
stderr('[X] No archivebox index found in the current directory.', color='red')
stderr(f' {DATA_DIR}', color='lightyellow')
stderr()
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**SHELL_CONFIG.ANSI))
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**SHELL_CONFIG.ANSI))
stderr(' archivebox init')
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
print(f' {DATA_DIR}', file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
print(' cd path/to/your/archive/folder', file=sys.stderr)
print(' archivebox [command]', file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
print(' archivebox init', file=sys.stderr)
raise SystemExit(2)
def check_migrations():
from archivebox import DATA_DIR, CONSTANTS
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if pending_migrations:
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {DATA_DIR}')
stderr()
stderr(f' To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:')
stderr(' archivebox init')
print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
print(f' {DATA_DIR}', file=sys.stderr)
print(file=sys.stderr)
print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
print(' archivebox init', file=sys.stderr)
raise SystemExit(3)
CONSTANTS.SOURCES_DIR.mkdir(exist_ok=True)
@ -39,3 +46,39 @@ def check_migrations():
# CONSTANTS.CACHE_DIR.mkdir(exist_ok=True)
(CONSTANTS.LIB_DIR / 'bin').mkdir(exist_ok=True, parents=True)
(CONSTANTS.PERSONAS_DIR / 'Default').mkdir(exist_ok=True, parents=True)
def check_io_encoding():
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
if PYTHON_ENCODING != 'UTF-8':
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
raise SystemExit(2)
def check_not_root():
from archivebox.config.permissions import IS_ROOT, IN_DOCKER
attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv[:2]
is_getting_version = '--version' in sys.argv or 'version' in sys.argv[:2]
is_installing = 'setup' in sys.argv[:2] or 'install' in sys.argv[:2]
if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
print(' For more information, see the security overview documentation:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
if IN_DOCKER:
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
print(' or:', file=sys.stderr)
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
raise SystemExit(2)

View file

@ -13,6 +13,7 @@ from rich.highlighter import Highlighter
# SETUP RICH CONSOLE / TTY detection / COLOR / PROGRESS BARS
CONSOLE = Console()
STDERR = Console(stderr=True)
IS_TTY = CONSOLE.is_interactive
@ -51,7 +52,7 @@ COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'37': [(255, 255, 255), (255, 255, 255)],
})
# Logging Helpers
# Logging Helpers (DEPRECATED, use rich.print instead going forward)
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI

View file

@ -4,7 +4,6 @@ __package__ = 'archivebox.misc'
import os
import signal
import shutil
import getpass
from json import dump
from pathlib import Path
@ -14,7 +13,7 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write
from archivebox.config import STORAGE_CONFIG
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.util import enforce_types, ExtendedEncoder

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox'
__package__ = 'archivebox.misc'
import re
import requests
@ -25,10 +25,10 @@ except ImportError:
detect_encoding = lambda rawdata: "utf-8"
from archivebox.config.constants import STATICFILE_EXTENSIONS
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG
from .misc.logging import COLOR_DICT
from .logging import COLOR_DICT
### Parsing Helpers
@ -120,7 +120,7 @@ def find_all_urls(urls_str: str):
def is_static_file(url: str):
# TODO: the proper way is with MIME type detection + ext, not only extension
return extension(url).lower() in STATICFILE_EXTENSIONS
return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
def enforce_types(func):

View file

@ -13,7 +13,8 @@ from typing import IO, Tuple, List, Optional
from datetime import datetime, timezone
from pathlib import Path
from archivebox.config import DATA_DIR, CONSTANTS, SHELL_CONFIG, ARCHIVING_CONFIG
from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.config.common import SHELL_CONFIG, ARCHIVING_CONFIG
from archivebox.misc.system import atomic_write
from archivebox.misc.logging import stderr, hint
from archivebox.misc.util import (

View file

@ -25,7 +25,8 @@ from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER

View file

@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
# from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from archivebox.plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG

View file

@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
class GitConfig(BaseConfigSet):

View file

@ -5,14 +5,14 @@ from pathlib import Path
from subprocess import run
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinName, bin_abspath
from pydantic_pkgr import BinProvider, BinName, BinProviderName, ProviderLookupDict, bin_abspath
from abx.archivebox.base_plugin import BasePlugin, BaseHook
from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, BinProviderName,ProviderLookupDict, env
from abx.archivebox.base_binary import BaseBinary, env
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
class MercuryConfig(BaseConfigSet):

View file

@ -16,7 +16,7 @@ from abx.archivebox.base_extractor import BaseExtractor
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################

View file

@ -1,11 +1,11 @@
__package__ = 'archivebox.plugins_extractor.singlefile'
from pathlib import Path
from typing import List, Dict, Optional, ClassVar
from typing import List, Dict, Optional
# from typing_extensions import Self
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field, validate_call
from pydantic import InstanceOf, Field
from pydantic_pkgr import BinProvider, BinProviderName, ProviderLookupDict, BinName, bin_abspath, ShallowBinary
# Depends on other Django apps:
@ -17,7 +17,7 @@ from abx.archivebox.base_queue import BaseQueue
from abx.archivebox.base_hook import BaseHook
# Depends on Other Plugins:
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################

View file

@ -14,7 +14,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
from archivebox.config import ARCHIVING_CONFIG, STORAGE_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
from .wget_util import wget_output_path

View file

@ -11,7 +11,7 @@ from abx.archivebox.base_configset import BaseConfigSet
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
from abx.archivebox.base_hook import BaseHook
from archivebox.config import ARCHIVING_CONFIG
from archivebox.config.common import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################

View file

@ -18,7 +18,8 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
from archivebox.config import CONSTANTS, SEARCH_BACKEND_CONFIG
from archivebox.config import CONSTANTS
from archivebox.config.common import SEARCH_BACKEND_CONFIG
###################### Config ##########################

View file

@ -15,7 +15,7 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
from archivebox.config import SEARCH_BACKEND_CONFIG
from archivebox.config.common import SEARCH_BACKEND_CONFIG
SONIC_LIB = None
try:

View file

@ -17,7 +17,7 @@ from abx.archivebox.base_hook import BaseHook
from abx.archivebox.base_searchbackend import BaseSearchBackend
# Depends on Other Plugins:
from archivebox.config import SEARCH_BACKEND_CONFIG
from archivebox.config.common import SEARCH_BACKEND_CONFIG

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.queues'
import os
import time
import signal
import psutil
@ -12,6 +13,8 @@ from typing import Dict, cast
from supervisor.xmlrpc import SupervisorTransport
from xmlrpc.client import ServerProxy
from archivebox.config.permissions import ARCHIVEBOX_USER
from .settings import SUPERVISORD_CONFIG_FILE, DATA_DIR, PID_FILE, SOCK_FILE, LOG_FILE, WORKERS_DIR, TMP_DIR, LOGS_DIR
from typing import Iterator
@ -42,6 +45,7 @@ childlogdir = {LOGS_DIR}
directory = {DATA_DIR}
strip_ansi = true
nocleanup = true
user = {ARCHIVEBOX_USER}
[unix_http_server]
file = {TMP_DIR}/{SOCK_FILE.name}

View file

@ -11,7 +11,7 @@ import abx.archivebox.use
from archivebox.index.schema import Link
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config import SEARCH_BACKEND_CONFIG
from archivebox.config.common import SEARCH_BACKEND_CONFIG
def log_index_started(url):

View file

@ -110,12 +110,11 @@ if [[ -d "$PLAYWRIGHT_BROWSERS_PATH/.links" ]]; then
chown -h $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/.links/*
fi
# also chown tmp dir
mkdir -p /tmp/archivebox
chmod 777 /tmp
chown $PUID:$PGID /tmp/archivebox
mkdir -p /app/lib
chown $PUID:$PGID /app/lib /app/lib/*
# also chown tmp dir and lib dir
mkdir -p "$SYSTEM_TMP_DIR"
chown $PUID:$PGID "$SYSTEM_TMP_DIR"
mkdir -p "$SYSTEM_LIB_DIR"
chown $PUID:$PGID "$SYSTEM_LIB_DIR" "$SYSTEM_LIB_DIR"/*
# (this check is written in blood in 2023, QEMU silently breaks things in ways that are not obvious)
export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"

View file

@ -1,6 +1,6 @@
[project]
name = "archivebox"
version = "0.8.5rc2"
version = "0.8.5rc3"
requires-python = ">=3.10"
description = "Self-hosted internet archiving solution."
authors = [{name = "Nick Sweeting", email = "pyproject.toml@archivebox.io"}]
@ -77,6 +77,7 @@ dependencies = [
"atomicwrites==1.4.1",
"django-taggit==1.3.0",
"base32-crockford==0.3.0",
"platformdirs>=4.3.6",
# "pocket@git+https://github.com/tapanpandita/pocket.git@v0.3.7",
# "pydantic-pkgr>=0.4.7",
############# Plugin Dependencies ################
@ -133,7 +134,6 @@ dev-dependencies = [
"django-autotyping>=0.5.1",
]
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"

View file

@ -7,11 +7,11 @@ from pathlib import Path
import json, shutil
import sqlite3
from archivebox.config import OUTPUT_PERMISSIONS
from archivebox.config.common import STORAGE_CONFIG
from .fixtures import *
DIR_PERMISSIONS = OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
DIR_PERMISSIONS = STORAGE_CONFIG.OUTPUT_PERMISSIONS.replace('6', '7').replace('4', '5')
def test_init(tmp_path, process):
assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
@ -57,7 +57,7 @@ def test_correct_permissions_output_folder(tmp_path, process):
index_files = ['index.sqlite3', 'archive']
for file in index_files:
file_path = tmp_path / file
assert oct(file_path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
assert oct(file_path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
def test_correct_permissions_add_command_results(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)
@ -65,7 +65,7 @@ def test_correct_permissions_add_command_results(tmp_path, process, disable_extr
env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
for path in archived_item_path.iterdir():
assert oct(path.stat().st_mode)[-3:] in (OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
assert oct(path.stat().st_mode)[-3:] in (STORAGE_CONFIG.OUTPUT_PERMISSIONS, DIR_PERMISSIONS)
def test_collision_urls_different_timestamps(tmp_path, process, disable_extractors_dict):
os.chdir(tmp_path)

View file

@ -41,7 +41,7 @@ wheels = [
[[package]]
name = "archivebox"
version = "0.8.5rc2"
version = "0.8.5rc3"
source = { editable = "." }
dependencies = [
{ name = "atomicwrites" },