__package__ = 'archivebox.config' import os import socket import hashlib import tempfile import platform from pathlib import Path from functools import cache from datetime import datetime from benedict import benedict from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER ############################################################################################# PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes') DATABASE_FILE = DATA_DIR / 'index.sqlite3' ############################################################################################# def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str: collection_id_file = DATA_DIR / '.archivebox_id' try: return collection_id_file.read_text().strip() except (OSError, FileNotFoundError, PermissionError): pass # hash the machine_id + collection dir path + creation time to get a unique collection_id machine_id = get_machine_id() collection_path = DATA_DIR.resolve() try: creation_date = DATA_DIR.stat().st_ctime except Exception: creation_date = datetime.now().isoformat() collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8] try: # only persist collection_id file if we already have an index.sqlite3 file present # otherwise we might be running in a directory that is not a collection, no point creating cruft files collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK) if collection_is_active or force_create: collection_id_file.write_text(collection_id) # if we're running as root right now, make sure the collection_id file is owned by the archivebox user if IS_ROOT: with SudoPermission(uid=0): if ARCHIVEBOX_USER == 0: os.system(f'chmod 777 "{collection_id_file}"') else: os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"') except (OSError, FileNotFoundError, PermissionError): pass return collection_id @cache def get_collection_id(DATA_DIR=DATA_DIR) -> str: """Get a short, stable, unique ID for the current collection (e.g. abc45678)""" return _get_collection_id(DATA_DIR=DATA_DIR) @cache def get_machine_id() -> str: """Get a short, stable, unique ID for the current machine (e.g. abc45678)""" MACHINE_ID = 'unknown' try: import machineid MACHINE_ID = machineid.hashed_id('archivebox')[:8] except Exception: try: import uuid import hashlib MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8] except Exception: pass return MACHINE_ID @cache def get_machine_type() -> str: """Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)""" OS: str = platform.system().lower() # darwin, linux, etc. ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc. LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}' return LIB_DIR_SCOPE def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool: """Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)""" current_uid, current_gid = os.geteuid(), os.getegid() uid, gid = uid or current_uid, gid or current_gid test_file = dir_path / '.permissions_test' try: with SudoPermission(uid=uid, fallback=fallback): test_file.exists() test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir') test_file.unlink() return True except (IOError, OSError, PermissionError): if chown: # try fixing it using sudo permissions with SudoPermission(uid=uid, fallback=fallback): os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null') return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False) return False def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool: """Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)""" from archivebox.logging_util import pretty_path try: socket_path = str(dir_path / '.test_socket.sock') s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM) try: os.remove(socket_path) except OSError: pass s.bind(socket_path) s.close() try: os.remove(socket_path) except OSError: pass except Exception as e: raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e return True def create_and_chown_dir(dir_path: Path) -> None: with SudoPermission(uid=0, fallback=True): dir_path.mkdir(parents=True, exist_ok=True) os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null') os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &') @cache def get_or_create_working_tmp_dir(autofix=True, quiet=False): from archivebox import CONSTANTS from archivebox.config.common import STORAGE_CONFIG from archivebox.misc.checks import check_tmp_dir # try a few potential directories in order of preference CANDIDATES = [ STORAGE_CONFIG.TMP_DIR, # CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/ Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512 Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512 Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512 Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512 Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5 ] for candidate in CANDIDATES: try: create_and_chown_dir(candidate) except Exception: pass if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True): if autofix and STORAGE_CONFIG.TMP_DIR != candidate: STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet) return candidate if not quiet: raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!') @cache def get_or_create_working_lib_dir(autofix=True, quiet=False): from archivebox import CONSTANTS from archivebox.config.common import STORAGE_CONFIG from archivebox.misc.checks import check_lib_dir # try a few potential directories in order of preference CANDIDATES = [ STORAGE_CONFIG.LIB_DIR, # CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5 *([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5 Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5 ] for candidate in CANDIDATES: try: create_and_chown_dir(candidate) except Exception: pass if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True): if autofix and STORAGE_CONFIG.LIB_DIR != candidate: STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet) return candidate if not quiet: raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!') @cache def get_data_locations(): from archivebox.config import CONSTANTS from archivebox.config.common import STORAGE_CONFIG return benedict({ "DATA_DIR": { "path": DATA_DIR.resolve(), "enabled": True, "is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK), "is_mount": os.path.ismount(DATA_DIR.resolve()), }, "CONFIG_FILE": { "path": CONSTANTS.CONFIG_FILE.resolve(), "enabled": True, "is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK), }, "SQL_INDEX": { "path": DATABASE_FILE.resolve(), "enabled": True, "is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK), "is_mount": os.path.ismount(DATABASE_FILE.resolve()), }, "QUEUE_DATABASE": { "path": CONSTANTS.QUEUE_DATABASE_FILE, "enabled": True, "is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK), "is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE), }, "ARCHIVE_DIR": { "path": ARCHIVE_DIR.resolve(), "enabled": True, "is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK), "is_mount": os.path.ismount(ARCHIVE_DIR.resolve()), }, "SOURCES_DIR": { "path": CONSTANTS.SOURCES_DIR.resolve(), "enabled": True, "is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK), }, "PERSONAS_DIR": { "path": CONSTANTS.PERSONAS_DIR.resolve(), "enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR), "is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write }, "LOGS_DIR": { "path": CONSTANTS.LOGS_DIR.resolve(), "enabled": True, "is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write }, 'TMP_DIR': { 'path': STORAGE_CONFIG.TMP_DIR.resolve(), 'enabled': True, 'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write }, # "CACHE_DIR": { # "path": CACHE_DIR.resolve(), # "enabled": True, # "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write # }, }) @cache def get_code_locations(): from archivebox.config import CONSTANTS from archivebox.config.common import STORAGE_CONFIG return benedict({ 'PACKAGE_DIR': { 'path': (PACKAGE_DIR).resolve(), 'enabled': True, 'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable }, 'TEMPLATES_DIR': { 'path': CONSTANTS.TEMPLATES_DIR.resolve(), 'enabled': True, 'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list }, 'CUSTOM_TEMPLATES_DIR': { 'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(), 'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR), 'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read }, 'USER_PLUGINS_DIR': { 'path': CONSTANTS.USER_PLUGINS_DIR.resolve(), 'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR), 'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read }, 'LIB_DIR': { 'path': STORAGE_CONFIG.LIB_DIR.resolve(), 'enabled': True, 'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write }, }) # @cache # def get_LIB_DIR(): # """ # - should be shared with other collections on the same host # - must be scoped by CPU architecture, OS family, and archivebox version # - should not be shared with other hosts/archivebox versions # - must be writable by any archivebox user # - should be persistent across reboots # - can be on a docker bin mount but probably shouldnt be # - ok to have a long path (doesnt contain SOCKETS) # """ # from .version import detect_installed_version # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) # lib_dir = tempfile.gettempdir() # try: # if 'SYSTEM_LIB_DIR' in os.environ: # lib_dir = Path(os.environ['SYSTEM_LIB_DIR']) # else: # with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True): # lib_dir = HOST_DIRS.site_data_path # # Docker: /usr/local/share/archivebox/0.8.5 # # Ubuntu: /usr/local/share/archivebox/0.8.5 # # macOS: /Library/Application Support/archivebox # try: # with SudoPermission(uid=0, fallback=True): # lib_dir.mkdir(parents=True, exist_ok=True) # except PermissionError: # # our user cannot # lib_dir = HOST_DIRS.user_data_path # lib_dir.mkdir(parents=True, exist_ok=True) # if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER): # if IS_ROOT: # # make sure lib dir is owned by the archivebox user, not root # with SudoPermission(uid=0): # if ARCHIVEBOX_USER == 0: # # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr) # os.system(f'chmod -R 777 "{lib_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"') # else: # raise PermissionError() # except (PermissionError, AssertionError): # # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') # print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) # return lib_dir # @cache # def get_TMP_DIR(): # """ # - must NOT be inside DATA_DIR / inside a docker volume bind mount # - must NOT have a long PATH (UNIX socket path length restrictions) # - must NOT be shared with other collections/hosts # - must be writable by archivebox user & root # - must be cleared on every boot / not persisted # - must be cleared on every archivebox version upgrade # """ # from .version import detect_installed_version # HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False) # # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP) # # print('RUNNING AS:', self.PUID, self.PGID) # run_dir = tempfile.gettempdir() # try: # if 'SYSTEM_TMP_DIR' in os.environ: # run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR) # with SudoPermission(uid=0, fallback=True): # run_dir.mkdir(parents=True, exist_ok=True) # if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): # if IS_ROOT: # with SudoPermission(uid=0, fallback=False): # if ARCHIVEBOX_USER == 0: # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) # os.system(f'chmod -R 777 "{run_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') # else: # raise PermissionError() # assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' # return run_dir # run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve() # try: # assert len(str(run_dir)) + len('/supervisord.sock') < 95 # except AssertionError: # run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR) # assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)' # with SudoPermission(uid=0, fallback=True): # run_dir.mkdir(parents=True, exist_ok=True) # if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER): # if IS_ROOT: # with SudoPermission(uid=0): # if ARCHIVEBOX_USER == 0: # # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr) # os.system(f'chmod -R 777 "{run_dir}"') # else: # os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"') # else: # raise PermissionError() # except (PermissionError, AssertionError): # # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}') # print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr) # return run_dir