mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-22 20:23:12 +00:00
413 lines
20 KiB
Python
413 lines
20 KiB
Python
__package__ = 'archivebox.config'
|
|
|
|
import os
|
|
import socket
|
|
import hashlib
|
|
import tempfile
|
|
import platform
|
|
from pathlib import Path
|
|
from functools import cache
|
|
from datetime import datetime
|
|
|
|
from benedict import benedict
|
|
|
|
from .permissions import SudoPermission, IS_ROOT, ARCHIVEBOX_USER
|
|
|
|
#############################################################################################
|
|
|
|
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
|
|
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
|
|
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
|
|
|
|
IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
|
|
|
|
DATABASE_FILE = DATA_DIR / 'index.sqlite3'
|
|
|
|
#############################################################################################
|
|
|
|
def _get_collection_id(DATA_DIR=DATA_DIR, force_create=False) -> str:
|
|
collection_id_file = DATA_DIR / '.archivebox_id'
|
|
|
|
try:
|
|
return collection_id_file.read_text().strip()
|
|
except (OSError, FileNotFoundError, PermissionError):
|
|
pass
|
|
|
|
# hash the machine_id + collection dir path + creation time to get a unique collection_id
|
|
machine_id = get_machine_id()
|
|
collection_path = DATA_DIR.resolve()
|
|
try:
|
|
creation_date = DATA_DIR.stat().st_ctime
|
|
except Exception:
|
|
creation_date = datetime.now().isoformat()
|
|
collection_id = hashlib.sha256(f'{machine_id}:{collection_path}@{creation_date}'.encode()).hexdigest()[:8]
|
|
|
|
try:
|
|
# only persist collection_id file if we already have an index.sqlite3 file present
|
|
# otherwise we might be running in a directory that is not a collection, no point creating cruft files
|
|
collection_is_active = os.path.isfile(DATABASE_FILE) and os.path.isdir(ARCHIVE_DIR) and os.access(DATA_DIR, os.W_OK)
|
|
if collection_is_active or force_create:
|
|
collection_id_file.write_text(collection_id)
|
|
|
|
# if we're running as root right now, make sure the collection_id file is owned by the archivebox user
|
|
if IS_ROOT:
|
|
with SudoPermission(uid=0):
|
|
if ARCHIVEBOX_USER == 0:
|
|
os.system(f'chmod 777 "{collection_id_file}"')
|
|
else:
|
|
os.system(f'chown {ARCHIVEBOX_USER} "{collection_id_file}"')
|
|
except (OSError, FileNotFoundError, PermissionError):
|
|
pass
|
|
return collection_id
|
|
|
|
@cache
|
|
def get_collection_id(DATA_DIR=DATA_DIR) -> str:
|
|
"""Get a short, stable, unique ID for the current collection (e.g. abc45678)"""
|
|
return _get_collection_id(DATA_DIR=DATA_DIR)
|
|
|
|
@cache
|
|
def get_machine_id() -> str:
|
|
"""Get a short, stable, unique ID for the current machine (e.g. abc45678)"""
|
|
|
|
MACHINE_ID = 'unknown'
|
|
try:
|
|
import machineid
|
|
MACHINE_ID = machineid.hashed_id('archivebox')[:8]
|
|
except Exception:
|
|
try:
|
|
import uuid
|
|
import hashlib
|
|
MACHINE_ID = hashlib.sha256(str(uuid.getnode()).encode()).hexdigest()[:8]
|
|
except Exception:
|
|
pass
|
|
return MACHINE_ID
|
|
|
|
@cache
|
|
def get_machine_type() -> str:
|
|
"""Get a short, stable, unique type identifier for the current machine (e.g. linux-x86_64-docker)"""
|
|
|
|
OS: str = platform.system().lower() # darwin, linux, etc.
|
|
ARCH: str = platform.machine().lower() # arm64, x86_64, aarch64, etc.
|
|
LIB_DIR_SCOPE: str = f'{ARCH}-{OS}-docker' if IN_DOCKER else f'{ARCH}-{OS}'
|
|
return LIB_DIR_SCOPE
|
|
|
|
|
|
def dir_is_writable(dir_path: Path, uid: int | None = None, gid: int | None = None, fallback=True, chown=True) -> bool:
|
|
"""Check if a given directory is writable by a specific user and group (fallback=try as current user is unable to check with provided uid)"""
|
|
current_uid, current_gid = os.geteuid(), os.getegid()
|
|
uid, gid = uid or current_uid, gid or current_gid
|
|
|
|
test_file = dir_path / '.permissions_test'
|
|
try:
|
|
with SudoPermission(uid=uid, fallback=fallback):
|
|
test_file.exists()
|
|
test_file.write_text(f'Checking if PUID={uid} PGID={gid} can write to dir')
|
|
test_file.unlink()
|
|
return True
|
|
except (IOError, OSError, PermissionError):
|
|
if chown:
|
|
# try fixing it using sudo permissions
|
|
with SudoPermission(uid=uid, fallback=fallback):
|
|
os.system(f'chown {uid}:{gid} "{dir_path}" 2>/dev/null')
|
|
return dir_is_writable(dir_path, uid=uid, gid=gid, fallback=fallback, chown=False)
|
|
return False
|
|
|
|
def assert_dir_can_contain_unix_sockets(dir_path: Path) -> bool:
|
|
"""Check if a given directory can contain unix sockets (e.g. /tmp/supervisord.sock)"""
|
|
from archivebox.logging_util import pretty_path
|
|
|
|
try:
|
|
socket_path = str(dir_path / '.test_socket.sock')
|
|
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
|
|
try:
|
|
os.remove(socket_path)
|
|
except OSError:
|
|
pass
|
|
s.bind(socket_path)
|
|
s.close()
|
|
try:
|
|
os.remove(socket_path)
|
|
except OSError:
|
|
pass
|
|
except Exception as e:
|
|
raise Exception(f'ArchiveBox failed to create a test UNIX socket file in {pretty_path(dir_path, color=False)}') from e
|
|
|
|
return True
|
|
|
|
|
|
def create_and_chown_dir(dir_path: Path) -> None:
|
|
with SudoPermission(uid=0, fallback=True):
|
|
dir_path.mkdir(parents=True, exist_ok=True)
|
|
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}" 2>/dev/null')
|
|
os.system(f'chown {ARCHIVEBOX_USER} "{dir_path}"/* 2>/dev/null &')
|
|
|
|
@cache
|
|
def get_or_create_working_tmp_dir(autofix=True, quiet=False):
|
|
from archivebox import CONSTANTS
|
|
from archivebox.config.common import STORAGE_CONFIG
|
|
from archivebox.misc.checks import check_tmp_dir
|
|
|
|
# try a few potential directories in order of preference
|
|
CANDIDATES = [
|
|
STORAGE_CONFIG.TMP_DIR, # <user-specified>
|
|
CONSTANTS.DEFAULT_TMP_DIR, # ./data/tmp/<machine_id>
|
|
Path('/var/run/archivebox') / get_collection_id(), # /var/run/archivebox/abc5d8512
|
|
Path('/tmp') / 'archivebox' / get_collection_id(), # /tmp/archivebox/abc5d8512
|
|
Path('~/.tmp/archivebox').expanduser() / get_collection_id(), # ~/.tmp/archivebox/abc5d8512
|
|
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id(), # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d8512
|
|
Path(tempfile.gettempdir()) / 'archivebox' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/archivebox/abc5d
|
|
Path(tempfile.gettempdir()) / 'abx' / get_collection_id()[:4], # /var/folders/qy/6tpfrpx100j1t4l312nz683m0000gn/T/abx/abc5
|
|
]
|
|
for candidate in CANDIDATES:
|
|
try:
|
|
create_and_chown_dir(candidate)
|
|
except Exception:
|
|
pass
|
|
if check_tmp_dir(candidate, throw=False, quiet=True, must_exist=True):
|
|
if autofix and STORAGE_CONFIG.TMP_DIR != candidate:
|
|
STORAGE_CONFIG.update_in_place(TMP_DIR=candidate, warn=not quiet)
|
|
return candidate
|
|
|
|
if not quiet:
|
|
raise OSError(f'ArchiveBox is unable to find a writable TMP_DIR, tried {CANDIDATES}!')
|
|
|
|
@cache
|
|
def get_or_create_working_lib_dir(autofix=True, quiet=False):
|
|
from archivebox import CONSTANTS
|
|
from archivebox.config.common import STORAGE_CONFIG
|
|
from archivebox.misc.checks import check_lib_dir
|
|
|
|
# try a few potential directories in order of preference
|
|
CANDIDATES = [
|
|
STORAGE_CONFIG.LIB_DIR, # <user-specified>
|
|
CONSTANTS.DEFAULT_LIB_DIR, # ./data/lib/arm64-linux-docker
|
|
Path('/usr/local/share/archivebox') / get_collection_id(), # /usr/local/share/archivebox/abc5
|
|
*([Path('/opt/homebrew/share/archivebox') / get_collection_id()] if os.path.isfile('/opt/homebrew/bin/archivebox') else []), # /opt/homebrew/share/archivebox/abc5
|
|
Path('~/.local/share/archivebox').expanduser() / get_collection_id(), # ~/.local/share/archivebox/abc5
|
|
]
|
|
|
|
for candidate in CANDIDATES:
|
|
try:
|
|
create_and_chown_dir(candidate)
|
|
except Exception:
|
|
pass
|
|
if check_lib_dir(candidate, throw=False, quiet=True, must_exist=True):
|
|
if autofix and STORAGE_CONFIG.LIB_DIR != candidate:
|
|
STORAGE_CONFIG.update_in_place(LIB_DIR=candidate, warn=not quiet)
|
|
return candidate
|
|
|
|
if not quiet:
|
|
raise OSError(f'ArchiveBox is unable to find a writable LIB_DIR, tried {CANDIDATES}!')
|
|
|
|
|
|
|
|
@cache
|
|
def get_data_locations():
|
|
from archivebox.config import CONSTANTS
|
|
from archivebox.config.common import STORAGE_CONFIG
|
|
|
|
return benedict({
|
|
"DATA_DIR": {
|
|
"path": DATA_DIR.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.R_OK) and os.access(DATA_DIR, os.W_OK),
|
|
"is_mount": os.path.ismount(DATA_DIR.resolve()),
|
|
},
|
|
"CONFIG_FILE": {
|
|
"path": CONSTANTS.CONFIG_FILE.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isfile(CONSTANTS.CONFIG_FILE) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.W_OK),
|
|
},
|
|
"SQL_INDEX": {
|
|
"path": DATABASE_FILE.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isfile(DATABASE_FILE) and os.access(DATABASE_FILE, os.R_OK) and os.access(DATABASE_FILE, os.W_OK),
|
|
"is_mount": os.path.ismount(DATABASE_FILE.resolve()),
|
|
},
|
|
"QUEUE_DATABASE": {
|
|
"path": CONSTANTS.QUEUE_DATABASE_FILE,
|
|
"enabled": True,
|
|
"is_valid": os.path.isfile(CONSTANTS.QUEUE_DATABASE_FILE) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.R_OK) and os.access(CONSTANTS.QUEUE_DATABASE_FILE, os.W_OK),
|
|
"is_mount": os.path.ismount(CONSTANTS.QUEUE_DATABASE_FILE),
|
|
},
|
|
"ARCHIVE_DIR": {
|
|
"path": ARCHIVE_DIR.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isdir(ARCHIVE_DIR) and os.access(ARCHIVE_DIR, os.R_OK) and os.access(ARCHIVE_DIR, os.W_OK),
|
|
"is_mount": os.path.ismount(ARCHIVE_DIR.resolve()),
|
|
},
|
|
"SOURCES_DIR": {
|
|
"path": CONSTANTS.SOURCES_DIR.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isdir(CONSTANTS.SOURCES_DIR) and os.access(CONSTANTS.SOURCES_DIR, os.R_OK) and os.access(CONSTANTS.SOURCES_DIR, os.W_OK),
|
|
},
|
|
"PERSONAS_DIR": {
|
|
"path": CONSTANTS.PERSONAS_DIR.resolve(),
|
|
"enabled": os.path.isdir(CONSTANTS.PERSONAS_DIR),
|
|
"is_valid": os.path.isdir(CONSTANTS.PERSONAS_DIR) and os.access(CONSTANTS.PERSONAS_DIR, os.R_OK) and os.access(CONSTANTS.PERSONAS_DIR, os.W_OK), # read + write
|
|
},
|
|
"LOGS_DIR": {
|
|
"path": CONSTANTS.LOGS_DIR.resolve(),
|
|
"enabled": True,
|
|
"is_valid": os.path.isdir(CONSTANTS.LOGS_DIR) and os.access(CONSTANTS.LOGS_DIR, os.R_OK) and os.access(CONSTANTS.LOGS_DIR, os.W_OK), # read + write
|
|
},
|
|
'TMP_DIR': {
|
|
'path': STORAGE_CONFIG.TMP_DIR.resolve(),
|
|
'enabled': True,
|
|
'is_valid': os.path.isdir(STORAGE_CONFIG.TMP_DIR) and os.access(STORAGE_CONFIG.TMP_DIR, os.R_OK) and os.access(STORAGE_CONFIG.TMP_DIR, os.W_OK), # read + write
|
|
},
|
|
# "CACHE_DIR": {
|
|
# "path": CACHE_DIR.resolve(),
|
|
# "enabled": True,
|
|
# "is_valid": os.access(CACHE_DIR, os.R_OK) and os.access(CACHE_DIR, os.W_OK), # read + write
|
|
# },
|
|
})
|
|
|
|
@cache
|
|
def get_code_locations():
|
|
from archivebox.config import CONSTANTS
|
|
from archivebox.config.common import STORAGE_CONFIG
|
|
|
|
return benedict({
|
|
'PACKAGE_DIR': {
|
|
'path': (PACKAGE_DIR).resolve(),
|
|
'enabled': True,
|
|
'is_valid': os.access(PACKAGE_DIR / '__main__.py', os.X_OK), # executable
|
|
},
|
|
'TEMPLATES_DIR': {
|
|
'path': CONSTANTS.TEMPLATES_DIR.resolve(),
|
|
'enabled': True,
|
|
'is_valid': os.access(CONSTANTS.STATIC_DIR, os.R_OK) and os.access(CONSTANTS.STATIC_DIR, os.X_OK), # read + list
|
|
},
|
|
'CUSTOM_TEMPLATES_DIR': {
|
|
'path': CONSTANTS.CUSTOM_TEMPLATES_DIR.resolve(),
|
|
'enabled': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR),
|
|
'is_valid': os.path.isdir(CONSTANTS.CUSTOM_TEMPLATES_DIR) and os.access(CONSTANTS.CUSTOM_TEMPLATES_DIR, os.R_OK), # read
|
|
},
|
|
'USER_PLUGINS_DIR': {
|
|
'path': CONSTANTS.USER_PLUGINS_DIR.resolve(),
|
|
'enabled': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR),
|
|
'is_valid': os.path.isdir(CONSTANTS.USER_PLUGINS_DIR) and os.access(CONSTANTS.USER_PLUGINS_DIR, os.R_OK), # read
|
|
},
|
|
'LIB_DIR': {
|
|
'path': STORAGE_CONFIG.LIB_DIR.resolve(),
|
|
'enabled': True,
|
|
'is_valid': os.path.isdir(STORAGE_CONFIG.LIB_DIR) and os.access(STORAGE_CONFIG.LIB_DIR, os.R_OK) and os.access(STORAGE_CONFIG.LIB_DIR, os.W_OK), # read + write
|
|
},
|
|
})
|
|
|
|
|
|
|
|
# @cache
|
|
# def get_LIB_DIR():
|
|
# """
|
|
# - should be shared with other collections on the same host
|
|
# - must be scoped by CPU architecture, OS family, and archivebox version
|
|
# - should not be shared with other hosts/archivebox versions
|
|
# - must be writable by any archivebox user
|
|
# - should be persistent across reboots
|
|
# - can be on a docker bin mount but probably shouldnt be
|
|
# - ok to have a long path (doesnt contain SOCKETS)
|
|
# """
|
|
# from .version import detect_installed_version
|
|
|
|
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
|
|
|
# lib_dir = tempfile.gettempdir()
|
|
# try:
|
|
# if 'SYSTEM_LIB_DIR' in os.environ:
|
|
# lib_dir = Path(os.environ['SYSTEM_LIB_DIR'])
|
|
# else:
|
|
# with SudoPermission(uid=ARCHIVEBOX_USER, fallback=True):
|
|
# lib_dir = HOST_DIRS.site_data_path
|
|
|
|
# # Docker: /usr/local/share/archivebox/0.8.5
|
|
# # Ubuntu: /usr/local/share/archivebox/0.8.5
|
|
# # macOS: /Library/Application Support/archivebox
|
|
# try:
|
|
# with SudoPermission(uid=0, fallback=True):
|
|
# lib_dir.mkdir(parents=True, exist_ok=True)
|
|
# except PermissionError:
|
|
# # our user cannot
|
|
# lib_dir = HOST_DIRS.user_data_path
|
|
# lib_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# if IS_ROOT or not dir_is_writable(lib_dir, uid=ARCHIVEBOX_USER):
|
|
# if IS_ROOT:
|
|
# # make sure lib dir is owned by the archivebox user, not root
|
|
# with SudoPermission(uid=0):
|
|
# if ARCHIVEBOX_USER == 0:
|
|
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_LIB_DIR {lib_dir} with mode 777 so that non-root archivebox users can share it.[/yellow] (caches shared libs used by archivebox for performance)', file=sys.stderr)
|
|
# os.system(f'chmod -R 777 "{lib_dir}"')
|
|
# else:
|
|
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{lib_dir}"')
|
|
# else:
|
|
# raise PermissionError()
|
|
# except (PermissionError, AssertionError):
|
|
# # raise PermissionError(f'SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
|
# print(f'[red]:cross_mark: ERROR: SYSTEM_LIB_DIR {lib_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
|
|
|
# return lib_dir
|
|
|
|
# @cache
|
|
# def get_TMP_DIR():
|
|
# """
|
|
# - must NOT be inside DATA_DIR / inside a docker volume bind mount
|
|
# - must NOT have a long PATH (UNIX socket path length restrictions)
|
|
# - must NOT be shared with other collections/hosts
|
|
# - must be writable by archivebox user & root
|
|
# - must be cleared on every boot / not persisted
|
|
# - must be cleared on every archivebox version upgrade
|
|
# """
|
|
# from .version import detect_installed_version
|
|
|
|
# HOST_DIRS = PlatformDirs(appname='archivebox', appauthor='ArchiveBox', version=detect_installed_version(), opinion=True, ensure_exists=False)
|
|
|
|
# # print('DATA_DIR OWNED BY:', ARCHIVEBOX_USER, ARCHIVEBOX_GROUP)
|
|
# # print('RUNNING AS:', self.PUID, self.PGID)
|
|
# run_dir = tempfile.gettempdir()
|
|
# try:
|
|
# if 'SYSTEM_TMP_DIR' in os.environ:
|
|
# run_dir = Path(os.environ['SYSTEM_TMP_DIR']).resolve() / get_collection_id(DATA_DIR=DATA_DIR)
|
|
# with SudoPermission(uid=0, fallback=True):
|
|
# run_dir.mkdir(parents=True, exist_ok=True)
|
|
# if not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
|
|
# if IS_ROOT:
|
|
# with SudoPermission(uid=0, fallback=False):
|
|
# if ARCHIVEBOX_USER == 0:
|
|
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
|
# os.system(f'chmod -R 777 "{run_dir}"')
|
|
# else:
|
|
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
|
# else:
|
|
# raise PermissionError()
|
|
# assert len(str(run_dir / 'supervisord.conf')) < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
|
# return run_dir
|
|
|
|
# run_dir = (HOST_DIRS.site_runtime_path / get_collection_id(DATA_DIR=DATA_DIR)).resolve()
|
|
# try:
|
|
# assert len(str(run_dir)) + len('/supervisord.sock') < 95
|
|
# except AssertionError:
|
|
# run_dir = Path(tempfile.gettempdir()).resolve() / 'archivebox' / get_collection_id(DATA_DIR=DATA_DIR)
|
|
# assert len(str(run_dir)) + len('/supervisord.sock') < 95, 'SYSTEM_TMP_DIR path is too long, please set SYSTEM_TMP_DIR env variable to a shorter path (unfortunately unix requires socket paths be < 108 chars)'
|
|
|
|
# with SudoPermission(uid=0, fallback=True):
|
|
# run_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# if IS_ROOT or not dir_is_writable(run_dir, uid=ARCHIVEBOX_USER):
|
|
# if IS_ROOT:
|
|
# with SudoPermission(uid=0):
|
|
# if ARCHIVEBOX_USER == 0:
|
|
# # print(f'[yellow]:warning: Waring: Creating SYSTEM_TMP_DIR {run_dir} with mode 777 so that non-root archivebox users can access it.[/yellow]', file=sys.stderr)
|
|
# os.system(f'chmod -R 777 "{run_dir}"')
|
|
# else:
|
|
# os.system(f'chown {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP} "{run_dir}"')
|
|
# else:
|
|
# raise PermissionError()
|
|
|
|
# except (PermissionError, AssertionError):
|
|
# # raise PermissionError(f'SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}')
|
|
# print(f'[red]:cross_mark: ERROR: SYSTEM_TMP_DIR {run_dir} is not writable by archivebox user {ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/red]', file=sys.stderr)
|
|
|
|
# return run_dir
|
|
|