fix tmp data dir resolution when running help or version outside data dir

This commit is contained in:
Nick Sweeting 2024-10-04 01:40:41 -07:00
parent f321d25f4c
commit 12f32c4690
No known key found for this signature in database
12 changed files with 30 additions and 208 deletions

View file

@ -20,21 +20,26 @@ __package__ = 'archivebox'
import os
import sys
import tempfile
from pathlib import Path
PACKAGE_DIR = Path(__file__).resolve().parent # archivebox source code dir
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir
USING_TMP_DATA_DIR = None
if len(sys.argv) > 1 and sys.argv[1] in ('version', 'help'):
current_dir = Path(os.getcwd()).resolve()
if not (current_dir / 'index.sqlite3').exists():
USING_TMP_DATA_DIR = Path(tempfile.gettempdir()) / 'archivebox'
USING_TMP_DATA_DIR.mkdir(parents=True, exist_ok=True)
os.chdir(USING_TMP_DATA_DIR)
# make sure PACKAGE_DIR is in sys.path so we can import all subfolders
# without necessarily waiting for django to load them thorugh INSTALLED_APPS
PACKAGE_DIR = Path(__file__).resolve().parent
if str(PACKAGE_DIR) not in sys.path:
sys.path.append(str(PACKAGE_DIR))
from .config.constants import CONSTANTS, VERSION # noqa
from .config.constants import CONSTANTS, DATA_DIR, PACKAGE_DIR, ARCHIVE_DIR, VERSION # noqa
os.environ['ARCHIVEBOX_PACKAGE_DIR'] = str(PACKAGE_DIR)
os.environ['ARCHIVEBOX_DATA_DIR'] = str(DATA_DIR)
os.environ['DJANGO_SETTINGS_MODULE'] = 'core.settings'
# print('INSTALLING MONKEY PATCHES')

View file

@ -2,9 +2,7 @@
"""This is the main entry point for the ArchiveBox CLI."""
__package__ = 'archivebox'
import archivebox # noqa # import archivebox/__init__.py to apply monkey patches, load vendored libs, etc.
import sys
from .cli import main
ASCII_LOGO_MINI = r"""

View file

@ -18,7 +18,7 @@ from . import toml_util
PACKAGE_DIR = Path(__file__).resolve().parent.parent
DATA_DIR = Path(os.curdir).resolve()
DATA_DIR = Path(os.getcwd()).resolve()

View file

@ -1,6 +1,7 @@
__package__ = 'abx.archivebox'
import json
import os
from typing import Optional, List, Literal, Annotated, Dict, Any, Tuple
from typing_extensions import Self
@ -189,7 +190,7 @@ class BaseExtractor(BaseHook):
# TODO: move this to a hookimpl
def exec(self, args: CmdArgsList=(), cwd: Optional[Path]=None, installed_binary=None):
cwd = cwd or Path('.')
cwd = cwd or Path(os.getcwd())
binary = self.load_binary(installed_binary=installed_binary)
return binary.exec(cmd=args, cwd=cwd)

View file

@ -1,9 +1,11 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
import os
import sys
import argparse
import threading
import tempfile
from time import sleep
from collections.abc import Mapping
@ -11,10 +13,6 @@ from collections.abc import Mapping
from typing import Optional, List, IO, Union, Iterable
from pathlib import Path
from archivebox.config import DATA_DIR
from archivebox.misc.checks import check_migrations
from archivebox.misc.logging import stderr
from importlib import import_module
BUILTIN_LIST = list
@ -135,9 +133,10 @@ def wait_for_bg_threads_to_exit(thread_names: Iterable[str]=(), ignore_names: It
if blocking_threads:
sleep(1)
if tries == 5: # only show stderr message if we need to wait more than 5s
stderr(
print(
f'[…] Waiting up to {timeout}s for background jobs (e.g. webhooks) to finish...',
threads_summary,
file=sys.stderr,
)
else:
return tries
@ -154,7 +153,11 @@ def run_subcommand(subcommand: str,
subcommand_args = subcommand_args or []
from archivebox.misc.checks import check_migrations
from archivebox.config.legacy import setup_django
# print('DATA_DIR is', DATA_DIR)
# print('pwd is', os.getcwd())
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
@ -237,12 +240,10 @@ def main(args: List[str] | Omitted=OMITTED, stdin: IO | Omitted=OMITTED, pwd: st
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin or None,
pwd=pwd or DATA_DIR,
)
run_subcommand(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin or None,
pwd=pwd or DATA_DIR,
)

View file

@ -17,7 +17,7 @@ from ..misc.logging import DEFAULT_CLI_COLORS
###################### Config ##########################
PACKAGE_DIR: Path = Path(__file__).resolve().parent.parent # archivebox source code dir
DATA_DIR: Path = Path(os.curdir).resolve() # archivebox user data dir
DATA_DIR: Path = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR: Path = DATA_DIR / 'archive' # archivebox snapshot data dir
def _detect_installed_version(PACKAGE_DIR: Path):

View file

@ -207,11 +207,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
# 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
# 'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
# 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
@ -427,74 +422,6 @@ def load_config(defaults: ConfigDefaultDict,
# Dependency Metadata Helpers
def bin_version(binary: Optional[str], cmd: Optional[str]=None, timeout: int=3) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
if not binary or not abspath:
return None
return '999.999.999'
# Now handled by new BinProvider plugin system, no longer needed:
try:
bin_env = os.environ | {'LANG': 'C'}
is_cmd_str = cmd and isinstance(cmd, str)
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT, env=bin_env)
.stdout.strip()
.decode()
)
if not version_str:
version_str = (
run(cmd or [abspath, "--version"], timeout=timeout, shell=is_cmd_str, stdout=PIPE, stderr=STDOUT)
.stdout.strip()
.decode()
)
# take first 3 columns of first line of version info
semver = SemVer.parse(version_str)
if semver:
return str(semver)
except (OSError, TimeoutExpired):
pass
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
return None
def bin_path(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
return 'UNUSED'
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
if binary is None:
return None
abs_path = bin_path(binary)
if abs_path is None or not Path(abs_path).exists():
return None
file_hash = md5()
with io.open(abs_path, mode='rb') as f:
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
file_hash.update(chunk)
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
@ -567,116 +494,6 @@ def wget_supports_compression(config):
return False
def get_dependency_info(config: benedict) -> ConfigValue:
return {
# 'PYTHON_BINARY': {
# 'path': bin_path(config['PYTHON_BINARY']),
# 'version': config['PYTHON_VERSION'],
# 'hash': bin_hash(config['PYTHON_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['PYTHON_VERSION']),
# },
# 'SQLITE_BINARY': {
# 'path': bin_path(config['SQLITE_BINARY']),
# 'version': config['SQLITE_VERSION'],
# 'hash': bin_hash(config['SQLITE_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['SQLITE_VERSION']),
# },
# 'DJANGO_BINARY': {
# 'path': bin_path(config['DJANGO_BINARY']),
# 'version': config['DJANGO_VERSION'],
# 'hash': bin_hash(config['DJANGO_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['DJANGO_VERSION']),
# },
# 'ARCHIVEBOX_BINARY': {
# 'path': bin_path(config['ARCHIVEBOX_BINARY']),
# 'version': config['VERSION'],
# 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
# 'enabled': True,
# 'is_valid': True,
# },
# 'CURL_BINARY': {
# 'path': bin_path(config['CURL_BINARY']),
# 'version': config['CURL_VERSION'],
# 'hash': bin_hash(config['CURL_BINARY']),
# 'enabled': config['USE_CURL'],
# 'is_valid': bool(config['CURL_VERSION']),
# },
# 'WGET_BINARY': {
# 'path': bin_path(config['WGET_BINARY']),
# 'version': config['WGET_VERSION'],
# 'hash': bin_hash(config['WGET_BINARY']),
# 'enabled': config['USE_WGET'],
# 'is_valid': bool(config['WGET_VERSION']),
# },
# 'NODE_BINARY': {
# 'path': bin_path(config['NODE_BINARY']),
# 'version': config['NODE_VERSION'],
# 'hash': bin_hash(config['NODE_BINARY']),
# 'enabled': config['USE_NODE'],
# 'is_valid': bool(config['NODE_VERSION']),
# },
# 'MERCURY_BINARY': {
# 'path': bin_path(config['MERCURY_BINARY']),
# 'version': config['MERCURY_VERSION'],
# 'hash': bin_hash(config['MERCURY_BINARY']),
# 'enabled': config['USE_MERCURY'],
# 'is_valid': bool(config['MERCURY_VERSION']),
# },
# 'GIT_BINARY': {
# 'path': bin_path(config['GIT_BINARY']),
# 'version': config['GIT_VERSION'],
# 'hash': bin_hash(config['GIT_BINARY']),
# 'enabled': config['USE_GIT'],
# 'is_valid': bool(config['GIT_VERSION']),
# },
# 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'],
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
# 'enabled': config['USE_SINGLEFILE'],
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
# },
# 'READABILITY_BINARY': {
# 'path': bin_path(config['READABILITY_BINARY']),
# 'version': config['READABILITY_VERSION'],
# 'hash': bin_hash(config['READABILITY_BINARY']),
# 'enabled': config['USE_READABILITY'],
# 'is_valid': bool(config['READABILITY_VERSION']),
# },
# 'YOUTUBEDL_BINARY': {
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
# 'version': config['YOUTUBEDL_VERSION'],
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
# 'enabled': config['USE_YOUTUBEDL'],
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
# },
# 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'],
# 'hash': bin_hash(config['CHROME_BINARY']),
# 'enabled': config['USE_CHROME'],
# 'is_valid': bool(config['CHROME_VERSION']),
# },
# 'RIPGREP_BINARY': {
# 'path': bin_path(config['RIPGREP_BINARY']),
# 'version': config['RIPGREP_VERSION'],
# 'hash': bin_hash(config['RIPGREP_BINARY']),
# 'enabled': config['USE_RIPGREP'],
# 'is_valid': bool(config['RIPGREP_VERSION']),
# },
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
# 'hash': bin_hash(config['SONIC_BINARY']),
# 'enabled': config['USE_SONIC'],
# 'is_valid': bool(config['SONIC_VERSION']),
# },
}
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************

View file

@ -9,7 +9,7 @@ import django.db.models.deletion
from index.json import to_json
DATA_DIR = Path(os.curdir).resolve() # archivebox user data dir
DATA_DIR = Path(os.getcwd()).resolve() # archivebox user data dir
ARCHIVE_DIR = DATA_DIR / 'archive' # archivebox snapshot data dir

View file

@ -227,7 +227,7 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
print()
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str):
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str | IO], pwd: str='.'):
args = ' '.join(subcommand_args)
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),

View file

@ -15,7 +15,7 @@ import machineid # https://github.com/keygen-sh/py-machineid
from rich import print
PACKAGE_DIR = Path(__file__).parent
DATA_DIR = Path('.').resolve()
DATA_DIR = Path(os.getcwd()).resolve()
def get_vm_info():
hw_in_docker = bool(os.getenv('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE'))

View file

@ -138,7 +138,7 @@ def help(out_dir: Path=DATA_DIR) -> None:
''')
if CONSTANTS.DATABASE_FILE.exists():
if CONSTANTS.ARCHIVE_DIR.exists():
pretty_out_dir = str(out_dir).replace(str(Path('~').expanduser()), '~')
EXAMPLE_USAGE = f'''
[light_slate_blue]DATA DIR[/light_slate_blue]: [yellow]{pretty_out_dir}[/yellow]
@ -254,7 +254,7 @@ def version(quiet: bool=False,
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
prnt()
if CONSTANTS.DATABASE_FILE.exists() or CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
if CONSTANTS.ARCHIVE_DIR.exists() or CONSTANTS.CONFIG_FILE.exists():
prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
for name, path in CONSTANTS.DATA_LOCATIONS.items():
prnt(printable_folder_status(name, path), overflow='ignore', crop=False)

View file

@ -111,10 +111,10 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
@enforce_types
def chmod_file(path: str, cwd: str='.') -> None:
def chmod_file(path: str, cwd: str='') -> None:
"""chmod -R <permissions> <cwd>/<path>"""
root = Path(cwd) / path
root = Path(cwd or os.getcwd()) / path
if not root.exists():
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))