2020-12-20 01:11:19 +00:00
"""
ArchiveBox config definitons ( including defaults and dynamic config options ) .
Config Usage Example :
archivebox config - - set MEDIA_TIMEOUT = 600
env MEDIA_TIMEOUT = 600 USE_COLOR = False . . . archivebox [ subcommand ] . . .
Config Precedence Order :
1. cli args ( - - update - all / - - index - only / etc . )
2. shell environment vars ( env USE_COLOR = False archivebox add ' ... ' )
3. config file ( echo " SAVE_FAVICON=False " >> ArchiveBox . conf )
4. defaults ( defined below in Python )
Documentation :
https : / / github . com / ArchiveBox / ArchiveBox / wiki / Configuration
"""
2020-10-31 07:08:03 +00:00
__package__ = ' archivebox '
2019-04-17 07:49:18 +00:00
2017-07-04 10:38:07 +00:00
import os
2019-04-22 18:34:12 +00:00
import io
2019-03-23 02:05:45 +00:00
import re
2017-07-04 10:48:12 +00:00
import sys
2020-08-18 19:00:00 +00:00
import json
2022-06-09 01:35:31 +00:00
import inspect
2019-04-17 07:49:18 +00:00
import getpass
2017-07-05 21:33:51 +00:00
import shutil
2023-11-28 04:56:30 +00:00
import requests
2017-07-04 10:48:12 +00:00
2019-04-22 18:34:12 +00:00
from hashlib import md5
2020-07-28 11:20:57 +00:00
from pathlib import Path
2021-04-10 08:19:30 +00:00
from datetime import datetime , timezone
2024-09-25 02:04:38 +00:00
from typing import Optional , Type , Tuple , Dict
2024-09-21 08:53:59 +00:00
from subprocess import run , PIPE , DEVNULL , STDOUT , TimeoutExpired
2019-04-25 23:03:38 +00:00
from configparser import ConfigParser
2023-11-04 03:07:39 +00:00
import importlib . metadata
2017-07-04 10:38:07 +00:00
2024-08-23 09:01:24 +00:00
from pydantic_pkgr import SemVer
2024-09-25 02:04:38 +00:00
from rich . progress import Progress
2024-09-25 02:37:29 +00:00
from rich . console import Console
2024-08-23 09:01:24 +00:00
import django
from django . db . backends . sqlite3 . base import Database as sqlite3
2020-10-31 07:08:03 +00:00
from . config_stubs import (
2024-08-21 01:31:21 +00:00
AttrDict ,
2019-04-24 15:36:14 +00:00
SimpleConfigValueDict ,
ConfigValue ,
ConfigDict ,
ConfigDefaultValue ,
ConfigDefaultDict ,
)
2024-09-25 02:04:38 +00:00
from . misc . logging import (
DEFAULT_CLI_COLORS ,
ANSI ,
COLOR_DICT ,
stderr ,
hint ,
)
from . misc . checks import check_system_config
2024-09-21 08:53:59 +00:00
# print('STARTING CONFIG LOADING')
2024-08-23 09:01:02 +00:00
# load fallback libraries from vendor dir
from . vendor import load_vendored_libs
load_vendored_libs ( )
2024-09-21 08:53:59 +00:00
# print("LOADED VENDOR LIBS")
2022-06-09 03:04:01 +00:00
2020-12-20 01:11:19 +00:00
############################### Config Schema ##################################
2019-04-24 08:09:25 +00:00
2020-12-20 01:11:19 +00:00
CONFIG_SCHEMA : Dict [ str , ConfigDefaultDict ] = {
2019-04-25 23:03:38 +00:00
' SHELL_CONFIG ' : {
' IS_TTY ' : { ' type ' : bool , ' default ' : lambda _ : sys . stdout . isatty ( ) } ,
' USE_COLOR ' : { ' type ' : bool , ' default ' : lambda c : c [ ' IS_TTY ' ] } ,
2024-09-25 02:04:38 +00:00
' SHOW_PROGRESS ' : { ' type ' : bool , ' default ' : lambda c : c [ ' IS_TTY ' ] } , # progress bars are buggy on mac, disable for now
2020-08-10 18:15:53 +00:00
' IN_DOCKER ' : { ' type ' : bool , ' default ' : False } ,
2023-10-31 07:23:19 +00:00
' IN_QEMU ' : { ' type ' : bool , ' default ' : False } ,
2022-06-09 00:42:52 +00:00
' PUID ' : { ' type ' : int , ' default ' : os . getuid ( ) } ,
' PGID ' : { ' type ' : int , ' default ' : os . getgid ( ) } ,
2019-04-25 23:03:38 +00:00
} ,
' GENERAL_CONFIG ' : {
' OUTPUT_DIR ' : { ' type ' : str , ' default ' : None } ,
' CONFIG_FILE ' : { ' type ' : str , ' default ' : None } ,
2019-04-27 21:26:24 +00:00
' ONLY_NEW ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' TIMEOUT ' : { ' type ' : int , ' default ' : 60 } ,
' MEDIA_TIMEOUT ' : { ' type ' : int , ' default ' : 3600 } ,
2021-05-31 23:31:42 +00:00
' OUTPUT_PERMISSIONS ' : { ' type ' : str , ' default ' : ' 644 ' } ,
2024-05-07 12:24:16 +00:00
' RESTRICT_FILE_NAMES ' : { ' type ' : str , ' default ' : ' windows ' } , # TODO: move this to be a default WGET_ARGS
2023-10-20 11:25:44 +00:00
2023-07-31 03:43:04 +00:00
' URL_DENYLIST ' : { ' type ' : str , ' default ' : r ' \ .(css|js|otf|ttf|woff|woff2|gstatic \ .com|googleapis \ .com/css)( \ ?.*)?$ ' , ' aliases ' : ( ' URL_BLACKLIST ' , ) } , # to avoid downloading code assets as their own pages
' URL_ALLOWLIST ' : { ' type ' : str , ' default ' : None , ' aliases ' : ( ' URL_WHITELIST ' , ) } ,
2023-10-20 11:25:44 +00:00
2021-05-31 23:31:11 +00:00
' ENFORCE_ATOMIC_WRITES ' : { ' type ' : bool , ' default ' : True } ,
2022-01-11 01:42:09 +00:00
' TAG_SEPARATOR_PATTERN ' : { ' type ' : str , ' default ' : r ' [,] ' } ,
2019-04-25 23:03:38 +00:00
} ,
2019-05-02 23:15:16 +00:00
' SERVER_CONFIG ' : {
2024-09-24 08:25:55 +00:00
' ADMIN_USERNAME ' : { ' type ' : str , ' default ' : None } ,
' ADMIN_PASSWORD ' : { ' type ' : str , ' default ' : None } ,
2021-09-30 15:40:13 +00:00
' SECRET_KEY ' : { ' type ' : str , ' default ' : None } ,
' BIND_ADDR ' : { ' type ' : str , ' default ' : lambda c : [ ' 127.0.0.1:8000 ' , ' 0.0.0.0:8000 ' ] [ c [ ' IN_DOCKER ' ] ] } ,
2024-08-23 01:40:47 +00:00
' ALLOWED_HOSTS ' : { ' type ' : str , ' default ' : ' * ' } , # e.g. archivebox.example.com,archivebox2.example.com
2024-08-23 09:01:40 +00:00
' CSRF_TRUSTED_ORIGINS ' : { ' type ' : str , ' default ' : lambda c : ' http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http:// {} ' . format ( c [ ' BIND_ADDR ' ] ) } , # e.g. https://archivebox.example.com,https://archivebox2.example.com:8080
2021-09-30 15:40:13 +00:00
' DEBUG ' : { ' type ' : bool , ' default ' : False } ,
' PUBLIC_INDEX ' : { ' type ' : bool , ' default ' : True } ,
' PUBLIC_SNAPSHOTS ' : { ' type ' : bool , ' default ' : True } ,
' PUBLIC_ADD_VIEW ' : { ' type ' : bool , ' default ' : False } ,
' FOOTER_INFO ' : { ' type ' : str , ' default ' : ' Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. ' } ,
2024-09-05 10:36:18 +00:00
' SNAPSHOTS_PER_PAGE ' : { ' type ' : int , ' default ' : 40 } ,
2021-09-30 15:40:13 +00:00
' CUSTOM_TEMPLATES_DIR ' : { ' type ' : str , ' default ' : None } ,
' TIME_ZONE ' : { ' type ' : str , ' default ' : ' UTC ' } ,
2023-08-31 22:17:45 +00:00
' TIMEZONE ' : { ' type ' : str , ' default ' : ' UTC ' } ,
2021-09-30 15:40:13 +00:00
' REVERSE_PROXY_USER_HEADER ' : { ' type ' : str , ' default ' : ' Remote-User ' } ,
' REVERSE_PROXY_WHITELIST ' : { ' type ' : str , ' default ' : ' ' } ,
2022-03-31 19:40:14 +00:00
' LOGOUT_REDIRECT_URL ' : { ' type ' : str , ' default ' : ' / ' } ,
2023-08-31 22:17:45 +00:00
' PREVIEW_ORIGINALS ' : { ' type ' : bool , ' default ' : True } ,
2023-08-17 02:53:49 +00:00
' LDAP ' : { ' type ' : bool , ' default ' : False } ,
' LDAP_SERVER_URI ' : { ' type ' : str , ' default ' : None } ,
' LDAP_BIND_DN ' : { ' type ' : str , ' default ' : None } ,
' LDAP_BIND_PASSWORD ' : { ' type ' : str , ' default ' : None } ,
' LDAP_USER_BASE ' : { ' type ' : str , ' default ' : None } ,
' LDAP_USER_FILTER ' : { ' type ' : str , ' default ' : None } ,
' LDAP_USERNAME_ATTR ' : { ' type ' : str , ' default ' : None } ,
' LDAP_FIRSTNAME_ATTR ' : { ' type ' : str , ' default ' : None } ,
' LDAP_LASTNAME_ATTR ' : { ' type ' : str , ' default ' : None } ,
' LDAP_EMAIL_ATTR ' : { ' type ' : str , ' default ' : None } ,
2024-05-06 18:06:42 +00:00
' LDAP_CREATE_SUPERUSER ' : { ' type ' : bool , ' default ' : False } ,
2019-05-02 23:15:16 +00:00
} ,
2019-04-25 23:03:38 +00:00
' ARCHIVE_METHOD_TOGGLES ' : {
' SAVE_TITLE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_TITLE ' , ) } ,
' SAVE_FAVICON ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_FAVICON ' , ) } ,
' SAVE_WGET ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET ' , ) } ,
' SAVE_WGET_REQUISITES ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET_REQUISITES ' , ) } ,
2020-07-30 18:23:10 +00:00
' SAVE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SINGLEFILE ' , ) } ,
2020-08-07 13:05:17 +00:00
' SAVE_READABILITY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_READABILITY ' , ) } ,
2020-09-22 08:46:21 +00:00
' SAVE_MERCURY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MERCURY ' , ) } ,
2023-10-24 01:42:25 +00:00
' SAVE_HTMLTOTEXT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_HTMLTOTEXT ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_PDF ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_PDF ' , ) } ,
' SAVE_SCREENSHOT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SCREENSHOT ' , ) } ,
' SAVE_DOM ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_DOM ' , ) } ,
2020-09-24 13:37:27 +00:00
' SAVE_HEADERS ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_HEADERS ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_WARC ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WARC ' , ) } ,
' SAVE_GIT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_GIT ' , ) } ,
' SAVE_MEDIA ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MEDIA ' , ) } ,
' SAVE_ARCHIVE_DOT_ORG ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' SUBMIT_ARCHIVE_DOT_ORG ' , ) } ,
2023-07-31 15:34:03 +00:00
' SAVE_ALLOWLIST ' : { ' type ' : dict , ' default ' : { } , } ,
' SAVE_DENYLIST ' : { ' type ' : dict , ' default ' : { } , } ,
2019-04-25 23:03:38 +00:00
} ,
' ARCHIVE_METHOD_OPTIONS ' : {
2024-01-23 21:51:56 +00:00
' RESOLUTION ' : { ' type ' : str , ' default ' : ' 1440,2000 ' , ' aliases ' : ( ' SCREENSHOT_RESOLUTION ' , ' WINDOW_SIZE ' ) } ,
' GIT_DOMAINS ' : { ' type ' : str , ' default ' : ' github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht ' } ,
2019-04-25 23:03:38 +00:00
' CHECK_SSL_VALIDITY ' : { ' type ' : bool , ' default ' : True } ,
2021-02-16 06:21:19 +00:00
' MEDIA_MAX_SIZE ' : { ' type ' : str , ' default ' : ' 750m ' } ,
2019-04-25 23:03:38 +00:00
2024-09-02 08:13:50 +00:00
' USER_AGENT ' : { ' type ' : str , ' default ' : ' Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/ {VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) ' } ,
' CURL_USER_AGENT ' : { ' type ' : str , ' default ' : lambda c : c [ ' USER_AGENT ' ] + ' curl/ {CURL_VERSION} ' } ,
' WGET_USER_AGENT ' : { ' type ' : str , ' default ' : lambda c : c [ ' USER_AGENT ' ] + ' wget/ {WGET_VERSION} ' } ,
' CHROME_USER_AGENT ' : { ' type ' : str , ' default ' : lambda c : c [ ' USER_AGENT ' ] } ,
2019-04-25 23:03:38 +00:00
' COOKIES_FILE ' : { ' type ' : str , ' default ' : None } ,
' CHROME_USER_DATA_DIR ' : { ' type ' : str , ' default ' : None } ,
2023-03-15 23:01:02 +00:00
' CHROME_TIMEOUT ' : { ' type ' : int , ' default ' : 0 } ,
2019-04-25 23:03:38 +00:00
' CHROME_HEADLESS ' : { ' type ' : bool , ' default ' : True } ,
2020-08-14 03:21:57 +00:00
' CHROME_SANDBOX ' : { ' type ' : bool , ' default ' : lambda c : not c [ ' IN_DOCKER ' ] } ,
2024-02-24 00:40:03 +00:00
' CHROME_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2021-04-10 09:45:02 +00:00
' YOUTUBEDL_ARGS ' : { ' type ' : list , ' default ' : lambda c : [
2024-01-23 22:19:59 +00:00
' --restrict-filenames ' ,
2024-01-31 09:59:43 +00:00
' --trim-filenames ' , ' 128 ' ,
2021-04-10 09:45:02 +00:00
' --write-description ' ,
2020-10-14 15:38:29 +00:00
' --write-info-json ' ,
' --write-annotations ' ,
' --write-thumbnail ' ,
' --no-call-home ' ,
2021-04-10 10:56:19 +00:00
' --write-sub ' ,
2023-08-31 22:17:45 +00:00
' --write-auto-subs ' ,
2021-04-10 11:22:20 +00:00
' --convert-subs=srt ' ,
2021-01-29 05:15:15 +00:00
' --yes-playlist ' ,
' --continue ' ,
2022-09-12 20:36:23 +00:00
# This flag doesn't exist in youtube-dl
# only in yt-dlp
2022-04-21 14:09:17 +00:00
' --no-abort-on-error ' ,
2022-09-14 04:27:58 +00:00
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
2020-10-14 15:38:29 +00:00
' --ignore-errors ' ,
' --geo-bypass ' ,
2021-01-29 05:15:15 +00:00
' --add-metadata ' ,
2023-08-31 22:17:45 +00:00
' --format=(bv*+ba/b)[filesize<= {} ][filesize_approx<=? {} ]/(bv*+ba/b) ' . format ( c [ ' MEDIA_MAX_SIZE ' ] , c [ ' MEDIA_MAX_SIZE ' ] ) ,
2021-01-29 05:15:15 +00:00
] } ,
2024-02-23 21:40:31 +00:00
' YOUTUBEDL_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2021-09-30 15:40:13 +00:00
2020-10-15 13:31:49 +00:00
' WGET_ARGS ' : { ' type ' : list , ' default ' : [ ' --no-verbose ' ,
' --adjust-extension ' ,
' --convert-links ' ,
' --force-directories ' ,
' --backup-converted ' ,
' --span-hosts ' ,
' --no-parent ' ,
' -e ' , ' robots=off ' ,
2020-10-15 13:42:46 +00:00
] } ,
2024-02-21 21:13:06 +00:00
' WGET_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2020-10-15 13:42:46 +00:00
' CURL_ARGS ' : { ' type ' : list , ' default ' : [ ' --silent ' ,
' --location ' ,
' --compressed '
2020-10-15 13:58:22 +00:00
] } ,
2024-02-21 21:13:06 +00:00
' CURL_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2020-10-15 13:58:22 +00:00
' GIT_ARGS ' : { ' type ' : list , ' default ' : [ ' --recursive ' ] } ,
2024-02-21 21:13:06 +00:00
' SINGLEFILE_ARGS ' : { ' type ' : list , ' default ' : None } ,
' SINGLEFILE_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2024-03-06 03:15:38 +00:00
' MERCURY_ARGS ' : { ' type ' : list , ' default ' : [ ' --format=text ' ] } ,
' MERCURY_EXTRA_ARGS ' : { ' type ' : list , ' default ' : None } ,
2023-05-06 01:42:36 +00:00
' FAVICON_PROVIDER ' : { ' type ' : str , ' default ' : ' https://www.google.com/s2/favicons?domain= {} ' } ,
2019-04-25 23:03:38 +00:00
} ,
2020-11-19 13:06:13 +00:00
' SEARCH_BACKEND_CONFIG ' : {
' USE_INDEXING_BACKEND ' : { ' type ' : bool , ' default ' : True } ,
' USE_SEARCHING_BACKEND ' : { ' type ' : bool , ' default ' : True } ,
2020-11-23 01:56:24 +00:00
' SEARCH_BACKEND_ENGINE ' : { ' type ' : str , ' default ' : ' ripgrep ' } ,
2020-11-19 13:06:13 +00:00
' SEARCH_BACKEND_HOST_NAME ' : { ' type ' : str , ' default ' : ' localhost ' } ,
' SEARCH_BACKEND_PORT ' : { ' type ' : int , ' default ' : 1491 } ,
' SEARCH_BACKEND_PASSWORD ' : { ' type ' : str , ' default ' : ' SecretPassword ' } ,
2023-10-12 17:14:39 +00:00
' SEARCH_PROCESS_HTML ' : { ' type ' : bool , ' default ' : True } ,
2020-11-19 13:06:13 +00:00
# SONIC
2020-11-19 21:45:12 +00:00
' SONIC_COLLECTION ' : { ' type ' : str , ' default ' : ' archivebox ' } ,
' SONIC_BUCKET ' : { ' type ' : str , ' default ' : ' snapshots ' } ,
2021-04-09 04:27:08 +00:00
' SEARCH_BACKEND_TIMEOUT ' : { ' type ' : int , ' default ' : 90 } ,
2023-10-09 20:43:08 +00:00
# SQLite3 FTS5
' FTS_SEPARATE_DATABASE ' : { ' type ' : bool , ' default ' : True } ,
' FTS_TOKENIZERS ' : { ' type ' : str , ' default ' : ' porter unicode61 remove_diacritics 2 ' } ,
# Default from https://www.sqlite.org/limits.html#max_length
' FTS_SQLITE_MAX_LENGTH ' : { ' type ' : int , ' default ' : int ( 1e9 ) } ,
2020-11-19 13:06:13 +00:00
} ,
2019-04-25 23:03:38 +00:00
' DEPENDENCY_CONFIG ' : {
' USE_CURL ' : { ' type ' : bool , ' default ' : True } ,
' USE_WGET ' : { ' type ' : bool , ' default ' : True } ,
2020-08-18 12:29:46 +00:00
' USE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True } ,
' USE_READABILITY ' : { ' type ' : bool , ' default ' : True } ,
2020-09-22 08:46:21 +00:00
' USE_MERCURY ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' USE_GIT ' : { ' type ' : bool , ' default ' : True } ,
' USE_CHROME ' : { ' type ' : bool , ' default ' : True } ,
2020-08-18 15:34:28 +00:00
' USE_NODE ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' USE_YOUTUBEDL ' : { ' type ' : bool , ' default ' : True } ,
2020-12-12 12:36:31 +00:00
' USE_RIPGREP ' : { ' type ' : bool , ' default ' : True } ,
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
' CURL_BINARY ' : { ' type ' : str , ' default ' : ' curl ' } ,
' GIT_BINARY ' : { ' type ' : str , ' default ' : ' git ' } ,
2023-10-20 09:46:41 +00:00
' WGET_BINARY ' : { ' type ' : str , ' default ' : ' wget ' } , # also can accept wget2
2021-04-06 01:14:06 +00:00
' SINGLEFILE_BINARY ' : { ' type ' : str , ' default ' : lambda c : bin_path ( ' single-file ' ) } ,
' READABILITY_BINARY ' : { ' type ' : str , ' default ' : lambda c : bin_path ( ' readability-extractor ' ) } ,
2023-10-20 09:46:41 +00:00
' MERCURY_BINARY ' : { ' type ' : str , ' default ' : lambda c : bin_path ( ' postlight-parser ' ) } ,
' YOUTUBEDL_BINARY ' : { ' type ' : str , ' default ' : ' yt-dlp ' } , # also can accept youtube-dl
2020-11-24 01:24:37 +00:00
' NODE_BINARY ' : { ' type ' : str , ' default ' : ' node ' } ,
2020-12-12 12:36:31 +00:00
' RIPGREP_BINARY ' : { ' type ' : str , ' default ' : ' rg ' } ,
2019-04-25 23:03:38 +00:00
' CHROME_BINARY ' : { ' type ' : str , ' default ' : None } ,
2020-11-07 19:17:21 +00:00
' POCKET_CONSUMER_KEY ' : { ' type ' : str , ' default ' : None } ,
' POCKET_ACCESS_TOKENS ' : { ' type ' : dict , ' default ' : { } } ,
2023-07-02 15:18:41 +00:00
2024-01-04 04:08:39 +00:00
' READWISE_READER_TOKENS ' : { ' type ' : dict , ' default ' : { } } ,
2019-04-25 23:03:38 +00:00
} ,
2019-04-24 08:09:25 +00:00
}
2020-12-20 01:11:19 +00:00
########################## Backwards-Compatibility #############################
2020-10-31 07:08:03 +00:00
# for backwards compatibility with old config files, check old/deprecated names for each key
2019-04-26 18:43:13 +00:00
CONFIG_ALIASES = {
alias : key
2020-12-20 01:11:19 +00:00
for section in CONFIG_SCHEMA . values ( )
2019-04-26 18:43:13 +00:00
for key , default in section . items ( )
for alias in default . get ( ' aliases ' , ( ) )
}
2024-05-06 18:06:42 +00:00
USER_CONFIG = { key : section [ key ] for section in CONFIG_SCHEMA . values ( ) for key in section . keys ( ) }
2020-10-31 07:08:03 +00:00
2019-04-26 18:43:13 +00:00
def get_real_name ( key : str ) - > str :
2020-12-20 01:11:19 +00:00
""" get the current canonical name for a given deprecated config key """
2019-04-26 18:43:13 +00:00
return CONFIG_ALIASES . get ( key . upper ( ) . strip ( ) , key . upper ( ) . strip ( ) )
2019-04-24 08:09:25 +00:00
2020-12-20 01:11:19 +00:00
################################ Constants #####################################
PACKAGE_DIR_NAME = ' archivebox '
2021-01-30 10:34:19 +00:00
TEMPLATES_DIR_NAME = ' templates '
2020-12-20 01:11:19 +00:00
ARCHIVE_DIR_NAME = ' archive '
SOURCES_DIR_NAME = ' sources '
LOGS_DIR_NAME = ' logs '
2024-05-12 08:42:34 +00:00
CACHE_DIR_NAME = ' cache '
2024-09-21 08:53:59 +00:00
LIB_DIR_NAME = ' lib '
2024-03-18 21:41:39 +00:00
PERSONAS_DIR_NAME = ' personas '
2024-04-26 04:36:43 +00:00
CRONTABS_DIR_NAME = ' crontabs '
2020-12-20 01:11:19 +00:00
SQL_INDEX_FILENAME = ' index.sqlite3 '
JSON_INDEX_FILENAME = ' index.json '
HTML_INDEX_FILENAME = ' index.html '
ROBOTS_TXT_FILENAME = ' robots.txt '
FAVICON_FILENAME = ' favicon.ico '
CONFIG_FILENAME = ' ArchiveBox.conf '
2019-03-27 20:44:00 +00:00
2024-09-25 02:04:38 +00:00
2020-07-01 17:23:59 +00:00
2019-05-01 03:13:04 +00:00
STATICFILE_EXTENSIONS = {
2020-06-26 01:30:29 +00:00
# 99.999% of the time, URLs ending in these extensions are static files
2019-05-01 03:13:04 +00:00
# that can be downloaded as-is, not html pages that need to be rendered
' gif ' , ' jpeg ' , ' jpg ' , ' png ' , ' tif ' , ' tiff ' , ' wbmp ' , ' ico ' , ' jng ' , ' bmp ' ,
' svg ' , ' svgz ' , ' webp ' , ' ps ' , ' eps ' , ' ai ' ,
2021-09-30 15:40:13 +00:00
' mp3 ' , ' mp4 ' , ' m4a ' , ' mpeg ' , ' mpg ' , ' mkv ' , ' mov ' , ' webm ' , ' m4v ' ,
2019-05-01 03:13:04 +00:00
' flv ' , ' wmv ' , ' avi ' , ' ogg ' , ' ts ' , ' m3u8 ' ,
' pdf ' , ' txt ' , ' rtf ' , ' rtfd ' , ' doc ' , ' docx ' , ' ppt ' , ' pptx ' , ' xls ' , ' xlsx ' ,
' atom ' , ' rss ' , ' css ' , ' js ' , ' json ' ,
' dmg ' , ' iso ' , ' img ' ,
' rar ' , ' war ' , ' hqx ' , ' zip ' , ' gz ' , ' bz2 ' , ' 7z ' ,
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
2021-09-30 15:40:13 +00:00
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
2019-05-01 03:13:04 +00:00
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
2020-06-26 01:30:29 +00:00
# These are always treated as pages, not as static files, never add them:
2019-05-01 03:13:04 +00:00
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
2019-04-17 03:18:42 +00:00
2021-04-12 21:06:32 +00:00
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR = {
2024-09-10 07:37:01 +00:00
" .gitignore " ,
" lost+found " ,
" .DS_Store " ,
" .venv " ,
" venv " ,
" virtualenv " ,
" .virtualenv " ,
" node_modules " ,
" package.json " ,
" package-lock.json " ,
" yarn.lock " ,
" static " ,
" sonic " ,
" search.sqlite3 " ,
2024-04-26 04:36:43 +00:00
CRONTABS_DIR_NAME ,
2021-04-12 21:06:32 +00:00
ARCHIVE_DIR_NAME ,
SOURCES_DIR_NAME ,
LOGS_DIR_NAME ,
2024-05-12 08:42:34 +00:00
CACHE_DIR_NAME ,
2024-09-21 08:53:59 +00:00
LIB_DIR_NAME ,
2024-03-18 21:41:39 +00:00
PERSONAS_DIR_NAME ,
2021-04-12 21:06:32 +00:00
SQL_INDEX_FILENAME ,
2024-09-10 07:37:01 +00:00
f " { SQL_INDEX_FILENAME } -wal " ,
f " { SQL_INDEX_FILENAME } -shm " ,
" queue.sqlite3 " ,
" queue.sqlite3-wal " ,
" queue.sqlite3-shm " ,
2021-04-12 21:06:32 +00:00
JSON_INDEX_FILENAME ,
HTML_INDEX_FILENAME ,
ROBOTS_TXT_FILENAME ,
FAVICON_FILENAME ,
CONFIG_FILENAME ,
2024-09-10 07:37:01 +00:00
f " { CONFIG_FILENAME } .bak " ,
" static_index.json " ,
2021-04-12 21:06:32 +00:00
}
2019-04-26 22:31:50 +00:00
2024-01-03 01:09:31 +00:00
ALLOWDENYLIST_REGEX_FLAGS : int = re . IGNORECASE | re . UNICODE | re . MULTILINE
2024-08-21 01:31:21 +00:00
CONSTANTS = {
" PACKAGE_DIR_NAME " : { ' default ' : lambda c : PACKAGE_DIR_NAME } ,
2024-09-21 08:53:59 +00:00
" LIB_DIR_NAME " : { ' default ' : lambda c : LIB_DIR_NAME } ,
2024-08-21 01:31:21 +00:00
" TEMPLATES_DIR_NAME " : { ' default ' : lambda c : TEMPLATES_DIR_NAME } ,
" ARCHIVE_DIR_NAME " : { ' default ' : lambda c : ARCHIVE_DIR_NAME } ,
" SOURCES_DIR_NAME " : { ' default ' : lambda c : SOURCES_DIR_NAME } ,
" LOGS_DIR_NAME " : { ' default ' : lambda c : LOGS_DIR_NAME } ,
" CACHE_DIR_NAME " : { ' default ' : lambda c : CACHE_DIR_NAME } ,
" PERSONAS_DIR_NAME " : { ' default ' : lambda c : PERSONAS_DIR_NAME } ,
" CRONTABS_DIR_NAME " : { ' default ' : lambda c : CRONTABS_DIR_NAME } ,
" SQL_INDEX_FILENAME " : { ' default ' : lambda c : SQL_INDEX_FILENAME } ,
" JSON_INDEX_FILENAME " : { ' default ' : lambda c : JSON_INDEX_FILENAME } ,
" HTML_INDEX_FILENAME " : { ' default ' : lambda c : HTML_INDEX_FILENAME } ,
" ROBOTS_TXT_FILENAME " : { ' default ' : lambda c : ROBOTS_TXT_FILENAME } ,
" FAVICON_FILENAME " : { ' default ' : lambda c : FAVICON_FILENAME } ,
" CONFIG_FILENAME " : { ' default ' : lambda c : CONFIG_FILENAME } ,
" DEFAULT_CLI_COLORS " : { ' default ' : lambda c : DEFAULT_CLI_COLORS } ,
" ANSI " : { ' default ' : lambda c : ANSI } ,
" COLOR_DICT " : { ' default ' : lambda c : COLOR_DICT } ,
" STATICFILE_EXTENSIONS " : { ' default ' : lambda c : STATICFILE_EXTENSIONS } ,
" ALLOWED_IN_OUTPUT_DIR " : { ' default ' : lambda c : ALLOWED_IN_OUTPUT_DIR } ,
2024-09-24 08:25:55 +00:00
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
2024-08-21 01:31:21 +00:00
}
2024-01-03 01:09:31 +00:00
############################## Version Config ##################################
2024-02-09 02:58:12 +00:00
def get_system_user ( ) - > str :
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f ' user_ { os . getuid ( ) } '
# Option 1
2024-01-03 01:09:31 +00:00
try :
import pwd
2024-02-09 02:58:12 +00:00
SYSTEM_USER = SYSTEM_USER or pwd . getpwuid ( os . geteuid ( ) ) . pw_name
except ( ModuleNotFoundError , Exception ) :
2024-01-03 01:09:31 +00:00
pass
2024-02-09 02:58:12 +00:00
# Option 2
try :
SYSTEM_USER = SYSTEM_USER or getpass . getuser ( )
except Exception :
2024-01-03 01:09:31 +00:00
pass
2024-02-09 02:58:12 +00:00
# Option 3
try :
SYSTEM_USER = SYSTEM_USER or os . getlogin ( )
2024-01-03 01:09:31 +00:00
except Exception :
pass
2024-02-09 02:58:12 +00:00
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
2024-01-03 01:09:31 +00:00
def get_version ( config ) :
2023-11-14 08:21:09 +00:00
try :
return importlib . metadata . version ( __package__ or ' archivebox ' )
except importlib . metadata . PackageNotFoundError :
2023-11-14 08:38:37 +00:00
try :
pyproject_config = ( config [ ' PACKAGE_DIR ' ] / ' pyproject.toml ' ) . read_text ( )
for line in pyproject_config :
if line . startswith ( ' version = ' ) :
return line . split ( ' = ' , 1 ) [ - 1 ] . strip ( ' " ' )
except FileNotFoundError :
# building docs, pyproject.toml is not available
return ' dev '
2023-11-14 08:21:09 +00:00
raise Exception ( ' Failed to detect installed archivebox version! ' )
2022-06-09 08:04:55 +00:00
2023-12-18 00:57:02 +00:00
def get_commit_hash ( config ) - > Optional [ str ] :
2022-06-09 08:04:55 +00:00
try :
2024-01-04 02:59:45 +00:00
git_dir = config [ ' PACKAGE_DIR ' ] / ' ../.git '
2023-03-28 08:58:49 +00:00
ref = ( git_dir / ' HEAD ' ) . read_text ( ) . strip ( ) . split ( ' ' ) [ - 1 ]
commit_hash = git_dir . joinpath ( ref ) . read_text ( ) . strip ( )
return commit_hash
2022-06-09 08:04:55 +00:00
except Exception :
2023-12-19 03:27:52 +00:00
pass
2022-06-09 08:04:55 +00:00
try :
return list ( ( config [ ' PACKAGE_DIR ' ] / ' ../.git/refs/heads/ ' ) . glob ( ' * ' ) ) [ 0 ] . read_text ( ) . strip ( )
except Exception :
2023-12-19 03:27:52 +00:00
pass
return None
2022-06-09 08:04:55 +00:00
2023-12-18 00:57:02 +00:00
def get_build_time ( config ) - > str :
if config [ ' IN_DOCKER ' ] :
docker_build_end_time = Path ( ' /VERSION.txt ' ) . read_text ( ) . rsplit ( ' BUILD_END_TIME= ' ) [ - 1 ] . split ( ' \n ' , 1 ) [ 0 ]
return docker_build_end_time
src_last_modified_unix_timestamp = ( config [ ' PACKAGE_DIR ' ] / ' config.py ' ) . stat ( ) . st_mtime
return datetime . fromtimestamp ( src_last_modified_unix_timestamp ) . strftime ( ' % Y- % m- %d % H: % M: % S %s ' )
2022-06-09 08:04:55 +00:00
2023-12-19 17:57:08 +00:00
def get_versions_available_on_github ( config ) :
2023-11-28 04:56:30 +00:00
"""
2023-12-19 17:57:08 +00:00
returns a dictionary containing the ArchiveBox GitHub release info for
2023-11-28 04:56:30 +00:00
the recommended upgrade version and the currently installed version
"""
2023-12-19 17:57:08 +00:00
# we only want to perform the (relatively expensive) check for new versions
# when its most relevant, e.g. when the user runs a long-running command
2024-01-03 01:17:35 +00:00
subcommand_run_by_user = sys . argv [ 3 ] if len ( sys . argv ) > 3 else ' help '
2023-12-19 17:57:08 +00:00
long_running_commands = ( ' add ' , ' schedule ' , ' update ' , ' status ' , ' server ' )
if subcommand_run_by_user not in long_running_commands :
return None
2023-12-01 00:06:54 +00:00
github_releases_api = " https://api.github.com/repos/ArchiveBox/ArchiveBox/releases "
2023-11-28 04:56:30 +00:00
response = requests . get ( github_releases_api )
if response . status_code != 200 :
2023-12-19 17:57:08 +00:00
stderr ( f ' [!] Warning: GitHub API call to check for new ArchiveBox version failed! (status= { response . status_code } ) ' , color = ' lightyellow ' , config = config )
2023-11-28 04:56:30 +00:00
return None
2023-12-19 17:57:08 +00:00
all_releases = response . json ( )
2023-11-28 04:56:30 +00:00
2023-12-19 17:57:08 +00:00
installed_version = parse_version_string ( config [ ' VERSION ' ] )
2023-11-28 04:56:30 +00:00
# find current version or nearest older version (to link to)
current_version = None
2023-12-19 17:57:08 +00:00
for idx , release in enumerate ( all_releases ) :
2024-01-03 01:17:35 +00:00
release_version = parse_version_string ( release [ ' tag_name ' ] )
2023-12-19 17:57:08 +00:00
if release_version < = installed_version :
2023-11-28 04:56:30 +00:00
current_version = release
break
2024-01-04 19:41:12 +00:00
current_version = current_version or all_releases [ - 1 ]
2023-12-19 17:57:08 +00:00
# recommended version is whatever comes after current_version in the release list
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
try :
recommended_version = all_releases [ idx + 1 ]
except IndexError :
recommended_version = None
2023-11-28 04:56:30 +00:00
2024-01-03 01:17:35 +00:00
return { ' recommended_version ' : recommended_version , ' current_version ' : current_version }
2023-11-28 04:56:30 +00:00
def can_upgrade ( config ) :
2023-12-19 17:57:08 +00:00
if config [ ' VERSIONS_AVAILABLE ' ] and config [ ' VERSIONS_AVAILABLE ' ] [ ' recommended_version ' ] :
recommended_version = parse_version_string ( config [ ' VERSIONS_AVAILABLE ' ] [ ' recommended_version ' ] [ ' tag_name ' ] )
current_version = parse_version_string ( config [ ' VERSIONS_AVAILABLE ' ] [ ' current_version ' ] [ ' tag_name ' ] )
return recommended_version > current_version
2023-11-28 04:56:30 +00:00
return False
2020-12-20 01:11:19 +00:00
############################## Derived Config ##################################
2019-04-24 08:09:25 +00:00
2024-08-21 01:31:21 +00:00
2023-12-19 06:04:11 +00:00
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
2020-12-20 01:11:19 +00:00
DYNAMIC_CONFIG_SCHEMA : ConfigDefaultDict = {
2024-08-21 01:31:21 +00:00
* * CONSTANTS ,
2019-04-24 08:09:25 +00:00
' TERM_WIDTH ' : { ' default ' : lambda c : lambda : shutil . get_terminal_size ( ( 100 , 10 ) ) . columns } ,
2024-01-03 01:09:31 +00:00
' USER ' : { ' default ' : lambda c : get_system_user ( ) } ,
2024-09-03 07:19:18 +00:00
' ANSI ' : { ' default ' : lambda c : DEFAULT_CLI_COLORS if c [ ' USE_COLOR ' ] else AttrDict ( { k : ' ' for k in DEFAULT_CLI_COLORS . keys ( ) } ) } ,
2020-06-30 05:08:14 +00:00
2020-10-31 07:08:03 +00:00
' PACKAGE_DIR ' : { ' default ' : lambda c : Path ( __file__ ) . resolve ( ) . parent } ,
2020-12-11 14:21:09 +00:00
' TEMPLATES_DIR ' : { ' default ' : lambda c : c [ ' PACKAGE_DIR ' ] / TEMPLATES_DIR_NAME } ,
2021-04-06 04:57:20 +00:00
' CUSTOM_TEMPLATES_DIR ' : { ' default ' : lambda c : c [ ' CUSTOM_TEMPLATES_DIR ' ] and Path ( c [ ' CUSTOM_TEMPLATES_DIR ' ] ) } ,
2020-09-07 22:49:14 +00:00
2020-09-08 21:29:22 +00:00
' OUTPUT_DIR ' : { ' default ' : lambda c : Path ( c [ ' OUTPUT_DIR ' ] ) . resolve ( ) if c [ ' OUTPUT_DIR ' ] else Path ( os . curdir ) . resolve ( ) } ,
2020-09-07 22:49:14 +00:00
' ARCHIVE_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / ARCHIVE_DIR_NAME } ,
' SOURCES_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / SOURCES_DIR_NAME } ,
' LOGS_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / LOGS_DIR_NAME } ,
2024-05-12 08:42:34 +00:00
' CACHE_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / CACHE_DIR_NAME } ,
2024-09-21 08:53:59 +00:00
' LIB_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / LIB_DIR_NAME } ,
' BIN_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / LIB_DIR_NAME / ' bin ' } ,
2024-03-18 21:41:39 +00:00
' PERSONAS_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / PERSONAS_DIR_NAME } ,
2020-09-08 21:29:22 +00:00
' CONFIG_FILE ' : { ' default ' : lambda c : Path ( c [ ' CONFIG_FILE ' ] ) . resolve ( ) if c [ ' CONFIG_FILE ' ] else c [ ' OUTPUT_DIR ' ] / CONFIG_FILENAME } ,
' COOKIES_FILE ' : { ' default ' : lambda c : c [ ' COOKIES_FILE ' ] and Path ( c [ ' COOKIES_FILE ' ] ) . resolve ( ) } ,
2024-03-14 07:58:45 +00:00
' CHROME_USER_DATA_DIR ' : { ' default ' : lambda c : Path ( c [ ' CHROME_USER_DATA_DIR ' ] ) . resolve ( ) if c [ ' CHROME_USER_DATA_DIR ' ] else None } ,
2023-07-31 15:34:03 +00:00
' URL_DENYLIST_PTN ' : { ' default ' : lambda c : c [ ' URL_DENYLIST ' ] and re . compile ( c [ ' URL_DENYLIST ' ] or ' ' , ALLOWDENYLIST_REGEX_FLAGS ) } ,
' URL_ALLOWLIST_PTN ' : { ' default ' : lambda c : c [ ' URL_ALLOWLIST ' ] and re . compile ( c [ ' URL_ALLOWLIST ' ] or ' ' , ALLOWDENYLIST_REGEX_FLAGS ) } ,
2023-12-19 06:04:11 +00:00
' DIR_OUTPUT_PERMISSIONS ' : { ' default ' : lambda c : c [ ' OUTPUT_PERMISSIONS ' ] . replace ( ' 6 ' , ' 7 ' ) . replace ( ' 4 ' , ' 5 ' ) } , # exec is always needed to list directories
2019-04-24 08:09:25 +00:00
2021-04-06 06:45:57 +00:00
' ARCHIVEBOX_BINARY ' : { ' default ' : lambda c : sys . argv [ 0 ] or bin_path ( ' archivebox ' ) } ,
2024-09-03 07:19:18 +00:00
' NODE_BIN_PATH ' : { ' default ' : lambda c : str ( ( Path ( c [ " OUTPUT_DIR " ] ) . absolute ( ) / ' node_modules ' / ' .bin ' ) ) } ,
2023-12-19 17:57:08 +00:00
2023-12-19 06:04:11 +00:00
' VERSION ' : { ' default ' : lambda c : get_version ( c ) . split ( ' + ' , 1 ) [ 0 ] } , # remove +editable from user-displayed version string
' COMMIT_HASH ' : { ' default ' : lambda c : get_commit_hash ( c ) } , # short git commit hash of codebase HEAD commit
' BUILD_TIME ' : { ' default ' : lambda c : get_build_time ( c ) } , # docker build completed time or python src last modified time
2022-06-09 08:04:55 +00:00
2024-09-24 08:25:55 +00:00
' VERSIONS_AVAILABLE ' : { ' default ' : lambda c : False } , # get_versions_available_on_github(c)},
' CAN_UPGRADE ' : { ' default ' : lambda c : False } , # can_upgrade(c)},
2023-12-19 17:57:08 +00:00
2019-04-24 08:09:25 +00:00
' PYTHON_BINARY ' : { ' default ' : lambda c : sys . executable } ,
' PYTHON_ENCODING ' : { ' default ' : lambda c : sys . stdout . encoding . upper ( ) } ,
2020-06-30 05:08:14 +00:00
' PYTHON_VERSION ' : { ' default ' : lambda c : ' {} . {} . {} ' . format ( * sys . version_info [ : 3 ] ) } ,
2019-04-24 08:09:25 +00:00
2022-06-09 03:12:55 +00:00
' DJANGO_BINARY ' : { ' default ' : lambda c : inspect . getfile ( django ) } ,
2024-08-23 09:01:24 +00:00
' DJANGO_VERSION ' : { ' default ' : lambda c : ' {} . {} . {} ' . format ( * django . VERSION [ : 3 ] ) } ,
2022-06-09 01:24:17 +00:00
2022-06-09 01:35:31 +00:00
' SQLITE_BINARY ' : { ' default ' : lambda c : inspect . getfile ( sqlite3 ) } ,
2022-06-09 02:11:02 +00:00
' SQLITE_VERSION ' : { ' default ' : lambda c : sqlite3 . version } ,
2023-12-19 17:57:08 +00:00
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
2022-06-09 03:12:55 +00:00
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
2019-04-24 08:09:25 +00:00
2020-07-30 20:55:24 +00:00
' USE_CURL ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and ( c [ ' SAVE_FAVICON ' ] or c [ ' SAVE_TITLE ' ] or c [ ' SAVE_ARCHIVE_DOT_ORG ' ] ) } ,
2019-04-24 08:09:25 +00:00
' CURL_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' CURL_BINARY ' ] ) if c [ ' USE_CURL ' ] else None } ,
2020-06-26 01:30:29 +00:00
' CURL_USER_AGENT ' : { ' default ' : lambda c : c [ ' CURL_USER_AGENT ' ] . format ( * * c ) } ,
2020-10-15 13:42:46 +00:00
' CURL_ARGS ' : { ' default ' : lambda c : c [ ' CURL_ARGS ' ] or [ ] } ,
2024-02-21 21:13:06 +00:00
' CURL_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' CURL_EXTRA_ARGS ' ] or [ ] } ,
2019-04-26 18:43:47 +00:00
' SAVE_FAVICON ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and c [ ' SAVE_FAVICON ' ] } ,
' SAVE_ARCHIVE_DOT_ORG ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and c [ ' SAVE_ARCHIVE_DOT_ORG ' ] } ,
2019-04-24 08:09:25 +00:00
' USE_WGET ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and ( c [ ' SAVE_WGET ' ] or c [ ' SAVE_WARC ' ] ) } ,
' WGET_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' WGET_BINARY ' ] ) if c [ ' USE_WGET ' ] else None } ,
' WGET_AUTO_COMPRESSION ' : { ' default ' : lambda c : wget_supports_compression ( c ) if c [ ' USE_WGET ' ] else False } ,
' WGET_USER_AGENT ' : { ' default ' : lambda c : c [ ' WGET_USER_AGENT ' ] . format ( * * c ) } ,
2019-04-26 18:43:47 +00:00
' SAVE_WGET ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and c [ ' SAVE_WGET ' ] } ,
' SAVE_WARC ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and c [ ' SAVE_WARC ' ] } ,
2020-10-15 13:31:49 +00:00
' WGET_ARGS ' : { ' default ' : lambda c : c [ ' WGET_ARGS ' ] or [ ] } ,
2024-02-21 21:13:06 +00:00
' WGET_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' WGET_EXTRA_ARGS ' ] or [ ] } ,
2019-04-24 08:09:25 +00:00
2020-12-12 12:36:31 +00:00
' RIPGREP_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' RIPGREP_BINARY ' ] ) if c [ ' USE_RIPGREP ' ] else None } ,
2020-11-24 01:24:37 +00:00
2020-08-07 03:07:25 +00:00
' USE_SINGLEFILE ' : { ' default ' : lambda c : c [ ' USE_SINGLEFILE ' ] and c [ ' SAVE_SINGLEFILE ' ] } ,
2020-07-30 18:23:10 +00:00
' SINGLEFILE_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' SINGLEFILE_BINARY ' ] ) if c [ ' USE_SINGLEFILE ' ] else None } ,
2022-06-09 06:35:48 +00:00
' SINGLEFILE_ARGS ' : { ' default ' : lambda c : c [ ' SINGLEFILE_ARGS ' ] or [ ] } ,
2024-02-21 21:13:06 +00:00
' SINGLEFILE_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' SINGLEFILE_EXTRA_ARGS ' ] or [ ] } ,
2020-07-30 18:23:10 +00:00
2020-08-07 13:05:17 +00:00
' USE_READABILITY ' : { ' default ' : lambda c : c [ ' USE_READABILITY ' ] and c [ ' SAVE_READABILITY ' ] } ,
' READABILITY_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' READABILITY_BINARY ' ] ) if c [ ' USE_READABILITY ' ] else None } ,
2020-09-22 08:46:21 +00:00
' USE_MERCURY ' : { ' default ' : lambda c : c [ ' USE_MERCURY ' ] and c [ ' SAVE_MERCURY ' ] } ,
2023-10-20 09:46:41 +00:00
' MERCURY_VERSION ' : { ' default ' : lambda c : ' 1.0.0 ' if shutil . which ( str ( bin_path ( c [ ' MERCURY_BINARY ' ] ) ) ) else None } , # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
2024-03-06 03:15:38 +00:00
' MERCURY_ARGS ' : { ' default ' : lambda c : c [ ' MERCURY_ARGS ' ] or [ ] } ,
' MERCURY_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' MERCURY_EXTRA_ARGS ' ] or [ ] } ,
2020-09-22 08:46:21 +00:00
2019-04-24 08:09:25 +00:00
' USE_GIT ' : { ' default ' : lambda c : c [ ' USE_GIT ' ] and c [ ' SAVE_GIT ' ] } ,
' GIT_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' GIT_BINARY ' ] ) if c [ ' USE_GIT ' ] else None } ,
2019-04-26 18:43:47 +00:00
' SAVE_GIT ' : { ' default ' : lambda c : c [ ' USE_GIT ' ] and c [ ' SAVE_GIT ' ] } ,
2019-04-24 08:09:25 +00:00
' USE_YOUTUBEDL ' : { ' default ' : lambda c : c [ ' USE_YOUTUBEDL ' ] and c [ ' SAVE_MEDIA ' ] } ,
' YOUTUBEDL_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' YOUTUBEDL_BINARY ' ] ) if c [ ' USE_YOUTUBEDL ' ] else None } ,
2019-04-26 18:43:47 +00:00
' SAVE_MEDIA ' : { ' default ' : lambda c : c [ ' USE_YOUTUBEDL ' ] and c [ ' SAVE_MEDIA ' ] } ,
2020-10-14 15:38:29 +00:00
' YOUTUBEDL_ARGS ' : { ' default ' : lambda c : c [ ' YOUTUBEDL_ARGS ' ] or [ ] } ,
2024-02-24 00:40:03 +00:00
' YOUTUBEDL_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' YOUTUBEDL_EXTRA_ARGS ' ] or [ ] } ,
2019-04-24 08:09:25 +00:00
2021-04-06 03:33:08 +00:00
' CHROME_BINARY ' : { ' default ' : lambda c : c [ ' CHROME_BINARY ' ] or find_chrome_binary ( ) } ,
' USE_CHROME ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' CHROME_BINARY ' ] and ( c [ ' SAVE_PDF ' ] or c [ ' SAVE_SCREENSHOT ' ] or c [ ' SAVE_DOM ' ] or c [ ' SAVE_SINGLEFILE ' ] ) } ,
2019-04-24 08:09:25 +00:00
' CHROME_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' CHROME_BINARY ' ] ) if c [ ' USE_CHROME ' ] else None } ,
2024-01-04 04:08:39 +00:00
' CHROME_USER_AGENT ' : { ' default ' : lambda c : c [ ' CHROME_USER_AGENT ' ] . format ( * * c ) } ,
2021-09-30 15:40:13 +00:00
2019-04-26 18:43:47 +00:00
' SAVE_PDF ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_PDF ' ] } ,
' SAVE_SCREENSHOT ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_SCREENSHOT ' ] } ,
' SAVE_DOM ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_DOM ' ] } ,
2020-10-31 23:32:43 +00:00
' SAVE_SINGLEFILE ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_SINGLEFILE ' ] and c [ ' USE_NODE ' ] } ,
2020-08-18 15:34:28 +00:00
' SAVE_READABILITY ' : { ' default ' : lambda c : c [ ' USE_READABILITY ' ] and c [ ' USE_NODE ' ] } ,
2020-09-22 08:46:21 +00:00
' SAVE_MERCURY ' : { ' default ' : lambda c : c [ ' USE_MERCURY ' ] and c [ ' USE_NODE ' ] } ,
2021-09-30 15:40:13 +00:00
2020-12-11 19:03:17 +00:00
' USE_NODE ' : { ' default ' : lambda c : c [ ' USE_NODE ' ] and ( c [ ' SAVE_READABILITY ' ] or c [ ' SAVE_SINGLEFILE ' ] or c [ ' SAVE_MERCURY ' ] ) } ,
' NODE_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' NODE_BINARY ' ] ) if c [ ' USE_NODE ' ] else None } ,
2019-04-24 08:09:25 +00:00
' DEPENDENCIES ' : { ' default ' : lambda c : get_dependency_info ( c ) } ,
' CODE_LOCATIONS ' : { ' default ' : lambda c : get_code_locations ( c ) } ,
' DATA_LOCATIONS ' : { ' default ' : lambda c : get_data_locations ( c ) } ,
' CHROME_OPTIONS ' : { ' default ' : lambda c : get_chrome_info ( c ) } ,
2024-02-24 00:40:03 +00:00
' CHROME_EXTRA_ARGS ' : { ' default ' : lambda c : c [ ' CHROME_EXTRA_ARGS ' ] or [ ] } ,
2023-07-31 15:34:03 +00:00
' SAVE_ALLOWLIST_PTN ' : { ' default ' : lambda c : c [ ' SAVE_ALLOWLIST ' ] and { re . compile ( k , ALLOWDENYLIST_REGEX_FLAGS ) : v for k , v in c [ ' SAVE_ALLOWLIST ' ] . items ( ) } } ,
2024-01-04 04:08:39 +00:00
' SAVE_DENYLIST_PTN ' : { ' default ' : lambda c : c [ ' SAVE_DENYLIST ' ] and { re . compile ( k , ALLOWDENYLIST_REGEX_FLAGS ) : v for k , v in c [ ' SAVE_DENYLIST ' ] . items ( ) } } ,
2019-04-24 08:09:25 +00:00
}
2019-03-27 20:44:00 +00:00
2019-04-24 08:09:25 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED DEFINING SCHEMAS")
2019-04-24 08:09:25 +00:00
################################### Helpers ####################################
2020-12-20 01:11:19 +00:00
2019-04-24 15:36:14 +00:00
def load_config_val ( key : str ,
default : ConfigDefaultValue = None ,
type : Optional [ Type ] = None ,
aliases : Optional [ Tuple [ str , . . . ] ] = None ,
2019-04-25 23:03:38 +00:00
config : Optional [ ConfigDict ] = None ,
env_vars : Optional [ os . _Environ ] = None ,
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > ConfigValue :
2020-07-13 15:22:07 +00:00
""" parse bool, int, and str key=value pairs from env """
2023-12-19 06:04:11 +00:00
assert isinstance ( config , dict )
2019-04-25 23:03:38 +00:00
2023-12-19 06:04:11 +00:00
is_read_only = type is None
if is_read_only :
if callable ( default ) :
return default ( config )
return default
# get value from environment variables or config files
2019-04-25 23:03:38 +00:00
config_keys_to_check = ( key , * ( aliases or ( ) ) )
2023-12-19 06:04:11 +00:00
val = None
2019-04-25 23:03:38 +00:00
for key in config_keys_to_check :
if env_vars :
val = env_vars . get ( key )
if val :
break
2023-12-19 06:04:11 +00:00
2019-04-25 23:03:38 +00:00
if config_file_vars :
val = config_file_vars . get ( key )
if val :
break
2019-04-24 08:09:25 +00:00
2023-12-19 06:04:11 +00:00
is_unset = val is None
if is_unset :
2019-04-24 15:36:14 +00:00
if callable ( default ) :
2019-04-24 08:09:25 +00:00
return default ( config )
return default
2020-06-30 05:08:14 +00:00
2023-12-19 06:04:11 +00:00
# calculate value based on expected type
BOOL_TRUEIES = ( ' true ' , ' yes ' , ' 1 ' )
BOOL_FALSEIES = ( ' false ' , ' no ' , ' 0 ' )
if type is bool :
if val . lower ( ) in BOOL_TRUEIES :
2019-04-24 08:09:25 +00:00
return True
2023-12-19 06:04:11 +00:00
elif val . lower ( ) in BOOL_FALSEIES :
2019-04-24 08:09:25 +00:00
return False
else :
2021-09-30 15:40:13 +00:00
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a boolean: True/False) ' )
2019-04-24 08:09:25 +00:00
elif type is str :
2023-12-19 06:04:11 +00:00
if val . lower ( ) in ( * BOOL_TRUEIES , * BOOL_FALSEIES ) :
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a string, but value looks like a boolean) ' )
2019-04-24 08:09:25 +00:00
return val . strip ( )
elif type is int :
2023-12-19 06:04:11 +00:00
if not val . strip ( ) . isdigit ( ) :
2019-04-24 08:09:25 +00:00
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected an integer) ' )
2023-12-19 06:04:11 +00:00
return int ( val . strip ( ) )
2019-04-24 08:09:25 +00:00
2020-11-07 19:17:21 +00:00
elif type is list or type is dict :
2020-10-15 13:16:08 +00:00
return json . loads ( val )
2020-10-14 15:38:29 +00:00
2023-12-19 06:04:11 +00:00
raise Exception ( ' Config values can only be str, bool, int, or json ' )
2019-04-24 08:09:25 +00:00
2020-06-30 05:08:14 +00:00
2024-08-21 01:31:21 +00:00
def load_config_file ( out_dir : str | None = None ) - > Optional [ ConfigDict ] :
2019-04-25 23:03:38 +00:00
""" load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf """
2020-09-07 22:49:14 +00:00
out_dir = out_dir or Path ( os . getenv ( ' OUTPUT_DIR ' , ' . ' ) ) . resolve ( )
2024-08-21 01:31:21 +00:00
assert out_dir and out_dir . is_dir ( )
2020-09-07 22:49:14 +00:00
config_path = Path ( out_dir ) / CONFIG_FILENAME
if config_path . exists ( ) :
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
2021-09-30 15:40:13 +00:00
config_file . optionxform = str
2019-04-25 23:03:38 +00:00
config_file . read ( config_path )
# flatten into one namespace
2024-08-21 01:31:21 +00:00
config_file_vars = ConfigDict ( {
2019-04-25 23:03:38 +00:00
key . upper ( ) : val
for section , options in config_file . items ( )
for key , val in options . items ( )
2024-08-21 01:31:21 +00:00
} )
2019-04-25 23:03:38 +00:00
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
2020-06-30 05:12:06 +00:00
2024-08-21 01:31:21 +00:00
def write_config_file ( config : Dict [ str , str ] , out_dir : str | None = None ) - > ConfigDict :
2019-04-25 23:03:38 +00:00
""" load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf """
2020-10-31 07:08:03 +00:00
from . system import atomic_write
2020-06-30 06:04:16 +00:00
2020-12-20 01:11:19 +00:00
CONFIG_HEADER = (
""" # This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
2021-09-30 15:40:13 +00:00
#
2020-12-20 01:11:19 +00:00
# If you modify this file manually, make sure to update your archive after by running:
# archivebox init
#
# A list of all possible config with documentation and examples can be found here:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
""" )
2020-09-07 22:49:14 +00:00
out_dir = out_dir or Path ( os . getenv ( ' OUTPUT_DIR ' , ' . ' ) ) . resolve ( )
config_path = Path ( out_dir ) / CONFIG_FILENAME
2021-09-30 15:40:13 +00:00
2020-09-07 22:49:14 +00:00
if not config_path . exists ( ) :
2020-06-30 05:12:06 +00:00
atomic_write ( config_path , CONFIG_HEADER )
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
2019-04-26 18:43:13 +00:00
config_file . optionxform = str
2019-04-25 23:03:38 +00:00
config_file . read ( config_path )
2021-03-27 05:01:29 +00:00
with open ( config_path , ' r ' , encoding = ' utf-8 ' ) as old :
2020-06-30 05:12:06 +00:00
atomic_write ( f ' { config_path } .bak ' , old . read ( ) )
2020-12-20 01:11:19 +00:00
find_section = lambda key : [ name for name , opts in CONFIG_SCHEMA . items ( ) if key in opts ] [ 0 ]
2019-04-25 23:03:38 +00:00
2020-06-30 05:12:06 +00:00
# Set up sections in empty config file
for key , val in config . items ( ) :
section = find_section ( key )
if section in config_file :
existing_config = dict ( config_file [ section ] )
else :
existing_config = { }
2024-08-21 01:31:21 +00:00
config_file [ section ] = ConfigDict ( { * * existing_config , key : val } )
2020-06-30 05:12:06 +00:00
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if ' SERVER_CONFIG ' in config_file and ' SECRET_KEY ' in config_file [ ' SERVER_CONFIG ' ] :
existing_secret_key = config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ]
if ( not existing_secret_key ) or ( ' not a valid secret ' in existing_secret_key ) :
from django . utils . crypto import get_random_string
2021-02-16 01:45:42 +00:00
chars = ' abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_ '
2020-06-30 05:12:06 +00:00
random_secret_key = get_random_string ( 50 , chars )
if ' SERVER_CONFIG ' in config_file :
config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ] = random_secret_key
else :
config_file [ ' SERVER_CONFIG ' ] = { ' SECRET_KEY ' : random_secret_key }
2021-03-27 05:01:29 +00:00
with open ( config_path , ' w+ ' , encoding = ' utf-8 ' ) as new :
2020-06-30 06:04:16 +00:00
config_file . write ( new )
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
try :
2020-06-30 05:12:06 +00:00
# validate the config by attempting to re-parse it
2019-04-25 23:03:38 +00:00
CONFIG = load_all_config ( )
2021-04-08 10:08:17 +00:00
except BaseException : # lgtm [py/catch-base-exception]
2020-06-30 05:12:06 +00:00
# something went horribly wrong, rever to the previous version
2021-03-27 05:01:29 +00:00
with open ( f ' { config_path } .bak ' , ' r ' , encoding = ' utf-8 ' ) as old :
2020-06-30 05:12:06 +00:00
atomic_write ( config_path , old . read ( ) )
2019-04-25 23:03:38 +00:00
2021-04-08 10:08:17 +00:00
raise
2020-09-30 19:54:51 +00:00
if Path ( f ' { config_path } .bak ' ) . exists ( ) :
2020-06-30 05:12:06 +00:00
os . remove ( f ' { config_path } .bak ' )
2021-09-30 15:40:13 +00:00
2021-04-09 16:15:47 +00:00
return {
key . upper ( ) : CONFIG . get ( key . upper ( ) )
for key in config . keys ( )
}
2019-04-25 23:03:38 +00:00
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
def load_config ( defaults : ConfigDefaultDict ,
config : Optional [ ConfigDict ] = None ,
out_dir : Optional [ str ] = None ,
env_vars : Optional [ os . _Environ ] = None ,
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > ConfigDict :
2021-09-30 15:40:13 +00:00
2019-04-25 23:03:38 +00:00
env_vars = env_vars or os . environ
config_file_vars = config_file_vars or load_config_file ( out_dir = out_dir )
2019-04-24 15:36:14 +00:00
extended_config : ConfigDict = config . copy ( ) if config else { }
2019-04-24 08:09:25 +00:00
for key , default in defaults . items ( ) :
try :
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG KEY:', key, 'DEFAULT=', default)
2019-04-24 15:36:14 +00:00
extended_config [ key ] = load_config_val (
key ,
default = default [ ' default ' ] ,
type = default . get ( ' type ' ) ,
aliases = default . get ( ' aliases ' ) ,
config = extended_config ,
2019-04-25 23:03:38 +00:00
env_vars = env_vars ,
config_file_vars = config_file_vars ,
2019-04-24 15:36:14 +00:00
)
2019-04-24 08:09:25 +00:00
except KeyboardInterrupt :
2019-04-25 23:02:44 +00:00
raise SystemExit ( 0 )
2019-04-24 08:09:25 +00:00
except Exception as e :
stderr ( )
2019-04-24 15:36:14 +00:00
stderr ( f ' [X] Error while loading configuration value: { key } ' , color = ' red ' , config = extended_config )
2019-04-24 08:09:25 +00:00
stderr ( ' {} : {} ' . format ( e . __class__ . __name__ , e ) )
stderr ( )
stderr ( ' Check your config for mistakes and try again (your archive data is unaffected). ' )
stderr ( )
stderr ( ' For config documentation and examples see: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration ' )
2019-04-24 08:09:25 +00:00
stderr ( )
2021-04-06 17:47:26 +00:00
# raise
2019-04-25 23:03:38 +00:00
raise SystemExit ( 2 )
2021-09-30 15:40:13 +00:00
2024-08-21 01:31:21 +00:00
return AttrDict ( extended_config )
2019-04-24 08:09:25 +00:00
2019-04-25 23:03:38 +00:00
2024-01-03 01:17:35 +00:00
def parse_version_string ( version : str ) - > Tuple [ int , int , int ] :
2023-12-19 17:57:08 +00:00
""" parses a version tag string formatted like ' vx.x.x ' into (major, minor, patch) ints """
2024-01-04 19:41:12 +00:00
base = version . split ( ' + ' ) [ 0 ] . split ( ' v ' ) [ - 1 ] # remove 'v' prefix and '+editable' suffix
2023-12-19 17:57:08 +00:00
return tuple ( int ( part ) for part in base . split ( ' . ' ) ) [ : 3 ]
2023-11-28 04:56:30 +00:00
2020-12-20 01:11:19 +00:00
2019-03-26 09:31:27 +00:00
2020-12-20 01:11:19 +00:00
# Dependency Metadata Helpers
2024-09-21 08:55:09 +00:00
def bin_version ( binary : Optional [ str ] , cmd : Optional [ str ] = None , timeout : int = 3 ) - > Optional [ str ] :
2019-03-26 09:31:27 +00:00
""" check the presence and return valid version line of a specified binary """
2019-04-19 01:09:54 +00:00
2019-04-24 15:36:14 +00:00
abspath = bin_path ( binary )
2020-08-18 12:21:55 +00:00
if not binary or not abspath :
2019-04-24 15:36:14 +00:00
return None
2024-09-25 02:04:38 +00:00
return ' 999.999.999 '
# Now handled by new BinProvider plugin system, no longer needed:
2019-04-11 07:42:35 +00:00
2019-04-24 15:36:14 +00:00
try :
2023-03-20 18:25:29 +00:00
bin_env = os . environ | { ' LANG ' : ' C ' }
2024-08-23 00:57:33 +00:00
is_cmd_str = cmd and isinstance ( cmd , str )
2024-09-21 08:55:09 +00:00
version_str = (
run ( cmd or [ abspath , " --version " ] , timeout = timeout , shell = is_cmd_str , stdout = PIPE , stderr = STDOUT , env = bin_env )
. stdout . strip ( )
. decode ( )
)
2022-05-10 04:21:08 +00:00
if not version_str :
2024-09-21 08:55:09 +00:00
version_str = (
run ( cmd or [ abspath , " --version " ] , timeout = timeout , shell = is_cmd_str , stdout = PIPE , stderr = STDOUT )
. stdout . strip ( )
. decode ( )
)
2024-08-23 00:57:33 +00:00
2020-08-18 19:00:12 +00:00
# take first 3 columns of first line of version info
2024-08-23 09:01:24 +00:00
semver = SemVer . parse ( version_str )
if semver :
return str ( semver )
2024-09-21 08:55:09 +00:00
except ( OSError , TimeoutExpired ) :
2020-08-18 12:21:55 +00:00
pass
2019-04-24 08:09:25 +00:00
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
2020-11-23 07:04:39 +00:00
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
2020-08-18 12:21:55 +00:00
return None
2019-04-11 07:42:35 +00:00
2019-04-24 15:36:14 +00:00
def bin_path ( binary : Optional [ str ] ) - > Optional [ str ] :
if binary is None :
return None
2020-08-18 22:14:56 +00:00
node_modules_bin = Path ( ' . ' ) / ' node_modules ' / ' .bin ' / binary
if node_modules_bin . exists ( ) :
return str ( node_modules_bin . resolve ( ) )
2020-12-11 17:33:18 +00:00
return shutil . which ( str ( Path ( binary ) . expanduser ( ) ) ) or shutil . which ( str ( binary ) ) or binary
2019-04-24 15:36:14 +00:00
def bin_hash ( binary : Optional [ str ] ) - > Optional [ str ] :
2024-09-25 02:04:38 +00:00
return ' UNUSED '
# DEPRECATED: now handled by new BinProvider plugin system, no longer needed:
2020-07-27 22:51:29 +00:00
if binary is None :
return None
2019-04-24 15:36:14 +00:00
abs_path = bin_path ( binary )
2020-07-28 11:20:57 +00:00
if abs_path is None or not Path ( abs_path ) . exists ( ) :
2019-04-22 18:34:12 +00:00
return None
file_hash = md5 ( )
2019-04-24 15:36:14 +00:00
with io . open ( abs_path , mode = ' rb ' ) as f :
2019-04-22 18:34:12 +00:00
for chunk in iter ( lambda : f . read ( io . DEFAULT_BUFFER_SIZE ) , b ' ' ) :
file_hash . update ( chunk )
2021-09-30 15:40:13 +00:00
2019-04-22 18:34:12 +00:00
return f ' md5: { file_hash . hexdigest ( ) } '
2019-04-11 07:42:35 +00:00
def find_chrome_binary ( ) - > Optional [ str ] :
2019-03-26 09:31:27 +00:00
""" find any installed chrome binaries in the default locations """
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_executable_paths = (
2023-11-14 10:04:49 +00:00
# '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
2019-03-26 09:31:27 +00:00
' chromium-browser ' ,
' chromium ' ,
' /Applications/Chromium.app/Contents/MacOS/Chromium ' ,
2020-06-26 01:30:29 +00:00
' chrome ' ,
2019-03-26 09:31:27 +00:00
' google-chrome ' ,
' /Applications/Google Chrome.app/Contents/MacOS/Google Chrome ' ,
' google-chrome-stable ' ,
' google-chrome-beta ' ,
' google-chrome-canary ' ,
' /Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary ' ,
' google-chrome-unstable ' ,
' google-chrome-dev ' ,
)
for name in default_executable_paths :
full_path_exists = shutil . which ( name )
if full_path_exists :
return name
2021-09-30 15:40:13 +00:00
2019-04-11 07:42:35 +00:00
return None
2019-03-26 09:31:27 +00:00
def find_chrome_data_dir ( ) - > Optional [ str ] :
""" find any installed chrome user data directories in the default locations """
2024-03-14 07:58:45 +00:00
# deprecated because this is DANGEROUS, do not re-implement/uncomment this behavior.
# Going forward we want to discourage people from using their main chrome profile for archiving.
# Session tokens, personal data, and cookies are often returned in server responses,
# when they get archived, they are essentially burned as anyone who can view the archive
# can use that data to masquerade as the logged-in user that did the archiving.
# For this reason users should always create dedicated burner profiles for archiving and not use
# their daily driver main accounts.
# # Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# # make sure data dir finding precedence order always matches binary finding order
# default_profile_paths = (
# '~/.config/chromium',
# '~/Library/Application Support/Chromium',
# '~/AppData/Local/Chromium/User Data',
# '~/.config/chrome',
# '~/.config/google-chrome',
# '~/Library/Application Support/Google/Chrome',
# '~/AppData/Local/Google/Chrome/User Data',
# '~/.config/google-chrome-stable',
# '~/.config/google-chrome-beta',
# '~/Library/Application Support/Google/Chrome Canary',
# '~/AppData/Local/Google/Chrome SxS/User Data',
# '~/.config/google-chrome-unstable',
# '~/.config/google-chrome-dev',
# )
# for path in default_profile_paths:
# full_path = Path(path).resolve()
# if full_path.exists():
# return full_path
2019-03-26 09:31:27 +00:00
return None
2019-04-24 08:09:25 +00:00
def wget_supports_compression ( config ) :
2020-11-12 19:28:43 +00:00
try :
cmd = [
config [ ' WGET_BINARY ' ] ,
" --compression=auto " ,
" --help " ,
]
return not run ( cmd , stdout = DEVNULL , stderr = DEVNULL ) . returncode
except ( FileNotFoundError , OSError ) :
return False
2019-03-26 23:21:34 +00:00
2019-04-24 15:36:14 +00:00
def get_code_locations ( config : ConfigDict ) - > SimpleConfigValueDict :
2019-04-24 08:09:25 +00:00
return {
2020-10-31 07:08:03 +00:00
' PACKAGE_DIR ' : {
' path ' : ( config [ ' PACKAGE_DIR ' ] ) . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-10-31 07:08:03 +00:00
' is_valid ' : ( config [ ' PACKAGE_DIR ' ] / ' __main__.py ' ) . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
' TEMPLATES_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : ( config [ ' TEMPLATES_DIR ' ] ) . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2021-01-30 10:34:19 +00:00
' is_valid ' : ( config [ ' TEMPLATES_DIR ' ] / ' static ' ) . exists ( ) ,
2019-04-24 08:09:25 +00:00
} ,
2024-09-21 08:53:59 +00:00
' LIB_DIR ' : {
' path ' : ( config [ ' LIB_DIR ' ] ) . resolve ( ) ,
' enabled ' : True ,
' is_valid ' : config [ ' LIB_DIR ' ] . is_dir ( ) ,
} ,
2020-11-24 01:24:37 +00:00
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
# 'is_valid': (...).exists(),
# },
2019-04-24 08:09:25 +00:00
}
2019-04-24 15:36:14 +00:00
def get_data_locations ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
2024-04-26 04:36:43 +00:00
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
2024-09-21 08:53:59 +00:00
" OUTPUT_DIR " : {
" path " : config [ " OUTPUT_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : ( config [ " OUTPUT_DIR " ] / SQL_INDEX_FILENAME ) . exists ( ) ,
" is_mount " : os . path . ismount ( config [ " OUTPUT_DIR " ] . resolve ( ) ) ,
2019-04-11 07:42:35 +00:00
} ,
2024-09-21 08:53:59 +00:00
" CONFIG_FILE " : {
" path " : config [ " CONFIG_FILE " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " CONFIG_FILE " ] . exists ( ) ,
2024-04-26 04:36:43 +00:00
} ,
2024-09-21 08:53:59 +00:00
" SQL_INDEX " : {
" path " : ( config [ " OUTPUT_DIR " ] / SQL_INDEX_FILENAME ) . resolve ( ) ,
" enabled " : True ,
" is_valid " : ( config [ " OUTPUT_DIR " ] / SQL_INDEX_FILENAME ) . exists ( ) ,
" is_mount " : os . path . ismount ( ( config [ " OUTPUT_DIR " ] / SQL_INDEX_FILENAME ) . resolve ( ) ) ,
2024-04-26 04:36:43 +00:00
} ,
2024-09-21 08:53:59 +00:00
" ARCHIVE_DIR " : {
" path " : config [ " ARCHIVE_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " ARCHIVE_DIR " ] . exists ( ) ,
" is_mount " : os . path . ismount ( config [ " ARCHIVE_DIR " ] . resolve ( ) ) ,
2024-04-26 04:36:43 +00:00
} ,
2024-09-21 08:53:59 +00:00
" SOURCES_DIR " : {
" path " : config [ " SOURCES_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " SOURCES_DIR " ] . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
2024-09-21 08:53:59 +00:00
" PERSONAS_DIR " : {
" path " : config [ " PERSONAS_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " PERSONAS_DIR " ] . exists ( ) ,
2024-08-18 02:38:51 +00:00
} ,
2024-09-21 08:53:59 +00:00
" LOGS_DIR " : {
" path " : config [ " LOGS_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " LOGS_DIR " ] . exists ( ) ,
2019-04-19 01:09:54 +00:00
} ,
2024-09-21 08:53:59 +00:00
" CACHE_DIR " : {
" path " : config [ " CACHE_DIR " ] . resolve ( ) ,
" enabled " : True ,
" is_valid " : config [ " CACHE_DIR " ] . exists ( ) ,
2024-05-12 08:42:34 +00:00
} ,
2024-09-21 08:53:59 +00:00
" CUSTOM_TEMPLATES_DIR " : {
" path " : config [ " CUSTOM_TEMPLATES_DIR " ] and Path ( config [ " CUSTOM_TEMPLATES_DIR " ] ) . resolve ( ) ,
" enabled " : bool ( config [ " CUSTOM_TEMPLATES_DIR " ] ) ,
" is_valid " : config [ " CUSTOM_TEMPLATES_DIR " ] and Path ( config [ " CUSTOM_TEMPLATES_DIR " ] ) . exists ( ) ,
2024-04-26 04:36:43 +00:00
} ,
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
2019-04-11 07:42:35 +00:00
}
2019-04-24 15:36:14 +00:00
def get_dependency_info ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
2024-09-25 05:22:03 +00:00
# 'PYTHON_BINARY': {
# 'path': bin_path(config['PYTHON_BINARY']),
# 'version': config['PYTHON_VERSION'],
# 'hash': bin_hash(config['PYTHON_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['PYTHON_VERSION']),
# },
# 'SQLITE_BINARY': {
# 'path': bin_path(config['SQLITE_BINARY']),
# 'version': config['SQLITE_VERSION'],
# 'hash': bin_hash(config['SQLITE_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['SQLITE_VERSION']),
# },
# 'DJANGO_BINARY': {
# 'path': bin_path(config['DJANGO_BINARY']),
# 'version': config['DJANGO_VERSION'],
# 'hash': bin_hash(config['DJANGO_BINARY']),
# 'enabled': True,
# 'is_valid': bool(config['DJANGO_VERSION']),
# },
# 'ARCHIVEBOX_BINARY': {
# 'path': bin_path(config['ARCHIVEBOX_BINARY']),
# 'version': config['VERSION'],
# 'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
# 'enabled': True,
# 'is_valid': True,
# },
2022-06-09 03:17:38 +00:00
2019-04-11 07:42:35 +00:00
' CURL_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' CURL_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' CURL_VERSION ' ] ,
2021-01-20 15:24:34 +00:00
' hash ' : bin_hash ( config [ ' CURL_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' enabled ' : config [ ' USE_CURL ' ] ,
' is_valid ' : bool ( config [ ' CURL_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' WGET_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' WGET_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' WGET_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' WGET_BINARY ' ] ) ,
' enabled ' : config [ ' USE_WGET ' ] ,
' is_valid ' : bool ( config [ ' WGET_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
2020-11-24 01:24:37 +00:00
' NODE_BINARY ' : {
' path ' : bin_path ( config [ ' NODE_BINARY ' ] ) ,
' version ' : config [ ' NODE_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' NODE_BINARY ' ] ) ,
' enabled ' : config [ ' USE_NODE ' ] ,
2021-01-20 15:24:34 +00:00
' is_valid ' : bool ( config [ ' NODE_VERSION ' ] ) ,
2020-11-24 01:24:37 +00:00
} ,
2020-07-30 18:23:10 +00:00
' SINGLEFILE_BINARY ' : {
' path ' : bin_path ( config [ ' SINGLEFILE_BINARY ' ] ) ,
' version ' : config [ ' SINGLEFILE_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' SINGLEFILE_BINARY ' ] ) ,
' enabled ' : config [ ' USE_SINGLEFILE ' ] ,
' is_valid ' : bool ( config [ ' SINGLEFILE_VERSION ' ] ) ,
} ,
2020-08-07 13:05:17 +00:00
' READABILITY_BINARY ' : {
' path ' : bin_path ( config [ ' READABILITY_BINARY ' ] ) ,
' version ' : config [ ' READABILITY_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' READABILITY_BINARY ' ] ) ,
' enabled ' : config [ ' USE_READABILITY ' ] ,
' is_valid ' : bool ( config [ ' READABILITY_VERSION ' ] ) ,
} ,
2020-09-22 08:46:21 +00:00
' MERCURY_BINARY ' : {
' path ' : bin_path ( config [ ' MERCURY_BINARY ' ] ) ,
' version ' : config [ ' MERCURY_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' MERCURY_BINARY ' ] ) ,
' enabled ' : config [ ' USE_MERCURY ' ] ,
' is_valid ' : bool ( config [ ' MERCURY_VERSION ' ] ) ,
} ,
2019-04-11 07:42:35 +00:00
' GIT_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' GIT_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' GIT_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' GIT_BINARY ' ] ) ,
' enabled ' : config [ ' USE_GIT ' ] ,
' is_valid ' : bool ( config [ ' GIT_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' YOUTUBEDL_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' YOUTUBEDL_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' YOUTUBEDL_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' YOUTUBEDL_BINARY ' ] ) ,
' enabled ' : config [ ' USE_YOUTUBEDL ' ] ,
' is_valid ' : bool ( config [ ' YOUTUBEDL_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' CHROME_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' CHROME_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' CHROME_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' CHROME_BINARY ' ] ) ,
' enabled ' : config [ ' USE_CHROME ' ] ,
' is_valid ' : bool ( config [ ' CHROME_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
2020-12-12 12:36:31 +00:00
' RIPGREP_BINARY ' : {
' path ' : bin_path ( config [ ' RIPGREP_BINARY ' ] ) ,
' version ' : config [ ' RIPGREP_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' RIPGREP_BINARY ' ] ) ,
' enabled ' : config [ ' USE_RIPGREP ' ] ,
' is_valid ' : bool ( config [ ' RIPGREP_VERSION ' ] ) ,
} ,
2020-12-20 01:11:19 +00:00
# TODO: add an entry for the sonic search backend?
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
# 'hash': bin_hash(config['SONIC_BINARY']),
# 'enabled': config['USE_SONIC'],
# 'is_valid': bool(config['SONIC_VERSION']),
# },
2019-04-11 07:42:35 +00:00
}
2019-03-23 02:05:45 +00:00
2019-04-24 15:36:14 +00:00
def get_chrome_info ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
' TIMEOUT ' : config [ ' TIMEOUT ' ] ,
' RESOLUTION ' : config [ ' RESOLUTION ' ] ,
' CHECK_SSL_VALIDITY ' : config [ ' CHECK_SSL_VALIDITY ' ] ,
2022-04-21 14:09:17 +00:00
' CHROME_BINARY ' : bin_path ( config [ ' CHROME_BINARY ' ] ) ,
2023-03-15 23:01:02 +00:00
' CHROME_TIMEOUT ' : config [ ' CHROME_TIMEOUT ' ] ,
2019-04-24 08:09:25 +00:00
' CHROME_HEADLESS ' : config [ ' CHROME_HEADLESS ' ] ,
' CHROME_SANDBOX ' : config [ ' CHROME_SANDBOX ' ] ,
' CHROME_USER_AGENT ' : config [ ' CHROME_USER_AGENT ' ] ,
' CHROME_USER_DATA_DIR ' : config [ ' CHROME_USER_DATA_DIR ' ] ,
2019-03-26 09:31:27 +00:00
}
2019-04-11 07:42:35 +00:00
2019-03-23 03:00:53 +00:00
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************
# ******* (compile the defaults, configs, and metadata all into CONFIG) ********
# ******************************************************************************
# ******************************************************************************
2019-04-24 08:09:25 +00:00
2019-04-25 23:00:25 +00:00
def load_all_config ( ) :
2024-08-21 01:31:21 +00:00
CONFIG : ConfigDict = ConfigDict ( )
2020-12-20 01:11:19 +00:00
for section_name , section_config in CONFIG_SCHEMA . items ( ) :
2024-09-21 08:53:59 +00:00
# print('LOADING CONFIG SECTION:', section_name)
2019-04-25 23:00:25 +00:00
CONFIG = load_config ( section_config , CONFIG )
2024-09-21 08:53:59 +00:00
# print("LOADING CONFIG SECTION:", 'DYNAMIC')
2020-12-20 01:11:19 +00:00
return load_config ( DYNAMIC_CONFIG_SCHEMA , CONFIG )
2019-04-25 23:00:25 +00:00
2020-12-20 01:11:19 +00:00
# add all final config values in CONFIG to globals in this file
2024-08-21 01:31:21 +00:00
CONFIG : ConfigDict = load_all_config ( )
2019-04-24 08:09:25 +00:00
globals ( ) . update ( CONFIG )
2020-12-20 01:11:19 +00:00
# this lets us do: from .config import DEBUG, MEDIA_TIMEOUT, ...
2019-04-11 07:42:35 +00:00
2024-09-21 08:53:59 +00:00
# print("FINISHED LOADING CONFIG USING SCHEMAS + FILE + ENV")
2020-12-20 01:11:19 +00:00
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
# ******************************************************************************
########################### System Environment Setup ###########################
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
2024-01-04 19:41:12 +00:00
assert TIMEZONE == ' UTC ' , ' The server timezone should always be set to UTC ' # noqa: F821
os . environ [ " TZ " ] = TIMEZONE # noqa: F821
os . umask ( 0o777 - int ( DIR_OUTPUT_PERMISSIONS , base = 8 ) ) # noqa: F821
2020-08-11 03:21:02 +00:00
2020-08-18 22:14:56 +00:00
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
2024-09-03 07:19:18 +00:00
sys . path . append ( CONFIG . NODE_BIN_PATH )
2020-08-18 22:14:56 +00:00
2022-06-09 03:12:55 +00:00
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
2021-02-01 07:27:24 +00:00
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG [ ' CHECK_SSL_VALIDITY ' ] :
2021-01-31 01:39:11 +00:00
import urllib3
requests . packages . urllib3 . disable_warnings ( requests . packages . urllib3 . exceptions . InsecureRequestWarning )
urllib3 . disable_warnings ( urllib3 . exceptions . InsecureRequestWarning )
2020-12-20 01:11:19 +00:00
2022-06-09 01:58:15 +00:00
# get SQLite database version, compile options, and runtime options
2022-06-09 03:12:55 +00:00
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
2022-06-09 02:09:11 +00:00
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
2022-06-09 03:12:55 +00:00
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
2020-12-20 01:11:19 +00:00
########################### Config Validity Checkers ###########################
2024-09-25 02:37:29 +00:00
if not CONFIG . USE_COLOR :
os . environ [ ' NO_COLOR ' ] = ' 1 '
if not CONFIG . SHOW_PROGRESS :
os . environ [ ' TERM ' ] = ' dumb '
# recreate rich console obj based on new config values
CONSOLE = Console ( )
from . misc import logging
logging . CONSOLE = CONSOLE
2024-09-25 02:04:38 +00:00
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = 0
2019-04-11 07:42:35 +00:00
2024-09-25 02:04:38 +00:00
def bump_startup_progress_bar ( ) :
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
if INITIAL_STARTUP_PROGRESS :
INITIAL_STARTUP_PROGRESS . update ( INITIAL_STARTUP_PROGRESS_TASK , advance = 1 ) # type: ignore
2021-09-30 15:40:13 +00:00
2024-09-25 02:04:38 +00:00
def setup_django ( out_dir : Path = None , check_db = False , config : ConfigDict = CONFIG , in_memory_db = False ) - > None :
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
with Progress ( transient = True , expand = True , console = CONSOLE ) as INITIAL_STARTUP_PROGRESS :
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS . add_task ( " [green]Loading modules... " , total = 25 )
check_system_config ( config )
2020-07-13 15:24:36 +00:00
2024-09-25 02:04:38 +00:00
output_dir = out_dir or Path ( config [ ' OUTPUT_DIR ' ] )
2019-04-24 08:09:25 +00:00
2024-09-25 02:04:38 +00:00
assert isinstance ( output_dir , Path ) and isinstance ( config [ ' PACKAGE_DIR ' ] , Path )
2019-04-24 08:09:25 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
try :
from django . core . management import call_command
2021-09-30 15:40:13 +00:00
2024-09-25 02:04:38 +00:00
sys . path . append ( str ( config [ ' PACKAGE_DIR ' ] ) )
os . environ . setdefault ( ' OUTPUT_DIR ' , str ( output_dir ) )
assert ( config [ ' PACKAGE_DIR ' ] / ' core ' / ' settings.py ' ) . exists ( ) , ' settings.py was not found at archivebox/core/settings.py '
os . environ . setdefault ( ' DJANGO_SETTINGS_MODULE ' , ' core.settings ' )
2019-04-24 15:36:14 +00:00
2024-09-25 02:04:38 +00:00
# Check to make sure JSON extension is available in our Sqlite3 instance
try :
cursor = sqlite3 . connect ( ' :memory: ' ) . cursor ( )
cursor . execute ( ' SELECT JSON( \' { " a " : " b " } \' ) ' )
except sqlite3 . OperationalError as exc :
stderr ( f ' [X] Your SQLite3 version is missing the required JSON1 extension: { exc } ' , color = ' red ' )
hint ( [
' Upgrade your Python version or install the extension manually: ' ,
' https://code.djangoproject.com/wiki/JSON1Extension '
] )
bump_startup_progress_bar ( )
if in_memory_db :
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os . environ . setdefault ( " ARCHIVEBOX_DATABASE_NAME " , " :memory: " )
django . setup ( )
bump_startup_progress_bar ( )
call_command ( " migrate " , interactive = False , verbosity = 0 )
else :
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django . setup ( )
bump_startup_progress_bar ( )
2019-04-24 15:36:14 +00:00
2024-09-25 02:04:38 +00:00
from django . conf import settings
2021-02-16 09:15:09 +00:00
2024-09-25 02:04:38 +00:00
# log startup message to the error log
with open ( settings . ERROR_LOG , " a " , encoding = ' utf-8 ' ) as f :
command = ' ' . join ( sys . argv )
ts = datetime . now ( timezone . utc ) . strftime ( ' % Y- % m- %d __ % H: % M: % S ' )
f . write ( f " \n > { command } ; TS= { ts } VERSION= { config [ ' VERSION ' ] } IN_DOCKER= { config [ ' IN_DOCKER ' ] } IS_TTY= { config [ ' IS_TTY ' ] } \n " )
2020-12-08 23:05:37 +00:00
2024-09-25 02:04:38 +00:00
if check_db :
# Enable WAL mode in sqlite3
from django . db import connection
with connection . cursor ( ) as cursor :
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor . execute ( " PRAGMA journal_mode " )
if current_mode != ' wal ' :
cursor . execute ( " PRAGMA journal_mode=wal; " )
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor . execute ( " PRAGMA busy_timeout = 5000; " )
cursor . execute ( " PRAGMA synchronous = NORMAL; " )
# Create cache table in DB if needed
try :
from django . core . cache import cache
cache . get ( ' test ' , None )
except django . db . utils . OperationalError :
call_command ( " createcachetable " , verbosity = 0 )
2021-03-01 03:53:34 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
2021-03-01 03:53:34 +00:00
2024-09-25 02:04:38 +00:00
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django . db import connections
for conn in connections . all ( ) :
conn . close_if_unusable_or_obsolete ( )
2021-02-16 06:22:36 +00:00
2024-09-25 02:04:38 +00:00
sql_index_path = Path ( output_dir ) / SQL_INDEX_FILENAME
assert sql_index_path . exists ( ) , (
f ' No database file { SQL_INDEX_FILENAME } found in: { config [ " OUTPUT_DIR " ] } (Are you in an ArchiveBox collection directory?) ' )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
bump_startup_progress_bar ( )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
# https://docs.pydantic.dev/logfire/integrations/django/ Logfire Debugging
if settings . DEBUG_LOGFIRE :
from opentelemetry . instrumentation . sqlite3 import SQLite3Instrumentor
SQLite3Instrumentor ( ) . instrument ( )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
import logfire
logfire . configure ( )
logfire . instrument_django ( is_sql_commentor_enabled = True )
logfire . info ( f ' Started ArchiveBox v { CONFIG . VERSION } ' , argv = sys . argv )
except KeyboardInterrupt :
raise SystemExit ( 2 )
2024-08-27 03:14:47 +00:00
2024-09-25 02:04:38 +00:00
INITIAL_STARTUP_PROGRESS = None
INITIAL_STARTUP_PROGRESS_TASK = None