2024-09-30 22:59:05 +00:00
__package__ = ' archivebox.config '
2024-09-25 12:10:09 +00:00
import os
2024-09-26 09:36:59 +00:00
import re
2024-10-04 07:08:44 +00:00
import platform
2024-10-04 10:24:15 +00:00
import tempfile
2024-10-04 07:08:44 +00:00
2024-09-26 09:36:59 +00:00
from typing import Dict
2024-09-25 12:10:09 +00:00
from pathlib import Path
2024-09-30 22:59:05 +00:00
import importlib . metadata
2024-10-01 00:13:55 +00:00
from collections . abc import Mapping
2024-09-25 12:10:09 +00:00
from benedict import benedict
2024-09-30 22:59:05 +00:00
from . . misc . logging import DEFAULT_CLI_COLORS
2024-09-25 12:10:09 +00:00
###################### Config ##########################
2024-09-30 23:50:36 +00:00
PACKAGE_DIR : Path = Path ( __file__ ) . resolve ( ) . parent . parent # archivebox source code dir
2024-10-04 08:40:41 +00:00
DATA_DIR : Path = Path ( os . getcwd ( ) ) . resolve ( ) # archivebox user data dir
2024-09-30 23:50:36 +00:00
ARCHIVE_DIR : Path = DATA_DIR / ' archive ' # archivebox snapshot data dir
2024-09-30 22:59:05 +00:00
2024-09-30 23:50:36 +00:00
def _detect_installed_version ( PACKAGE_DIR : Path ) :
2024-10-01 01:12:48 +00:00
""" Autodetect the installed archivebox version by using pip package metadata, pyproject.toml file, or package.json file """
2024-09-30 22:59:05 +00:00
try :
2024-10-01 01:12:48 +00:00
# if in production install, use pip-installed package metadata
2024-10-05 06:48:25 +00:00
return importlib . metadata . version ( __package__ or ' archivebox ' ) . strip ( )
2024-09-30 22:59:05 +00:00
except importlib . metadata . PackageNotFoundError :
2024-10-01 01:12:48 +00:00
pass
try :
# if in dev Git repo dir, use pyproject.toml file
pyproject_config = ( PACKAGE_DIR . parent / ' pyproject.toml ' ) . read_text ( ) . split ( ' \n ' )
for line in pyproject_config :
if line . startswith ( ' version = ' ) :
2024-10-05 06:48:25 +00:00
return line . split ( ' = ' , 1 ) [ - 1 ] . strip ( ' " ' ) . strip ( )
2024-10-01 01:12:48 +00:00
except FileNotFoundError :
# building docs, pyproject.toml is not available
pass
# raise Exception('Failed to detect installed archivebox version!')
return ' dev '
2024-09-30 22:59:05 +00:00
2024-09-30 23:50:36 +00:00
VERSION : str = _detect_installed_version ( PACKAGE_DIR )
2024-09-30 22:59:05 +00:00
2024-09-26 09:36:59 +00:00
2024-10-04 10:24:15 +00:00
2024-10-01 00:13:55 +00:00
class ConstantsDict ( Mapping ) :
2024-10-04 07:08:44 +00:00
IN_DOCKER = os . environ . get ( ' IN_DOCKER ' , False ) in ( ' 1 ' , ' true ' , ' True ' , ' yes ' )
OS = platform . system ( ) . lower ( ) # darwin, linux, etc.
ARCH = platform . machine ( ) . lower ( ) # arm64, x86_64, etc.
LIB_DIR_SCOPE = f ' { ARCH } - { OS } ' + ( ' -docker ' if IN_DOCKER else ' ' )
2024-09-30 23:50:36 +00:00
PACKAGE_DIR : Path = PACKAGE_DIR # archivebox source code dir
DATA_DIR : Path = DATA_DIR # archivebox user data dir
ARCHIVE_DIR : Path = ARCHIVE_DIR # archivebox snapshot data dir
VERSION : str = VERSION
PACKAGE_DIR_NAME : str = PACKAGE_DIR . name
TEMPLATES_DIR_NAME : str = ' templates '
TEMPLATES_DIR : Path = PACKAGE_DIR / TEMPLATES_DIR_NAME
STATIC_DIR : Path = TEMPLATES_DIR / ' static '
USER_PLUGINS_DIR_NAME : str = ' user_plugins '
CUSTOM_TEMPLATES_DIR_NAME : str = ' user_templates '
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
ARCHIVE_DIR_NAME : str = ' archive '
SOURCES_DIR_NAME : str = ' sources '
PERSONAS_DIR_NAME : str = ' personas '
CRONTABS_DIR_NAME : str = ' crontabs '
CACHE_DIR_NAME : str = ' cache '
LOGS_DIR_NAME : str = ' logs '
LIB_DIR_NAME : str = ' lib '
TMP_DIR_NAME : str = ' tmp '
2024-09-27 07:38:37 +00:00
2024-10-05 04:03:02 +00:00
SYSTEM_TMP_DIR : Path = Path ( os . environ [ ' SYSTEM_TMP_DIR ' ] ) if ' SYSTEM_TMP_DIR ' in os . environ else ( Path ( tempfile . gettempdir ( ) ) / ' archivebox ' )
2024-10-05 10:16:27 +00:00
# DATA_DIR_TMP_DIR: Path = DATA_DIR / TMP_DIR_NAME / machineid.hashed_id('archivebox')[:16] # cant be used because of socket path length restrictions break too often if data dir is in some deep subdir: ocket.error reported AF_UNIX path too long
2024-10-05 04:03:02 +00:00
SYSTEM_LIB_DIR : Path = Path ( os . environ [ ' SYSTEM_LIB_DIR ' ] ) if ' SYSTEM_LIB_DIR ' in os . environ else ( PACKAGE_DIR / LIB_DIR_NAME )
DATA_DIR_LIB_DIR : Path = DATA_DIR / LIB_DIR_NAME / LIB_DIR_SCOPE
2024-10-04 10:24:15 +00:00
2024-09-30 23:50:36 +00:00
ARCHIVE_DIR : Path = DATA_DIR / ARCHIVE_DIR_NAME
SOURCES_DIR : Path = DATA_DIR / SOURCES_DIR_NAME
PERSONAS_DIR : Path = DATA_DIR / PERSONAS_DIR_NAME
CACHE_DIR : Path = DATA_DIR / CACHE_DIR_NAME
LOGS_DIR : Path = DATA_DIR / LOGS_DIR_NAME
2024-10-05 04:03:02 +00:00
LIB_DIR : Path = SYSTEM_LIB_DIR if IN_DOCKER else DATA_DIR_LIB_DIR # e.g. /app/lib or ./data/lib/arm64-darwin-docker
2024-10-05 10:16:27 +00:00
TMP_DIR : Path = SYSTEM_TMP_DIR
2024-09-30 23:50:36 +00:00
CUSTOM_TEMPLATES_DIR : Path = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
USER_PLUGINS_DIR : Path = DATA_DIR / USER_PLUGINS_DIR_NAME
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
LIB_PIP_DIR : Path = LIB_DIR / ' pip '
LIB_NPM_DIR : Path = LIB_DIR / ' npm '
LIB_BROWSERS_DIR : Path = LIB_DIR / ' browsers '
LIB_BIN_DIR : Path = LIB_DIR / ' bin '
BIN_DIR : Path = LIB_BIN_DIR
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
CONFIG_FILENAME : str = ' ArchiveBox.conf '
SQL_INDEX_FILENAME : str = ' index.sqlite3 '
2024-10-05 02:38:36 +00:00
QUEUE_DATABASE_FILENAME : str = ' queue.sqlite3 '
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
CONFIG_FILE : Path = DATA_DIR / CONFIG_FILENAME
DATABASE_FILE : Path = DATA_DIR / SQL_INDEX_FILENAME
2024-10-05 02:38:36 +00:00
QUEUE_DATABASE_FILE : Path = DATA_DIR / QUEUE_DATABASE_FILENAME
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
JSON_INDEX_FILENAME : str = ' index.json '
HTML_INDEX_FILENAME : str = ' index.html '
ROBOTS_TXT_FILENAME : str = ' robots.txt '
FAVICON_FILENAME : str = ' favicon.ico '
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
TIMEZONE : str = ' UTC '
DEFAULT_CLI_COLORS : Dict [ str , str ] = DEFAULT_CLI_COLORS
DISABLED_CLI_COLORS : Dict [ str , str ] = benedict ( { k : ' ' for k in DEFAULT_CLI_COLORS } )
2024-09-27 07:38:37 +00:00
2024-09-30 23:50:36 +00:00
ALLOWDENYLIST_REGEX_FLAGS : int = re . IGNORECASE | re . UNICODE | re . MULTILINE
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
STATICFILE_EXTENSIONS : frozenset [ str ] = frozenset ( (
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
' gif ' , ' jpeg ' , ' jpg ' , ' png ' , ' tif ' , ' tiff ' , ' wbmp ' , ' ico ' , ' jng ' , ' bmp ' ,
' svg ' , ' svgz ' , ' webp ' , ' ps ' , ' eps ' , ' ai ' ,
' mp3 ' , ' mp4 ' , ' m4a ' , ' mpeg ' , ' mpg ' , ' mkv ' , ' mov ' , ' webm ' , ' m4v ' ,
' flv ' , ' wmv ' , ' avi ' , ' ogg ' , ' ts ' , ' m3u8 ' ,
' pdf ' , ' txt ' , ' rtf ' , ' rtfd ' , ' doc ' , ' docx ' , ' ppt ' , ' pptx ' , ' xls ' , ' xlsx ' ,
' atom ' , ' rss ' , ' css ' , ' js ' , ' json ' ,
' dmg ' , ' iso ' , ' img ' ,
' rar ' , ' war ' , ' hqx ' , ' zip ' , ' gz ' , ' bz2 ' , ' 7z ' ,
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
) )
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
INGORED_PATHS : frozenset [ str ] = frozenset ( (
" .git " ,
" .svn " ,
" .DS_Store " ,
" .gitignore " ,
" lost+found " ,
" .DS_Store " ,
" .env " ,
" Dockerfile " ,
2024-10-05 10:11:48 +00:00
" .ArchiveBox.conf.bak " ,
2024-09-30 23:50:36 +00:00
) )
PIP_RELATED_NAMES : frozenset [ str ] = frozenset ( (
" .venv " ,
" venv " ,
" virtualenv " ,
" .virtualenv " ,
) )
NPM_RELATED_NAMES : frozenset [ str ] = frozenset ( (
" node_modules " ,
" package.json " ,
" package-lock.json " ,
" yarn.lock " ,
) )
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
DATA_DIR_NAMES : frozenset [ str ] = frozenset ( (
ARCHIVE_DIR_NAME ,
SOURCES_DIR_NAME ,
LOGS_DIR_NAME ,
CACHE_DIR_NAME ,
LIB_DIR_NAME ,
2024-10-05 02:38:36 +00:00
TMP_DIR_NAME ,
2024-09-30 23:50:36 +00:00
PERSONAS_DIR_NAME ,
CUSTOM_TEMPLATES_DIR_NAME ,
USER_PLUGINS_DIR_NAME ,
2024-10-05 02:38:36 +00:00
CRONTABS_DIR_NAME ,
2024-09-30 23:50:36 +00:00
) )
DATA_DIRS : frozenset [ Path ] = frozenset ( DATA_DIR / dirname for dirname in DATA_DIR_NAMES )
DATA_FILE_NAMES : frozenset [ str ] = frozenset ( (
CONFIG_FILENAME ,
SQL_INDEX_FILENAME ,
f " { SQL_INDEX_FILENAME } -wal " ,
f " { SQL_INDEX_FILENAME } -shm " ,
2024-10-05 02:38:36 +00:00
QUEUE_DATABASE_FILENAME ,
f " { QUEUE_DATABASE_FILENAME } -wal " ,
f " { QUEUE_DATABASE_FILENAME } -shm " ,
2024-09-30 23:50:36 +00:00
" search.sqlite3 " ,
JSON_INDEX_FILENAME ,
HTML_INDEX_FILENAME ,
ROBOTS_TXT_FILENAME ,
FAVICON_FILENAME ,
CONFIG_FILENAME ,
f " { CONFIG_FILENAME } .bak " ,
" static_index.json " ,
) )
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
2024-10-01 00:25:15 +00:00
ALLOWED_IN_DATA_DIR : frozenset [ str ] = frozenset ( (
2024-09-30 23:50:36 +00:00
* INGORED_PATHS ,
* PIP_RELATED_NAMES ,
* NPM_RELATED_NAMES ,
* DATA_DIR_NAMES ,
* DATA_FILE_NAMES ,
" static " , # created by old static exports <v0.6.0
" sonic " , # created by docker bind mount
) )
2024-09-26 09:36:59 +00:00
2024-09-30 23:50:36 +00:00
CODE_LOCATIONS = benedict ( {
' PACKAGE_DIR ' : {
' path ' : ( PACKAGE_DIR ) . resolve ( ) ,
' enabled ' : True ,
' is_valid ' : ( PACKAGE_DIR / ' __main__.py ' ) . exists ( ) ,
} ,
' TEMPLATES_DIR ' : {
' path ' : TEMPLATES_DIR . resolve ( ) ,
' enabled ' : True ,
' is_valid ' : STATIC_DIR . exists ( ) ,
} ,
2024-10-01 06:52:00 +00:00
' LIB_DIR ' : {
' path ' : LIB_DIR . resolve ( ) ,
' enabled ' : True ,
' is_valid ' : LIB_DIR . is_dir ( ) ,
} ,
2024-10-05 10:16:27 +00:00
' TMP_DIR ' : {
' path ' : TMP_DIR . resolve ( ) ,
' enabled ' : True ,
' is_valid ' : TMP_DIR . is_dir ( ) ,
} ,
2024-09-30 23:50:36 +00:00
} )
DATA_LOCATIONS = benedict ( {
2024-10-01 00:25:15 +00:00
" DATA_DIR " : {
2024-09-30 23:50:36 +00:00
" path " : DATA_DIR . resolve ( ) ,
" enabled " : True ,
" is_valid " : DATABASE_FILE . exists ( ) ,
" is_mount " : os . path . ismount ( DATA_DIR . resolve ( ) ) ,
} ,
" CONFIG_FILE " : {
" path " : CONFIG_FILE . resolve ( ) ,
" enabled " : True ,
" is_valid " : CONFIG_FILE . exists ( ) ,
} ,
" SQL_INDEX " : {
" path " : DATABASE_FILE . resolve ( ) ,
" enabled " : True ,
" is_valid " : DATABASE_FILE . exists ( ) ,
" is_mount " : os . path . ismount ( DATABASE_FILE . resolve ( ) ) ,
} ,
" QUEUE_DATABASE " : {
" path " : QUEUE_DATABASE_FILE . resolve ( ) ,
" enabled " : True ,
" is_valid " : QUEUE_DATABASE_FILE . exists ( ) ,
" is_mount " : os . path . ismount ( QUEUE_DATABASE_FILE . resolve ( ) ) ,
} ,
" ARCHIVE_DIR " : {
" path " : ARCHIVE_DIR . resolve ( ) ,
" enabled " : True ,
" is_valid " : ARCHIVE_DIR . exists ( ) ,
" is_mount " : os . path . ismount ( ARCHIVE_DIR . resolve ( ) ) ,
} ,
" SOURCES_DIR " : {
" path " : SOURCES_DIR . resolve ( ) ,
" enabled " : True ,
" is_valid " : SOURCES_DIR . exists ( ) ,
} ,
" LOGS_DIR " : {
" path " : LOGS_DIR . resolve ( ) ,
" enabled " : True ,
" is_valid " : LOGS_DIR . is_dir ( ) ,
} ,
2024-10-05 04:34:19 +00:00
# "CACHE_DIR": {
# "path": CACHE_DIR.resolve(),
# "enabled": True,
# "is_valid": CACHE_DIR.is_dir(),
# },
" PERSONAS_DIR " : {
" path " : PERSONAS_DIR . resolve ( ) ,
2024-10-05 04:40:36 +00:00
" enabled " : PERSONAS_DIR . exists ( ) ,
2024-10-05 04:34:19 +00:00
" is_valid " : PERSONAS_DIR . is_dir ( ) ,
} ,
' CUSTOM_TEMPLATES_DIR ' : {
' path ' : CUSTOM_TEMPLATES_DIR . resolve ( ) ,
2024-10-05 04:40:36 +00:00
' enabled ' : CUSTOM_TEMPLATES_DIR . exists ( ) ,
2024-10-05 04:34:19 +00:00
' is_valid ' : CUSTOM_TEMPLATES_DIR . is_dir ( ) ,
} ,
' USER_PLUGINS_DIR ' : {
' path ' : USER_PLUGINS_DIR . resolve ( ) ,
2024-10-05 04:40:36 +00:00
' enabled ' : USER_PLUGINS_DIR . exists ( ) ,
2024-10-05 04:34:19 +00:00
' is_valid ' : USER_PLUGINS_DIR . is_dir ( ) ,
2024-09-30 23:50:36 +00:00
} ,
} )
2024-10-01 00:13:55 +00:00
@classmethod
def __getitem__ ( cls , key : str ) :
return getattr ( cls , key )
@classmethod
def __benedict__ ( cls ) :
return benedict ( { key : value for key , value in cls . __dict__ . items ( ) if key . isupper ( ) and not key . startswith ( ' _ ' ) } )
@classmethod
def __len__ ( cls ) :
return len ( cls . __benedict__ ( ) )
2024-09-26 09:36:59 +00:00
2024-10-01 00:13:55 +00:00
@classmethod
def __iter__ ( cls ) :
return iter ( cls . __benedict__ ( ) )
2024-09-25 12:10:09 +00:00
2024-10-01 00:13:55 +00:00
CONSTANTS = ConstantsDict ( )
CONSTANTS_CONFIG = CONSTANTS . __benedict__ ( )
2024-09-25 12:10:09 +00:00
2024-10-01 00:13:55 +00:00
# add all key: values to globals() for easier importing
globals ( ) . update ( CONSTANTS )
2024-10-05 04:03:02 +00:00
# these need to always exist as we need them to run almost everything
CONSTANTS . LIB_DIR . mkdir ( parents = True , exist_ok = True )
CONSTANTS . TMP_DIR . mkdir ( parents = True , exist_ok = True )