2020-10-31 07:08:03 +00:00
__package__ = ' archivebox '
2019-04-17 07:49:18 +00:00
2017-07-04 10:38:07 +00:00
import os
2019-04-22 18:34:12 +00:00
import io
2019-03-23 02:05:45 +00:00
import re
2017-07-04 10:48:12 +00:00
import sys
2020-08-18 19:00:00 +00:00
import json
2019-04-17 07:49:18 +00:00
import getpass
2017-07-05 21:33:51 +00:00
import shutil
2020-08-18 19:00:00 +00:00
import django
2017-07-04 10:48:12 +00:00
2019-04-22 18:34:12 +00:00
from hashlib import md5
2020-07-28 11:20:57 +00:00
from pathlib import Path
2020-08-18 12:21:36 +00:00
from typing import Optional , Type , Tuple , Dict , Union , List
2019-03-23 02:05:45 +00:00
from subprocess import run , PIPE , DEVNULL
2019-04-25 23:03:38 +00:00
from configparser import ConfigParser
2020-07-23 17:02:17 +00:00
from collections import defaultdict
2017-07-04 10:38:07 +00:00
2020-10-31 07:08:03 +00:00
from . config_stubs import (
2019-04-24 15:36:14 +00:00
SimpleConfigValueDict ,
ConfigValue ,
ConfigDict ,
ConfigDefaultValue ,
ConfigDefaultDict ,
)
2020-07-02 07:12:30 +00:00
# precedence order for config:
2020-10-30 08:50:08 +00:00
# 1. cli args (e.g. )
# 2. shell environment vars (env USE_COLOR=False archivebox add '...')
# 3. config file (echo "SAVE_FAVICON=False" >> ArchiveBox.conf)
# 4. defaults (defined below in Python)
2020-07-02 07:12:30 +00:00
2020-10-30 08:50:08 +00:00
#
2020-07-02 07:12:30 +00:00
# env SHOW_PROGRESS=1 archivebox add '...'
2020-10-30 08:50:08 +00:00
# archivebox config --set TIMEOUT=600
#
2019-04-24 08:09:25 +00:00
2017-10-23 09:57:34 +00:00
# ******************************************************************************
2020-11-23 07:04:39 +00:00
# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
2019-03-12 16:48:46 +00:00
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
2019-04-17 03:18:42 +00:00
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
2017-10-23 09:57:34 +00:00
# ******************************************************************************
2017-07-05 21:33:51 +00:00
2019-04-24 08:09:25 +00:00
################################# User Config ##################################
2017-07-04 10:38:07 +00:00
2019-04-25 23:03:38 +00:00
CONFIG_DEFAULTS : Dict [ str , ConfigDefaultDict ] = {
' SHELL_CONFIG ' : {
' IS_TTY ' : { ' type ' : bool , ' default ' : lambda _ : sys . stdout . isatty ( ) } ,
' USE_COLOR ' : { ' type ' : bool , ' default ' : lambda c : c [ ' IS_TTY ' ] } ,
2020-10-31 11:56:05 +00:00
' SHOW_PROGRESS ' : { ' type ' : bool , ' default ' : lambda c : c [ ' IS_TTY ' ] } ,
2020-08-10 18:15:53 +00:00
' IN_DOCKER ' : { ' type ' : bool , ' default ' : False } ,
2020-06-30 07:35:20 +00:00
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
2019-04-25 23:03:38 +00:00
} ,
' GENERAL_CONFIG ' : {
' OUTPUT_DIR ' : { ' type ' : str , ' default ' : None } ,
' CONFIG_FILE ' : { ' type ' : str , ' default ' : None } ,
2019-04-27 21:26:24 +00:00
' ONLY_NEW ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' TIMEOUT ' : { ' type ' : int , ' default ' : 60 } ,
' MEDIA_TIMEOUT ' : { ' type ' : int , ' default ' : 3600 } ,
' OUTPUT_PERMISSIONS ' : { ' type ' : str , ' default ' : ' 755 ' } ,
2020-06-26 01:30:29 +00:00
' RESTRICT_FILE_NAMES ' : { ' type ' : str , ' default ' : ' windows ' } ,
2020-08-18 22:58:05 +00:00
' URL_BLACKLIST ' : { ' type ' : str , ' default ' : r ' \ .(css|js|otf|ttf|woff|woff2|gstatic \ .com|googleapis \ .com/css)( \ ?.*)?$ ' } , # to avoid downloading code assets as their own pages
2019-04-25 23:03:38 +00:00
} ,
2019-05-02 23:15:16 +00:00
' SERVER_CONFIG ' : {
' SECRET_KEY ' : { ' type ' : str , ' default ' : None } ,
2020-08-14 03:21:37 +00:00
' BIND_ADDR ' : { ' type ' : str , ' default ' : lambda c : [ ' 127.0.0.1:8000 ' , ' 0.0.0.0:8000 ' ] [ c [ ' IN_DOCKER ' ] ] } ,
2019-05-02 23:15:16 +00:00
' ALLOWED_HOSTS ' : { ' type ' : str , ' default ' : ' * ' } ,
' DEBUG ' : { ' type ' : bool , ' default ' : False } ,
' PUBLIC_INDEX ' : { ' type ' : bool , ' default ' : True } ,
' PUBLIC_SNAPSHOTS ' : { ' type ' : bool , ' default ' : True } ,
2020-08-25 18:15:42 +00:00
' PUBLIC_ADD_VIEW ' : { ' type ' : bool , ' default ' : False } ,
2019-05-02 23:15:16 +00:00
' FOOTER_INFO ' : { ' type ' : str , ' default ' : ' Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests. ' } ,
' ACTIVE_THEME ' : { ' type ' : str , ' default ' : ' default ' } ,
} ,
2019-04-25 23:03:38 +00:00
' ARCHIVE_METHOD_TOGGLES ' : {
' SAVE_TITLE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_TITLE ' , ) } ,
' SAVE_FAVICON ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_FAVICON ' , ) } ,
' SAVE_WGET ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET ' , ) } ,
' SAVE_WGET_REQUISITES ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WGET_REQUISITES ' , ) } ,
2020-07-30 18:23:10 +00:00
' SAVE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SINGLEFILE ' , ) } ,
2020-08-07 13:05:17 +00:00
' SAVE_READABILITY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_READABILITY ' , ) } ,
2020-09-22 08:46:21 +00:00
' SAVE_MERCURY ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MERCURY ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_PDF ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_PDF ' , ) } ,
' SAVE_SCREENSHOT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_SCREENSHOT ' , ) } ,
' SAVE_DOM ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_DOM ' , ) } ,
2020-09-24 13:37:27 +00:00
' SAVE_HEADERS ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_HEADERS ' , ) } ,
2019-04-25 23:03:38 +00:00
' SAVE_WARC ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_WARC ' , ) } ,
' SAVE_GIT ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_GIT ' , ) } ,
' SAVE_MEDIA ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' FETCH_MEDIA ' , ) } ,
' SAVE_ARCHIVE_DOT_ORG ' : { ' type ' : bool , ' default ' : True , ' aliases ' : ( ' SUBMIT_ARCHIVE_DOT_ORG ' , ) } ,
} ,
' ARCHIVE_METHOD_OPTIONS ' : {
' RESOLUTION ' : { ' type ' : str , ' default ' : ' 1440,2000 ' , ' aliases ' : ( ' SCREENSHOT_RESOLUTION ' , ) } ,
' GIT_DOMAINS ' : { ' type ' : str , ' default ' : ' github.com,bitbucket.org,gitlab.com ' } ,
' CHECK_SSL_VALIDITY ' : { ' type ' : bool , ' default ' : True } ,
2020-11-23 07:04:39 +00:00
' CURL_USER_AGENT ' : { ' type ' : str , ' default ' : ' ArchiveBox/ {VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/ {CURL_VERSION} ' } ,
' WGET_USER_AGENT ' : { ' type ' : str , ' default ' : ' ArchiveBox/ {VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/ {WGET_VERSION} ' } ,
2019-04-25 23:03:38 +00:00
' CHROME_USER_AGENT ' : { ' type ' : str , ' default ' : ' Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36 ' } ,
' COOKIES_FILE ' : { ' type ' : str , ' default ' : None } ,
' CHROME_USER_DATA_DIR ' : { ' type ' : str , ' default ' : None } ,
' CHROME_HEADLESS ' : { ' type ' : bool , ' default ' : True } ,
2020-08-14 03:21:57 +00:00
' CHROME_SANDBOX ' : { ' type ' : bool , ' default ' : lambda c : not c [ ' IN_DOCKER ' ] } ,
2020-10-14 15:38:29 +00:00
' YOUTUBEDL_ARGS ' : { ' type ' : list , ' default ' : [ ' --write-description ' ,
' --write-info-json ' ,
' --write-annotations ' ,
' --write-thumbnail ' ,
' --no-call-home ' ,
' --user-agent ' ,
' --all-subs ' ,
' --extract-audio ' ,
' --keep-video ' ,
' --ignore-errors ' ,
' --geo-bypass ' ,
' --audio-format ' , ' mp3 ' ,
' --audio-quality ' , ' 320K ' ,
' --embed-thumbnail ' ,
2020-10-15 13:31:49 +00:00
' --add-metadata ' ] } ,
' WGET_ARGS ' : { ' type ' : list , ' default ' : [ ' --no-verbose ' ,
' --adjust-extension ' ,
' --convert-links ' ,
' --force-directories ' ,
' --backup-converted ' ,
' --span-hosts ' ,
' --no-parent ' ,
' -e ' , ' robots=off ' ,
2020-10-15 13:42:46 +00:00
] } ,
' CURL_ARGS ' : { ' type ' : list , ' default ' : [ ' --silent ' ,
' --location ' ,
' --compressed '
2020-10-15 13:58:22 +00:00
] } ,
' GIT_ARGS ' : { ' type ' : list , ' default ' : [ ' --recursive ' ] } ,
2019-04-25 23:03:38 +00:00
} ,
2020-11-19 13:06:13 +00:00
' SEARCH_BACKEND_CONFIG ' : {
' USE_INDEXING_BACKEND ' : { ' type ' : bool , ' default ' : True } ,
' USE_SEARCHING_BACKEND ' : { ' type ' : bool , ' default ' : True } ,
' SEARCH_BACKEND_ENGINE ' : { ' type ' : str , ' default ' : ' sonic ' } ,
' SEARCH_BACKEND_HOST_NAME ' : { ' type ' : str , ' default ' : ' localhost ' } ,
' SEARCH_BACKEND_PORT ' : { ' type ' : int , ' default ' : 1491 } ,
' SEARCH_BACKEND_PASSWORD ' : { ' type ' : str , ' default ' : ' SecretPassword ' } ,
# SONIC
2020-11-19 21:45:12 +00:00
' SONIC_COLLECTION ' : { ' type ' : str , ' default ' : ' archivebox ' } ,
' SONIC_BUCKET ' : { ' type ' : str , ' default ' : ' snapshots ' } ,
2020-11-19 13:06:13 +00:00
} ,
2019-04-25 23:03:38 +00:00
' DEPENDENCY_CONFIG ' : {
' USE_CURL ' : { ' type ' : bool , ' default ' : True } ,
' USE_WGET ' : { ' type ' : bool , ' default ' : True } ,
2020-08-18 12:29:46 +00:00
' USE_SINGLEFILE ' : { ' type ' : bool , ' default ' : True } ,
' USE_READABILITY ' : { ' type ' : bool , ' default ' : True } ,
2020-09-22 08:46:21 +00:00
' USE_MERCURY ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' USE_GIT ' : { ' type ' : bool , ' default ' : True } ,
' USE_CHROME ' : { ' type ' : bool , ' default ' : True } ,
2020-08-18 15:34:28 +00:00
' USE_NODE ' : { ' type ' : bool , ' default ' : True } ,
2019-04-25 23:03:38 +00:00
' USE_YOUTUBEDL ' : { ' type ' : bool , ' default ' : True } ,
2020-11-19 13:06:13 +00:00
2019-04-25 23:03:38 +00:00
' CURL_BINARY ' : { ' type ' : str , ' default ' : ' curl ' } ,
' GIT_BINARY ' : { ' type ' : str , ' default ' : ' git ' } ,
' WGET_BINARY ' : { ' type ' : str , ' default ' : ' wget ' } ,
2020-07-30 21:26:24 +00:00
' SINGLEFILE_BINARY ' : { ' type ' : str , ' default ' : ' single-file ' } ,
2020-08-07 13:05:17 +00:00
' READABILITY_BINARY ' : { ' type ' : str , ' default ' : ' readability-extractor ' } ,
2020-09-22 08:46:21 +00:00
' MERCURY_BINARY ' : { ' type ' : str , ' default ' : ' mercury-parser ' } ,
2019-04-25 23:03:38 +00:00
' YOUTUBEDL_BINARY ' : { ' type ' : str , ' default ' : ' youtube-dl ' } ,
2020-11-24 01:24:37 +00:00
' NODE_BINARY ' : { ' type ' : str , ' default ' : ' node ' } ,
2019-04-25 23:03:38 +00:00
' CHROME_BINARY ' : { ' type ' : str , ' default ' : None } ,
2020-11-07 19:17:21 +00:00
' POCKET_CONSUMER_KEY ' : { ' type ' : str , ' default ' : None } ,
' POCKET_ACCESS_TOKENS ' : { ' type ' : dict , ' default ' : { } } ,
2019-04-25 23:03:38 +00:00
} ,
2019-04-24 08:09:25 +00:00
}
2020-10-31 07:08:03 +00:00
# for backwards compatibility with old config files, check old/deprecated names for each key
2019-04-26 18:43:13 +00:00
CONFIG_ALIASES = {
alias : key
for section in CONFIG_DEFAULTS . values ( )
for key , default in section . items ( )
for alias in default . get ( ' aliases ' , ( ) )
}
USER_CONFIG = { key for section in CONFIG_DEFAULTS . values ( ) for key in section . keys ( ) }
2020-10-31 07:08:03 +00:00
2019-04-26 18:43:13 +00:00
def get_real_name ( key : str ) - > str :
return CONFIG_ALIASES . get ( key . upper ( ) . strip ( ) , key . upper ( ) . strip ( ) )
2019-04-24 08:09:25 +00:00
############################## Derived Config ##############################
# Constants
2019-03-27 20:44:00 +00:00
2019-04-24 08:09:25 +00:00
DEFAULT_CLI_COLORS = {
2019-03-27 20:44:00 +00:00
' reset ' : ' \033 [00;00m ' ,
' lightblue ' : ' \033 [01;30m ' ,
' lightyellow ' : ' \033 [01;33m ' ,
' lightred ' : ' \033 [01;35m ' ,
' red ' : ' \033 [01;31m ' ,
' green ' : ' \033 [01;32m ' ,
' blue ' : ' \033 [01;34m ' ,
' white ' : ' \033 [01;37m ' ,
' black ' : ' \033 [01;30m ' ,
}
2019-04-24 08:09:25 +00:00
ANSI = { k : ' ' for k in DEFAULT_CLI_COLORS . keys ( ) }
2019-03-27 20:44:00 +00:00
2020-07-23 17:02:17 +00:00
COLOR_DICT = defaultdict ( lambda : [ ( 0 , 0 , 0 ) , ( 0 , 0 , 0 ) ] , {
2020-07-01 17:23:59 +00:00
' 00 ' : [ ( 0 , 0 , 0 ) , ( 0 , 0 , 0 ) ] ,
2020-07-23 16:47:01 +00:00
' 30 ' : [ ( 0 , 0 , 0 ) , ( 0 , 0 , 0 ) ] ,
2020-07-01 17:23:59 +00:00
' 31 ' : [ ( 255 , 0 , 0 ) , ( 128 , 0 , 0 ) ] ,
' 32 ' : [ ( 0 , 200 , 0 ) , ( 0 , 128 , 0 ) ] ,
' 33 ' : [ ( 255 , 255 , 0 ) , ( 128 , 128 , 0 ) ] ,
' 34 ' : [ ( 0 , 0 , 255 ) , ( 0 , 0 , 128 ) ] ,
' 35 ' : [ ( 255 , 0 , 255 ) , ( 128 , 0 , 128 ) ] ,
' 36 ' : [ ( 0 , 255 , 255 ) , ( 0 , 128 , 128 ) ] ,
2020-07-23 16:47:01 +00:00
' 37 ' : [ ( 255 , 255 , 255 ) , ( 255 , 255 , 255 ) ] ,
2020-07-23 17:02:17 +00:00
} )
2020-07-01 17:23:59 +00:00
2019-05-01 03:13:04 +00:00
STATICFILE_EXTENSIONS = {
2020-06-26 01:30:29 +00:00
# 99.999% of the time, URLs ending in these extensions are static files
2019-05-01 03:13:04 +00:00
# that can be downloaded as-is, not html pages that need to be rendered
' gif ' , ' jpeg ' , ' jpg ' , ' png ' , ' tif ' , ' tiff ' , ' wbmp ' , ' ico ' , ' jng ' , ' bmp ' ,
' svg ' , ' svgz ' , ' webp ' , ' ps ' , ' eps ' , ' ai ' ,
' mp3 ' , ' mp4 ' , ' m4a ' , ' mpeg ' , ' mpg ' , ' mkv ' , ' mov ' , ' webm ' , ' m4v ' ,
' flv ' , ' wmv ' , ' avi ' , ' ogg ' , ' ts ' , ' m3u8 ' ,
' pdf ' , ' txt ' , ' rtf ' , ' rtfd ' , ' doc ' , ' docx ' , ' ppt ' , ' pptx ' , ' xls ' , ' xlsx ' ,
' atom ' , ' rss ' , ' css ' , ' js ' , ' json ' ,
' dmg ' , ' iso ' , ' img ' ,
' rar ' , ' war ' , ' hqx ' , ' zip ' , ' gz ' , ' bz2 ' , ' 7z ' ,
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
2020-06-26 01:30:29 +00:00
# These are always treated as pages, not as static files, never add them:
2019-05-01 03:13:04 +00:00
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
2019-04-17 03:18:42 +00:00
2020-10-31 07:08:03 +00:00
PACKAGE_DIR_NAME = ' archivebox '
2019-04-27 21:26:24 +00:00
TEMPLATES_DIR_NAME = ' themes '
2017-10-18 22:38:17 +00:00
2019-03-08 22:46:14 +00:00
ARCHIVE_DIR_NAME = ' archive '
SOURCES_DIR_NAME = ' sources '
2019-04-19 01:09:54 +00:00
LOGS_DIR_NAME = ' logs '
2019-04-24 08:09:25 +00:00
STATIC_DIR_NAME = ' static '
SQL_INDEX_FILENAME = ' index.sqlite3 '
JSON_INDEX_FILENAME = ' index.json '
HTML_INDEX_FILENAME = ' index.html '
ROBOTS_TXT_FILENAME = ' robots.txt '
FAVICON_FILENAME = ' favicon.ico '
2019-04-25 23:03:38 +00:00
CONFIG_FILENAME = ' ArchiveBox.conf '
2019-04-26 22:31:50 +00:00
CONFIG_HEADER = (
""" # This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
2019-04-25 23:03:38 +00:00
#
2019-04-26 22:31:50 +00:00
# If you modify this file manually, make sure to update your archive after by running:
2019-04-25 23:03:38 +00:00
# archivebox init
#
2019-04-26 22:31:50 +00:00
# A list of all possible config with documentation and examples can be found here:
2020-11-23 07:04:39 +00:00
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
2019-04-26 22:31:50 +00:00
""" )
2019-04-24 08:09:25 +00:00
2019-04-24 15:36:14 +00:00
DERIVED_CONFIG_DEFAULTS : ConfigDefaultDict = {
2019-04-24 08:09:25 +00:00
' TERM_WIDTH ' : { ' default ' : lambda c : lambda : shutil . get_terminal_size ( ( 100 , 10 ) ) . columns } ,
' USER ' : { ' default ' : lambda c : getpass . getuser ( ) or os . getlogin ( ) } ,
' ANSI ' : { ' default ' : lambda c : DEFAULT_CLI_COLORS if c [ ' USE_COLOR ' ] else { k : ' ' for k in DEFAULT_CLI_COLORS . keys ( ) } } ,
2020-06-30 05:08:14 +00:00
2020-10-31 07:08:03 +00:00
' PACKAGE_DIR ' : { ' default ' : lambda c : Path ( __file__ ) . resolve ( ) . parent } ,
' TEMPLATES_DIR ' : { ' default ' : lambda c : c [ ' PACKAGE_DIR ' ] / TEMPLATES_DIR_NAME / ' legacy ' } ,
2020-09-07 22:49:14 +00:00
2020-09-08 21:29:22 +00:00
' OUTPUT_DIR ' : { ' default ' : lambda c : Path ( c [ ' OUTPUT_DIR ' ] ) . resolve ( ) if c [ ' OUTPUT_DIR ' ] else Path ( os . curdir ) . resolve ( ) } ,
2020-09-07 22:49:14 +00:00
' ARCHIVE_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / ARCHIVE_DIR_NAME } ,
' SOURCES_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / SOURCES_DIR_NAME } ,
' LOGS_DIR ' : { ' default ' : lambda c : c [ ' OUTPUT_DIR ' ] / LOGS_DIR_NAME } ,
2020-09-08 21:29:22 +00:00
' CONFIG_FILE ' : { ' default ' : lambda c : Path ( c [ ' CONFIG_FILE ' ] ) . resolve ( ) if c [ ' CONFIG_FILE ' ] else c [ ' OUTPUT_DIR ' ] / CONFIG_FILENAME } ,
' COOKIES_FILE ' : { ' default ' : lambda c : c [ ' COOKIES_FILE ' ] and Path ( c [ ' COOKIES_FILE ' ] ) . resolve ( ) } ,
2020-09-24 13:48:58 +00:00
' CHROME_USER_DATA_DIR ' : { ' default ' : lambda c : find_chrome_data_dir ( ) if c [ ' CHROME_USER_DATA_DIR ' ] is None else ( Path ( c [ ' CHROME_USER_DATA_DIR ' ] ) . resolve ( ) if c [ ' CHROME_USER_DATA_DIR ' ] else None ) } , # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
2020-08-18 19:00:00 +00:00
' URL_BLACKLIST_PTN ' : { ' default ' : lambda c : c [ ' URL_BLACKLIST ' ] and re . compile ( c [ ' URL_BLACKLIST ' ] or ' ' , re . IGNORECASE | re . UNICODE | re . MULTILINE ) } ,
2019-04-24 08:09:25 +00:00
' ARCHIVEBOX_BINARY ' : { ' default ' : lambda c : sys . argv [ 0 ] } ,
2020-10-31 07:08:03 +00:00
' VERSION ' : { ' default ' : lambda c : json . loads ( ( Path ( c [ ' PACKAGE_DIR ' ] ) / ' package.json ' ) . read_text ( ) . strip ( ) ) [ ' version ' ] } ,
2019-04-24 08:09:25 +00:00
' GIT_SHA ' : { ' default ' : lambda c : c [ ' VERSION ' ] . split ( ' + ' ) [ - 1 ] or ' unknown ' } ,
' PYTHON_BINARY ' : { ' default ' : lambda c : sys . executable } ,
' PYTHON_ENCODING ' : { ' default ' : lambda c : sys . stdout . encoding . upper ( ) } ,
2020-06-30 05:08:14 +00:00
' PYTHON_VERSION ' : { ' default ' : lambda c : ' {} . {} . {} ' . format ( * sys . version_info [ : 3 ] ) } ,
2019-04-24 08:09:25 +00:00
' DJANGO_BINARY ' : { ' default ' : lambda c : django . __file__ . replace ( ' __init__.py ' , ' bin/django-admin.py ' ) } ,
' DJANGO_VERSION ' : { ' default ' : lambda c : ' {} . {} . {} {} ( {} ) ' . format ( * django . VERSION ) } ,
2020-07-30 20:55:24 +00:00
' USE_CURL ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and ( c [ ' SAVE_FAVICON ' ] or c [ ' SAVE_TITLE ' ] or c [ ' SAVE_ARCHIVE_DOT_ORG ' ] ) } ,
2019-04-24 08:09:25 +00:00
' CURL_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' CURL_BINARY ' ] ) if c [ ' USE_CURL ' ] else None } ,
2020-06-26 01:30:29 +00:00
' CURL_USER_AGENT ' : { ' default ' : lambda c : c [ ' CURL_USER_AGENT ' ] . format ( * * c ) } ,
2020-10-15 13:42:46 +00:00
' CURL_ARGS ' : { ' default ' : lambda c : c [ ' CURL_ARGS ' ] or [ ] } ,
2019-04-26 18:43:47 +00:00
' SAVE_FAVICON ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and c [ ' SAVE_FAVICON ' ] } ,
' SAVE_ARCHIVE_DOT_ORG ' : { ' default ' : lambda c : c [ ' USE_CURL ' ] and c [ ' SAVE_ARCHIVE_DOT_ORG ' ] } ,
2019-04-24 08:09:25 +00:00
' USE_WGET ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and ( c [ ' SAVE_WGET ' ] or c [ ' SAVE_WARC ' ] ) } ,
' WGET_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' WGET_BINARY ' ] ) if c [ ' USE_WGET ' ] else None } ,
' WGET_AUTO_COMPRESSION ' : { ' default ' : lambda c : wget_supports_compression ( c ) if c [ ' USE_WGET ' ] else False } ,
' WGET_USER_AGENT ' : { ' default ' : lambda c : c [ ' WGET_USER_AGENT ' ] . format ( * * c ) } ,
2019-04-26 18:43:47 +00:00
' SAVE_WGET ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and c [ ' SAVE_WGET ' ] } ,
' SAVE_WARC ' : { ' default ' : lambda c : c [ ' USE_WGET ' ] and c [ ' SAVE_WARC ' ] } ,
2020-10-15 13:31:49 +00:00
' WGET_ARGS ' : { ' default ' : lambda c : c [ ' WGET_ARGS ' ] or [ ] } ,
2019-04-24 08:09:25 +00:00
2020-11-24 01:24:37 +00:00
2020-08-07 03:07:25 +00:00
' USE_SINGLEFILE ' : { ' default ' : lambda c : c [ ' USE_SINGLEFILE ' ] and c [ ' SAVE_SINGLEFILE ' ] } ,
2020-07-30 18:23:10 +00:00
' SINGLEFILE_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' SINGLEFILE_BINARY ' ] ) if c [ ' USE_SINGLEFILE ' ] else None } ,
2020-08-07 13:05:17 +00:00
' USE_READABILITY ' : { ' default ' : lambda c : c [ ' USE_READABILITY ' ] and c [ ' SAVE_READABILITY ' ] } ,
' READABILITY_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' READABILITY_BINARY ' ] ) if c [ ' USE_READABILITY ' ] else None } ,
2020-09-22 08:46:21 +00:00
' USE_MERCURY ' : { ' default ' : lambda c : c [ ' USE_MERCURY ' ] and c [ ' SAVE_MERCURY ' ] } ,
2020-10-25 02:59:09 +00:00
' MERCURY_VERSION ' : { ' default ' : lambda c : ' 1.0.0 ' if ( c [ ' USE_MERCURY ' ] and c [ ' MERCURY_BINARY ' ] ) else None } , # mercury is unversioned
2020-09-22 08:46:21 +00:00
2019-04-24 08:09:25 +00:00
' USE_GIT ' : { ' default ' : lambda c : c [ ' USE_GIT ' ] and c [ ' SAVE_GIT ' ] } ,
' GIT_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' GIT_BINARY ' ] ) if c [ ' USE_GIT ' ] else None } ,
2019-04-26 18:43:47 +00:00
' SAVE_GIT ' : { ' default ' : lambda c : c [ ' USE_GIT ' ] and c [ ' SAVE_GIT ' ] } ,
2019-04-24 08:09:25 +00:00
' USE_YOUTUBEDL ' : { ' default ' : lambda c : c [ ' USE_YOUTUBEDL ' ] and c [ ' SAVE_MEDIA ' ] } ,
' YOUTUBEDL_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' YOUTUBEDL_BINARY ' ] ) if c [ ' USE_YOUTUBEDL ' ] else None } ,
2019-04-26 18:43:47 +00:00
' SAVE_MEDIA ' : { ' default ' : lambda c : c [ ' USE_YOUTUBEDL ' ] and c [ ' SAVE_MEDIA ' ] } ,
2020-10-14 15:38:29 +00:00
' YOUTUBEDL_ARGS ' : { ' default ' : lambda c : c [ ' YOUTUBEDL_ARGS ' ] or [ ] } ,
2019-04-24 08:09:25 +00:00
2020-08-08 14:12:14 +00:00
' USE_CHROME ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and ( c [ ' SAVE_PDF ' ] or c [ ' SAVE_SCREENSHOT ' ] or c [ ' SAVE_DOM ' ] or c [ ' SAVE_SINGLEFILE ' ] ) } ,
2019-04-24 08:09:25 +00:00
' CHROME_BINARY ' : { ' default ' : lambda c : c [ ' CHROME_BINARY ' ] if c [ ' CHROME_BINARY ' ] else find_chrome_binary ( ) } ,
' CHROME_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' CHROME_BINARY ' ] ) if c [ ' USE_CHROME ' ] else None } ,
2020-08-18 15:34:28 +00:00
' USE_NODE ' : { ' default ' : lambda c : c [ ' USE_NODE ' ] and ( c [ ' SAVE_READABILITY ' ] or c [ ' SAVE_SINGLEFILE ' ] ) } ,
2020-11-24 01:24:37 +00:00
' NODE_VERSION ' : { ' default ' : lambda c : bin_version ( c [ ' NODE_BINARY ' ] ) if c [ ' USE_NODE ' ] else None } ,
2019-04-26 18:43:47 +00:00
' SAVE_PDF ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_PDF ' ] } ,
' SAVE_SCREENSHOT ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_SCREENSHOT ' ] } ,
' SAVE_DOM ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_DOM ' ] } ,
2020-10-31 23:32:43 +00:00
' SAVE_SINGLEFILE ' : { ' default ' : lambda c : c [ ' USE_CHROME ' ] and c [ ' SAVE_SINGLEFILE ' ] and c [ ' USE_NODE ' ] } ,
2020-08-18 15:34:28 +00:00
' SAVE_READABILITY ' : { ' default ' : lambda c : c [ ' USE_READABILITY ' ] and c [ ' USE_NODE ' ] } ,
2020-09-22 08:46:21 +00:00
' SAVE_MERCURY ' : { ' default ' : lambda c : c [ ' USE_MERCURY ' ] and c [ ' USE_NODE ' ] } ,
2019-04-24 08:09:25 +00:00
' DEPENDENCIES ' : { ' default ' : lambda c : get_dependency_info ( c ) } ,
' CODE_LOCATIONS ' : { ' default ' : lambda c : get_code_locations ( c ) } ,
2019-04-27 21:26:24 +00:00
' EXTERNAL_LOCATIONS ' : { ' default ' : lambda c : get_external_locations ( c ) } ,
2019-04-24 08:09:25 +00:00
' DATA_LOCATIONS ' : { ' default ' : lambda c : get_data_locations ( c ) } ,
' CHROME_OPTIONS ' : { ' default ' : lambda c : get_chrome_info ( c ) } ,
}
2019-03-27 20:44:00 +00:00
2019-04-24 08:09:25 +00:00
################################### Helpers ####################################
2019-04-24 15:36:14 +00:00
def load_config_val ( key : str ,
default : ConfigDefaultValue = None ,
type : Optional [ Type ] = None ,
aliases : Optional [ Tuple [ str , . . . ] ] = None ,
2019-04-25 23:03:38 +00:00
config : Optional [ ConfigDict ] = None ,
env_vars : Optional [ os . _Environ ] = None ,
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > ConfigValue :
2020-07-13 15:22:07 +00:00
""" parse bool, int, and str key=value pairs from env """
2019-04-25 23:03:38 +00:00
config_keys_to_check = ( key , * ( aliases or ( ) ) )
for key in config_keys_to_check :
if env_vars :
val = env_vars . get ( key )
if val :
break
if config_file_vars :
val = config_file_vars . get ( key )
if val :
break
2019-04-24 08:09:25 +00:00
if type is None or val is None :
2019-04-24 15:36:14 +00:00
if callable ( default ) :
assert isinstance ( config , dict )
2019-04-24 08:09:25 +00:00
return default ( config )
return default
2020-06-30 05:08:14 +00:00
2019-04-24 08:09:25 +00:00
elif type is bool :
if val . lower ( ) in ( ' true ' , ' yes ' , ' 1 ' ) :
return True
elif val . lower ( ) in ( ' false ' , ' no ' , ' 0 ' ) :
return False
else :
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a boolean: True/False) ' )
elif type is str :
if val . lower ( ) in ( ' true ' , ' false ' , ' yes ' , ' no ' , ' 1 ' , ' 0 ' ) :
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected a string) ' )
return val . strip ( )
elif type is int :
if not val . isdigit ( ) :
raise ValueError ( f ' Invalid configuration option { key } = { val } (expected an integer) ' )
return int ( val )
2020-11-07 19:17:21 +00:00
elif type is list or type is dict :
2020-10-15 13:16:08 +00:00
return json . loads ( val )
2020-10-14 15:38:29 +00:00
2020-10-15 13:16:08 +00:00
raise Exception ( ' Config values can only be str, bool, int or json ' )
2019-04-24 08:09:25 +00:00
2020-06-30 05:08:14 +00:00
2019-04-25 23:03:38 +00:00
def load_config_file ( out_dir : str = None ) - > Optional [ Dict [ str , str ] ] :
""" load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf """
2020-09-07 22:49:14 +00:00
out_dir = out_dir or Path ( os . getenv ( ' OUTPUT_DIR ' , ' . ' ) ) . resolve ( )
config_path = Path ( out_dir ) / CONFIG_FILENAME
if config_path . exists ( ) :
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
config_file . optionxform = str
config_file . read ( config_path )
# flatten into one namespace
config_file_vars = {
key . upper ( ) : val
for section , options in config_file . items ( )
for key , val in options . items ( )
}
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
2020-06-30 05:12:06 +00:00
2019-04-26 18:43:13 +00:00
def write_config_file ( config : Dict [ str , str ] , out_dir : str = None ) - > ConfigDict :
2019-04-25 23:03:38 +00:00
""" load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf """
2020-10-31 07:08:03 +00:00
from . system import atomic_write
2020-06-30 06:04:16 +00:00
2020-09-07 22:49:14 +00:00
out_dir = out_dir or Path ( os . getenv ( ' OUTPUT_DIR ' , ' . ' ) ) . resolve ( )
config_path = Path ( out_dir ) / CONFIG_FILENAME
2020-06-30 05:12:06 +00:00
2020-09-07 22:49:14 +00:00
if not config_path . exists ( ) :
2020-06-30 05:12:06 +00:00
atomic_write ( config_path , CONFIG_HEADER )
2019-04-25 23:03:38 +00:00
config_file = ConfigParser ( )
2019-04-26 18:43:13 +00:00
config_file . optionxform = str
2019-04-25 23:03:38 +00:00
config_file . read ( config_path )
2020-06-30 05:12:06 +00:00
with open ( config_path , ' r ' ) as old :
atomic_write ( f ' { config_path } .bak ' , old . read ( ) )
2019-04-25 23:03:38 +00:00
find_section = lambda key : [ name for name , opts in CONFIG_DEFAULTS . items ( ) if key in opts ] [ 0 ]
2020-06-30 05:12:06 +00:00
# Set up sections in empty config file
for key , val in config . items ( ) :
section = find_section ( key )
if section in config_file :
existing_config = dict ( config_file [ section ] )
else :
existing_config = { }
config_file [ section ] = { * * existing_config , key : val }
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if ' SERVER_CONFIG ' in config_file and ' SECRET_KEY ' in config_file [ ' SERVER_CONFIG ' ] :
existing_secret_key = config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ]
if ( not existing_secret_key ) or ( ' not a valid secret ' in existing_secret_key ) :
from django . utils . crypto import get_random_string
chars = ' abcdefghijklmnopqrstuvwxyz0123456789-_+!. '
random_secret_key = get_random_string ( 50 , chars )
if ' SERVER_CONFIG ' in config_file :
config_file [ ' SERVER_CONFIG ' ] [ ' SECRET_KEY ' ] = random_secret_key
else :
config_file [ ' SERVER_CONFIG ' ] = { ' SECRET_KEY ' : random_secret_key }
2020-06-30 06:04:16 +00:00
with open ( config_path , ' w+ ' ) as new :
config_file . write ( new )
2019-04-25 23:03:38 +00:00
try :
2020-06-30 05:12:06 +00:00
# validate the config by attempting to re-parse it
2019-04-25 23:03:38 +00:00
CONFIG = load_all_config ( )
return {
key . upper ( ) : CONFIG . get ( key . upper ( ) )
for key in config . keys ( )
}
except :
2020-06-30 05:12:06 +00:00
# something went horribly wrong, rever to the previous version
with open ( f ' { config_path } .bak ' , ' r ' ) as old :
atomic_write ( config_path , old . read ( ) )
2019-04-25 23:03:38 +00:00
2020-09-30 19:54:51 +00:00
if Path ( f ' { config_path } .bak ' ) . exists ( ) :
2020-06-30 05:12:06 +00:00
os . remove ( f ' { config_path } .bak ' )
2019-04-25 23:03:38 +00:00
return { }
def load_config ( defaults : ConfigDefaultDict ,
config : Optional [ ConfigDict ] = None ,
out_dir : Optional [ str ] = None ,
env_vars : Optional [ os . _Environ ] = None ,
config_file_vars : Optional [ Dict [ str , str ] ] = None ) - > ConfigDict :
env_vars = env_vars or os . environ
config_file_vars = config_file_vars or load_config_file ( out_dir = out_dir )
2019-04-24 15:36:14 +00:00
extended_config : ConfigDict = config . copy ( ) if config else { }
2019-04-24 08:09:25 +00:00
for key , default in defaults . items ( ) :
try :
2019-04-24 15:36:14 +00:00
extended_config [ key ] = load_config_val (
key ,
default = default [ ' default ' ] ,
type = default . get ( ' type ' ) ,
aliases = default . get ( ' aliases ' ) ,
config = extended_config ,
2019-04-25 23:03:38 +00:00
env_vars = env_vars ,
config_file_vars = config_file_vars ,
2019-04-24 15:36:14 +00:00
)
2019-04-24 08:09:25 +00:00
except KeyboardInterrupt :
2019-04-25 23:02:44 +00:00
raise SystemExit ( 0 )
2019-04-24 08:09:25 +00:00
except Exception as e :
stderr ( )
2019-04-24 15:36:14 +00:00
stderr ( f ' [X] Error while loading configuration value: { key } ' , color = ' red ' , config = extended_config )
2019-04-24 08:09:25 +00:00
stderr ( ' {} : {} ' . format ( e . __class__ . __name__ , e ) )
stderr ( )
stderr ( ' Check your config for mistakes and try again (your archive data is unaffected). ' )
stderr ( )
stderr ( ' For config documentation and examples see: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration ' )
2019-04-24 08:09:25 +00:00
stderr ( )
2019-04-27 21:26:24 +00:00
raise
2019-04-25 23:03:38 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
2019-04-24 15:36:14 +00:00
return extended_config
2019-04-24 08:09:25 +00:00
2019-04-25 23:03:38 +00:00
# def write_config(config: ConfigDict):
# with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
2020-08-18 12:21:36 +00:00
def stdout ( * args , color : Optional [ str ] = None , prefix : str = ' ' , config : Optional [ ConfigDict ] = None ) - > None :
ansi = DEFAULT_CLI_COLORS if ( config or { } ) . get ( ' USE_COLOR ' ) else ANSI
if color :
strs = [ ansi [ color ] , ' ' . join ( str ( a ) for a in args ) , ansi [ ' reset ' ] , ' \n ' ]
else :
strs = [ ' ' . join ( str ( a ) for a in args ) , ' \n ' ]
2019-04-25 23:03:38 +00:00
2020-08-18 12:21:36 +00:00
sys . stdout . write ( prefix + ' ' . join ( strs ) )
2019-04-25 23:03:38 +00:00
2020-08-18 12:21:36 +00:00
def stderr ( * args , color : Optional [ str ] = None , prefix : str = ' ' , config : Optional [ ConfigDict ] = None ) - > None :
2019-04-24 08:09:25 +00:00
ansi = DEFAULT_CLI_COLORS if ( config or { } ) . get ( ' USE_COLOR ' ) else ANSI
if color :
2019-04-24 15:36:14 +00:00
strs = [ ansi [ color ] , ' ' . join ( str ( a ) for a in args ) , ansi [ ' reset ' ] , ' \n ' ]
2019-04-24 08:09:25 +00:00
else :
2019-04-24 15:36:14 +00:00
strs = [ ' ' . join ( str ( a ) for a in args ) , ' \n ' ]
2019-04-24 08:09:25 +00:00
2020-08-18 12:21:36 +00:00
sys . stderr . write ( prefix + ' ' . join ( strs ) )
def hint ( text : Union [ Tuple [ str , . . . ] , List [ str ] , str ] , prefix = ' ' , config : Optional [ ConfigDict ] = None ) - > None :
ansi = DEFAULT_CLI_COLORS if ( config or { } ) . get ( ' USE_COLOR ' ) else ANSI
if isinstance ( text , str ) :
stderr ( ' {} {lightred} Hint: {reset} {} ' . format ( prefix , text , * * ansi ) )
else :
stderr ( ' {} {lightred} Hint: {reset} {} ' . format ( prefix , text [ 0 ] , * * ansi ) )
for line in text [ 1 : ] :
stderr ( ' {} {} ' . format ( prefix , line ) )
2019-03-26 09:31:27 +00:00
2019-04-24 15:36:14 +00:00
def bin_version ( binary : Optional [ str ] ) - > Optional [ str ] :
2019-03-26 09:31:27 +00:00
""" check the presence and return valid version line of a specified binary """
2019-04-19 01:09:54 +00:00
2019-04-24 15:36:14 +00:00
abspath = bin_path ( binary )
2020-08-18 12:21:55 +00:00
if not binary or not abspath :
2019-04-24 15:36:14 +00:00
return None
2019-04-11 07:42:35 +00:00
2019-04-24 15:36:14 +00:00
try :
2020-08-18 19:00:12 +00:00
version_str = run ( [ abspath , " --version " ] , stdout = PIPE ) . stdout . strip ( ) . decode ( )
# take first 3 columns of first line of version info
return ' ' . join ( version_str . split ( ' \n ' ) [ 0 ] . strip ( ) . split ( ) [ : 3 ] )
2020-08-18 12:21:55 +00:00
except OSError :
pass
2019-04-24 08:09:25 +00:00
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
2020-11-23 07:04:39 +00:00
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
2020-08-18 12:21:55 +00:00
return None
2019-04-11 07:42:35 +00:00
2019-04-24 15:36:14 +00:00
def bin_path ( binary : Optional [ str ] ) - > Optional [ str ] :
if binary is None :
return None
2020-08-18 22:14:56 +00:00
node_modules_bin = Path ( ' . ' ) / ' node_modules ' / ' .bin ' / binary
if node_modules_bin . exists ( ) :
return str ( node_modules_bin . resolve ( ) )
2020-09-30 19:54:51 +00:00
return shutil . which ( Path ( binary ) . expanduser ( ) ) or binary
2019-04-24 15:36:14 +00:00
def bin_hash ( binary : Optional [ str ] ) - > Optional [ str ] :
2020-07-27 22:51:29 +00:00
if binary is None :
return None
2019-04-24 15:36:14 +00:00
abs_path = bin_path ( binary )
2020-07-28 11:20:57 +00:00
if abs_path is None or not Path ( abs_path ) . exists ( ) :
2019-04-22 18:34:12 +00:00
return None
file_hash = md5 ( )
2019-04-24 15:36:14 +00:00
with io . open ( abs_path , mode = ' rb ' ) as f :
2019-04-22 18:34:12 +00:00
for chunk in iter ( lambda : f . read ( io . DEFAULT_BUFFER_SIZE ) , b ' ' ) :
file_hash . update ( chunk )
return f ' md5: { file_hash . hexdigest ( ) } '
2019-04-11 07:42:35 +00:00
def find_chrome_binary ( ) - > Optional [ str ] :
2019-03-26 09:31:27 +00:00
""" find any installed chrome binaries in the default locations """
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_executable_paths = (
' chromium-browser ' ,
' chromium ' ,
' /Applications/Chromium.app/Contents/MacOS/Chromium ' ,
2020-06-26 01:30:29 +00:00
' chrome ' ,
2019-03-26 09:31:27 +00:00
' google-chrome ' ,
' /Applications/Google Chrome.app/Contents/MacOS/Google Chrome ' ,
' google-chrome-stable ' ,
' google-chrome-beta ' ,
' google-chrome-canary ' ,
' /Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary ' ,
' google-chrome-unstable ' ,
' google-chrome-dev ' ,
)
for name in default_executable_paths :
full_path_exists = shutil . which ( name )
if full_path_exists :
return name
2019-04-11 07:42:35 +00:00
return None
2019-03-26 09:31:27 +00:00
def find_chrome_data_dir ( ) - > Optional [ str ] :
""" find any installed chrome user data directories in the default locations """
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
2019-03-30 19:03:31 +00:00
# make sure data dir finding precedence order always matches binary finding order
2019-03-26 09:31:27 +00:00
default_profile_paths = (
' ~/.config/chromium ' ,
' ~/Library/Application Support/Chromium ' ,
' ~/AppData/Local/Chromium/User Data ' ,
2020-06-26 01:30:29 +00:00
' ~/.config/chrome ' ,
2019-03-26 09:31:27 +00:00
' ~/.config/google-chrome ' ,
' ~/Library/Application Support/Google/Chrome ' ,
' ~/AppData/Local/Google/Chrome/User Data ' ,
' ~/.config/google-chrome-stable ' ,
' ~/.config/google-chrome-beta ' ,
' ~/Library/Application Support/Google/Chrome Canary ' ,
' ~/AppData/Local/Google/Chrome SxS/User Data ' ,
' ~/.config/google-chrome-unstable ' ,
' ~/.config/google-chrome-dev ' ,
)
for path in default_profile_paths :
2020-09-08 21:29:22 +00:00
full_path = Path ( path ) . resolve ( )
2020-09-07 22:49:14 +00:00
if full_path . exists ( ) :
2019-03-26 09:31:27 +00:00
return full_path
return None
2019-04-24 08:09:25 +00:00
def wget_supports_compression ( config ) :
2020-11-12 19:28:43 +00:00
try :
cmd = [
config [ ' WGET_BINARY ' ] ,
" --compression=auto " ,
" --help " ,
]
return not run ( cmd , stdout = DEVNULL , stderr = DEVNULL ) . returncode
except ( FileNotFoundError , OSError ) :
return False
2019-03-26 23:21:34 +00:00
2019-04-24 15:36:14 +00:00
def get_code_locations ( config : ConfigDict ) - > SimpleConfigValueDict :
2019-04-24 08:09:25 +00:00
return {
2020-10-31 07:08:03 +00:00
' PACKAGE_DIR ' : {
' path ' : ( config [ ' PACKAGE_DIR ' ] ) . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-10-31 07:08:03 +00:00
' is_valid ' : ( config [ ' PACKAGE_DIR ' ] / ' __main__.py ' ) . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
' TEMPLATES_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : ( config [ ' TEMPLATES_DIR ' ] ) . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : ( config [ ' TEMPLATES_DIR ' ] / ' static ' ) . exists ( ) ,
2019-04-24 08:09:25 +00:00
} ,
2020-11-24 01:24:37 +00:00
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
# 'is_valid': (...).exists(),
# },
2019-04-24 08:09:25 +00:00
}
2019-04-27 21:26:24 +00:00
def get_external_locations ( config : ConfigDict ) - > ConfigValue :
2020-09-30 19:54:51 +00:00
abspath = lambda path : None if path is None else Path ( path ) . resolve ( )
2019-04-24 08:09:25 +00:00
return {
' CHROME_USER_DATA_DIR ' : {
2019-04-24 15:36:14 +00:00
' path ' : abspath ( config [ ' CHROME_USER_DATA_DIR ' ] ) ,
2019-04-24 08:09:25 +00:00
' enabled ' : config [ ' USE_CHROME ' ] and config [ ' CHROME_USER_DATA_DIR ' ] ,
2020-09-30 19:54:51 +00:00
' is_valid ' : False if config [ ' CHROME_USER_DATA_DIR ' ] is None else ( Path ( config [ ' CHROME_USER_DATA_DIR ' ] ) / ' Default ' ) . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
2019-04-24 08:09:25 +00:00
' COOKIES_FILE ' : {
2019-04-24 15:36:14 +00:00
' path ' : abspath ( config [ ' COOKIES_FILE ' ] ) ,
2019-04-24 08:09:25 +00:00
' enabled ' : config [ ' USE_WGET ' ] and config [ ' COOKIES_FILE ' ] ,
2020-09-30 19:54:51 +00:00
' is_valid ' : False if config [ ' COOKIES_FILE ' ] is None else Path ( config [ ' COOKIES_FILE ' ] ) . exists ( ) ,
2019-04-24 08:09:25 +00:00
} ,
}
2019-04-24 15:36:14 +00:00
def get_data_locations ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
2019-04-11 07:42:35 +00:00
' OUTPUT_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : config [ ' OUTPUT_DIR ' ] . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-10-31 07:08:03 +00:00
' is_valid ' : ( config [ ' OUTPUT_DIR ' ] / SQL_INDEX_FILENAME ) . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
' SOURCES_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : config [ ' SOURCES_DIR ' ] . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : config [ ' SOURCES_DIR ' ] . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
2019-04-19 01:09:54 +00:00
' LOGS_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : config [ ' LOGS_DIR ' ] . resolve ( ) ,
2019-04-19 01:09:54 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : config [ ' LOGS_DIR ' ] . exists ( ) ,
2019-04-19 01:09:54 +00:00
} ,
2019-04-11 07:42:35 +00:00
' ARCHIVE_DIR ' : {
2020-09-07 22:49:14 +00:00
' path ' : config [ ' ARCHIVE_DIR ' ] . resolve ( ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : config [ ' ARCHIVE_DIR ' ] . exists ( ) ,
2019-04-11 07:42:35 +00:00
} ,
2019-04-27 21:26:24 +00:00
' CONFIG_FILE ' : {
2020-09-07 22:49:14 +00:00
' path ' : config [ ' CONFIG_FILE ' ] . resolve ( ) ,
2019-04-27 21:26:24 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : config [ ' CONFIG_FILE ' ] . exists ( ) ,
2019-04-27 21:26:24 +00:00
} ,
2019-04-24 15:36:14 +00:00
' SQL_INDEX ' : {
2020-09-07 22:49:14 +00:00
' path ' : ( config [ ' OUTPUT_DIR ' ] / SQL_INDEX_FILENAME ) . resolve ( ) ,
2019-04-27 21:26:24 +00:00
' enabled ' : True ,
2020-09-07 22:49:14 +00:00
' is_valid ' : ( config [ ' OUTPUT_DIR ' ] / SQL_INDEX_FILENAME ) . exists ( ) ,
2019-04-27 21:26:24 +00:00
} ,
2019-04-11 07:42:35 +00:00
}
2019-04-24 15:36:14 +00:00
def get_dependency_info ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
2020-11-24 01:24:37 +00:00
' ARCHIVEBOX_BINARY ' : {
' path ' : bin_path ( config [ ' ARCHIVEBOX_BINARY ' ] ) ,
' version ' : config [ ' VERSION ' ] ,
' hash ' : bin_hash ( config [ ' ARCHIVEBOX_BINARY ' ] ) ,
' enabled ' : True ,
' is_valid ' : True ,
} ,
2019-04-19 01:09:54 +00:00
' PYTHON_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' PYTHON_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' PYTHON_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' PYTHON_BINARY ' ] ) ,
2019-04-19 01:09:54 +00:00
' enabled ' : True ,
2019-04-24 08:09:25 +00:00
' is_valid ' : bool ( config [ ' DJANGO_VERSION ' ] ) ,
2019-04-19 01:09:54 +00:00
} ,
2019-04-11 07:42:35 +00:00
' DJANGO_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' DJANGO_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' DJANGO_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' DJANGO_BINARY ' ] ) ,
2019-04-11 07:42:35 +00:00
' enabled ' : True ,
2019-04-24 08:09:25 +00:00
' is_valid ' : bool ( config [ ' DJANGO_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' CURL_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' CURL_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' CURL_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' PYTHON_BINARY ' ] ) ,
' enabled ' : config [ ' USE_CURL ' ] ,
' is_valid ' : bool ( config [ ' CURL_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' WGET_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' WGET_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' WGET_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' WGET_BINARY ' ] ) ,
' enabled ' : config [ ' USE_WGET ' ] ,
' is_valid ' : bool ( config [ ' WGET_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
2020-11-24 01:24:37 +00:00
' NODE_BINARY ' : {
' path ' : bin_path ( config [ ' NODE_BINARY ' ] ) ,
' version ' : config [ ' NODE_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' NODE_BINARY ' ] ) ,
' enabled ' : config [ ' USE_NODE ' ] ,
' is_valid ' : bool ( config [ ' SINGLEFILE_VERSION ' ] ) ,
} ,
2020-07-30 18:23:10 +00:00
' SINGLEFILE_BINARY ' : {
' path ' : bin_path ( config [ ' SINGLEFILE_BINARY ' ] ) ,
' version ' : config [ ' SINGLEFILE_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' SINGLEFILE_BINARY ' ] ) ,
' enabled ' : config [ ' USE_SINGLEFILE ' ] ,
' is_valid ' : bool ( config [ ' SINGLEFILE_VERSION ' ] ) ,
} ,
2020-08-07 13:05:17 +00:00
' READABILITY_BINARY ' : {
' path ' : bin_path ( config [ ' READABILITY_BINARY ' ] ) ,
' version ' : config [ ' READABILITY_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' READABILITY_BINARY ' ] ) ,
' enabled ' : config [ ' USE_READABILITY ' ] ,
' is_valid ' : bool ( config [ ' READABILITY_VERSION ' ] ) ,
} ,
2020-09-22 08:46:21 +00:00
' MERCURY_BINARY ' : {
' path ' : bin_path ( config [ ' MERCURY_BINARY ' ] ) ,
' version ' : config [ ' MERCURY_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' MERCURY_BINARY ' ] ) ,
' enabled ' : config [ ' USE_MERCURY ' ] ,
' is_valid ' : bool ( config [ ' MERCURY_VERSION ' ] ) ,
} ,
2019-04-11 07:42:35 +00:00
' GIT_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' GIT_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' GIT_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' GIT_BINARY ' ] ) ,
' enabled ' : config [ ' USE_GIT ' ] ,
' is_valid ' : bool ( config [ ' GIT_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' YOUTUBEDL_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' YOUTUBEDL_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' YOUTUBEDL_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' YOUTUBEDL_BINARY ' ] ) ,
' enabled ' : config [ ' USE_YOUTUBEDL ' ] ,
' is_valid ' : bool ( config [ ' YOUTUBEDL_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
' CHROME_BINARY ' : {
2019-04-24 15:36:14 +00:00
' path ' : bin_path ( config [ ' CHROME_BINARY ' ] ) ,
2019-04-24 08:09:25 +00:00
' version ' : config [ ' CHROME_VERSION ' ] ,
' hash ' : bin_hash ( config [ ' CHROME_BINARY ' ] ) ,
' enabled ' : config [ ' USE_CHROME ' ] ,
' is_valid ' : bool ( config [ ' CHROME_VERSION ' ] ) ,
2019-04-11 07:42:35 +00:00
} ,
}
2019-03-23 02:05:45 +00:00
2019-04-24 15:36:14 +00:00
def get_chrome_info ( config : ConfigDict ) - > ConfigValue :
2019-04-24 08:09:25 +00:00
return {
' TIMEOUT ' : config [ ' TIMEOUT ' ] ,
' RESOLUTION ' : config [ ' RESOLUTION ' ] ,
' CHECK_SSL_VALIDITY ' : config [ ' CHECK_SSL_VALIDITY ' ] ,
' CHROME_BINARY ' : config [ ' CHROME_BINARY ' ] ,
' CHROME_HEADLESS ' : config [ ' CHROME_HEADLESS ' ] ,
' CHROME_SANDBOX ' : config [ ' CHROME_SANDBOX ' ] ,
' CHROME_USER_AGENT ' : config [ ' CHROME_USER_AGENT ' ] ,
' CHROME_USER_DATA_DIR ' : config [ ' CHROME_USER_DATA_DIR ' ] ,
2019-03-26 09:31:27 +00:00
}
2019-04-11 07:42:35 +00:00
2019-03-23 03:00:53 +00:00
2019-04-24 08:09:25 +00:00
################################## Load Config #################################
2019-04-25 23:00:25 +00:00
def load_all_config ( ) :
CONFIG : ConfigDict = { }
for section_name , section_config in CONFIG_DEFAULTS . items ( ) :
CONFIG = load_config ( section_config , CONFIG )
return load_config ( DERIVED_CONFIG_DEFAULTS , CONFIG )
CONFIG = load_all_config ( )
2019-04-24 08:09:25 +00:00
globals ( ) . update ( CONFIG )
2019-04-11 07:42:35 +00:00
2020-08-11 03:21:02 +00:00
# Timezone set as UTC
os . environ [ " TZ " ] = ' UTC '
2020-08-18 22:14:56 +00:00
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
2020-08-18 22:40:19 +00:00
NODE_BIN_PATH = str ( ( Path ( CONFIG [ " OUTPUT_DIR " ] ) . absolute ( ) / ' node_modules ' / ' .bin ' ) )
2020-08-18 22:14:56 +00:00
sys . path . append ( NODE_BIN_PATH )
2019-04-25 23:00:25 +00:00
2019-04-24 08:09:25 +00:00
############################## Importable Checkers #############################
2019-04-11 07:42:35 +00:00
2019-04-24 15:36:14 +00:00
def check_system_config ( config : ConfigDict = CONFIG ) - > None :
2019-04-24 08:09:25 +00:00
### Check system environment
if config [ ' USER ' ] == ' root ' :
stderr ( ' [!] ArchiveBox should never be run as root! ' , color = ' red ' )
stderr ( ' For more information, see the security overview documentation: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root ' )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
### Check Python environment
2020-06-30 05:08:14 +00:00
if sys . version_info [ : 3 ] < ( 3 , 6 , 0 ) :
2019-04-24 08:09:25 +00:00
stderr ( f ' [X] Python version is not new enough: { config [ " PYTHON_VERSION " ] } (>3.6 is required) ' , color = ' red ' )
2020-11-23 07:04:39 +00:00
stderr ( ' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation. ' )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
if config [ ' PYTHON_ENCODING ' ] not in ( ' UTF-8 ' , ' UTF8 ' ) :
stderr ( f ' [X] Your system is running python3 scripts with a bad locale setting: { config [ " PYTHON_ENCODING " ] } (it should be UTF-8). ' , color = ' red ' )
stderr ( ' To fix it, add the line " export PYTHONIOENCODING=UTF-8 " to your ~/.bashrc file (without quotes) ' )
stderr ( ' Or if you \' re using ubuntu/debian, run " dpkg-reconfigure locales " ' )
stderr ( ' ' )
stderr ( ' Confirm that it \' s fixed by opening a new shell and running: ' )
stderr ( ' python3 -c " import sys; print(sys.stdout.encoding) " # should output UTF-8 ' )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
2019-04-24 15:36:14 +00:00
if config [ ' CHROME_USER_DATA_DIR ' ] is not None :
2020-09-30 19:54:51 +00:00
if not ( Path ( config [ ' CHROME_USER_DATA_DIR ' ] ) / ' Default ' ) . exists ( ) :
2019-04-24 15:36:14 +00:00
stderr ( ' [X] Could not find profile " Default " in CHROME_USER_DATA_DIR. ' , color = ' red ' )
stderr ( f ' { config [ " CHROME_USER_DATA_DIR " ] } ' )
stderr ( ' Make sure you set it to a Chrome user data directory containing a Default profile folder. ' )
stderr ( ' For more info see: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR ' )
2020-10-29 17:46:03 +00:00
if ' /Default ' in str ( config [ ' CHROME_USER_DATA_DIR ' ] ) :
2019-04-24 15:36:14 +00:00
stderr ( )
stderr ( ' Try removing /Default from the end e.g.: ' )
stderr ( ' CHROME_USER_DATA_DIR= " {} " ' . format ( config [ ' CHROME_USER_DATA_DIR ' ] . split ( ' /Default ' ) [ 0 ] ) )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
2020-08-10 14:00:10 +00:00
2019-04-24 15:36:14 +00:00
def check_dependencies ( config : ConfigDict = CONFIG , show_help : bool = True ) - > None :
2020-08-18 12:21:55 +00:00
invalid_dependencies = [
2020-08-18 08:38:13 +00:00
( name , info ) for name , info in config [ ' DEPENDENCIES ' ] . items ( )
2019-04-24 08:09:25 +00:00
if info [ ' enabled ' ] and not info [ ' is_valid ' ]
]
2020-08-18 12:21:55 +00:00
if invalid_dependencies and show_help :
stderr ( f ' [!] Warning: Missing { len ( invalid_dependencies ) } recommended dependencies ' , color = ' lightyellow ' )
for dependency , info in invalid_dependencies :
2020-08-18 08:38:13 +00:00
stderr (
2020-08-18 12:21:55 +00:00
' ! {} : {} ( {} ) ' . format (
dependency ,
2020-08-18 08:38:13 +00:00
info [ ' path ' ] or ' unable to find binary ' ,
info [ ' version ' ] or ' unable to detect version ' ,
)
)
2020-09-23 15:34:05 +00:00
if dependency in ( ' SINGLEFILE_BINARY ' , ' READABILITY_BINARY ' , ' MERCURY_BINARY ' ) :
2020-11-23 07:04:39 +00:00
hint ( ( ' npm install --prefix . " git+https://github.com/ArchiveBox/ArchiveBox.git " ' ,
2020-08-18 22:38:14 +00:00
f ' or archivebox config --set SAVE_ { dependency . rsplit ( " _ " , 1 ) [ 0 ] } =False to silence this warning ' ,
2020-08-18 22:32:11 +00:00
' ' ) , prefix = ' ' )
2020-08-18 12:21:55 +00:00
stderr ( ' ' )
2019-04-24 08:09:25 +00:00
if config [ ' TIMEOUT ' ] < 5 :
stderr ( f ' [!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT= { config [ " TIMEOUT " ] } seconds) ' , color = ' red ' )
stderr ( ' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully. ' )
2020-07-13 15:22:07 +00:00
stderr ( ' (Setting it to somewhere between 30 and 3000 seconds is recommended) ' )
2019-04-24 08:09:25 +00:00
stderr ( )
stderr ( ' If you want to make ArchiveBox run faster, disable specific archive methods instead: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles ' )
2020-10-31 11:55:27 +00:00
stderr ( )
2019-04-24 08:09:25 +00:00
elif config [ ' USE_CHROME ' ] and config [ ' TIMEOUT ' ] < 15 :
stderr ( f ' [!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT= { config [ " TIMEOUT " ] } seconds) ' , color = ' red ' )
stderr ( ' Chrome will fail to archive all sites if set to less than ~15 seconds. ' )
stderr ( ' (Setting it to somewhere between 30 and 300 seconds is recommended) ' )
stderr ( )
stderr ( ' If you want to make ArchiveBox run faster, disable specific archive methods instead: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles ' )
2020-10-31 11:55:27 +00:00
stderr ( )
2019-04-24 08:09:25 +00:00
if config [ ' USE_YOUTUBEDL ' ] and config [ ' MEDIA_TIMEOUT ' ] < 20 :
stderr ( f ' [!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT= { config [ " MEDIA_TIMEOUT " ] } seconds) ' , color = ' red ' )
stderr ( ' Youtube-dl will fail to archive all media if set to less than ~20 seconds. ' )
stderr ( ' (Setting it somewhere over 60 seconds is recommended) ' )
stderr ( )
stderr ( ' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead: ' )
2020-11-23 07:04:39 +00:00
stderr ( ' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media ' )
2020-10-31 11:55:27 +00:00
stderr ( )
2019-04-11 07:42:35 +00:00
2020-11-28 06:05:35 +00:00
def check_data_folder ( out_dir : Union [ str , Path , None ] = None , config : ConfigDict = CONFIG ) - > None :
2019-04-24 15:36:14 +00:00
output_dir = out_dir or config [ ' OUTPUT_DIR ' ]
2020-09-03 22:26:49 +00:00
assert isinstance ( output_dir , ( str , Path ) )
2019-04-24 15:36:14 +00:00
2020-09-03 22:26:49 +00:00
sql_index_exists = ( Path ( output_dir ) / SQL_INDEX_FILENAME ) . exists ( )
2020-08-18 17:53:46 +00:00
if not sql_index_exists :
2020-06-26 03:32:01 +00:00
stderr ( ' [X] No archivebox index found in the current directory. ' , color = ' red ' )
stderr ( f ' { output_dir } ' , color = ' lightyellow ' )
2019-04-24 08:09:25 +00:00
stderr ( )
2020-06-26 03:32:01 +00:00
stderr ( ' {lightred} Hint {reset} : Are you running archivebox in the right folder? ' . format ( * * config [ ' ANSI ' ] ) )
2019-04-17 06:25:28 +00:00
stderr ( ' cd path/to/your/archive/folder ' )
2019-04-11 07:42:35 +00:00
stderr ( ' archivebox [command] ' )
stderr ( )
2020-06-26 03:32:01 +00:00
stderr ( ' {lightred} Hint {reset} : To create a new archive collection or import existing data in this folder, run: ' . format ( * * config [ ' ANSI ' ] ) )
2019-04-24 08:09:25 +00:00
stderr ( ' archivebox init ' )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 2 )
2019-04-24 08:09:25 +00:00
2020-10-31 07:08:03 +00:00
from . index . sql import list_migrations
2019-04-24 08:09:25 +00:00
pending_migrations = [ name for status , name in list_migrations ( ) if not status ]
if ( not sql_index_exists ) or pending_migrations :
if sql_index_exists :
pending_operation = f ' apply the { len ( pending_migrations ) } pending migrations '
else :
pending_operation = ' generate the new SQL main index '
stderr ( ' [X] This collection was created with an older version of ArchiveBox and must be upgraded first. ' , color = ' lightyellow ' )
2019-04-24 15:36:14 +00:00
stderr ( f ' { output_dir } ' )
2019-04-24 08:09:25 +00:00
stderr ( )
stderr ( f ' To upgrade it to the latest version and { pending_operation } run: ' )
2019-04-11 07:42:35 +00:00
stderr ( ' archivebox init ' )
2019-04-25 23:02:44 +00:00
raise SystemExit ( 3 )
2019-04-24 08:09:25 +00:00
2020-09-07 22:49:14 +00:00
sources_dir = Path ( output_dir ) / SOURCES_DIR_NAME
if not sources_dir . exists ( ) :
sources_dir . mkdir ( )
2020-07-13 15:24:36 +00:00
2019-04-24 08:09:25 +00:00
2020-09-15 19:05:48 +00:00
def setup_django ( out_dir : Path = None , check_db = False , config : ConfigDict = CONFIG ) - > None :
2020-07-22 05:31:23 +00:00
check_system_config ( )
2020-09-15 19:05:48 +00:00
output_dir = out_dir or Path ( config [ ' OUTPUT_DIR ' ] )
2019-04-24 15:36:14 +00:00
2020-10-31 07:08:03 +00:00
assert isinstance ( output_dir , Path ) and isinstance ( config [ ' PACKAGE_DIR ' ] , Path )
2019-04-24 15:36:14 +00:00
2019-04-25 23:02:44 +00:00
try :
import django
2020-10-31 07:08:03 +00:00
sys . path . append ( str ( config [ ' PACKAGE_DIR ' ] ) )
2020-09-03 22:26:49 +00:00
os . environ . setdefault ( ' OUTPUT_DIR ' , str ( output_dir ) )
2020-10-31 07:08:03 +00:00
assert ( config [ ' PACKAGE_DIR ' ] / ' core ' / ' settings.py ' ) . exists ( ) , ' settings.py was not found at archivebox/core/settings.py '
2019-04-25 23:02:44 +00:00
os . environ . setdefault ( ' DJANGO_SETTINGS_MODULE ' , ' core.settings ' )
django . setup ( )
if check_db :
2020-09-07 22:49:14 +00:00
sql_index_path = Path ( output_dir ) / SQL_INDEX_FILENAME
assert sql_index_path . exists ( ) , (
2019-04-25 23:02:44 +00:00
f ' No database file { SQL_INDEX_FILENAME } found in OUTPUT_DIR: { config [ " OUTPUT_DIR " ] } ' )
except KeyboardInterrupt :
raise SystemExit ( 2 )
2020-07-23 15:50:42 +00:00
2020-07-28 09:51:02 +00:00
os . umask ( 0o777 - int ( OUTPUT_PERMISSIONS , base = 8 ) ) # noqa: F821