2
0
Fork 0
mirror of https://github.com/ArchiveBox/ArchiveBox synced 2025-02-23 00:38:27 +00:00

move pdf, screenshot, dom, singlefile, and ytdlp extractor config to new plugin system

This commit is contained in:
Nick Sweeting 2024-09-25 00:42:26 -07:00
parent a2a586e369
commit a5ffd4e9d3
No known key found for this signature in database
11 changed files with 333 additions and 353 deletions

View file

@ -30,6 +30,7 @@ import inspect
import getpass
import shutil
import requests
import archivebox
from hashlib import md5
from pathlib import Path
@ -62,7 +63,6 @@ from .misc.logging import (
stderr,
hint,
)
from .misc.checks import check_system_config
# print('STARTING CONFIG LOADING')
@ -167,15 +167,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
'CHROME_TIMEOUT': {'type': int, 'default': 0},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'CHROME_EXTRA_ARGS': {'type': list, 'default': None},
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: [
'--restrict-filenames',
@ -267,7 +260,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
@ -551,7 +543,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
@ -595,7 +587,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
@ -620,15 +612,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] or find_chrome_binary()},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and c['CHROME_BINARY'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'CHROME_USER_AGENT': {'default': lambda c: c['CHROME_USER_AGENT'].format(**c)},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
@ -638,8 +621,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'CHROME_EXTRA_ARGS': {'default': lambda c: c['CHROME_EXTRA_ARGS'] or []},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
@ -1183,21 +1165,20 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']),
},
'CHROME_BINARY': {
'path': bin_path(config['CHROME_BINARY']),
'version': config['CHROME_VERSION'],
'hash': bin_hash(config['CHROME_BINARY']),
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
'RIPGREP_BINARY': {
'path': bin_path(config['RIPGREP_BINARY']),
'version': config['RIPGREP_VERSION'],
'hash': bin_hash(config['RIPGREP_BINARY']),
'enabled': config['USE_RIPGREP'],
'is_valid': bool(config['RIPGREP_VERSION']),
},
# TODO: add an entry for the sonic search backend?
# 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'],
# 'hash': bin_hash(config['CHROME_BINARY']),
# 'enabled': config['USE_CHROME'],
# 'is_valid': bool(config['CHROME_VERSION']),
# },
# 'RIPGREP_BINARY': {
# 'path': bin_path(config['RIPGREP_BINARY']),
# 'version': config['RIPGREP_VERSION'],
# 'hash': bin_hash(config['RIPGREP_BINARY']),
# 'enabled': config['USE_RIPGREP'],
# 'is_valid': bool(config['RIPGREP_VERSION']),
# },
# 'SONIC_BINARY': {
# 'path': bin_path(config['SONIC_BINARY']),
# 'version': config['SONIC_VERSION'],
@ -1207,20 +1188,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
# },
}
def get_chrome_info(config: ConfigDict) -> ConfigValue:
return {
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
'CHROME_TIMEOUT': config['CHROME_TIMEOUT'],
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
}
# ******************************************************************************
# ******************************************************************************
# ******************************** Load Config *********************************
@ -1264,27 +1231,6 @@ os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # n
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
sys.path.append(CONFIG.NODE_BIN_PATH)
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# get SQLite database version, compile options, and runtime options
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
########################### Config Validity Checkers ###########################
@ -1308,13 +1254,19 @@ def bump_startup_progress_bar():
if INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS.update(INITIAL_STARTUP_PROGRESS_TASK, advance=1) # type: ignore
def setup_django_minimal():
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG, in_memory_db=False) -> None:
global INITIAL_STARTUP_PROGRESS
global INITIAL_STARTUP_PROGRESS_TASK
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
check_system_config(config)
output_dir = out_dir or Path(config['OUTPUT_DIR'])

View file

@ -8,13 +8,6 @@ from ..system import run, chmod_file, atomic_write
from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
from ..logging_util import TimedProgress
@ -25,6 +18,8 @@ def get_output_path():
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -33,42 +28,48 @@ def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
if (out_dir / get_output_path()).stat().st_size > 1:
return False
return SAVE_DOM
return CHROME_CONFIG.SAVE_DOM
@enforce_types
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
cmd = [
*chrome_args(),
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
atomic_write(output_path, result.stdout)
if result.returncode:
hints = result.stderr.decode()
hints = result.stderr
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CHROME_VERSION,
cmd_version=str(CHROME_BIN.version),
output=output,
status=status,
**timer.stats,

View file

@ -5,20 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
dedupe,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
YOUTUBEDL_ARGS,
YOUTUBEDL_EXTRA_ARGS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
from ..util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress
@ -38,6 +25,8 @@ def get_embed_path(archiveresult=None):
@enforce_types
def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.ytdlp.apps import YTDLP_CONFIG
if is_static_file(link.url):
return False
@ -45,45 +34,52 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_MEDIA
return YTDLP_CONFIG.USE_YTDLP
@enforce_types
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=0) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
# from plugins_extractor.chrome.apps import CHROME_CONFIG
from plugins_extractor.ytdlp.apps import YTDLP_BINARY, YTDLP_CONFIG
YTDLP_BIN = YTDLP_BINARY.load()
assert YTDLP_BIN.abspath and YTDLP_BIN.version
timeout = timeout or YTDLP_CONFIG.YTDLP_TIMEOUT
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
output_path.mkdir(exist_ok=True)
# later options take precedence
options = [
*YOUTUBEDL_ARGS,
*YOUTUBEDL_EXTRA_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
*YTDLP_CONFIG.YTDLP_EXTRA_ARGS,
*([] if YTDLP_CONFIG.YTDLP_CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
]
cmd = [
YOUTUBEDL_BINARY,
str(YTDLP_BIN.abspath),
*dedupe(options),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
result = run(cmd, cwd=str(output_path), timeout=timeout + 1, text=True)
chmod_file(output, cwd=str(out_dir))
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
if ('ERROR: Unsupported URL' in result.stderr
or 'HTTP Error 404' in result.stderr
or 'HTTP Error 403' in result.stderr
or 'URL could be a direct video link' in result.stderr
or 'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
'Got yt-dlp response code: {}.'.format(result.returncode),
*result.stderr.split('\n'),
)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
@ -117,7 +113,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=YOUTUBEDL_VERSION,
cmd_version=str(YTDLP_BIN.version),
output=output,
status=status,
index_texts=index_texts,

View file

@ -8,13 +8,6 @@ from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
from ..logging_util import TimedProgress
@ -25,6 +18,8 @@ def get_output_path():
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -32,34 +27,40 @@ def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optiona
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_PDF
return CHROME_CONFIG.SAVE_PDF
@enforce_types
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
cmd = [
*chrome_args(),
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save PDF', hints)
chmod_file(get_output_path(), cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
@ -67,7 +68,7 @@ def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) ->
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CHROME_VERSION,
cmd_version=str(CHROME_BINARY.version),
output=output,
status=status,
**timer.stats,

View file

@ -5,17 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
chrome_args,
chrome_cleanup,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
from ..util import enforce_types, is_static_file
from ..logging_util import TimedProgress
@ -25,6 +15,8 @@ def get_output_path():
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.chrome.apps import CHROME_CONFIG
if is_static_file(link.url):
return False
@ -32,40 +24,45 @@ def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_SCREENSHOT
return CHROME_CONFIG.SAVE_SCREENSHOT
@enforce_types
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
cmd = [
*chrome_args(),
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
chrome_cleanup()
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=CHROME_VERSION,
cmd_version=str(CHROME_BIN.version),
output=output,
status=status,
**timer.stats,

View file

@ -7,22 +7,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
chrome_args,
dedupe,
)
from ..config import (
TIMEOUT,
SAVE_SINGLEFILE,
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
SINGLEFILE_EXTRA_ARGS,
CHROME_BINARY,
COOKIES_FILE,
)
from ..util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress
@ -32,6 +17,8 @@ def get_output_path():
@enforce_types
def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG
if is_static_file(link.url):
return False
@ -39,30 +26,35 @@ def should_save_singlefile(link: Link, out_dir: Optional[Path]=None, overwrite:
if not overwrite and (out_dir / get_output_path()).exists():
return False
return SAVE_SINGLEFILE
return SINGLEFILE_CONFIG.SAVE_SINGLEFILE
@enforce_types
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""download full site using single-file"""
from plugins_extractor.chrome.apps import CHROME_CONFIG, CHROME_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_CONFIG, SINGLEFILE_BINARY
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
SINGLEFILE_BIN = SINGLEFILE_BINARY.load()
assert SINGLEFILE_BIN.abspath and SINGLEFILE_BIN.version
out_dir = out_dir or Path(link.link_dir)
output = get_output_path()
browser_args = chrome_args(CHROME_TIMEOUT=0)
browser_args = CHROME_CONFIG.chrome_args(CHROME_TIMEOUT=0)
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
# later options take precedence
options = [
'--browser-executable-path={}'.format(CHROME_BINARY),
*(["--browser-cookies-file={}".format(COOKIES_FILE)] if COOKIES_FILE else []),
browser_args,
*SINGLEFILE_ARGS,
*SINGLEFILE_EXTRA_ARGS,
'--browser-executable-path={}'.format(CHROME_BIN.abspath),
*(["--browser-cookies-file={}".format(SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE)] if SINGLEFILE_CONFIG.SINGLEFILE_COOKIES_FILE else []),
'--browser-args={}'.format(json.dumps(browser_args)),
*SINGLEFILE_CONFIG.SINGLEFILE_EXTRA_ARGS,
]
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
str(SINGLEFILE_BIN.abspath),
*dedupe(options),
link.url,
output,
@ -72,13 +64,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
timer = TimedProgress(timeout, prefix=' ')
result = None
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True, capture_output=True)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (
@ -93,9 +85,9 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"")
cmd[2] = cmd[2].replace('"', "\\\"")
if result:
err.hints = (result.stdout + result.stderr).decode().split('\n')
err.hints = (result.stdout + result.stderr).split('\n')
output = err
finally:
timer.end()
@ -103,7 +95,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=SINGLEFILE_VERSION,
cmd_version=str(SINGLEFILE_BIN.version),
output=output,
status=status,
**timer.stats,

View file

@ -1,10 +1,10 @@
__package__ = 'archivebox'
import os
import time
import sys
import shutil
import platform
import archivebox
from typing import Dict, List, Optional, Iterable, IO, Union
from pathlib import Path
@ -69,6 +69,7 @@ from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint
from .misc.checks import check_data_folder, check_dependencies
from .config import (
setup_django_minimal,
ConfigDict,
ANSI,
IS_TTY,
@ -81,8 +82,6 @@ from .config import (
TIMEZONE,
ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS,
PYTHON_BINARY,
ARCHIVEBOX_BINARY,
ONLY_NEW,
OUTPUT_DIR,
SOURCES_DIR,
@ -95,31 +94,22 @@ from .config import (
HTML_INDEX_FILENAME,
SQL_INDEX_FILENAME,
ALLOWED_IN_OUTPUT_DIR,
SEARCH_BACKEND_ENGINE,
LDAP,
get_version,
write_config_file,
VERSION,
VERSIONS_AVAILABLE,
CAN_UPGRADE,
COMMIT_HASH,
BUILD_TIME,
CODE_LOCATIONS,
DATA_LOCATIONS,
DEPENDENCIES,
CHROME_BINARY,
CHROME_VERSION,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
SINGLEFILE_VERSION,
READABILITY_VERSION,
MERCURY_VERSION,
NODE_VERSION,
load_all_config,
CONFIG,
USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
get_real_name,
setup_django,
)
@ -216,6 +206,11 @@ def version(quiet: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox version and dependency information"""
setup_django_minimal()
from plugins_sys.config.apps import SEARCH_BACKEND_CONFIG, STORAGE_CONFIG, SHELL_CONFIG
from plugins_auth.ldap.apps import LDAP_CONFIG
from django.conf import settings
print(VERSION)
if not quiet:
@ -227,7 +222,7 @@ def version(quiet: bool=False,
p = platform.uname()
print(
'ArchiveBox v{}'.format(get_version(CONFIG)),
'ArchiveBox v{}'.format(archivebox.__version__),
f'COMMIT_HASH={COMMIT_HASH[:7] if COMMIT_HASH else "unknown"}',
f'BUILD_TIME={BUILD_TIME}',
)
@ -241,29 +236,35 @@ def version(quiet: bool=False,
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print(
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
f'FS_USER={SHELL_CONFIG.PUID}:{SHELL_CONFIG.PGID}',
f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
)
print(
f'DEBUG={DEBUG}',
f'IS_TTY={IS_TTY}',
f'DEBUG={SHELL_CONFIG.DEBUG}',
f'IS_TTY={SHELL_CONFIG.IS_TTY}',
f'TZ={TIMEZONE}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP}',
f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP_CONFIG.LDAP_ENABLED}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
print('{white}[i] Old dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print(printable_dependency_version(name, dependency))
# add a newline between core dependencies and extractor dependencies for easier reading
if name == 'ARCHIVEBOX_BINARY':
print()
print()
print('{white}[i] New dependency versions:{reset}'.format(**ANSI))
for name, binary in settings.BINARIES.items():
loaded_bin = binary.load()
print('', '' if loaded_bin.is_valid else 'X', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(15), loaded_bin.abspath)
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, path in CODE_LOCATIONS.items():
@ -431,10 +432,11 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User
from plugins_sys.config.apps import SERVER_CONFIG
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
if (SERVER_CONFIG.ADMIN_USERNAME and SERVER_CONFIG.ADMIN_PASSWORD) and not User.objects.filter(username=SERVER_CONFIG.ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
User.objects.create_superuser(username=SERVER_CONFIG.ADMIN_USERNAME, password=SERVER_CONFIG.ADMIN_PASSWORD)
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
@ -693,8 +695,8 @@ def add(urls: Union[str, List[str]],
# tail_worker_logs(worker['stdout_logfile'])
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
return new_links
@ -967,6 +969,8 @@ def list_folders(links: List[Link],
def setup(out_dir: Path=OUTPUT_DIR) -> None:
"""Automatically install all ArchiveBox dependencies and extras"""
if not (out_dir / ARCHIVE_DIR_NAME).exists():
run_subcommand('init', stdin=None, pwd=out_dir)
@ -980,24 +984,26 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr('\n[+] Installing enabled ArchiveBox dependencies automatically...', color='green')
from plugins_pkg.pip.apps import PYTHON_BINARY
stderr('\n Installing YOUTUBEDL_BINARY automatically using pip...')
if YOUTUBEDL_VERSION:
print(f'{YOUTUBEDL_VERSION} is already installed', YOUTUBEDL_BINARY)
else:
try:
run_shell([
PYTHON_BINARY, '-m', 'pip',
PYTHON_BINARY.load().abspath, '-m', 'pip',
'install',
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
'yt-dlp',
], capture_output=False, cwd=out_dir)
], capture_output=False, cwd=out_dir, text=True)
pkg_path = run_shell([
PYTHON_BINARY, '-m', 'pip',
PYTHON_BINARY.load().abspath, '-m', 'pip',
'show',
'yt-dlp',
], capture_output=True, text=True, cwd=out_dir).stdout.decode().split('Location: ')[-1].split('\n', 1)[0]
], capture_output=True, text=True, cwd=out_dir).stdout.split('Location: ')[-1].split('\n', 1)[0]
NEW_YOUTUBEDL_BINARY = Path(pkg_path) / 'yt-dlp' / '__main__.py'
os.chmod(NEW_YOUTUBEDL_BINARY, 0o777)
assert NEW_YOUTUBEDL_BINARY.exists(), f'yt-dlp must exist inside {pkg_path}'
@ -1006,33 +1012,18 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr(f'[X] Failed to install python packages: {e}', color='red')
raise SystemExit(1)
if platform.machine() == 'armv7l':
stderr('\n Skip the automatic installation of CHROME_BINARY because playwright is not available on armv7.')
else:
stderr('\n Installing CHROME_BINARY automatically using playwright...')
if CHROME_VERSION:
print(f'{CHROME_VERSION} is already installed', CHROME_BINARY)
else:
try:
run_shell([
PYTHON_BINARY, '-m', 'pip',
'install',
'--upgrade',
'--no-cache-dir',
'--no-warn-script-location',
'playwright',
], capture_output=False, cwd=out_dir)
run_shell([PYTHON_BINARY, '-m', 'playwright', 'install', 'chromium'], capture_output=False, cwd=out_dir)
proc = run_shell([PYTHON_BINARY, '-c', 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)'], capture_output=True, text=True, cwd=out_dir)
NEW_CHROME_BINARY = proc.stdout.decode().strip() if isinstance(proc.stdout, bytes) else proc.stdout.strip()
assert NEW_CHROME_BINARY and len(NEW_CHROME_BINARY), 'CHROME_BINARY must contain a path'
config(f'CHROME_BINARY={NEW_CHROME_BINARY}', set=True, out_dir=out_dir)
except BaseException as e: # lgtm [py/catch-base-exception]
stderr(f'[X] Failed to install chromium using playwright: {e.__class__.__name__} {e}', color='red')
raise SystemExit(1)
from plugins_extractor.chrome.apps import CHROME_BINARY
CHROME_BINARY.load_or_install()
from plugins_pkg.npm.apps import NPM_BINARY
from plugins_extractor.singlefile.apps import SINGLEFILE_BINARY
SINGLEFILE_BINARY.load_or_install()
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
if not NODE_VERSION:
if not NPM_BINARY.load().version:
stderr('[X] You must first install node & npm using your system package manager', color='red')
hint([
'https://github.com/nodesource/distributions#table-of-contents',
@ -1077,7 +1068,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr('\n[√] Set up ArchiveBox and its dependencies successfully.', color='green')
run_shell([PYTHON_BINARY, ARCHIVEBOX_BINARY, '--version'], capture_output=False, cwd=out_dir)
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
run_shell([ARCHIVEBOX_BINARY.load().abspath, '--version'], capture_output=False, cwd=out_dir)
@enforce_types
def config(config_options_str: Optional[str]=None,
@ -1192,6 +1185,8 @@ def schedule(add: bool=False,
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
check_data_folder(CONFIG)
setup_django_minimal()
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
Path(LOGS_DIR).mkdir(exist_ok=True)
@ -1212,7 +1207,7 @@ def schedule(add: bool=False,
'cd',
quoted(out_dir),
'&&',
quoted(ARCHIVEBOX_BINARY),
quoted(ARCHIVEBOX_BINARY.load().abspath),
*([
'add',
*(['--overwrite'] if overwrite else []),
@ -1300,8 +1295,8 @@ def schedule(add: bool=False,
print('\n{green}[√] Stopped.{reset}'.format(**ANSI))
raise SystemExit(1)
if CAN_UPGRADE:
hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
# if CAN_UPGRADE:
# hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
@enforce_types
@ -1386,6 +1381,7 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
"""Run an ArchiveBox Django management command"""
check_data_folder(CONFIG)
setup_django_minimal()
from django.core.management import execute_from_command_line
if (args and "createsuperuser" in args) and (IN_DOCKER and not IS_TTY):
@ -1393,7 +1389,9 @@ def manage(args: Optional[List[str]]=None, out_dir: Path=OUTPUT_DIR) -> None:
stderr(' docker run -it archivebox manage {}'.format(' '.join(args or ['...'])), color='lightyellow')
stderr('')
execute_from_command_line([f'{ARCHIVEBOX_BINARY} manage', *(args or ['help'])])
from plugins_pkg.pip.apps import ARCHIVEBOX_BINARY
execute_from_command_line([ARCHIVEBOX_BINARY.load().abspath, 'manage', *(args or ['help'])])
@enforce_types

View file

@ -1,5 +1,6 @@
__package__ = 'archivebox.plugins_extractor.chrome'
import sys
import platform
from pathlib import Path
from typing import List, Optional, Dict, ClassVar
@ -7,7 +8,8 @@ from typing import List, Optional, Dict, ClassVar
from django.conf import settings
# Depends on other PyPI/vendor packages:
from pydantic import InstanceOf, Field
from rich import print
from pydantic import InstanceOf, Field, model_validator
from pydantic_pkgr import (
BinProvider,
BinName,
@ -25,9 +27,12 @@ from plugantic.base_binary import BaseBinary, env
from plugantic.base_hook import BaseHook
# Depends on Other Plugins:
from plugins_sys.config.apps import ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from ...util import dedupe
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
@ -82,11 +87,113 @@ def create_macos_app_symlink(target: Path, shortcut: Path):
class ChromeConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
CHROME_BINARY: str = Field(default='chrome')
CHROME_ARGS: List[str] | None = Field(default=None)
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
CHROME_DEFAULT_ARGS: List[str] = Field(default=lambda: ['--timeout={TIMEOUT-10}'])
USE_CHROME: bool = Field(default=True)
# Chrome Binary
CHROME_BINARY: str = Field(default='chrome')
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
# Chrome Options Tuning
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
CHROME_HEADLESS: bool = Field(default=True)
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
# Cookies & Auth
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CHROME_USER_DATA_DIR: Path | None = Field(default=None)
CHROME_PROFILE_NAME: str = Field(default='Default')
# Extractor Toggles
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
@model_validator(mode='after')
def validate_use_chrome(self):
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(file=sys.stderr)
# if user has specified a user data dir, make sure its valid
if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists():
# check to make sure user_data_dir/<profile_name> exists
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists():
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
print(' For more info see:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
if '/Default' in str(self.CHROME_USER_DATA_DIR):
print(file=sys.stderr)
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
# hard error is too annoying here, instead just set it to nothing
# raise SystemExit(2)
self.CHROME_USER_DATA_DIR = None
else:
self.CHROME_USER_DATA_DIR = None
return self
def chrome_args(self, **options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
options = self.model_copy(update=options)
cmd_args = [*options.CHROME_EXTRA_ARGS]
if options.CHROME_HEADLESS:
cmd_args += ["--headless=new"] # expects chrome version >= 111
if not options.CHROME_SANDBOX:
# assume this means we are running inside a docker container
# in docker, GPU support is limited, sandboxing is unecessary,
# and SHM is limited to 64MB by default (which is too low to be usable).
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
if not options.CHROME_CHECK_SSL_VALIDITY:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if options.CHROME_USER_AGENT:
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
if options.CHROME_TIMEOUT:
cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
if options.CHROME_USER_DATA_DIR:
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME))
return dedupe(cmd_args)
CHROME_CONFIG = ChromeConfig()
@ -122,6 +229,18 @@ class ChromeBinary(BaseBinary):
# otherwise on linux we can symlink directly to binary executable
symlink.symlink_to(binary.abspath)
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
lock_file = Path("~/.config/chromium/SingletonLock")
if SHELL_CONFIG.IN_DOCKER and lock_file.exists():
lock_file.unlink()
CHROME_BINARY = ChromeBinary()

View file

@ -24,40 +24,21 @@ from plugins_pkg.npm.apps import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
###################### Config ##########################
class SinglefileToggleConfigs(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_TOGGLES'
class SinglefileConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVING_CONFIG'
SAVE_SINGLEFILE: bool = True
class SinglefileOptionsConfigs(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'ARCHIVE_METHOD_OPTIONS'
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
class SinglefileDependencyConfigs(BaseConfigSet):
section: ClassVar[ConfigSectionName] = 'DEPENDENCY_CONFIG'
SINGLEFILE_BINARY: str = Field(default='wget')
SINGLEFILE_ARGS: Optional[List[str]] = Field(default=None)
SINGLEFILE_EXTRA_ARGS: List[str] = []
SINGLEFILE_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
class SinglefileConfigs(SinglefileToggleConfigs, SinglefileOptionsConfigs, SinglefileDependencyConfigs):
# section: ClassVar[ConfigSectionName] = 'ALL_CONFIGS'
pass
DEFAULT_GLOBAL_CONFIG = {
'CHECK_SSL_VALIDITY': False,
'SAVE_SINGLEFILE': True,
'TIMEOUT': 120,
}
SINGLEFILE_CONFIG = SinglefileConfigs(**DEFAULT_GLOBAL_CONFIG)
SINGLEFILE_CONFIG = SinglefileConfig()
SINGLEFILE_MIN_VERSION = '1.1.54'

View file

@ -1,6 +1,8 @@
import sys
from typing import List, Dict, ClassVar
from subprocess import run, PIPE
from pydantic import InstanceOf, Field
from pydantic import InstanceOf, Field, model_validator, AliasChoices
from django.conf import settings
@ -10,20 +12,37 @@ from plugantic.base_configset import BaseConfigSet, ConfigSectionName
from plugantic.base_binary import BaseBinary, env, apt, brew
from plugantic.base_hook import BaseHook
from plugins_sys.config.apps import ARCHIVING_CONFIG
from plugins_pkg.pip.apps import pip
###################### Config ##########################
class YtdlpDependencyConfigs(BaseConfigSet):
class YtdlpConfig(BaseConfigSet):
section: ClassVar[ConfigSectionName] = "DEPENDENCY_CONFIG"
USE_YTDLP: bool = True
USE_YTDLP: bool = Field(default=True, validation_alias=AliasChoices('USE_YOUTUBEDL', 'SAVE_MEDIA'))
YTDLP_BINARY: str = Field(default='yt-dlp')
YTDLP_BINARY: str = Field(default='yt-dlp', alias='YOUTUBEDL_BINARY')
YTDLP_EXTRA_ARGS: List[str] = Field(default=[], alias='YOUTUBEDL_EXTRA_ARGS')
YTDLP_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
YTDLP_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.MEDIA_TIMEOUT)
@model_validator(mode='after')
def validate_use_ytdlp(self):
if self.USE_YTDLP and self.YTDLP_TIMEOUT < 20:
print(f'[red][!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={self.YTDLP_TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.', file=sys.stderr)
print(' (Setting it somewhere over 60 seconds is recommended)', file=sys.stderr)
print(file=sys.stderr)
print(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media', file=sys.stderr)
print(file=sys.stderr)
return self
DEFAULT_GLOBAL_CONFIG = {}
YTDLP_CONFIG = YtdlpDependencyConfigs(**DEFAULT_GLOBAL_CONFIG)
YTDLP_CONFIG = YtdlpConfig()
@ -31,6 +50,9 @@ class YtdlpBinary(BaseBinary):
name: BinName = YTDLP_CONFIG.YTDLP_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [pip, apt, brew, env]
YTDLP_BINARY = YtdlpBinary()
class FfmpegBinary(BaseBinary):
name: BinName = 'ffmpeg'
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
@ -53,10 +75,9 @@ class FfmpegBinary(BaseBinary):
# def get_ffmpeg_version(self) -> Optional[str]:
# return self.exec(cmd=['-version']).stdout
YTDLP_BINARY = YtdlpBinary()
FFMPEG_BINARY = FfmpegBinary()
# class YtdlpExtractor(BaseExtractor):
# name: str = 'ytdlp'
# binary: str = 'ytdlp'

View file

@ -18,8 +18,6 @@ from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode # type: ignore
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
from os.path import lexists
from os import remove as remove_file
try:
import chardet
@ -282,82 +280,6 @@ def get_headers(url: str, timeout: int=None) -> str:
)
@enforce_types
def chrome_args(**options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
from .config import (
CHROME_OPTIONS,
CHROME_VERSION,
CHROME_EXTRA_ARGS,
)
options = {**CHROME_OPTIONS, **options}
if not options['CHROME_BINARY']:
raise Exception('Could not find any CHROME_BINARY installed on your system')
cmd_args = [options['CHROME_BINARY']]
cmd_args += CHROME_EXTRA_ARGS
if options['CHROME_HEADLESS']:
cmd_args += ("--headless=new",) # expects chrome version >= 111
if not options['CHROME_SANDBOX']:
# assume this means we are running inside a docker container
# in docker, GPU support is limited, sandboxing is unecessary,
# and SHM is limited to 64MB by default (which is too low to be usable).
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
# "--password-store=basic",
)
# disable automatic updating when running headless, as there's no user to see the upgrade prompts
cmd_args += ("--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options['RESOLUTION']),)
if not options['CHECK_SSL_VALIDITY']:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if options['CHROME_USER_AGENT']:
cmd_args += ('--user-agent={}'.format(options['CHROME_USER_AGENT']),)
if options['CHROME_TIMEOUT']:
cmd_args += ('--timeout={}'.format(options['CHROME_TIMEOUT'] * 1000),)
if options['CHROME_USER_DATA_DIR']:
cmd_args.append('--user-data-dir={}'.format(options['CHROME_USER_DATA_DIR']))
cmd_args.append('--profile-directory=Default')
return dedupe(cmd_args)
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
from .config import IN_DOCKER
if IN_DOCKER and lexists("/home/archivebox/.config/chromium/SingletonLock"):
remove_file("/home/archivebox/.config/chromium/SingletonLock")
@enforce_types
def ansi_to_html(text: str) -> str:
"""