2024-09-25 02:04:38 +00:00
|
|
|
__package__ = 'archivebox.plugins_extractor.chrome'
|
|
|
|
|
2024-09-25 07:42:26 +00:00
|
|
|
import sys
|
2024-09-21 11:00:54 +00:00
|
|
|
import platform
|
|
|
|
from pathlib import Path
|
2024-10-01 00:25:15 +00:00
|
|
|
from typing import List, Optional, Dict
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
# Depends on other PyPI/vendor packages:
|
2024-09-25 07:42:26 +00:00
|
|
|
from rich import print
|
|
|
|
from pydantic import InstanceOf, Field, model_validator
|
2024-09-21 11:00:54 +00:00
|
|
|
from pydantic_pkgr import (
|
|
|
|
BinProvider,
|
|
|
|
BinName,
|
|
|
|
BinProviderName,
|
|
|
|
ProviderLookupDict,
|
|
|
|
bin_abspath,
|
|
|
|
)
|
|
|
|
|
|
|
|
# Depends on other Django apps:
|
2024-09-27 07:41:21 +00:00
|
|
|
from abx.archivebox.base_plugin import BasePlugin
|
2024-09-30 23:50:36 +00:00
|
|
|
from abx.archivebox.base_configset import BaseConfigSet
|
2024-09-27 07:41:21 +00:00
|
|
|
from abx.archivebox.base_binary import BaseBinary, env
|
|
|
|
# from abx.archivebox.base_extractor import BaseExtractor
|
|
|
|
# from abx.archivebox.base_queue import BaseQueue
|
|
|
|
from abx.archivebox.base_hook import BaseHook
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
# Depends on Other Plugins:
|
2024-09-30 22:59:05 +00:00
|
|
|
from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
|
2024-09-24 08:34:27 +00:00
|
|
|
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
|
|
|
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
2024-09-21 11:00:54 +00:00
|
|
|
|
2024-10-01 00:25:15 +00:00
|
|
|
from archivebox.misc.util import dedupe
|
2024-09-25 07:42:26 +00:00
|
|
|
|
2024-09-21 11:00:54 +00:00
|
|
|
|
2024-09-22 20:17:10 +00:00
|
|
|
CHROMIUM_BINARY_NAMES_LINUX = [
|
2024-09-21 11:00:54 +00:00
|
|
|
"chromium",
|
|
|
|
"chromium-browser",
|
|
|
|
"chromium-browser-beta",
|
|
|
|
"chromium-browser-unstable",
|
|
|
|
"chromium-browser-canary",
|
|
|
|
"chromium-browser-dev",
|
|
|
|
]
|
2024-09-22 20:17:10 +00:00
|
|
|
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
|
|
|
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
|
|
|
|
|
|
|
CHROME_BINARY_NAMES_LINUX = [
|
2024-09-21 11:00:54 +00:00
|
|
|
"google-chrome",
|
|
|
|
"google-chrome-stable",
|
|
|
|
"google-chrome-beta",
|
|
|
|
"google-chrome-canary",
|
|
|
|
"google-chrome-unstable",
|
|
|
|
"google-chrome-dev",
|
2024-09-22 20:17:10 +00:00
|
|
|
"chrome"
|
|
|
|
]
|
|
|
|
CHROME_BINARY_NAMES_MACOS = [
|
2024-09-21 11:00:54 +00:00
|
|
|
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
|
|
|
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
|
|
|
]
|
2024-09-22 20:17:10 +00:00
|
|
|
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
|
|
|
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
|
|
|
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
|
|
|
if abspath:
|
|
|
|
return abspath
|
|
|
|
return None
|
|
|
|
|
2024-09-22 20:17:10 +00:00
|
|
|
def create_macos_app_symlink(target: Path, shortcut: Path):
|
|
|
|
"""
|
|
|
|
on macOS, some binaries are inside of .app, so we need to
|
|
|
|
create a tiny bash script instead of a symlink
|
|
|
|
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
|
|
|
"""
|
|
|
|
# TODO: should we enforce this? is it useful in any other situation?
|
|
|
|
# if platform.system().lower() != 'darwin':
|
|
|
|
# raise Exception(...)
|
2024-10-03 10:56:45 +00:00
|
|
|
shortcut.unlink(missing_ok=True)
|
2024-09-22 20:17:10 +00:00
|
|
|
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
|
|
|
shortcut.chmod(0o777) # make sure its executable by everyone
|
|
|
|
|
2024-09-21 11:00:54 +00:00
|
|
|
###################### Config ##########################
|
|
|
|
|
|
|
|
|
2024-09-25 02:04:38 +00:00
|
|
|
class ChromeConfig(BaseConfigSet):
|
2024-09-25 07:42:26 +00:00
|
|
|
USE_CHROME: bool = Field(default=True)
|
|
|
|
|
|
|
|
# Chrome Binary
|
2024-09-25 02:04:38 +00:00
|
|
|
CHROME_BINARY: str = Field(default='chrome')
|
2024-10-01 07:18:57 +00:00
|
|
|
CHROME_DEFAULT_ARGS: List[str] = Field(default=[
|
|
|
|
'--virtual-time-budget=15000',
|
|
|
|
'--disable-features=DarkMode',
|
|
|
|
"--run-all-compositor-stages-before-draw",
|
|
|
|
"--hide-scrollbars",
|
|
|
|
"--autoplay-policy=no-user-gesture-required",
|
|
|
|
"--no-first-run",
|
|
|
|
"--use-fake-ui-for-media-stream",
|
|
|
|
"--use-fake-device-for-media-stream",
|
|
|
|
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
|
|
|
|
])
|
|
|
|
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
2024-09-25 07:42:26 +00:00
|
|
|
|
|
|
|
# Chrome Options Tuning
|
|
|
|
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
|
|
|
|
CHROME_HEADLESS: bool = Field(default=True)
|
|
|
|
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
|
|
|
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
|
|
|
|
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
|
|
|
|
|
|
|
# Cookies & Auth
|
|
|
|
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
|
|
|
CHROME_USER_DATA_DIR: Path | None = Field(default=None)
|
|
|
|
CHROME_PROFILE_NAME: str = Field(default='Default')
|
|
|
|
|
|
|
|
# Extractor Toggles
|
|
|
|
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
|
|
|
|
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
|
|
|
|
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
|
2024-09-21 11:00:54 +00:00
|
|
|
|
2024-09-25 07:42:26 +00:00
|
|
|
@model_validator(mode='after')
|
|
|
|
def validate_use_chrome(self):
|
|
|
|
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
|
|
|
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]', file=sys.stderr)
|
|
|
|
print(' Chrome will fail to archive all sites if set to less than ~15 seconds.', file=sys.stderr)
|
|
|
|
print(' (Setting it to somewhere between 30 and 300 seconds is recommended)', file=sys.stderr)
|
|
|
|
print(file=sys.stderr)
|
|
|
|
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
|
|
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
|
|
|
print(file=sys.stderr)
|
|
|
|
|
|
|
|
# if user has specified a user data dir, make sure its valid
|
|
|
|
if self.CHROME_USER_DATA_DIR and self.CHROME_USER_DATA_DIR.exists():
|
|
|
|
# check to make sure user_data_dir/<profile_name> exists
|
|
|
|
if not (self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME).exists():
|
|
|
|
print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]', file=sys.stderr)
|
|
|
|
print(f' {self.CHROME_USER_DATA_DIR}', file=sys.stderr)
|
|
|
|
print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.', file=sys.stderr)
|
|
|
|
print(' For more info see:', file=sys.stderr)
|
|
|
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR', file=sys.stderr)
|
|
|
|
if '/Default' in str(self.CHROME_USER_DATA_DIR):
|
|
|
|
print(file=sys.stderr)
|
|
|
|
print(' Try removing /Default from the end e.g.:', file=sys.stderr)
|
|
|
|
print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).split('/Default')[0]), file=sys.stderr)
|
|
|
|
|
|
|
|
# hard error is too annoying here, instead just set it to nothing
|
|
|
|
# raise SystemExit(2)
|
|
|
|
self.CHROME_USER_DATA_DIR = None
|
|
|
|
else:
|
|
|
|
self.CHROME_USER_DATA_DIR = None
|
|
|
|
|
|
|
|
return self
|
|
|
|
|
|
|
|
def chrome_args(self, **options) -> List[str]:
|
|
|
|
"""helper to build up a chrome shell command with arguments"""
|
|
|
|
|
|
|
|
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
|
|
|
|
|
|
|
options = self.model_copy(update=options)
|
|
|
|
|
2024-10-01 07:18:57 +00:00
|
|
|
cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
|
2024-09-25 07:42:26 +00:00
|
|
|
|
|
|
|
if options.CHROME_HEADLESS:
|
|
|
|
cmd_args += ["--headless=new"] # expects chrome version >= 111
|
|
|
|
|
|
|
|
if not options.CHROME_SANDBOX:
|
|
|
|
# assume this means we are running inside a docker container
|
|
|
|
# in docker, GPU support is limited, sandboxing is unecessary,
|
|
|
|
# and SHM is limited to 64MB by default (which is too low to be usable).
|
|
|
|
cmd_args += (
|
|
|
|
"--no-sandbox",
|
|
|
|
"--no-zygote",
|
|
|
|
"--disable-dev-shm-usage",
|
|
|
|
"--disable-software-rasterizer",
|
|
|
|
"--disable-sync",
|
|
|
|
# "--password-store=basic",
|
|
|
|
)
|
2024-10-01 07:18:57 +00:00
|
|
|
|
2024-09-25 07:42:26 +00:00
|
|
|
|
|
|
|
# set window size for screenshot/pdf/etc. rendering
|
|
|
|
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
|
|
|
|
|
|
|
|
if not options.CHROME_CHECK_SSL_VALIDITY:
|
|
|
|
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
|
|
|
|
|
|
|
if options.CHROME_USER_AGENT:
|
|
|
|
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
|
|
|
|
|
|
|
|
if options.CHROME_TIMEOUT:
|
|
|
|
cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
|
|
|
|
|
|
|
|
if options.CHROME_USER_DATA_DIR:
|
|
|
|
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
|
|
|
|
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME))
|
|
|
|
|
|
|
|
return dedupe(cmd_args)
|
2024-09-21 11:00:54 +00:00
|
|
|
|
2024-09-25 02:04:38 +00:00
|
|
|
CHROME_CONFIG = ChromeConfig()
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ChromeBinary(BaseBinary):
|
2024-09-22 20:17:10 +00:00
|
|
|
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
2024-09-21 11:00:54 +00:00
|
|
|
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER]
|
|
|
|
|
|
|
|
provider_overrides: Dict[BinProviderName, ProviderLookupDict] = {
|
|
|
|
env.name: {
|
2024-09-22 20:17:10 +00:00
|
|
|
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
2024-09-21 11:00:54 +00:00
|
|
|
},
|
|
|
|
PUPPETEER_BINPROVIDER.name: {
|
2024-09-22 20:17:10 +00:00
|
|
|
'packages': lambda: ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
2024-09-21 11:00:54 +00:00
|
|
|
},
|
|
|
|
PLAYWRIGHT_BINPROVIDER.name: {
|
2024-09-22 20:17:10 +00:00
|
|
|
'packages': lambda: ['chromium'], # playwright install chromium
|
2024-09-21 11:00:54 +00:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
@staticmethod
|
2024-09-30 22:59:05 +00:00
|
|
|
def symlink_to_lib(binary, bin_dir=CONSTANTS.LIB_BIN_DIR) -> None:
|
2024-09-21 11:00:54 +00:00
|
|
|
if not (binary.abspath and binary.abspath.exists()):
|
|
|
|
return
|
2024-09-25 02:04:38 +00:00
|
|
|
|
2024-09-21 11:00:54 +00:00
|
|
|
bin_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
symlink = bin_dir / binary.name
|
|
|
|
|
|
|
|
if platform.system().lower() == 'darwin':
|
|
|
|
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
2024-09-22 20:17:10 +00:00
|
|
|
create_macos_app_symlink(binary.abspath, symlink)
|
2024-09-21 11:00:54 +00:00
|
|
|
else:
|
|
|
|
# otherwise on linux we can symlink directly to binary executable
|
2024-10-03 10:56:45 +00:00
|
|
|
symlink.unlink(missing_ok=True)
|
2024-09-21 11:00:54 +00:00
|
|
|
symlink.symlink_to(binary.abspath)
|
|
|
|
|
2024-09-25 07:42:26 +00:00
|
|
|
@staticmethod
|
|
|
|
def chrome_cleanup_lockfile():
|
|
|
|
"""
|
|
|
|
Cleans up any state or runtime files that chrome leaves behind when killed by
|
|
|
|
a timeout or other error
|
|
|
|
"""
|
|
|
|
lock_file = Path("~/.config/chromium/SingletonLock")
|
|
|
|
|
|
|
|
if SHELL_CONFIG.IN_DOCKER and lock_file.exists():
|
|
|
|
lock_file.unlink()
|
|
|
|
|
|
|
|
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
CHROME_BINARY = ChromeBinary()
|
|
|
|
|
|
|
|
|
|
|
|
class ChromePlugin(BasePlugin):
|
2024-09-22 20:17:10 +00:00
|
|
|
app_label: str = 'chrome'
|
|
|
|
verbose_name: str = 'Chrome Browser'
|
2024-09-21 11:00:54 +00:00
|
|
|
|
|
|
|
hooks: List[InstanceOf[BaseHook]] = [
|
|
|
|
CHROME_CONFIG,
|
|
|
|
CHROME_BINARY,
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
PLUGIN = ChromePlugin()
|
2024-09-26 09:43:12 +00:00
|
|
|
# PLUGIN.register(settings)
|
2024-09-21 11:00:54 +00:00
|
|
|
DJANGO_APP = PLUGIN.AppConfig
|