Merge branch 'dev' into version-banner

This commit is contained in:
Nick Sweeting 2023-12-19 09:57:08 -08:00 committed by GitHub
commit 14f10a0461
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
20 changed files with 2055 additions and 1063 deletions

View file

@ -16,6 +16,7 @@ venv/
.docker-venv/
node_modules/
docs/
build/
dist/
brew_dist/

View file

@ -11,8 +11,7 @@ on:
env:
DOCKER_IMAGE: archivebox-ci
jobs:
buildx:
runs-on: ubuntu-latest
@ -60,13 +59,11 @@ jobs:
uses: docker/metadata-action@v5
with:
images: archivebox/archivebox,nikisweeting/archivebox
flavor: |
latest=auto
tags: |
type=ref,event=branch
type=semver,pattern={{version}}
type=semver,pattern={{major}}.{{minor}}
type=sha
type=raw,value=latest,enable={{is_default_branch}}
- name: Build and push
id: docker_build
@ -78,8 +75,18 @@ jobs:
push: ${{ github.event_name != 'pull_request' }}
tags: ${{ steps.docker_meta.outputs.tags }}
cache-from: type=local,src=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache
cache-to: type=local,dest=/tmp/.buildx-cache-new
platforms: linux/amd64,linux/arm64,linux/arm/v7
- name: Image digest
run: echo ${{ steps.docker_build.outputs.digest }}
# This ugly bit is necessary if you don't want your cache to grow forever
# until it hits GitHub's limit of 5GB.
# Temp fix
# https://github.com/docker/build-push-action/issues/252
# https://github.com/moby/buildkit/issues/1896
- name: Move cache
run: |
rm -rf /tmp/.buildx-cache
mv /tmp/.buildx-cache-new /tmp/.buildx-cache

View file

@ -73,7 +73,8 @@ COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
RUN echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache \
&& rm -f /etc/apt/apt.conf.d/docker-clean
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
@ -123,7 +124,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& curl -fsSL "https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key" | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
nodejs libatomic1 python3-minimal \
@ -202,7 +203,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
&& chown -R $ARCHIVEBOX_USER "$PLAYWRIGHT_BROWSERS_PATH" \
# Save version info
&& ( \
which chromium-browser && /usr/bin/chromium-browser --version \
which chromium-browser && /usr/bin/chromium-browser --version || /usr/lib/chromium/chromium --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
@ -246,15 +247,15 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$T
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
&& apt-get update -qq \
# && apt-get update -qq \
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
build-essential \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential \
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
&& pip install -e "$CODE_DIR"[sonic,ldap] \
# save docker image size and always remove compilers / build tools after building is complete
&& apt-get purge -y build-essential \
&& apt-get autoremove -y \
# && apt-get purge -y build-essential \
# && apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
####################################################
@ -276,11 +277,10 @@ ENV IN_DOCKER=True
# Print version for nice docker finish summary
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
&& echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ}\n\n" \
&& "$CODE_DIR/bin/docker_entrypoint.sh" \
archivebox version 2>&1 \
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})\n" \
&& echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s")\n\n" \
) | tee -a /VERSION.txt
RUN "$CODE_DIR"/bin/docker_entrypoint.sh version 2>&1 | tee -a /VERSION.txt
####################################################

View file

@ -54,6 +54,7 @@ from .config_stubs import (
### Pre-Fetch Minimal System Config
TIMEZONE = 'UTC'
SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
@ -82,7 +83,6 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'IN_QEMU': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
},
'GENERAL_CONFIG': {
@ -377,7 +377,7 @@ ALLOWED_IN_OUTPUT_DIR = {
'static_index.json',
}
def get_version(config):
def get_version(config) -> str:
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
@ -392,58 +392,76 @@ def get_version(config):
raise Exception('Failed to detect installed archivebox version!')
def get_commit_hash(config):
def get_commit_hash(config) -> Optional[str]:
try:
git_dir = config['PACKAGE_DIR'] / '../'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
return None
pass
return None
def get_version_releases(config):
def get_build_time(config) -> str:
if config['IN_DOCKER']:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
def get_versions_available_on_github(config):
"""
returns a dictionary containing the GitHub release data for
returns a dictionary containing the ArchiveBox GitHub release info for
the recommended upgrade version and the currently installed version
"""
# we only want to perform the (relatively expensive) check for new versions
# when its most relevant, e.g. when the user runs a long-running command
subcommand_run_by_user = sys.argv[3]
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
if subcommand_run_by_user not in long_running_commands:
return None
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
response = requests.get(github_releases_api)
if response.status_code != 200:
stderr('Failed to get release data from GitHub', color='lightyellow', config=config)
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
return None
all_releases = response.json()
releases = response.json()
installed_version = config['VERSION']
installed_version_parts = parse_tag_name(installed_version)
installed_version = parse_version_string(config['VERSION'])
# find current version or nearest older version (to link to)
current_version = None
for release in releases:
release_parts = parse_tag_name(release["tag_name"])
if release_parts <= installed_version_parts :
for idx, release in enumerate(all_releases):
release_version = parse_version_string(release["tag_name"])
if release_version <= installed_version:
current_version = release
break
current_version = current_version if current_version else releases[-1]
current_version = current_version or releases[-1]
# recommended version is whatever comes after current_version in the release list
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
try:
recommended_version = all_releases[idx+1]
except IndexError:
recommended_version = None
# find upgrade version
upgrade_version = None
smallest_version_diff = parse_tag_name(releases[0]["tag_name"])[1]
for release in releases:
release_parts = parse_tag_name(release["tag_name"])
major_version_diff = release_parts[1] - installed_version_parts[1]
if major_version_diff < smallest_version_diff:
smallest_version_diff = major_version_diff
if smallest_version_diff < 1:
break
upgrade_version = release
upgrade_version = upgrade_version if upgrade_version else releases[0]
return {"upgrade_version": upgrade_version, "current_version": current_version}
return {"recommended_version": recommended_version, "current_version": current_version}
def can_upgrade(config):
if config['VERSION_RELEASES']:
upgrade_version = parse_tag_name(config['VERSION_RELEASES']['upgrade_version']['tag_name'])
current_version = parse_tag_name(config['VERSION_RELEASES']['current_version']['tag_name'])
return upgrade_version > current_version
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
return recommended_version > current_version
return False
@ -473,11 +491,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'VERSION': {'default': lambda c: get_version(c)},
'VERSION_RELEASES': {'default': lambda c: get_version_releases(c)},
'CAN_UPGRADE': {'default': lambda c: can_upgrade(c)},
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]},
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
'BUILD_TIME': {'default': lambda c: get_build_time(c)},
'VERSIONS_AVAILABLE': {'default': lambda c: get_versions_available_on_github(c)},
'CAN_UPGRADE': {'default': lambda c: can_upgrade(c)},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
@ -487,7 +508,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
@ -744,14 +765,11 @@ def load_config(defaults: ConfigDefaultDict,
return extended_config
# def write_config(config: ConfigDict):
# with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
def parse_tag_name(v):
"""parses a version tag string formatted like 'vx.x.x'"""
def parse_version_string(version: str) -> Tuple[int, int int]:
"""parses a version tag string formatted like 'vx.x.x' into (major, minor, patch) ints"""
base = v.split('+')[0].split('v')[-1] # remove 'v' prefix and '+editable' suffix
return tuple(int(part) for part in base.split('.'))
return tuple(int(part) for part in base.split('.'))[:3]
# Logging Helpers
@ -840,6 +858,7 @@ def find_chrome_binary() -> Optional[str]:
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_executable_paths = (
# '~/Library/Caches/ms-playwright/chromium-*/chrome-mac/Chromium.app/Contents/MacOS/Chromium',
'chromium-browser',
'chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
@ -1166,14 +1185,25 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
def check_system_config(config: ConfigDict=CONFIG) -> None:
### Check system environment
if config['USER'] == 'root':
if config['USER'] == 'root' or str(config['PUID']) == "0":
stderr('[!] ArchiveBox should never be run as root!', color='red')
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
if config['IN_DOCKER']:
attempted_command = ' '.join(sys.argv[:3])
stderr('')
stderr(' {lightred}Hint{reset}: When using Docker, you must run commands with {green}docker run{reset} instead of {lightyellow}docker exec{reset}, e.g.:'.format(**config['ANSI']))
stderr(f' docker compose run archivebox {attempted_command}')
stderr(f' docker compose exec --user=archivebox archivebox {attempted_command}')
stderr(' or')
stderr(f' docker run -it -v ... -p ... archivebox/archivebox {attempted_command}')
stderr(f' docker exec -it --user=archivebox <container id> /bin/bash')
raise SystemExit(2)
### Check Python environment
if sys.version_info[:3] < (3, 6, 0):
if sys.version_info[:3] < (3, 7, 0):
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(2)
@ -1249,7 +1279,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
stderr(' youtube-dl/yt-dlp will fail to archive any media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
@ -1337,8 +1367,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
if check_db:
# Enable WAL mode in sqlite3

View file

@ -48,22 +48,25 @@ class TagInline(admin.TabularInline):
from django.contrib.admin.helpers import ActionForm
from django.contrib.admin.widgets import AutocompleteSelectMultiple
class AutocompleteTags:
model = Tag
search_fields = ['name']
# WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
# class AutocompleteTags:
# model = Tag
# search_fields = ['name']
# name = 'tags'
class AutocompleteTagsAdminStub:
name = 'admin'
# class AutocompleteTagsAdminStub:
# name = 'admin'
class SnapshotActionForm(ActionForm):
tags = forms.ModelMultipleChoiceField(
queryset=Tag.objects.all(),
required=False,
widget=AutocompleteSelectMultiple(
AutocompleteTags(),
AutocompleteTagsAdminStub(),
),
# WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
# widget=AutocompleteSelectMultiple(
# # AutocompleteTags(),
# # AutocompleteTagsAdminStub(),
# ),
)
# TODO: allow selecting actions for specific extractors? is this useful?

View file

@ -3,4 +3,4 @@ from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'
default_auto_field = 'django.db.models.UUIDField'
# default_auto_field = 'django.db.models.UUIDField'

View file

@ -268,6 +268,8 @@ AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
# WIP: commented out because broken by Django 3.1.2 -> 4.0 migration
# DEFAULT_AUTO_FIELD = 'django.db.models.UUIDField'
################################################################################
### Shell Settings

View file

@ -184,7 +184,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
link.url,
command,
ts
) + "\n"))
) + "\n" + str(e) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(' ', stats)

View file

@ -393,7 +393,11 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
else:
_LAST_RUN_STATS.succeeded += 1
size = get_dir_size(link_dir)
try:
size = get_dir_size(link_dir)
except FileNotFoundError:
size = (0, None, '0')
end_ts = datetime.now(timezone.utc)
duration = str(end_ts - start_ts).split('.')[0]
print(' {black}{} files ({}) in {}s {reset}'.format(size[2], printable_filesize(size[0]), duration, **ANSI))

View file

@ -93,6 +93,8 @@ from .config import (
SQL_INDEX_FILENAME,
ALLOWED_IN_OUTPUT_DIR,
SEARCH_BACKEND_ENGINE,
LDAP,
get_version,
check_dependencies,
check_data_folder,
write_config_file,
@ -100,6 +102,7 @@ from .config import (
VERSION_RELEASES,
CAN_UPGRADE,
COMMIT_HASH,
BUILD_TIME,
CODE_LOCATIONS,
EXTERNAL_LOCATIONS,
DATA_LOCATIONS,
@ -220,31 +223,39 @@ def version(quiet: bool=False,
if not quiet:
# 0.7.1
# ArchiveBox v0.7.1 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IN_QEMU=False IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
# ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
# IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
# FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
# DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
p = platform.uname()
print(
'ArchiveBox v{}'.format(VERSION),
*((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
sys.implementation.name.title(),
p.system,
platform.platform(),
p.machine,
'ArchiveBox v{}'.format(get_version(CONFIG)),
*((f'COMMIT_HASH={COMMIT_HASH[:7]}',) if COMMIT_HASH else ()),
f'BUILD_TIME={BUILD_TIME}',
)
print(
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={IN_QEMU}',
f'ARCH={p.machine}',
f'OS={p.system}',
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}',
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print(
f'DEBUG={DEBUG}',
f'IN_DOCKER={IN_DOCKER}',
f'IN_QEMU={IN_QEMU}',
f'IS_TTY={IS_TTY}',
f'TZ={TIMEZONE}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
)
print(
f'DEBUG={DEBUG}',
f'IS_TTY={IS_TTY}',
f'TZ={TIMEZONE}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
f'LDAP={LDAP}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
)
print()
@ -273,7 +284,7 @@ def version(quiet: bool=False,
print(printable_folder_status(name, path))
else:
print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))
print('{white}[i] Data locations:{reset} (not in a data directory)'.format(**ANSI))
print()
check_dependencies()
@ -1010,9 +1021,9 @@ def setup(out_dir: Path=OUTPUT_DIR) -> None:
stderr('\n Installing SINGLEFILE_BINARY, READABILITY_BINARY, MERCURY_BINARY automatically using npm...')
if not NODE_VERSION:
stderr('[X] You must first install node using your system package manager', color='red')
stderr('[X] You must first install node & npm using your system package manager', color='red')
hint([
'curl -sL https://deb.nodesource.com/setup_15.x | sudo -E bash -',
'https://github.com/nodesource/distributions#table-of-contents',
'or to disable all node-based modules run: archivebox config --set USE_NODE=False',
])
raise SystemExit(1)

1
archivebox/static Symbolic link
View file

@ -0,0 +1 @@
templates/static

View file

@ -23,6 +23,7 @@ SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
GIT_SHA=sha-"$(git rev-parse --short HEAD)"
SELECTED_PLATFORMS="${2:-$SUPPORTED_PLATFORMS}"
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$SELECTED_PLATFORMS"
@ -50,6 +51,7 @@ function create_builder() {
docker buildx use xbuilder && return 0
echo "[+] Creating new xbuilder for: $SELECTED_PLATFORMS"
echo
docker pull 'moby/buildkit:buildx-stable-1'
# Switch to buildx builder if already present / previously created
docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$SELECTED_PLATFORMS" || true
@ -74,6 +76,7 @@ echo "[+] Generating requirements.txt and pdm.lock from pyproject.toml..."
pdm lock --group=':all' --strategy="cross_platform" --production
pdm export --group=':all' --production --without-hashes -o requirements.txt
echo "[+] Building archivebox:$VERSION docker image..."
# docker builder prune
# docker build . --no-cache -t archivebox-dev \
@ -83,12 +86,16 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --load . \
-t archivebox/archivebox:$TAG_NAME \
-t archivebox/archivebox:$VERSION \
-t archivebox/archivebox:$SHORT_VERSION \
-t archivebox/archivebox:$GIT_SHA \
-t archivebox/archivebox:latest \
-t nikisweeting/archivebox \
-t nikisweeting/archivebox:$TAG_NAME \
-t nikisweeting/archivebox:$VERSION \
-t nikisweeting/archivebox:$SHORT_VERSION \
-t nikisweeting/archivebox:$GIT_SHA \
-t nikisweeting/archivebox:latest \
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
-t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA \
-t ghcr.io/archivebox/archivebox/archivebox:latest

View file

@ -1,20 +1,55 @@
#!/bin/bash
# This Docker ENTRYPOINT script is called by `docker run archivebox ...` or `docker compose run archivebox ...`.
# It takes a CMD as $* shell arguments and runs it following these setup steps:
# - Set the archivebox user to use the correct PUID & PGID
# 1. highest precedence is for valid PUID and PGID env vars passsed in explicitly
# 2. fall back to DETECTED_PUID of files found within existing data dir
# 3. fall back to DEFAULT_PUID if no data dir or its owned by root
# - Create a new /data dir if necessary and set the correct ownership on it
# - Create a new /browsers dir if necessary and set the correct ownership on it
# - Check whether we're running inside QEMU emulation and show a warning if so.
# - Drop down to archivebox user permisisons and execute passed CMD command.
# Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
# set -o nounset
set -o errexit
set -o errtrace
set -o pipefail
# IFS=$'\n'
# Load global invariants (set by Dockerfile during image build time, not intended to be customized by users at runtime)
export DATA_DIR="${DATA_DIR:-/data}"
export ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
# default PUID and PGID if data dir is empty and no PUID+PGID is set
# Global default PUID and PGID if data dir is empty and no intended PUID+PGID is set manually by user
export DEFAULT_PUID=911
export DEFAULT_PGID=911
# if data directory already exists, autodetect detect owner by looking at files within
export DETECTED_UID="$(stat -c '%u' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PUID")"
export DETECTED_GID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PGID")"
# If user tires to set PUID and PGID to root values manually, catch and reject because root is not allowed
if [[ "$PUID" == "0" ]] || [[ "$PGID" == "0" ]]; then
echo -e "\n[X] Error: Got PUID=$PUID and PGID=$PGID but ArchiveBox is not allowed to be run as root, please change or unset PUID & PGID and try again." > /dev/stderr
echo -e " Hint: some NFS/SMB/FUSE/etc. filesystems force-remap all permissions, leave PUID/PGID blank" > /dev/stderr
echo -e " or set PUID/PGID to the same value as the user/group they remap to (e.g. $DEFAULT_PUID:$DEFAULT_PGID)." > /dev/stderr
echo -e " https://linux.die.net/man/8/mount.cifs#:~:text=does%20not%20provide%20unix%20ownership" > /dev/stderr
exit 3
fi
# Set the archivebox user to use the configured UID & GID
# prefers PUID and PGID env vars passsed in explicitly, falls back to autodetected defaults
usermod -o -u "${PUID:-$DETECTED_UID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
groupmod -o -g "${PGID:-$DETECTED_GID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
# If data directory already exists, autodetect detect owner by looking at files within
export DETECTED_PUID="$(stat -c '%u' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PUID")"
export DETECTED_PGID="$(stat -c '%g' "$DATA_DIR/logs/errors.log" 2>/dev/null || echo "$DEFAULT_PGID")"
# If data directory exists but is owned by root, use defaults instead of root because root is not allowed
[[ "$DETECTED_PUID" == "0" ]] && export DETECTED_PUID="$DEFAULT_PUID"
[[ "$DETECTED_PGID" == "0" ]] && export DETECTED_PGID="$DEFAULT_PGID"
# Set archivebox user and group ids to desired PUID/PGID
usermod -o -u "${PUID:-$DETECTED_PUID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
groupmod -o -g "${PGID:-$DETECTED_PGID}" "$ARCHIVEBOX_USER" > /dev/null 2>&1
# re-set PUID and PGID to values reported by system instead of values we tried to set,
# in case wonky filesystems or Docker setups try to play UID/GID remapping tricks on us
@ -29,12 +64,12 @@ if [[ -d "$DATA_DIR/archive" ]]; then
# echo "[√] Permissions are correct"
else
# the only time this fails is if the host filesystem doesn't allow us to write as root (e.g. some NFS mapall/maproot problems, connection issues, drive dissapeared, etc.)
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir." >&2
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:"
echo -e " \$ chown -R $PUID:$PGID ./data\n" >&2
echo -e " Configure the PUID & PGID environment variables to change the desired owner:" >&2
echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" >&2
exit 1
echo -e "\n[X] Error: archivebox user (PUID=$PUID) is not able to write to your ./data dir (currently owned by $(stat -c '%u' "$DATA_DIR"):$(stat -c '%g' "$DATA_DIR")." >&2
echo -e " Change ./data to be owned by PUID=$PUID PGID=$PGID on the host and retry:" > /dev/stderr
echo -e " \$ chown -R $PUID:$PGID ./data\n" > /dev/stderr
echo -e " Configure the PUID & PGID environment variables to change the desired owner:" > /dev/stderr
echo -e " https://docs.linuxserver.io/general/understanding-puid-and-pgid\n" > /dev/stderr
exit 3
fi
else
# create data directory
@ -46,29 +81,35 @@ fi
chown $PUID:$PGID "$DATA_DIR"
chown $PUID:$PGID "$DATA_DIR"/*
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome
# also chown BROWSERS_DIR because otherwise 'archivebox setup' wont be able to install chrome at runtime
PLAYWRIGHT_BROWSERS_PATH="${PLAYWRIGHT_BROWSERS_PATH:-/browsers}"
mkdir -p "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"
chown $PUID:$PGID "${PLAYWRIGHT_BROWSERS_PATH}/*"
chown $PUID:$PGID "$PLAYWRIGHT_BROWSERS_PATH"/*
rm -Rf "$PLAYWRIGHT_BROWSERS_PATH/permissions_test_safe_to_delete"
# (this check is written in blood, QEMU silently breaks things in ways that are not obvious)
export IN_QEMU="$(pmap 1 | grep qemu | wc -l | grep -E '^0$' >/dev/null && echo 'False' || echo 'True')"
if [[ "$IN_QEMU" == 'True' ]]; then
echo -e "\n[!] Warning: Running $(uname -m) emulated container in QEMU, some things will break!" >&2
echo -e " chromium (screenshot, pdf, dom), singlefile, and any dependencies that rely on inotify will not run in QEMU." >&2
echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" >&2
export IN_QEMU="$(pmap 1 | grep qemu >/dev/null && echo 'True' || echo 'False')"
if [[ "$IN_QEMU" == "True" ]]; then
echo -e "\n[!] Warning: Running $(uname -m) docker image using QEMU emulation, some things will break!" > /dev/stderr
echo -e " chromium (screenshot, pdf, dom), singlefile, and any dependencies that rely on inotify will not run in QEMU." > /dev/stderr
echo -e " See here for more info: https://github.com/microsoft/playwright/issues/17395#issuecomment-1250830493\n" > /dev/stderr
fi
# Drop permissions to run commands as the archivebox user
if [[ "$1" == /* || "$1" == "bash" || "$1" == "sh" || "$1" == "echo" || "$1" == "cat" || "$1" == "archivebox" ]]; then
# handle "docker run archivebox /some/non-archivebox/command --with=some args" by passing args directly to bash -c
# e.g. "docker run archivebox /venv/bin/archivebox-alt init"
# e.g. "docker run archivebox archivebox init:
# "docker run archivebox /venv/bin/archivebox-alt init"
# "docker run archivebox /bin/bash -c '...'"
# "docker run archivebox echo test"
# "docker run archivebox cat /VERSION.txt"
exec gosu "$PUID" bash -c "$*"
else
# handle "docker run archivebox add some subcommand --with=args abc" by calling archivebox to run as args as CLI subcommand
# e.g. "docker run archivebox add --depth=1 https://example.com"
# e.g. "docker run archivebox help"
# "docker run archivebox add --depth=1 https://example.com"
# "docker run archivebox manage createsupseruser"
# "docker run archivebox server 0.0.0.0:8000"
exec gosu "$PUID" bash -c "archivebox $*"

View file

@ -18,6 +18,7 @@ SUPPORTED_PLATFORMS="linux/amd64,linux/arm64,linux/arm/v7"
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
GIT_SHA=sha-"$(git rev-parse --short HEAD)"
SELECTED_PLATFORMS="${2:-$SUPPORTED_PLATFORMS}"
@ -34,12 +35,16 @@ docker buildx build --platform "$SELECTED_PLATFORMS" --push . \
-t archivebox/archivebox:$TAG_NAME \
-t archivebox/archivebox:$VERSION \
-t archivebox/archivebox:$SHORT_VERSION \
-t archivebox/archivebox:$GIT_SHA \
-t archivebox/archivebox:latest \
-t nikisweeting/archivebox \
-t nikisweeting/archivebox:$TAG_NAME \
-t nikisweeting/archivebox:$VERSION \
-t nikisweeting/archivebox:$SHORT_VERSION \
-t nikisweeting/archivebox:$GIT_SHA \
-t nikisweeting/archivebox:latest \
-t ghcr.io/archivebox/archivebox/archivebox:$TAG_NAME \
-t ghcr.io/archivebox/archivebox/archivebox:$VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION
-t ghcr.io/archivebox/archivebox/archivebox:$SHORT_VERSION \
-t ghcr.io/archivebox/archivebox/archivebox:$GIT_SHA

View file

@ -21,7 +21,6 @@ services:
# - ./etc/crontabs:/var/spool/cron/crontabs # uncomment this and archivebox_scheduler below to set up automatic recurring archive jobs
# - ./archivebox:/app/archivebox # uncomment this to mount the ArchiveBox source code at runtime (for developers working on archivebox)
# build: . # uncomment this to build the image from source code at buildtime (for developers working on archivebox)
environment:
- ALLOWED_HOSTS=* # restrict this to only accept incoming traffic via specific domain name
# - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list
@ -161,4 +160,4 @@ networks:
ipam:
driver: default
config:
- subnet: 172.20.0.0/24
- subnet: 172.20.0.0/24

8
etc/crontabs/archivebox Normal file
View file

@ -0,0 +1,8 @@
# DO NOT EDIT THIS FILE - edit the master and reinstall.
# (/tmp/tmpe3dawo9u installed on Tue Jun 13 23:21:48 2023)
# (Cron version -- $Id: crontab.c,v 2.13 1994/01/17 03:20:37 vixie Exp $)
@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/3" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com/2" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
@daily cd /data && /usr/local/bin/archivebox add --depth=0 "https://example.com" >> /data/logs/schedule.log 2>&1 # archivebox_schedule
@daily cd /data && /usr/local/bin/archivebox add --depth=0 "update" >> /data/logs/schedule.log 2>&1 # archivebox_schedule

1740
package-lock.json generated

File diff suppressed because it is too large Load diff

975
pdm.lock

File diff suppressed because it is too large Load diff

View file

@ -39,6 +39,9 @@ classifiers = [
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",

View file

@ -1,35 +1,41 @@
# This file is @generated by PDM.
# Please do not edit it manually.
appnope==0.1.3
asgiref==3.7.2
asttokens==2.4.1
brotli==1.1.0
certifi==2023.7.22
brotli==1.1.0; implementation_name == "cpython"
brotlicffi==1.1.0.0; implementation_name != "cpython"
certifi==2023.11.17
cffi==1.16.0; implementation_name != "cpython"
charset-normalizer==3.3.2
colorama==0.4.6; sys_platform == "win32"
croniter==2.0.1
dateparser==1.1.8
dateparser==1.2.0
decorator==5.1.1
django==3.1.14
django-auth-ldap==4.1.0
django-extensions==3.1.5
exceptiongroup==1.2.0; python_version < "3.11"
executing==2.0.1
idna==3.4
ipython==8.17.2
idna==3.6
ipython==8.18.1
jedi==0.19.1
matplotlib-inline==0.1.6
mutagen==1.47.0
mypy-extensions==1.0.0
parso==0.8.3
pexpect==4.8.0
prompt-toolkit==3.0.40
ptyprocess==0.7.0
pexpect==4.9.0; sys_platform != "win32"
prompt-toolkit==3.0.43
ptyprocess==0.7.0; sys_platform != "win32"
pure-eval==0.2.2
pyasn1==0.5.0
pyasn1==0.5.1
pyasn1-modules==0.3.0
pycparser==2.21; implementation_name != "cpython"
pycryptodomex==3.19.0
pygments==2.16.1
pygments==2.17.2
python-crontab==3.0.0
python-dateutil==2.8.2
python-ldap==3.4.4
pytz==2023.3.post1
regex==2023.10.3
requests==2.31.0
@ -37,10 +43,12 @@ six==1.16.0
sonic-client==1.0.0
sqlparse==0.4.4
stack-data==0.6.3
traitlets==5.13.0
traitlets==5.14.0
typing-extensions==4.9.0; python_version < "3.11"
tzdata==2023.3; platform_system == "Windows"
tzlocal==5.2
urllib3==2.1.0
w3lib==2.1.2
wcwidth==0.2.10
wcwidth==0.2.12
websockets==12.0
yt-dlp==2023.11.14
yt-dlp==2023.11.16