2020-08-13 22:23:27 -04:00
# This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
2022-09-12 20:34:02 +00:00
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
2019-02-28 14:04:37 -05:00
# Usage:
2022-09-11 22:11:13 +02:00
# git submodule update --init --recursive
# git pull --recurse-submodules
2020-08-13 22:23:27 -04:00
# docker build . -t archivebox --no-cache
2020-07-22 01:30:58 -04:00
# docker run -v "$PWD/data":/data archivebox init
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
2020-08-13 22:23:27 -04:00
# docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
2022-04-21 07:29:27 -07:00
# Multi-arch build:
# docker buildx create --use
2022-04-21 07:35:34 -07:00
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
2022-09-11 22:13:22 +02:00
2023-10-20 05:10:03 -07:00
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
2022-04-21 07:29:27 -07:00
2019-02-28 14:04:37 -05:00
2023-10-31 03:19:35 -07:00
# Use Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
2023-10-31 03:58:24 -07:00
FROM python:3.11-slim-bookworm
2019-07-09 13:05:51 -04:00
2020-06-25 17:46:11 -04:00
LABEL name="archivebox" \
2023-10-20 02:47:34 -07:00
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
2020-08-13 22:23:27 -04:00
description="All-in-one personal internet archiving container" \
2020-11-23 02:04:39 -05:00
homepage="https://github.com/ArchiveBox/ArchiveBox" \
2018-10-13 22:47:30 -04:00
2023-10-30 23:25:51 -07:00
2023-10-31 19:43:01 -07:00
2023-10-30 23:25:51 -07:00
######### Environment Variables #################################
2023-10-20 02:47:34 -07:00
# Global system-level config
2020-06-25 21:30:29 -04:00
2020-06-25 17:46:11 -04:00
2020-06-25 21:30:29 -04:00
2020-08-13 22:23:27 -04:00
DEBIAN_FRONTEND=noninteractive \
2023-10-20 02:47:34 -07:00
2023-10-30 23:25:51 -07:00
2023-10-31 05:31:19 -07:00
2023-10-20 02:47:34 -07:00
2018-10-13 22:47:30 -04:00
2023-10-30 23:25:51 -07:00
# Version config
# User config
ENV ARCHIVEBOX_USER="archivebox" \
# Global paths
2020-08-13 22:23:27 -04:00
DATA_DIR=/data \
2023-10-20 02:47:34 -07:00
2023-10-30 23:25:51 -07:00
2020-08-03 13:19:47 -05:00
2023-10-30 23:25:51 -07:00
# Application-level paths
ENV APP_VENV=/app/.venv \
# Build shell config
2023-10-20 02:47:34 -07:00
2023-10-31 03:06:02 -07:00
SHELL ["/bin/bash", "-o", "pipefail", "-o", "errexit", "-o", "errtrace", "-o", "nounset", "-c"]
2023-10-30 23:25:51 -07:00
######### System Environment ####################################
2023-10-20 02:47:34 -07:00
2023-10-30 23:25:51 -07:00
# Detect ArchiveBox version number by reading package.json
COPY --chown=root:root --chmod=755 package.json "$CODE_DIR/"
RUN grep '"version": ' "${CODE_DIR}/package.json" | awk -F'"' '{print $4}' > /VERSION.txt
2023-10-31 19:43:01 -07:00
# Force apt to leave downloaded binaries in /var/cache/apt (massively speeds up Docker builds)
2023-10-31 16:06:19 -07:00
RUN rm -f /etc/apt/apt.conf.d/docker-clean; echo 'Binary::apt::APT::Keep-Downloaded-Packages "true";' > /etc/apt/apt.conf.d/keep-cache
2023-10-31 06:28:11 -07:00
2023-11-03 21:54:17 -07:00
# Print debug info about build and save it to disk, for human eyes only, not used by anything else
2023-10-30 23:25:51 -07:00
RUN (echo "[i] Docker build for ArchiveBox $(cat /VERSION.txt) starting..." \
&& echo "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
&& echo "BUILD_START_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ} LANG=${LANG}" \
&& echo \
&& echo \
&& uname -a \
&& cat /etc/os-release | head -n7 \
&& which bash && bash --version | head -n1 \
&& which dpkg && dpkg --version | head -n1 \
&& echo -e '\n\n' && env && echo -e '\n\n' \
) | tee -a /VERSION.txt
2023-10-20 02:47:34 -07:00
2020-08-13 22:23:27 -04:00
# Create non-privileged user for archivebox and chrome
2023-10-31 03:06:02 -07:00
RUN echo "[*] Setting up $ARCHIVEBOX_USER user uid=${DEFAULT_PUID}..." \
2023-10-20 04:08:38 -07:00
&& groupadd --system $ARCHIVEBOX_USER \
2023-10-20 02:47:34 -07:00
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
2023-10-30 23:25:51 -07:00
&& usermod -u "$DEFAULT_PUID" "$ARCHIVEBOX_USER" \
&& groupmod -g "$DEFAULT_PGID" "$ARCHIVEBOX_USER" \
| tee -a /VERSION.txt
# DEFAULT_PUID and DEFAULT_PID are overriden by PUID and PGID in /bin/docker_entrypoint.sh at runtime
# https://docs.linuxserver.io/general/understanding-puid-and-pgid
2020-08-03 13:19:47 -05:00
2023-10-20 02:47:34 -07:00
# Install system apt dependencies (adding backports to access more recent apt updates)
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[+] Installing APT base system dependencies for $TARGETPLATFORM..." \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
2023-10-30 23:25:51 -07:00
&& mkdir -p /etc/apt/keyrings \
2023-10-20 02:47:34 -07:00
&& apt-get update -qq \
2023-10-31 03:10:48 -07:00
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
2023-10-20 02:47:34 -07:00
# 1. packaging dependencies
2023-10-31 06:28:11 -07:00
apt-transport-https ca-certificates apt-utils gnupg2 curl wget \
2023-10-20 02:47:34 -07:00
# 2. docker and init system dependencies
2023-10-30 23:25:51 -07:00
zlib1g-dev dumb-init gosu cron unzip grep \
2023-10-20 02:47:34 -07:00
# 3. frivolous CLI helpers to make debugging failed archiving easier
2023-10-30 23:25:51 -07:00
# nano iputils-ping dnsutils htop procps jq yq
2020-08-13 22:23:27 -04:00
&& rm -rf /var/lib/apt/lists/*
2019-01-23 01:06:47 -05:00
2023-10-20 02:47:34 -07:00
######### Language Environments ####################################
2020-08-11 11:52:43 -05:00
2020-08-13 22:23:27 -04:00
# Install Node environment
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[+] Installing Node $NODE_VERSION environment in $NODE_MODULES..." \
2023-10-30 23:25:51 -07:00
&& echo "deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_${NODE_VERSION}.x nodistro main" >> /etc/apt/sources.list.d/nodejs.list \
2023-10-20 02:47:34 -07:00
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
2020-06-25 21:30:29 -04:00
&& apt-get update -qq \
2023-10-30 23:25:51 -07:00
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
2023-10-31 06:28:11 -07:00
nodejs libatomic1 python3-minimal \
2023-10-30 23:25:51 -07:00
&& rm -rf /var/lib/apt/lists/* \
# Update NPM to latest version
2023-10-31 05:31:19 -07:00
&& npm i -g npm --cache /root/.npm \
2023-10-30 23:25:51 -07:00
# Save version info
&& ( \
which node && node --version \
&& which npm && npm --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
2023-10-20 02:47:34 -07:00
# Install Python environment
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
2023-10-31 05:31:19 -07:00
echo "[+] Setting up Python $PYTHON_VERSION runtime..." \
2023-10-30 23:25:51 -07:00
# tell PDM to allow using global system python site packages
2023-10-31 03:58:24 -07:00
# && rm /usr/lib/python3*/EXTERNALLY-MANAGED \
2023-10-30 23:25:51 -07:00
# create global virtual environment GLOBAL_VENV to use (better than using pip install --global)
2023-10-31 03:58:24 -07:00
# && python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
# && python3 -m venv --system-site-packages $GLOBAL_VENV \
# && python3 -m venv $GLOBAL_VENV \
2023-10-30 23:25:51 -07:00
# install global dependencies / python build dependencies in GLOBAL_VENV
2023-10-31 19:43:41 -07:00
# && pip install --upgrade pip setuptools wheel \
2023-10-30 23:25:51 -07:00
# Save version info
&& ( \
which python3 && python3 --version | grep " $PYTHON_VERSION" \
2023-10-31 03:58:24 -07:00
&& which pip && pip --version \
# && which pdm && pdm --version \
2023-10-30 23:25:51 -07:00
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
2020-04-22 21:13:49 -04:00
2023-10-20 02:47:34 -07:00
######### Extractor Dependencies ##################################
2020-09-08 17:12:55 -05:00
2023-10-20 02:47:34 -07:00
# Install apt dependencies
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[+] Installing APT extractor dependencies globally using apt..." \
2023-10-20 04:08:38 -07:00
&& apt-get update -qq \
2023-10-20 02:47:34 -07:00
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
2023-10-30 23:25:51 -07:00
&& rm -rf /var/lib/apt/lists/* \
# Save version info
&& ( \
which curl && curl --version | head -n1 \
2023-11-13 20:40:29 -08:00
&& which wget && wget --version 2>&1 | head -n1 \
&& which yt-dlp && yt-dlp --version 2>&1 | head -n1 \
&& which git && git --version 2>&1 | head -n1 \
&& which rg && rg --version 2>&1 | head -n1 \
2023-10-30 23:25:51 -07:00
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
2018-10-13 22:47:30 -04:00
2023-10-20 02:47:34 -07:00
# Install chromium browser using playwright
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/ms-playwright,sharing=locked,id=browsers-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[+] Installing Browser binary dependencies to $PLAYWRIGHT_BROWSERS_PATH..." \
2023-10-20 04:08:38 -07:00
&& apt-get update -qq \
2023-10-31 06:28:11 -07:00
&& if [[ "$TARGETPLATFORM" == *amd64* || "$TARGETPLATFORM" == *arm64* ]]; then \
2023-10-30 23:25:51 -07:00
# install Chromium using playwright
2023-10-31 03:58:24 -07:00
pip install playwright \
2023-10-31 05:31:19 -07:00
&& cp -r /root/.cache/ms-playwright "$PLAYWRIGHT_BROWSERS_PATH" \
&& playwright install --with-deps chromium \
&& export CHROME_BINARY="$(python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')"; \
2023-10-30 23:25:51 -07:00
else \
2023-10-31 03:06:02 -07:00
# fall back to installing Chromium via apt-get on platforms not supported by playwright (e.g. risc, ARMv7, etc.)
2023-10-30 23:25:51 -07:00
apt-get install -qq -y -t bookworm-backports --no-install-recommends \
chromium fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& export CHROME_BINARY="$(which chromium)"; \
fi \
&& rm -rf /var/lib/apt/lists/* \
2023-10-20 02:47:34 -07:00
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
2023-10-21 14:17:50 -07:00
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config" \
2023-11-13 19:18:41 -08:00
2023-11-07 01:42:16 -08:00
2023-10-30 23:25:51 -07:00
# Save version info
&& ( \
which chromium-browser && /usr/bin/chromium-browser --version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
2023-10-20 02:47:34 -07:00
# Install Node dependencies
2023-11-13 20:57:31 -08:00
COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR"/
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/root/.npm,sharing=locked,id=npm-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[+] Installing NPM extractor dependencies from package.json into $NODE_MODULES..." \
&& npm ci --prefer-offline --no-audit --cache /root/.npm \
2023-10-30 23:25:51 -07:00
&& ( \
which node && node --version \
&& which npm && npm version \
&& echo -e '\n\n' \
) | tee -a /VERSION.txt
2023-10-20 02:47:34 -07:00
######### Build Dependencies ####################################
2021-02-16 15:55:47 -05:00
2023-10-30 23:58:13 -07:00
# Install ArchiveBox Python dependencies
2023-11-13 20:57:31 -08:00
COPY --chown=root:root --chmod=755 "./pyproject.toml" "requirements.txt" "$CODE_DIR"/
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
2023-11-13 20:57:31 -08:00
echo "[+] Installing PIP ArchiveBox dependencies from requirements.txt for ${TARGETPLATFORM}..." \
2023-10-30 23:58:13 -07:00
&& apt-get update -qq \
2023-10-31 03:06:02 -07:00
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
2023-10-31 05:31:19 -07:00
build-essential \
libssl-dev libldap2-dev libsasl2-dev \
2023-10-31 16:06:19 -07:00
python3-ldap python3-msgpack python3-mutagen python3-regex python3-pycryptodome procps \
2023-10-31 03:58:24 -07:00
# && ln -s "$GLOBAL_VENV" "$APP_VENV" \
2023-10-31 03:06:02 -07:00
# && pdm use --venv in-project \
# && pdm run python -m ensurepip \
2023-10-31 01:05:39 -07:00
# && pdm sync --fail-fast --no-editable --group :all --no-self \
# && pdm export -o requirements.txt --without-hashes \
2023-10-31 03:58:24 -07:00
# && source $GLOBAL_VENV/bin/activate \
2023-10-31 03:19:35 -07:00
&& pip install -r requirements.txt \
2023-10-31 03:06:02 -07:00
&& apt-get purge -y \
2023-10-31 05:31:19 -07:00
build-essential \
2023-10-31 03:06:02 -07:00
&& apt-get autoremove -y \
2023-10-30 23:58:13 -07:00
&& rm -rf /var/lib/apt/lists/*
2023-10-20 02:47:34 -07:00
# Install ArchiveBox Python package from source
2023-10-20 04:08:38 -07:00
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
2023-10-31 19:43:41 -07:00
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked,id=apt-$TARGETARCH$TARGETVARIANT --mount=type=cache,target=/root/.cache/pip,sharing=locked,id=pip-$TARGETARCH$TARGETVARIANT \
2023-10-31 03:58:24 -07:00
echo "[*] Installing PIP ArchiveBox package from $CODE_DIR..." \
2023-10-20 04:08:38 -07:00
&& apt-get update -qq \
2023-10-30 23:25:51 -07:00
# install C compiler to build deps on platforms that dont have 32-bit wheels available on pypi
2023-10-31 05:31:19 -07:00
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
build-essential \
2023-10-30 23:25:51 -07:00
# INSTALL ARCHIVEBOX python package globally from CODE_DIR, with all optional dependencies
2023-10-31 05:31:19 -07:00
&& pip install -e "$CODE_DIR"[sonic,ldap] \
2023-10-30 23:25:51 -07:00
# save docker image size and always remove compilers / build tools after building is complete
&& apt-get purge -y build-essential \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
2023-10-20 02:47:34 -07:00
2020-08-13 22:23:27 -04:00
# Setup ArchiveBox runtime config
2023-10-20 04:08:38 -07:00
2023-10-30 23:25:51 -07:00
## No need to set explicitly, these values will be autodetected by archivebox in docker:
# WGET_BINARY="wget" \
# CHROME_BINARY="/usr/bin/chromium-browser" \
# SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
# READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
# MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
2018-10-13 22:47:30 -04:00
2020-08-13 22:23:27 -04:00
# Print version for nice docker finish summary
2023-10-30 23:25:51 -07:00
RUN (echo -e "\n\n[√] Finished Docker build succesfully. Saving build summary in: /VERSION.txt" \
&& echo -e "PLATFORM=${TARGETPLATFORM} ARCH=$(uname -m) ($(uname -s) ${TARGETARCH} ${TARGETVARIANT})" \
&& echo -e "BUILD_END_TIME=$(date +"%Y-%m-%d %H:%M:%S %s") TZ=${TZ}\n\n" \
&& "$CODE_DIR/bin/docker_entrypoint.sh" \
archivebox version 2>&1 \
) | tee -a /VERSION.txt
2023-10-20 02:47:34 -07:00
2018-10-13 22:47:30 -04:00
2020-08-13 22:23:27 -04:00
# Open up the interfaces to the outside world
2023-10-30 23:25:51 -07:00
2020-08-13 22:23:27 -04:00
2020-07-22 01:30:58 -04:00
2021-12-02 21:03:19 -05:00
# Optional:
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
2021-02-17 18:24:38 -05:00
2020-08-10 14:15:53 -04:00
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
2021-02-28 22:53:23 -05:00
CMD ["archivebox", "server", "--quick-init", ""]