ArchiveBox/Dockerfile

175 lines
7.1 KiB
Docker
Raw Normal View History

# This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
2022-09-12 20:34:02 +00:00
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
2019-02-28 19:04:37 +00:00
# Usage:
# git submodule update --init --recursive
# git pull --recurse-submodules
# docker build . -t archivebox --no-cache
2020-07-22 05:30:58 +00:00
# docker run -v "$PWD/data":/data archivebox init
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
# docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
2022-09-11 20:13:22 +00:00
#
# Read more about [developing
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
2019-02-28 19:04:37 +00:00
FROM debian:bookworm-backports
2019-07-09 17:05:51 +00:00
2020-06-25 21:46:11 +00:00
LABEL name="archivebox" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one personal internet archiving container" \
2020-11-23 07:04:39 +00:00
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
2018-10-14 02:47:30 +00:00
######### Base System Setup ####################################
# Global system-level config
2020-06-26 01:30:29 +00:00
ENV TZ=UTC \
2020-06-25 21:46:11 +00:00
LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \
2020-06-26 01:30:29 +00:00
LANG=C.UTF-8 \
2020-06-25 21:46:11 +00:00
PYTHONIOENCODING=UTF-8 \
PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
npm_config_loglevel=error
2018-10-14 02:47:30 +00:00
# Application-level config
ENV CODE_DIR=/app \
DATA_DIR=/data \
GLOBAL_VENV=/venv \
APP_VENV=/app/.venv \
NODE_MODULES=/app/node_modules \
ARCHIVEBOX_USER="archivebox"
ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
# Create non-privileged user for archivebox and chrome
RUN groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
&& mkdir -p /etc/apt/keyrings
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN echo 'deb https://deb.debian.org/debian bullseye-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
&& apt-get update -qq \
&& apt-get install -qq -y \
apt-transport-https ca-certificates gnupg2 curl wget \
zlib1g-dev dumb-init gosu cron unzip \
nano iputils-ping dnsutils \
# 1. packaging dependencies
# 2. docker and init system dependencies
# 3. frivolous CLI helpers to make debugging failed archiving easier
&& mkdir -p /etc/apt/keyrings \
&& rm -rf /var/lib/apt/lists/*
######### Language Environments ####################################
# Install Node environment
RUN echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_20.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
2020-06-26 01:30:29 +00:00
&& apt-get update -qq \
&& apt-get install -qq -y nodejs \
&& npm i -g npm \
&& node --version \
&& npm --version
# Install Python environment
RUN apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \
&& rm /usr/lib/python3*/EXTERNALLY-MANAGED \
&& python3 -m venv $GLOBAL_VENV \
&& $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel \
&& rm -rf /var/lib/apt/lists/*
2020-04-23 01:13:49 +00:00
######### Extractor Dependencies ##################################
2020-09-08 22:12:55 +00:00
# Install apt dependencies
RUN apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
2018-10-14 02:47:30 +00:00
# Install chromium browser using playwright
ENV PLAYWRIGHT_BROWSERS_PATH=/browsers
RUN apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install playwright \
&& $GLOBAL_VENV/bin/playwright install --with-deps chromium \
&& CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config"
# Install Node dependencies
WORKDIR "$CODE_DIR"
ADD "package.json" "package-lock.json" "$CODE_DIR/"
RUN npm ci --prefer-offline --no-audit
RUN "$NODE_MODULES/.bin/readability-extractor" --version
######### Build Dependencies ####################################
WORKDIR "$CODE_DIR"
COPY --chown=root:root . "$CODE_DIR/"
# Install Python Build dependencies & build ArchiveBox package
# RUN apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# && pdm venv create \
# && pdm install --fail-fast --no-lock --group :all \
# && pdm build \
# && apt-get purge -y \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# # these are only needed to build CPython libs, we discard after build phase to shrink layer size
# && apt-get autoremove -y \
# && rm -rf /var/lib/apt/lists/*
# Install ArchiveBox Python package from source
RUN apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap]
####################################################
# Setup ArchiveBox runtime config
ENV IN_DOCKER=True \
WGET_BINARY="wget" \
YOUTUBEDL_BINARY="yt-dlp" \
CHROME_SANDBOX=False \
CHROME_BINARY="/usr/bin/chromium-browser" \
USE_SINGLEFILE=True \
SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
USE_READABILITY=True \
READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
USE_MERCURY=True \
MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
2018-10-14 02:47:30 +00:00
# Print version for nice docker finish summary
# RUN archivebox version
RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \
&& uname -a | tee -a /version_info.txt \
&& env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \
&& env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \
&& "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt
####################################################
2018-10-14 02:47:30 +00:00
# Open up the interfaces to the outside world
VOLUME "/data"
EXPOSE 8000
2020-07-22 05:30:58 +00:00
2021-12-03 02:03:19 +00:00
# Optional:
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
2021-02-17 23:24:38 +00:00
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
2021-03-01 03:53:23 +00:00
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]