Merge branch 'dev' into search_index_extract_html_text

This commit is contained in:
Nick Sweeting 2023-10-27 23:09:28 -07:00 committed by GitHub
commit a680724367
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
29 changed files with 3230 additions and 1654 deletions

View file

@ -5,16 +5,21 @@ __pycache__/
.mypy_cache/
.pytest_cache/
.github/
.git/
.pdm-build/
.pdm-python/
.eggs/
venv/
.venv/
.docker-venv/
node_modules/
build/
dist/
pip_dist/
!pip_dist/archivebox.egg-info/requires.txt
brew_dist/
deb_dist/
pip_dist/
assets/
data/

View file

@ -7,7 +7,7 @@ on:
jobs:
build:
runs-on: ubuntu-20.04
runs-on: ubuntu-22.04
steps:
- uses: actions/checkout@v2
@ -18,7 +18,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.9
python-version: 3.11
architecture: x64
- name: Build Python Package

2
.gitignore vendored
View file

@ -13,6 +13,8 @@ venv/
node_modules/
# Packaging artifacts
.pdm-python
.pdm-build
archivebox.egg-info
archivebox-*.tar.gz
build/

View file

@ -12,19 +12,21 @@
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
# Read more about [developing Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
FROM python:3.11-slim-bullseye
FROM debian:bookworm-backports
# Debian 12 w/ faster package updates: https://packages.debian.org/bookworm-backports/
LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
maintainer="Nick Sweeting <dockerfile@archivebox.io>" \
description="All-in-one personal internet archiving container" \
homepage="https://github.com/ArchiveBox/ArchiveBox" \
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
# System-level base config
######### Base System Setup ####################################
# Global system-level config
ENV TZ=UTC \
LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \
@ -32,103 +34,156 @@ ENV TZ=UTC \
PYTHONIOENCODING=UTF-8 \
PYTHONUNBUFFERED=1 \
DEBIAN_FRONTEND=noninteractive \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
npm_config_loglevel=error
# Application-level base config
# Application-level config
ENV CODE_DIR=/app \
VENV_PATH=/venv \
DATA_DIR=/data \
NODE_DIR=/node \
GLOBAL_VENV=/venv \
APP_VENV=/app/.venv \
NODE_MODULES=/app/node_modules \
ARCHIVEBOX_USER="archivebox"
ENV PATH="$PATH:$GLOBAL_VENV/bin:$APP_VENV/bin:$NODE_MODULES/.bin"
SHELL ["/bin/bash", "-c"]
ARG TARGETPLATFORM
ARG TARGETARCH
ARG TARGETVARIANT
RUN printf "[i] Building for TARGETPLATFORM=${TARGETPLATFORM}" \
&& printf ", TARGETARCH=${TARGETARCH}" \
&& printf ", TARGETVARIANT=${TARGETVARIANT} \n" \
&& printf "uname -a : " && uname -a
# Create non-privileged user for archivebox and chrome
RUN groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER
RUN echo "[*] Setting up system environment..." \
&& groupadd --system $ARCHIVEBOX_USER \
&& useradd --system --create-home --gid $ARCHIVEBOX_USER --groups audio,video $ARCHIVEBOX_USER \
&& mkdir -p /etc/apt/keyrings
# Install system dependencies
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
apt-transport-https ca-certificates gnupg2 zlib1g-dev \
dumb-init gosu cron unzip curl \
# Install system apt dependencies (adding backports to access more recent apt updates)
RUN echo "[+] Installing system dependencies..." \
&& echo 'deb https://deb.debian.org/debian bookworm-backports main contrib non-free' >> /etc/apt/sources.list.d/backports.list \
&& apt-get update -qq \
&& apt-get install -qq -y \
apt-transport-https ca-certificates gnupg2 curl wget \
zlib1g-dev dumb-init gosu cron unzip \
# nano iputils-ping dnsutils htop procps \
# 1. packaging dependencies
# 2. docker and init system dependencies
# 3. frivolous CLI helpers to make debugging failed archiving easier
&& mkdir -p /etc/apt/keyrings \
&& rm -rf /var/lib/apt/lists/*
# Install apt dependencies
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
wget curl chromium git ffmpeg youtube-dl ripgrep \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
&& rm -rf /var/lib/apt/lists/*
######### Language Environments ####################################
# Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
&& echo 'deb https://deb.nodesource.com/node_18.x buster main' >> /etc/apt/sources.list \
RUN echo "[+] Installing Node environment..." \
&& echo 'deb [signed-by=/etc/apt/keyrings/nodesource.gpg] https://deb.nodesource.com/node_21.x nodistro main' >> /etc/apt/sources.list.d/nodejs.list \
&& curl -fsSL https://deb.nodesource.com/gpgkey/nodesource-repo.gpg.key | gpg --dearmor | gpg --dearmor -o /etc/apt/keyrings/nodesource.gpg \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
nodejs \
# && npm install -g npm \
&& apt-get install -qq -y nodejs libatomic1 \
&& npm i -g npm \
&& node --version \
&& npm --version
# Install Python environment
RUN echo "[+] Installing Python environment..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
python3 python3-pip python3-venv python3-setuptools python3-wheel python-dev-is-python3 \
python3-ldap libldap2-dev libsasl2-dev libssl-dev python3-msgpack \
&& rm /usr/lib/python3*/EXTERNALLY-MANAGED \
&& python3 -m venv --system-site-packages --symlinks $GLOBAL_VENV \
&& $GLOBAL_VENV/bin/pip install --upgrade pip pdm setuptools wheel python-ldap \
&& rm -rf /var/lib/apt/lists/*
######### Extractor Dependencies ##################################
# Install apt dependencies
RUN echo "[+] Installing extractor APT dependencies..." \
&& apt-get update -qq \
&& apt-get install -qq -y -t bookworm-backports --no-install-recommends \
curl wget git yt-dlp ffmpeg ripgrep \
# Packages we have also needed in the past:
# youtube-dl wget2 aria2 python3-pyxattr rtmpdump libfribidi-bin mpv \
# fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
# Install chromium browser using playwright
ENV PLAYWRIGHT_BROWSERS_PATH="/browsers"
RUN echo "[+] Installing extractor Chromium dependency..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install playwright \
&& $GLOBAL_VENV/bin/playwright install --with-deps chromium \
&& CHROME_BINARY="$($GLOBAL_VENV/bin/python -c 'from playwright.sync_api import sync_playwright; print(sync_playwright().start().chromium.executable_path)')" \
&& ln -s "$CHROME_BINARY" /usr/bin/chromium-browser \
&& mkdir -p "/home/${ARCHIVEBOX_USER}/.config/chromium/Crash Reports/pending/" \
&& chown -R $ARCHIVEBOX_USER "/home/${ARCHIVEBOX_USER}/.config" \
|| if [[ "$TARGETPLATFORM" == "linux/arm/v7" ]]; then exit 0; else exit 1; fi
# ignore failure for architectures where no playwright release is available yet
# Install Node dependencies
WORKDIR "$NODE_DIR"
ENV PATH="${PATH}:$NODE_DIR/node_modules/.bin" \
npm_config_loglevel=error
ADD ./package.json ./package.json
ADD ./package-lock.json ./package-lock.json
RUN npm ci
# Install Python dependencies
WORKDIR "$CODE_DIR"
ENV PATH="${PATH}:$VENV_PATH/bin"
RUN python -m venv --clear --symlinks "$VENV_PATH" \
&& pip install --upgrade --quiet pip setuptools \
&& mkdir -p "$CODE_DIR/archivebox"
ADD "./setup.py" "$CODE_DIR/"
ADD "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install -r /tmp/requirements.txt \
&& pip install --upgrade youtube-dl yt-dlp \
&& apt-get purge -y build-essential python-dev python3-dev libldap2-dev libsasl2-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
COPY --chown=root:root --chmod=755 "package.json" "package-lock.json" "$CODE_DIR/"
RUN echo "[+] Installing extractor Node dependencies..." \
&& npm ci --prefer-offline --no-audit \
&& npm version
# Install apt development dependencies
# RUN apt-get install -qq \
# && apt-get install -qq -y --no-install-recommends \
# python3 python3-dev python3-pip python3-venv python3-all \
# dh-python debhelper devscripts dput software-properties-common \
# python3-distutils python3-setuptools python3-wheel python3-stdeb
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
# && pip install --quiet -r /tmp/dev_requirements.txt
######### Build Dependencies ####################################
# Install ArchiveBox Python package and its dependencies
WORKDIR "$CODE_DIR"
ADD . "$CODE_DIR"
RUN chown -R root:root . && chmod a+rX -R . && pip install -e .
# # Building ArchiveBox from source with all pdm dev dependencies
# WORKDIR "$CODE_DIR"
# COPY --chown=root:root --chmod=755 "./pyproject.toml" "./pdm.lock" "$CODE_DIR/"
# RUN echo "[+] Installing project Python dependencies..." \
# && apt-get update -qq \
# && apt-get install -qq -y -t bookworm-backports --no-install-recommends \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# && pdm use -f $GLOBAL_VENV \
# && pdm install --fail-fast --no-lock --group :all --no-self \
# && pdm build \
# && apt-get purge -y \
# build-essential libssl-dev libldap2-dev libsasl2-dev \
# # these are only needed to build CPython libs, we discard after build phase to shrink layer size
# && apt-get autoremove -y \
# && rm -rf /var/lib/apt/lists/*
# Install ArchiveBox Python package from source
COPY --chown=root:root --chmod=755 "." "$CODE_DIR/"
RUN echo "[*] Installing ArchiveBox package from /app..." \
&& apt-get update -qq \
&& $GLOBAL_VENV/bin/pip install -e "$CODE_DIR"[sonic,ldap]
####################################################
# Setup ArchiveBox runtime config
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \
WGET_BINARY="wget" \
YOUTUBEDL_BINARY="yt-dlp" \
CHROME_SANDBOX=False \
CHROME_BINARY="/usr/bin/chromium-browser" \
USE_SINGLEFILE=True \
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \
SINGLEFILE_BINARY="$NODE_MODULES/.bin/single-file" \
USE_READABILITY=True \
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \
READABILITY_BINARY="$NODE_MODULES/.bin/readability-extractor" \
USE_MERCURY=True \
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \
YOUTUBEDL_BINARY="yt-dlp"
MERCURY_BINARY="$NODE_MODULES/.bin/postlight-parser"
# Print version for nice docker finish summary
# RUN archivebox version
RUN /app/bin/docker_entrypoint.sh archivebox version
RUN echo "[√] Finished Docker build succesfully. Saving build summary in: /version_info.txt" \
&& uname -a | tee -a /version_info.txt \
&& env --chdir="$NODE_DIR" npm version | tee -a /version_info.txt \
&& env --chdir="$CODE_DIR" pdm info | tee -a /version_info.txt \
&& "$CODE_DIR/bin/docker_entrypoint.sh" archivebox version 2>&1 | tee -a /version_info.txt
####################################################
# Open up the interfaces to the outside world
VOLUME "$DATA_DIR"
VOLUME "/data"
EXPOSE 8000
# Optional:

View file

@ -10,7 +10,7 @@
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
<pre lang="bash"><code style="white-space: pre-line">"Your own personal internet archive" (网站存档 / 爬虫)
<pre lang="bash" align="center"><code style="white-space: pre-line; text-align: center" align="center">"Your own personal internet archive" (网站存档 / 爬虫)
curl -sSL 'https://get.archivebox.io' | sh
</code></pre>
@ -588,7 +588,8 @@ Each snapshot subfolder `./archive/<timestamp>/` includes a static `index.json`
You can export the main index to browse it statically without needing to run a server.
*Note about large exports: These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.*
> **Note**
> These exports are not paginated, exporting many URLs or the entire archive at once may be slow. Use the filtering CLI flags on the `archivebox list` command to export specific Snapshots or ranges.
```bash
# archivebox list --help
@ -615,7 +616,7 @@ The paths in the static exports are relative, make sure to keep them next to you
### Archiving Private Content
<a id="archiving-private-urls"/>
<a id="archiving-private-urls"></a>
If you're importing pages with private content or URLs containing secret tokens you don't want public (e.g Google Docs, paywalled content, unlisted videos, etc.), **you may want to disable some of the extractor methods to avoid leaking that content to 3rd party APIs or the public**.
@ -985,6 +986,7 @@ archivebox init --setup
<details><summary><i>Click to expand...</i></summary>
Make sure to run this whenever you change things in `models.py`.
```bash
cd archivebox/
./manage.py makemigrations
@ -993,6 +995,7 @@ cd path/to/test/data/
archivebox shell
archivebox manage dbshell
```
(uses `pytest -s`)
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
@ -1000,7 +1003,9 @@ https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-dj
#### Contributing a new extractor
<details><summary><i>Click to expand...</i></summary><br/><br/>
<details><summary><i>Click to expand...</i></summary>
<br/><br/>
ArchiveBox [`extractors`](https://github.com/ArchiveBox/ArchiveBox/blob/dev/archivebox/extractors/media.py) are external binaries or Python/Node scripts that ArchiveBox runs to archive content on a page.

34
SECURITY.md Normal file
View file

@ -0,0 +1,34 @@
# Security Policy
---
## Security Information
Please see this wiki page for important notices about ArchiveBox security, publishing your archives securely, and the dangers of executing archived JS:
https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview
Also see this section of the README about important caveats when running ArchiveBox:
https://github.com/ArchiveBox/ArchiveBox?tab=readme-ov-file#caveats
You can also read these pages for more information about ArchiveBox's internals, development environment, DB schema, and more:
- https://github.com/ArchiveBox/ArchiveBox#archive-layout
- https://github.com/ArchiveBox/ArchiveBox#archivebox-development
- https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
- https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting
---
## Reporting a Vulnerability
We use Github's built-in [Private Reporting](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability) feature to accept vulnerability reports.
1. Go to the Security tab on our Github repo: https://github.com/ArchiveBox/ArchiveBox/security
2. Click the ["Report a Vulnerability"](https://github.com/ArchiveBox/ArchiveBox/security/advisories/new) button
3. Fill out the form to submit the details of the report and it will be securely sent to the maintainers
You can also contact the maintainers via our public [Zulip Chat Server zulip.archivebox.io](https://zulip.archivebox.io) or [Twitter DMs @ArchiveBoxApp](https://twitter.com/ArchiveBoxApp).

View file

@ -90,8 +90,13 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '644'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
'URL_WHITELIST': {'type': str, 'default': None},
'URL_DENYLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', 'aliases': ('URL_BLACKLIST',)}, # to avoid downloading code assets as their own pages
'URL_ALLOWLIST': {'type': str, 'default': None, 'aliases': ('URL_WHITELIST',)},
'ADMIN_USERNAME': {'type': str, 'default': None},
'ADMIN_PASSWORD': {'type': str, 'default': None},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
@ -143,6 +148,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
'SAVE_ALLOWLIST': {'type': dict, 'default': {},},
'SAVE_DENYLIST': {'type': dict, 'default': {},},
},
'ARCHIVE_METHOD_OPTIONS': {
@ -231,12 +238,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
@ -374,6 +380,8 @@ def get_commit_hash(config):
############################## Derived Config ##################################
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: SYSTEM_USER},
@ -390,8 +398,8 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (Path(c['CHROME_USER_DATA_DIR']).resolve() if c['CHROME_USER_DATA_DIR'] else None)}, # None means unset, so we autodetect it with find_chrome_Data_dir(), but emptystring '' means user manually set it to '', and we should store it as None
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
'URL_WHITELIST_PTN': {'default': lambda c: c['URL_WHITELIST'] and re.compile(c['URL_WHITELIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
@ -435,7 +443,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -465,10 +473,11 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
}
################################### Helpers ####################################

View file

@ -41,7 +41,7 @@ class ConfigDict(BaseConfig, total=False):
MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str
RESTRICT_FILE_NAMES: str
URL_BLACKLIST: str
URL_DENYLIST: str
SECRET_KEY: Optional[str]
BIND_ADDR: str

View file

@ -41,7 +41,7 @@ class AddLinkForm(forms.Form):
# label="Exclude patterns",
# min_length='1',
# required=False,
# initial=URL_BLACKLIST,
# initial=URL_DENYLIST,
# )
# timeout = forms.IntegerField(
# initial=TIMEOUT,

View file

@ -6,9 +6,6 @@ import re
import logging
import tempfile
import ldap
from django_auth_ldap.config import LDAPSearch
from pathlib import Path
from django.utils.crypto import get_random_string
@ -97,33 +94,43 @@ AUTHENTICATION_BACKENDS = [
]
if LDAP:
global AUTH_LDAP_SERVER_URI
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
try:
import ldap
from django_auth_ldap.config import LDAPSearch
global AUTH_LDAP_BIND_DN
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
global AUTH_LDAP_SERVER_URI
global AUTH_LDAP_BIND_DN
global AUTH_LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
global AUTH_LDAP_USER_ATTR_MAP
global AUTH_LDAP_BIND_PASSWORD
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
AUTH_LDAP_SERVER_URI = LDAP_SERVER_URI
AUTH_LDAP_BIND_DN = LDAP_BIND_DN
AUTH_LDAP_BIND_PASSWORD = LDAP_BIND_PASSWORD
global AUTH_LDAP_USER_SEARCH
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
assert AUTH_LDAP_SERVER_URI and LDAP_USERNAME_ATTR and LDAP_USER_FILTER, 'LDAP_* config options must all be set if LDAP=True'
global AUTH_LDAP_USER_ATTR_MAP
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTH_LDAP_USER_SEARCH = LDAPSearch(
LDAP_USER_BASE,
ldap.SCOPE_SUBTREE,
'(&(' + LDAP_USERNAME_ATTR + '=%(user)s)' + LDAP_USER_FILTER + ')',
)
AUTH_LDAP_USER_ATTR_MAP = {
'username': LDAP_USERNAME_ATTR,
'first_name': LDAP_FIRSTNAME_ATTR,
'last_name': LDAP_LASTNAME_ATTR,
'email': LDAP_EMAIL_ATTR,
}
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
except ModuleNotFoundError:
sys.stderr.write('[X] Error: Found LDAP=True config but LDAP packages not installed. You may need to run: pip install archivebox[ldap]\n\n')
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
# sys.exit(1)
AUTHENTICATION_BACKENDS = [
'django_auth_ldap.backend.LDAPBackend',
]
################################################################################
### Debug Settings

View file

@ -4,12 +4,16 @@ import os
import sys
from pathlib import Path
from typing import Optional, List, Iterable, Union
from typing import Callable, Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet
from ..config import (
SAVE_ALLOWLIST_PTN,
SAVE_DENYLIST_PTN,
)
from ..core.settings import ERROR_LOG
from ..index.schema import Link
from ..index.schema import ArchiveResult, Link
from ..index.sql import write_link_to_sql_index
from ..index import (
load_link_details,
@ -43,7 +47,11 @@ from .archive_org import should_save_archive_dot_org, save_archive_dot_org
from .headers import should_save_headers, save_headers
def get_default_archive_methods():
ShouldSaveFunction = Callable[[Link, Optional[Path], Optional[bool]], bool]
SaveFunction = Callable[[Link, Optional[Path], int], ArchiveResult]
ArchiveMethodEntry = tuple[str, ShouldSaveFunction, SaveFunction]
def get_default_archive_methods() -> List[ArchiveMethodEntry]:
return [
('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers),
@ -71,12 +79,30 @@ ARCHIVE_METHODS_INDEXING_PRECEDENCE = [
('wget', 6)
]
@enforce_types
def ignore_methods(to_ignore: List[str]):
def get_archive_methods_for_link(link: Link) -> Iterable[ArchiveMethodEntry]:
DEFAULT_METHODS = get_default_archive_methods()
allowed_methods = {
m for pat, methods in
SAVE_ALLOWLIST_PTN.items()
if pat.search(link.url)
for m in methods
} or { m[0] for m in DEFAULT_METHODS }
denied_methods = {
m for pat, methods in
SAVE_DENYLIST_PTN.items()
if pat.search(link.url)
for m in methods
}
allowed_methods -= denied_methods
return (m for m in DEFAULT_METHODS if m[0] in allowed_methods)
@enforce_types
def ignore_methods(to_ignore: List[str]) -> Iterable[str]:
ARCHIVE_METHODS = get_default_archive_methods()
methods = filter(lambda x: x[0] not in to_ignore, ARCHIVE_METHODS)
methods = map(lambda x: x[0], methods)
return list(methods)
return [x[0] for x in ARCHIVE_METHODS if x[0] not in to_ignore]
@enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[Path]=None) -> Link:
@ -89,11 +115,11 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
except Snapshot.DoesNotExist:
snapshot = write_link_to_sql_index(link)
ARCHIVE_METHODS = get_default_archive_methods()
active_methods = get_archive_methods_for_link(link)
if methods:
ARCHIVE_METHODS = [
method for method in ARCHIVE_METHODS
active_methods = [
method for method in active_methods
if method[0] in methods
]
@ -110,7 +136,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc)
for method_name, should_run, method_function in ARCHIVE_METHODS:
for method_name, should_run, method_function in active_methods:
try:
if method_name not in link.history:
link.history[method_name] = []

View file

@ -71,7 +71,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
result = run(cmd, cwd=out_dir, timeout=timeout)
try:
result_json = json.loads(result.stdout)
assert result_json and 'content' in result_json
assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
@ -85,7 +85,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (

View file

@ -22,8 +22,8 @@ from ..config import (
JSON_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
URL_WHITELIST_PTN,
URL_DENYLIST_PTN,
URL_ALLOWLIST_PTN,
stderr,
OUTPUT_PERMISSIONS
)
@ -142,9 +142,9 @@ def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
continue
if scheme(link.url) not in ('http', 'https', 'ftp'):
continue
if URL_BLACKLIST_PTN and URL_BLACKLIST_PTN.search(link.url):
if URL_DENYLIST_PTN and URL_DENYLIST_PTN.search(link.url):
continue
if URL_WHITELIST_PTN and (not URL_WHITELIST_PTN.search(link.url)):
if URL_ALLOWLIST_PTN and (not URL_ALLOWLIST_PTN.search(link.url)):
continue
yield link

View file

@ -533,11 +533,27 @@ def log_shell_welcome_msg():
### Helpers
@enforce_types
def pretty_path(path: Union[Path, str]) -> str:
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=OUTPUT_DIR) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = Path('.').resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return str(path).replace(str(pwd) + '/', './')
pwd = str(Path(pwd)) # .resolve()
path = str(path)
if not path:
return path
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/'):
path = path.replace(pwd, '.', 1)
# quote paths containing spaces
if ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the absolute path for clarity
if path == '.':
path = pwd
return path
@enforce_types
@ -578,6 +594,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if Path(folder['path']).exists():
num_files = (
@ -592,13 +609,7 @@ def printable_folder_status(name: str, folder: Dict) -> str:
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
path = pretty_path(folder['path'])
return ' '.join((
ANSI[color],
@ -629,9 +640,7 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
if path and ' ' in path:
path = f'"{path}"'
path = pretty_path(dependency['path'])
return ' '.join((
ANSI[color],

View file

@ -112,6 +112,8 @@ from .config import (
load_all_config,
CONFIG,
USER_CONFIG,
ADMIN_USERNAME,
ADMIN_PASSWORD,
get_real_name,
setup_django,
)
@ -216,7 +218,7 @@ def version(quiet: bool=False,
if not quiet:
# 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 FS_USER=501:20 SEARCH_BACKEND=ripgrep
p = platform.uname()
print(
@ -236,7 +238,8 @@ def version(quiet: bool=False,
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
f'FS_USER={PUID}:{PGID}',
f'FS_PERMS={OUTPUT_PERMISSIONS}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
)
print()
@ -251,19 +254,19 @@ def version(quiet: bool=False,
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
for name, folder in CODE_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in CODE_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
print('{white}[i] Secrets locations:{reset}'.format(**ANSI))
for name, folder in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in EXTERNAL_LOCATIONS.items():
print(printable_folder_status(name, path))
print()
if DATA_LOCATIONS['OUTPUT_DIR']['is_valid']:
print('{white}[i] Data locations:{reset}'.format(**ANSI))
for name, folder in DATA_LOCATIONS.items():
print(printable_folder_status(name, folder))
for name, path in DATA_LOCATIONS.items():
print(printable_folder_status(name, path))
else:
print()
print('{white}[i] Data locations:{reset}'.format(**ANSI))
@ -419,14 +422,16 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
write_main_index(list(pending_links.values()), out_dir=out_dir)
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
from django.contrib.auth.models import User
if (ADMIN_USERNAME and ADMIN_PASSWORD) and not User.objects.filter(username=ADMIN_USERNAME).exists():
print('{green}[+] Found ADMIN_USERNAME and ADMIN_PASSWORD configuration options, creating new admin user.{reset}'.format(**ANSI))
User.objects.create_superuser(username=ADMIN_USERNAME, password=ADMIN_PASSWORD)
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
# TODO: allow creating new supersuer via env vars on first init
# if config.HTTP_USER and config.HTTP_PASS:
# from django.contrib.auth.models import User
# User.objects.create_superuser(HTTP_USER, '', HTTP_PASS)
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
json_index = out_dir / JSON_INDEX_FILENAME

View file

@ -1,62 +1,3 @@
{% extends "base.html" %}
{% load static %}
{% block body %}
<div id="toolbar">
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
<div>
<label for="searchbar"><img src="/static/admin/img/search.svg" alt="Search"></label>
<input type="text" size="40" name="q" value="" id="searchbar" autofocus placeholder="Title, URL, tags, timestamp, or content...".>
<input type="submit" value="Search" style="height: 36px; padding-top: 6px; margin: 8px"/>
<input type="button"
value="♺"
title="Refresh..."
onclick="location.href='{% url 'public-index' %}'"
style="background-color: rgba(121, 174, 200, 0.8); height: 30px; font-size: 0.8em; margin-top: 12px; padding-top: 6px; float:right">
</input>
</div>
</form>
</div>
<table id="table-bookmarks">
<thead>
<tr>
<th style="width: 100px;">Bookmarked</th>
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
<th style="width: 140px">Files</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr>
</thead>
<tbody>
{% for link in object_list %}
{% include 'main_index_row.html' with link=link %}
{% endfor %}
</tbody>
</table>
<center>
<span class="step-links">
{% if page_obj.has_previous %}
<a href="{% url 'public-index' %}?page=1">&laquo; first</a>
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
{% endif %}
<span class="current">
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
<br>
</center>
{% endblock %}
{% extends "admin/base_site.html" %}
{% load i18n admin_urls static admin_list %}
{% load core_tags %}

View file

@ -33,7 +33,7 @@
<br/>
<div class="loader"></div>
<br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for detailed progress...
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
@ -46,19 +46,22 @@
</form>
<br/><br/><br/>
<center id="delay-warning" style="display: none">
<small>(it's safe to leave this page, adding will continue in the background)</small>
<small>(you will be redirected to your <a href="/">Snapshot list</a> momentarily, its safe to close this page at any time)</small>
</center>
{% if absolute_add_path %}
<center id="bookmarklet">
<!-- <center id="bookmarklet">
<p>Bookmark this link to quickly add to your archive:
<a href="javascript:void(window.open('{{ absolute_add_path }}?url='+encodeURIComponent(document.location.href)));">Add to ArchiveBox</a></p>
</center>
</center> -->
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block'
setTimeout(function() {
window.location = '/'
}, 2000)
return true
})
</script>

35
bin/build_dev.sh Executable file
View file

@ -0,0 +1,35 @@
#!/usr/bin/env bash
# ./bin/build_docker.sh dev 'linux/arm/v7'
### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
# https://www.gnu.org/software/bash/manual/html_node/The-Set-Builtin.html
# set -o xtrace
set -o errexit
set -o errtrace
set -o nounset
set -o pipefail
IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
cd "$REPO_DIR"
which docker > /dev/null || exit 1
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
echo "[+] Building archivebox:$VERSION docker image..."
# docker builder prune
docker build . --no-cache -t archivebox-dev --load
# docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
# -t archivebox \
# -t archivebox:$TAG_NAME \
# -t archivebox:$VERSION \
# -t archivebox:$SHORT_VERSION

View file

@ -1,4 +1,5 @@
#!/usr/bin/env bash
# ./bin/build_docker.sh dev 'linux/arm/v7'
### Bash Environment Setup
# http://redsymbol.net/articles/unofficial-bash-strict-mode/
@ -15,10 +16,12 @@ cd "$REPO_DIR"
which docker > /dev/null || exit 1
TAG_NAME="dev"
TAG_NAME="${1:-$(git rev-parse --abbrev-ref HEAD)}"
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
SHORT_VERSION="$(echo "$VERSION" | perl -pe 's/(\d+)\.(\d+)\.(\d+)/$1.$2/g')"
REQUIRED_PLATFORMS=('linux/arm64','linux/amd64','linux/arm/v8','linux/arm/v7')
REQUIRED_PLATFORMS="${2:-"linux/arm64,linux/amd64,linux/arm/v7"}"
echo "[+] Building Docker image: tag=$TAG_NAME version=$SHORT_VERSION arch=$REQUIRED_PLATFORMS"
function check_platforms() {
INSTALLED_PLATFORMS="$(docker buildx inspect | grep 'Platforms:' )"
@ -33,44 +36,44 @@ function check_platforms() {
return 0
}
function remove_builder() {
# remove existing xbuilder
docker buildx stop xbuilder || true
docker buildx rm xbuilder || true
}
function create_builder() {
docker buildx use xbuilder && return 0
echo "[+] Creating new xbuilder for: $REQUIRED_PLATFORMS"
echo
# Switch to buildx builder if already present / previously created
docker buildx create --name xbuilder --driver docker-container --bootstrap --use --platform "$REQUIRED_PLATFORMS" || true
docker buildx inspect --bootstrap || true
echo
}
function recreate_builder() {
# Install QEMU binaries for cross-platform building if not installed
docker run --privileged --rm 'tonistiigi/binfmt' --install all
# remove existing xbuilder
docker buildx stop xbuilder || true
docker buildx rm xbuilder || true
# Create Docker builder for cross-platform building
docker buildx use xbuilder && return 0
remove_builder
create_builder
}
# Check if docker is ready for cross-plaform builds, if not, recreate builder
docker buildx use xbuilder || create_builder
docker buildx use xbuilder 2>&1 >/dev/null || create_builder
check_platforms || (recreate_builder && check_platforms) || exit 1
echo "[+] Building archivebox:$VERSION docker image..."
#docker build . \
docker buildx build --platform "$REQUIRED_PLATFORMS" --push . \
# docker builder prune
# docker build . --no-cache -t archivebox-dev \
docker buildx build --platform "$REQUIRED_PLATFORMS" --load . \
-t archivebox \
-t archivebox:$TAG_NAME \
-t archivebox:$VERSION \
-t archivebox:$SHORT_VERSION \
-t archivebox:latest \
-t docker.io/nikisweeting/archivebox:$TAG_NAME \
-t docker.io/nikisweeting/archivebox:$VERSION \
-t docker.io/nikisweeting/archivebox:$SHORT_VERSION \

View file

@ -25,7 +25,10 @@ cd "$REPO_DIR"
rm -Rf build dist
echo "[+] Building sdist, bdist_wheel, and egg_info"
python3 setup.py \
sdist --dist-dir=./pip_dist \
bdist_wheel --dist-dir=./pip_dist \
egg_info --egg-base=./pip_dist
# python3 setup.py \
# sdist --dist-dir=./pip_dist \
# bdist_wheel --dist-dir=./pip_dist \
# egg_info --egg-base=./pip_dist
# pip install --upgrade pip setuptools build
python -m build

View file

@ -12,22 +12,26 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
fi
export PUID="$(id -u archivebox)"
export PGID="$(id -g archivebox)"
# Set the permissions of the data dir to match the archivebox user
# Check the permissions of the data dir (or create if it doesn't exist)
if [[ -d "$DATA_DIR/archive" ]]; then
# check data directory permissions
if [[ ! "$(stat -c %u $DATA_DIR/archive)" = "$(id -u archivebox)" ]]; then
echo "Change in ownership detected, please be patient while we chown existing files"
echo "This could take some time..."
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER -R "$DATA_DIR"
if touch "$DATA_DIR/archive/.permissions_test_safe_to_delete"; then
# It's fine, we are able to write to the data directory
rm "$DATA_DIR/archive/.permissions_test_safe_to_delete"
# echo "[√] Permissions are correct"
else
echo "[X] Permissions Error: ArchiveBox is not able to write to your data dir. You need to fix the data dir ownership and retry:" >2
echo " chown -R $PUID:$PGID data" >2
echo " https://docs.linuxserver.io/general/understanding-puid-and-pgid" >2
exit 1
fi
else
# create data directory
mkdir -p "$DATA_DIR/logs"
chown -R $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
fi
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR"
chown $ARCHIVEBOX_USER:$ARCHIVEBOX_USER "$DATA_DIR" "$DATA_DIR"/*
# Drop permissions to run commands as the archivebox user
if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then

View file

@ -34,6 +34,8 @@ services:
# - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive
# - PUID=1000 # set to your host user's UID & GID if you encounter permissions issues
# - PGID=1000
# - ADMIN_USERNAME=admin # create an admin user on first run with the given user/pass combo
# - ADMIN_PASSWORD=SomeSecretPassword
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these and sonic container below for better full-text search
# - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SomeSecretPassword

1718
package-lock.json generated

File diff suppressed because it is too large Load diff

View file

@ -1,14 +1,13 @@
{
"name": "archivebox",
"version": "0.6.3",
"version": "0.7.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies": {
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
"playwright": "^1.37.1",
"@postlight/parser": "^2.2.3",
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
"single-file-cli": "^1.0.63"
"single-file-cli": "^1.1.12"
}
}

2077
pdm.lock Normal file

File diff suppressed because it is too large Load diff

121
pyproject.toml Normal file
View file

@ -0,0 +1,121 @@
[project]
name = "archivebox"
version = "0.7.0"
description = "Self-hosted internet archiving solution."
authors = [
{name = "Nick Sweeting", email = "setup.py@archivebox.io"},
]
dependencies = [
"setuptools>=68.2.2",
"croniter>=0.3.34",
"dateparser>=1.0.0",
"django-extensions>=3.0.3",
"django>=3.1.3,<3.2",
"ipython>5.0.0",
"mypy-extensions>=0.4.3",
"python-crontab>=2.5.1",
"requests>=2.24.0",
"w3lib>=1.22.0",
# "youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"playwright>=1.39.0",
]
requires-python = ">=3.9"
readme = "README.md"
license = {text = "MIT"}
classifiers = [
"Development Status :: 4 - Beta",
"Environment :: Console",
"Environment :: Web Environment",
"Framework :: Django",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Sociology :: History",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Utilities",
"Typing :: Typed",
]
# pdm lock -G:all
# pdm install -G:all
[tool.pdm.dev-dependencies]
build = [
"pdm",
"bottle",
"setuptools",
"stdeb",
"twine",
"wheel",
]
lint = [
"flake8",
"mypy",
"django-stubs",
]
test = [
"pytest",
]
debug = [
"django-debug-toolbar",
"djdt_flamegraph",
"ipdb",
]
doc = [
"recommonmark",
"sphinx",
"sphinx-rtd-theme",
]
[project.optional-dependencies]
sonic = [
# echo "deb [signed-by=/usr/share/keyrings/valeriansaliou_sonic.gpg] https://packagecloud.io/valeriansaliou/sonic/debian/ bookworm main" > /etc/apt/sources.list.d/valeriansaliou_sonic.list
# curl -fsSL https://packagecloud.io/valeriansaliou/sonic/gpgkey | gpg --dearmor -o /usr/share/keyrings/valeriansaliou_sonic.gpg
"sonic-client>=0.0.5",
]
ldap = [
# apt install libldap2-dev libsasl2-dev
"django-auth-ldap>=4.1.0",
]
[project.scripts]
archivebox = "archivebox.cli:main"
[tool.pdm.scripts]
lint = "./bin/lint.sh"
test = "./bin/test.sh"
# all = {composite = ["lint mypackage/", "test -v tests/"]}
[build-system]
requires = ["pdm-backend"]
build-backend = "pdm.backend"
[project.urls]
Homepage = "https://github.com/ArchiveBox/ArchiveBox"
Source = "https://github.com/ArchiveBox/ArchiveBox"
Documentation = "https://github.com/ArchiveBox/ArchiveBox/wiki"
"Bug Tracker" = "https://github.com/ArchiveBox/ArchiveBox/issues"
Changelog = "https://github.com/ArchiveBox/ArchiveBox/releases"
Roadmap = "https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap"
Community = "https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community"
Demo = "https://demo.archivebox.io"
Donate = "https://github.com/ArchiveBox/ArchiveBox/wiki/Donations"

266
setup.py
View file

@ -1,146 +1,150 @@
import json
import setuptools
from setuptools.command.test import test
#####################################################################################
# THIS FILE IS DEPRECATED AND WILL BE REMOVED EVENTUALLU
# ALL FUTURE CHANGES SHOULD HAPPEN IN pyproject.toml with pdm
#####################################################################################
from pathlib import Path
# import json
# import setuptools
# from setuptools.command.test import test
# from pathlib import Path
PKG_NAME = "archivebox"
DESCRIPTION = "Self-hosted internet archiving solution."
LICENSE = "MIT"
AUTHOR = "Nick Sweeting"
AUTHOR_EMAIL="git@nicksweeting.com"
REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
PROJECT_URLS = {
"Source": f"{REPO_URL}",
"Documentation": f"{REPO_URL}/wiki",
"Bug Tracker": f"{REPO_URL}/issues",
"Changelog": f"{REPO_URL}/releases",
"Roadmap": f"{REPO_URL}/wiki/Roadmap",
"Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
"Demo": f"https://demo.archivebox.io",
"Donate": f"{REPO_URL}/wiki/Donations",
}
# PKG_NAME = "archivebox"
# DESCRIPTION = "Self-hosted internet archiving solution."
# LICENSE = "MIT"
# AUTHOR = "Nick Sweeting"
# AUTHOR_EMAIL="setup.py@archivebox.io"
# REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
# PROJECT_URLS = {
# "Source": f"{REPO_URL}",
# "Documentation": f"{REPO_URL}/wiki",
# "Bug Tracker": f"{REPO_URL}/issues",
# "Changelog": f"{REPO_URL}/releases",
# "Roadmap": f"{REPO_URL}/wiki/Roadmap",
# "Community": f"{REPO_URL}/wiki/Web-Archiving-Community",
# "Demo": f"https://demo.archivebox.io",
# "Donate": f"{REPO_URL}/wiki/Donations",
# }
ROOT_DIR = Path(__file__).parent.resolve()
PACKAGE_DIR = ROOT_DIR / PKG_NAME
# ROOT_DIR = Path(__file__).parent.resolve()
# PACKAGE_DIR = ROOT_DIR / PKG_NAME
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
# README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
# VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
PYTHON_REQUIRES = ">=3.7"
SETUP_REQUIRES = ["wheel"]
INSTALL_REQUIRES = [
# only add things here that have corresponding apt python3-packages available
# anything added here also needs to be added to our package dependencies in
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests>=2.24.0",
"mypy-extensions>=0.4.3",
"django>=3.1.3,<3.2",
"django-extensions>=3.0.3",
"dateparser>=1.0.0",
"youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"python-crontab>=2.5.1",
"croniter>=0.3.34",
"w3lib>=1.22.0",
"ipython>5.0.0",
"django-auth-ldap>=4.1.0"
]
EXTRAS_REQUIRE = {
'sonic': [
"sonic-client>=0.0.5",
],
'dev': [
"setuptools",
"twine",
"wheel",
"flake8",
"ipdb",
"mypy",
"django-stubs",
"sphinx",
"sphinx-rtd-theme",
"recommonmark",
"pytest",
"bottle",
"stdeb",
"django-debug-toolbar",
"djdt_flamegraph",
],
}
# class DisabledTestCommand(test):
# def run(self):
# # setup.py test is deprecated, disable it here by force so stdeb doesnt run it
# print('\n[X] Running tests via setup.py test is deprecated.')
# print(' Hint: Use the ./bin/test.sh script or pytest instead')
# To see when setup.py gets called (uncomment for debugging):
# import sys
# print(PACKAGE_DIR, f" (v{VERSION})")
# print('>', sys.executable, *sys.argv)
# PYTHON_REQUIRES = ">=3.9"
# SETUP_REQUIRES = ["wheel"]
# INSTALL_REQUIRES = [
# # only add things here that have corresponding apt python3-packages available
# # anything added here also needs to be added to our package dependencies in
# # stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# # if there is no apt python3-package equivalent, then vendor it instead in
# # ./archivebox/vendor/
# "requests>=2.24.0",
# "mypy-extensions>=0.4.3",
# "django>=3.1.3,<3.2",
# "django-extensions>=3.0.3",
# "dateparser>=1.0.0",
# "youtube-dl>=2021.04.17",
# "yt-dlp>=2021.4.11",
# "python-crontab>=2.5.1",
# "croniter>=0.3.34",
# "w3lib>=1.22.0",
# "ipython>5.0.0",
# ]
# EXTRAS_REQUIRE = {
# 'sonic': [
# "sonic-client>=0.0.5",
# ],
# 'ldap': [
# "django-auth-ldap>=4.1.0",
# ],
# 'dev': [
# "setuptools",
# "twine",
# "wheel",
# "flake8",
# "ipdb",
# "mypy",
# "django-stubs",
# "sphinx",
# "sphinx-rtd-theme",
# "recommonmark",
# "pytest",
# "bottle",
# "stdeb",
# "django-debug-toolbar",
# "djdt_flamegraph",
# ],
# }
#
# setuptools.setup(
# name=PKG_NAME,
# version=VERSION,
# license=LICENSE,
# author=AUTHOR,
# author_email=AUTHOR_EMAIL,
# description=DESCRIPTION,
# long_description=README,
# long_description_content_type="text/markdown",
# url=REPO_URL,
# project_urls=PROJECT_URLS,
# python_requires=PYTHON_REQUIRES,
# setup_requires=SETUP_REQUIRES,
# install_requires=INSTALL_REQUIRES,
# extras_require=EXTRAS_REQUIRE,
# packages=[PKG_NAME],
# include_package_data=True, # see MANIFEST.in
# entry_points={
# "console_scripts": [
# f"{PKG_NAME} = {PKG_NAME}.cli:main",
# ],
# },
# classifiers=[
# "License :: OSI Approved :: MIT License",
# "Natural Language :: English",
# "Operating System :: OS Independent",
# "Development Status :: 4 - Beta",
class DisabledTestCommand(test):
def run(self):
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it
print()
print('[X] Running tests via setup.py test is deprecated.')
print(' Hint: Use the ./bin/test.sh script or pytest instead')
# "Topic :: Utilities",
# "Topic :: System :: Archiving",
# "Topic :: System :: Archiving :: Backup",
# "Topic :: System :: Recovery Tools",
# "Topic :: Sociology :: History",
# "Topic :: Internet :: WWW/HTTP",
# "Topic :: Internet :: WWW/HTTP :: Indexing/Search",
# "Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
# "Topic :: Software Development :: Libraries :: Python Modules",
setuptools.setup(
name=PKG_NAME,
version=VERSION,
license=LICENSE,
author=AUTHOR,
author_email=AUTHOR_EMAIL,
description=DESCRIPTION,
long_description=README,
long_description_content_type="text/markdown",
url=REPO_URL,
project_urls=PROJECT_URLS,
python_requires=PYTHON_REQUIRES,
setup_requires=SETUP_REQUIRES,
install_requires=INSTALL_REQUIRES,
extras_require=EXTRAS_REQUIRE,
packages=[PKG_NAME],
include_package_data=True, # see MANIFEST.in
entry_points={
"console_scripts": [
f"{PKG_NAME} = {PKG_NAME}.cli:main",
],
},
classifiers=[
"License :: OSI Approved :: MIT License",
"Natural Language :: English",
"Operating System :: OS Independent",
"Development Status :: 4 - Beta",
"Topic :: Utilities",
"Topic :: System :: Archiving",
"Topic :: System :: Archiving :: Backup",
"Topic :: System :: Recovery Tools",
"Topic :: Sociology :: History",
"Topic :: Internet :: WWW/HTTP",
"Topic :: Internet :: WWW/HTTP :: Indexing/Search",
"Topic :: Internet :: WWW/HTTP :: WSGI :: Application",
"Topic :: Software Development :: Libraries :: Python Modules",
"Intended Audience :: Developers",
"Intended Audience :: Education",
"Intended Audience :: End Users/Desktop",
"Intended Audience :: Information Technology",
"Intended Audience :: Legal Industry",
"Intended Audience :: System Administrators",
# "Intended Audience :: Developers",
# "Intended Audience :: Education",
# "Intended Audience :: End Users/Desktop",
# "Intended Audience :: Information Technology",
# "Intended Audience :: Legal Industry",
# "Intended Audience :: System Administrators",
"Environment :: Console",
"Environment :: Web Environment",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Framework :: Django",
"Typing :: Typed",
],
cmdclass={
"test": DisabledTestCommand,
},
)
# "Environment :: Console",
# "Environment :: Web Environment",
# "Programming Language :: Python :: 3",
# "Programming Language :: Python :: 3.7",
# "Programming Language :: Python :: 3.8",
# "Programming Language :: Python :: 3.9",
# "Framework :: Django",
# "Typing :: Typed",
# ],
# cmdclass={
# "test": DisabledTestCommand,
# },
# )

View file

@ -6,6 +6,6 @@ Suite: focal
Suite3: focal
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
X-Python3-Version: >= 3.7
XS-Python-Version: >= 3.7
X-Python3-Version: >= 3.9
XS-Python-Version: >= 3.9
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck

View file

@ -13,12 +13,51 @@ def test_ignore_methods():
Takes the passed method out of the default methods list and returns that value
"""
ignored = ignore_methods(['title'])
assert should_save_title not in ignored
assert "title" not in ignored
def test_save_allowdenylist_works(tmp_path, process, disable_extractors_dict):
allow_list = {
r'/static': ["headers", "singlefile"],
r'example\.com\.html$': ["headers"],
}
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_ALLOWLIST": pyjson.dumps(allow_list),
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_save_denylist_works(tmp_path, process, disable_extractors_dict):
deny_list = {
"/static": ["singlefile"],
}
disable_extractors_dict.update({
"SAVE_HEADERS": "true",
"USE_SINGLEFILE": "true",
"SAVE_DENYLIST": pyjson.dumps(deny_list),
})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
singlefile_file = archived_item_path / "singlefile.html"
assert not singlefile_file.exists()
headers_file = archived_item_path / "headers.json"
assert headers_file.exists()
def test_singlefile_works(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_SINGLEFILE": "true"})
add_process = subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'],
capture_output=True, env=disable_extractors_dict)
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
output_file = archived_item_path / "singlefile.html"
assert output_file.exists()