Merge branch 'dev' into feat/reverse-proxy-auth

This commit is contained in:
Nick Sweeting 2023-01-09 18:20:45 -08:00 committed by GitHub
commit 2538b170c7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
36 changed files with 625 additions and 292 deletions

View file

@ -23,11 +23,12 @@ jobs:
cd brew_dist/
brew install --build-bottle ./archivebox.rb
# brew bottle archivebox
archivebox version
- name: Add some links to test
run: |
mkdir data && cd data
archivebox init
archivebox init --setup
archivebox add 'https://example.com'
archivebox version
archivebox status

3
.gitignore vendored
View file

@ -24,3 +24,6 @@ data1/
data2/
data3/
output/
# vim
*.sw?

View file

@ -1,13 +1,22 @@
# This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, single-file
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
# Usage:
# git submodule update --init --recursive
# git pull --recurse-submodules
# docker build . -t archivebox --no-cache
# docker run -v "$PWD/data":/data archivebox init
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
# docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
# Multi-arch build:
# docker buildx create --use
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
#
# Read more about [developing
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
FROM python:3.9-slim-buster
FROM python:3.10-slim-bullseye
LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
@ -48,11 +57,12 @@ RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
wget curl chromium git ffmpeg youtube-dl ripgrep \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
&& rm -rf /var/lib/apt/lists/*
# Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
&& echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
&& echo 'deb https://deb.nodesource.com/node_17.x buster main' >> /etc/apt/sources.list \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
nodejs \
@ -80,7 +90,8 @@ RUN apt-get update -qq \
build-essential python-dev python3-dev \
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install --quiet -r /tmp/requirements.txt \
&& pip install -r /tmp/requirements.txt \
&& pip install --upgrade youtube-dl yt-dlp \
&& apt-get purge -y build-essential python-dev python3-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
@ -103,13 +114,14 @@ RUN pip install -e .
WORKDIR "$DATA_DIR"
ENV IN_DOCKER=True \
CHROME_SANDBOX=False \
CHROME_BINARY="chromium" \
CHROME_BINARY="/usr/bin/chromium-browser" \
USE_SINGLEFILE=True \
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \
USE_READABILITY=True \
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \
USE_MERCURY=True \
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser"
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \
YOUTUBEDL_BINARY="yt-dlp"
# Print version for nice docker finish summary
# RUN archivebox version
@ -119,8 +131,9 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
# Optional:
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

View file

@ -51,10 +51,13 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
<br/>
**📦&nbsp; Get ArchiveBox with `docker-compose` / `docker` / `apt` / `brew` / `pip3` ([see Quickstart below](#quickstart)).**
**📦&nbsp; Get ArchiveBox with Docker / `apt` / `brew` / `pip3` / etc. ([see Quickstart below](#quickstart)).**
```bash
# Or use this auto setup script to install it for you (optional)
# Follow the instructions for your package manager in the quickstart, e.g.:
pip3 install archivebox
# Or use the optional auto setup script to install it for you:
curl -sSL 'https://get.archivebox.io' | sh
```
@ -81,15 +84,15 @@ ls ./archive/*/index.json # or browse directly via the filesyste
## Key Features
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up online, stores all data locally
- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies)
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl), articles (readability), code (git), etc.](#output-formats)
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats)
- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
- [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, and WARC
- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA)
- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode)
- Planned: support for archiving [content requiring a login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (working, but ill-advised until some pending fixes are released)
- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode)
- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!)
- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345)...
<br/><br/>
@ -165,14 +168,16 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the C
</ol>
See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.<br/>
See <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/bin/setup.sh"><code>setup.sh</code></a> for the source code of the auto-install script.
See <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/bin/setup.sh"><code>setup.sh</code></a> for the source code of the auto-install script.<br/>
See <a href="https://docs.sweeting.me/s/against-curl-sh">"Against curl | sh as an install method"</a> blog post for my thoughts on the shortcomings of this install method.
<br/><br/>
</details>
<br/>
#### 🛠&nbsp; Manual Setup
#### 🛠&nbsp; Package Manager Setup
<a name="Manual-Setup"></a>
<details>
<summary><b><img src="https://user-images.githubusercontent.com/511499/117448075-49597580-af0c-11eb-91ba-f34fff10096b.png" alt="aptitude" height="28px" align="top"/> <code>apt</code></b> (Ubuntu/Debian)</summary>
<br/>
@ -272,7 +277,7 @@ See the <a href="https://github.com/ArchiveBox/pip-archivebox"><code>pip-archive
<summary><img src="https://user-images.githubusercontent.com/511499/118077361-f0616580-b381-11eb-973c-ee894a3349fb.png" alt="Arch" height="28px" align="top"/> <code>pacman</code> / <img src="https://user-images.githubusercontent.com/511499/118077946-29e6a080-b383-11eb-94f0-d4871da08c3f.png" alt="FreeBSD" height="28px" align="top"/> <code>pkg</code> / <img src="https://user-images.githubusercontent.com/511499/118077861-002d7980-b383-11eb-86a7-5936fad9190f.png" alt="Nix" height="28px" align="top"/> <code>nix</code> (Arch/FreeBSD/NixOS/more)</summary>
<br/>
<ul>
<li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>pacman install archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
<li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
<li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -sSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li>
<li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li>
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
@ -316,6 +321,7 @@ None of these hosting providers are officially endorsed:<br/>
<sub><i>(most still require manual setup or manual periodic updating using the methods above)</i></sub>
<br/><br/>
<li><a href="https://www.stellarhosted.com/archivebox/"><img src="https://img.shields.io/badge/Semi_Managed_Hosting-StellarHosted.com-%23193f7e.svg?style=flat" height="22px"/></a> (USD $29-250/mo, <a href="https://www.stellarhosted.com/archivebox/#pricing">pricing</a>)</li>
<li><a href="https://www.pikapods.com/pods?run=archivebox"><img src="https://img.shields.io/badge/Semi_Managed_Hosting-PikaPods.com-%2343a047.svg?style=flat" height="22px"/></a> (from USD $2.6/mo)</li>
<li><a href="https://m.do.co/c/cbc4c0c17840">
<img src="https://img.shields.io/badge/Unmanaged_VPS-DigitalOcean.com-%232f7cf7.svg?style=flat" height="22px"/>
</a> (USD $5-50+/mo, <a href="https://m.do.co/c/cbc4c0c17840">🎗&nbsp; referral link</a>, <a href="https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-compose-on-ubuntu-20-04">instructions</a>)</li>
@ -341,7 +347,7 @@ For more discussion on managed and paid hosting options see here: <a href="https
- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)...
- Tweak your UI or archiving behavior [Configuration](#configuration) or read about some of the [Caveats](#caveats) and troubleshooting steps...
- Read about the [Dependencies](#dependencies) used for archiving or the [Archive Layout](#archive-layout) on disk...
- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk...
- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)...
<br/>
@ -362,13 +368,13 @@ archivebox help
- `archivebox setup/init/config/status/manage` to administer your collection
- `archivebox add/schedule/remove/update/list/shell/oneshot` to manage Snapshots in the archive
- `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
- `archivebox schedule` to pull in fresh URLs in regularly from [bookmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
#### 🖥&nbsp; Web UI Usage
```bash
archivebox manage createsuperuser
archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it
archivebox manage createsuperuser # set an admin password
archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it
# you can also configure whether or not login is required for most features
archivebox config --set PUBLIC_INDEX=False
@ -419,6 +425,7 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
- <img src="https://i.imgur.com/AQyHbu8.png" height="22px"/> Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox)
- <img src="https://getpocket.com/favicon.ico" height="22px"/> [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user/export), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
<img src="https://i.imgur.com/zM4z1aU.png" width="330px" align="right">
@ -462,7 +469,7 @@ Inside each Snapshot folder, ArchiveBox save these different types of extractor
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury
- **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp)
- **Source Code:** `git/` clone of any repository found on GitHub, Bitbucket, or GitLab links
- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
@ -522,7 +529,7 @@ To achieve high fidelity archives in as many situations as possible, ArchiveBox
- `node` & `npm` (for readability, mercury, and singlefile)
- `wget` (for plain HTML, static files, and WARC saving)
- `curl` (for fetching headers, favicon, and posting to Archive.org)
- `youtube-dl` (for audio, video, and subtitles)
- `youtube-dl` or `yt-dlp` (for audio, video, and subtitles)
- `git` (for cloning git repos)
- and more as we grow...
@ -538,8 +545,9 @@ archivebox setup # auto install all the extractors and extras
archivebox --version # see info and check validity of installed dependencies
```
Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported**, but some advanced users have reported getting it working.
Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported** (I cannot respond to Windows support tickets), but some advanced users have reported getting it working.
For detailed information about ugprading ArchiveBox and its dependencies, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
<br/>
@ -829,6 +837,7 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http
- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install)
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives)
- [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha)
- [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (alpha)
@ -895,7 +904,9 @@ archivebox --version
# if you edit e.g. ./archivebox/core/models.py on the docker host, runserver
# inside the container will reload and pick up your changes
docker build . -t archivebox
docker run -it archivebox init --setup
docker run -it \
-v $PWD/data:/data \
archivebox init --setup
docker run -it -p 8000:8000 \
-v $PWD/data:/data \
-v $PWD/archivebox:/app/archivebox \
@ -921,6 +932,8 @@ archivebox config --set DEBUG=True
archivebox server --debug ...
```
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
</details>
#### Install and run a specific GitHub branch
@ -975,7 +988,8 @@ cd path/to/test/data/
archivebox shell
archivebox manage dbshell
```
(uses `pytest -s`)
(uses `pytest -s`)
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
</details>
@ -1067,7 +1081,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
<img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/>
<br/>
<i><sub>
This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous contributors and <a href="https://monadical.com">Monadical</a> (✨ <a href="https://monadical.com">hire them</a> for dev work!).
This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous <a href="https://github.com/ArchiveBox/ArchiveBox/graphs/contributors">contributors</a> and <a href="https://monadical.com">Monadical</a> (✨ <a href="https://monadical.com">hire them</a> for dev work!).
</sub>
</i>
<br/><br/>

View file

@ -1 +1,3 @@
theme: jekyll-theme-merlot
production_url: https://archivebox.io
theme: jekyll-theme-merlot
# Github Pages static site settings for https://archivebox.io

View file

@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--update-all', #'-n',
'--update', #'-u',
action='store_true',
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
default=False,
help="Also update ALL links in index when finished adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
urls=stdin_urls or urls,
depth=command.depth,
tag=command.tag,
update=command.update,
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,

View file

@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
)
parser.add_argument(
'--update',
action='store_true',
help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
every=command.every,
depth=command.depth,
overwrite=command.overwrite,
update=command.update,
import_path=command.import_path,
out_dir=pwd or OUTPUT_DIR,
)

View file

@ -26,11 +26,12 @@ import io
import re
import sys
import json
import inspect
import getpass
import platform
import shutil
import sqlite3
import django
from sqlite3 import dbapi2 as sqlite3
from hashlib import md5
from pathlib import Path
@ -48,6 +49,9 @@ from .config_stubs import (
ConfigDefaultDict,
)
### Pre-Fetch Minimal System Config
SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
},
@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
'URL_WHITELIST': {'type': str, 'default': None},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
'SERVER_CONFIG': {
@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-call-home',
'--write-sub',
'--all-subs',
'--write-auto-sub',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--compressed'
]},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default' : None}
},
'SEARCH_BACKEND_CONFIG' : {
@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
'static_index.json',
}
def get_version(config):
return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
def get_commit_hash(config):
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
return None
############################## Derived Config ##################################
@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
'VERSION': {'default': lambda c: get_version(c)},
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
return None
try:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
if not version_str:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except OSError:
@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': config['OUTPUT_DIR'].resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'CONFIG_FILE': {
'path': config['CONFIG_FILE'].resolve(),
@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
},
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
'ARCHIVEBOX_BINARY': {
'path': bin_path(config['ARCHIVEBOX_BINARY']),
'version': config['VERSION'],
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
'enabled': True,
'is_valid': True,
},
'PYTHON_BINARY': {
'path': bin_path(config['PYTHON_BINARY']),
'version': config['PYTHON_VERSION'],
@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': bool(config['PYTHON_VERSION']),
},
'SQLITE_BINARY': {
'path': bin_path(config['SQLITE_BINARY']),
'version': config['SQLITE_VERSION'],
'hash': bin_hash(config['SQLITE_BINARY']),
'enabled': True,
'is_valid': bool(config['SQLITE_VERSION']),
},
'DJANGO_BINARY': {
'path': bin_path(config['DJANGO_BINARY']),
'version': config['DJANGO_VERSION'],
@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
'ARCHIVEBOX_BINARY': {
'path': bin_path(config['ARCHIVEBOX_BINARY']),
'version': config['VERSION'],
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
'enabled': True,
'is_valid': True,
},
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': config['CHROME_BINARY'],
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
@ -972,13 +1020,22 @@ globals().update(CONFIG)
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
os.environ["TZ"] = 'UTC'
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later
os.environ["TZ"] = TIMEZONE
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
sys.path.append(NODE_BIN_PATH)
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# get SQLite database version, compile options, and runtime options
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
########################### Config Validity Checkers ###########################
@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR']
assert isinstance(output_dir, (str, Path))
@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
# without running migrations automatically (user runs them manually by calling init)
django.setup()
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed
try:
from django.core.cache import cache
@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident

View file

@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False):
WGET_ARGS: List[str]
CURL_ARGS: List[str]
GIT_ARGS: List[str]
TAG_SEPARATOR_PATTERN: str
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.14 on 2022-09-14 09:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0020_auto_20210410_1031'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
),
]

View file

@ -19,7 +19,7 @@ from ..config import (
SQL_INDEX_FILENAME,
OUTPUT_DIR,
LOGS_DIR,
TIME_ZONE,
TIMEZONE,
)
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@ -157,7 +157,7 @@ DATABASES = {
'timeout': 60,
'check_same_thread': False,
},
'TIME_ZONE': 'UTC',
'TIME_ZONE': TIMEZONE,
# DB setup is sometimes modified at runtime by setup_django() in config.py
}
}
@ -227,7 +227,8 @@ USE_L10N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
TIME_ZONE = TIME_ZONE # noqa
TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
from django.conf.locale.en import formats as en_formats

View file

@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
# print('DEBUG', settings.DEBUG)
@ -24,14 +24,16 @@ urlpatterns = [
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', admin.site.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('index.html', RedirectView.as_view(url='/')),
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
path('', HomepageView.as_view(), name='Home'),

View file

@ -38,7 +38,7 @@ class HomepageView(View):
if PUBLIC_INDEX:
return redirect('/public')
return redirect(f'/admin/login/?next={request.path}')
@ -205,7 +205,7 @@ class SnapshotView(View):
content_type="text/html",
status=404,
)
class PublicIndexView(ListView):
template_name = 'public_index.html'
@ -220,7 +220,7 @@ class PublicIndexView(ListView):
'FOOTER_INFO': FOOTER_INFO,
}
def get_queryset(self, **kwargs):
def get_queryset(self, **kwargs):
qs = super().get_queryset(**kwargs)
query = self.request.GET.get('q')
if query and query.strip():
@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView):
url = self.request.GET.get('url', None)
if url:
return {'url': url if '://' in url else f'https://{url}'}
return super().get_initial()
def test_func(self):
@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView):
"form": AddLinkForm()
})
return render(template_name=self.template_name, request=self.request, context=context)
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
"""
def get(self, request):
"""
Handle a GET request
"""
return HttpResponse(
'OK',
content_type='text/plain',
status=200
)

View file

@ -1,12 +1,14 @@
__package__ = 'archivebox.extractors'
import os
import sys
from pathlib import Path
from typing import Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet
from ..core.settings import ERROR_LOG
from ..index.schema import Link
from ..index.sql import write_link_to_sql_index
from ..index import (
@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers
def get_default_archive_methods():
return [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
@ -50,7 +51,8 @@ def get_default_archive_methods():
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('wget', should_save_wget, save_wget),
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them
('readability', should_save_readability, save_readability),
('mercury', should_save_mercury, save_mercury),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
# are fixed.
"""
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
"""
# Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
method_name,
link.url,
command,
ts
) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(' ', stats)
@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
except KeyboardInterrupt:
log_archiving_paused(num_links, idx, link.timestamp)
raise SystemExit(0)
except BaseException: # lgtm [py/catch-base-exception]
except BaseException:
print()
raise

View file

@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
@enforce_types
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'media'
@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
YOUTUBEDL_BINARY,
*YOUTUBEDL_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
link.url,
]
status = 'succeeded'
@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
timer.end()
# add video description and subtitles to full-text index
# Let's try a few different
index_texts = [
text_file.read_text(encoding='utf-8').strip()
# errors:
# * 'strict' to raise a ValueError exception if there is an
# encoding error. The default value of None has the same effect.
# * 'ignore' ignores errors. Note that ignoring encoding errors
# can lead to data loss.
# * 'xmlcharrefreplace' is only supported when writing to a
# file. Characters not supported by the encoding are replaced with
# the appropriate XML character reference &#nnn;.
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
for text_file in (
*output_path.glob('*.description'),
*output_path.glob('*.srt'),

View file

@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, atomic_write
from ..util import (
enforce_types,
download_url,
is_static_file,
)
from ..config import (
TIMEOUT,
@ -22,28 +20,8 @@ from ..config import (
READABILITY_VERSION,
)
from ..logging_util import TimedProgress
from .title import get_html
@enforce_types
def get_html(link: Link, path: Path) -> str:
"""
Try to find wget, singlefile and then dom files.
If none is found, download the url again.
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
document = None
for source in sources:
try:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url)
else:
return document
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:

View file

@ -17,6 +17,7 @@ from ..config import (
SAVE_SINGLEFILE,
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
CHROME_BINARY,
)
from ..logging_util import TimedProgress
@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
options = [
*SINGLEFILE_ARGS,
'--browser-executable-path={}'.format(CHROME_BINARY),
browser_args,
]
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
seen_option_names = []
def test_seen(argument):
option_name = argument.split("=")[0]
if option_name in seen_option_names:
return False
else:
seen_option_names.append(option_name)
return True
deduped_options = list(filter(test_seen, options))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
*deduped_options,
link.url,
output,
]

View file

@ -58,6 +58,27 @@ class TitleParser(HTMLParser):
if tag.lower() == "title":
self.inside_title_tag = False
@enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
Try to find wget, singlefile and then dom files.
If none is found, download the url again.
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
document = None
for source in sources:
try:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url, timeout=timeout)
else:
return document
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
html = download_url(link.url, timeout=timeout)
html = get_html(link, out_dir, timeout=timeout)
try:
# try using relatively strict html parser first
parser = TitleParser()

View file

@ -24,6 +24,7 @@ from ..config import (
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
PREVIEW_ORIGINALS,
)
MAIN_INDEX_TEMPLATE = 'static_index.html'
@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str:
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
})
@enforce_types

View file

@ -1,5 +1,7 @@
__package__ = 'archivebox.index'
import re
from io import StringIO
from pathlib import Path
from typing import List, Tuple, Iterator
@ -8,7 +10,10 @@ from django.db import transaction
from .schema import Link
from ..util import enforce_types, parse_date
from ..config import OUTPUT_DIR
from ..config import (
OUTPUT_DIR,
TAG_SEPARATOR_PATTERN,
)
### Main Links Index
@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
def write_link_to_sql_index(link: Link):
from core.models import Snapshot, ArchiveResult
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
tags = []
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
))
info.pop('tags')
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link):
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
snapshot.save_tags(tag_list)
for extractor, entries in link.history.items():
for entry in entries:
@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
snap = write_link_to_sql_index(link)
snap.title = link.title
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
))
snap.save()
snap.save_tags(tag_list)

View file

@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"):
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
else:
if isinstance(hints, bytes):
hints = hints.decode()
hints = hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str:
def printable_folder_status(name: str, folder: Dict) -> str:
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
color, symbol, note, num_files = 'green', '', 'valid', ''
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str:
)
else:
num_files = 'missing'
if folder.get('is_mount'):
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:

View file

@ -4,8 +4,9 @@ import os
import sys
import shutil
import platform
from django.utils import timezone
from pathlib import Path
from datetime import date
from datetime import date, datetime
from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices
@ -70,7 +71,12 @@ from .config import (
IS_TTY,
DEBUG,
IN_DOCKER,
PUID,
PGID,
USER,
TIMEZONE,
ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS,
PYTHON_BINARY,
ARCHIVEBOX_BINARY,
ONLY_NEW,
@ -90,6 +96,7 @@ from .config import (
check_data_folder,
write_config_file,
VERSION,
COMMIT_HASH,
CODE_LOCATIONS,
EXTERNAL_LOCATIONS,
DATA_LOCATIONS,
@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
def version(quiet: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox version and dependency information"""
if quiet:
print(VERSION)
else:
# ArchiveBox v0.5.6
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
print('ArchiveBox v{}'.format(VERSION))
print(VERSION)
if not quiet:
# 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
p = platform.uname()
print(
'ArchiveBox v{}'.format(VERSION),
*((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
sys.implementation.name.title(),
p.system,
platform.platform(),
p.machine,
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print(
f'IN_DOCKER={IN_DOCKER}',
f'DEBUG={DEBUG}',
f'IN_DOCKER={IN_DOCKER}',
f'IS_TTY={IS_TTY}',
f'TZ={os.environ.get("TZ", "UTC")}',
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
f'TZ={TIMEZONE}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
)
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print(printable_dependency_version(name, dependency))
# add a newline between core dependencies and extractor dependencies for easier reading
if name == 'ARCHIVEBOX_BINARY':
print()
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
print(' archivebox server # then visit http://127.0.0.1:8000')
print()
print(' To add new links, you can run:')
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
def add(urls: Union[str, List[str]],
tag: str='',
depth: int=0,
update_all: bool=not ONLY_NEW,
update: bool=not ONLY_NEW,
update_all: bool=False,
index_only: bool=False,
overwrite: bool=False,
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
# If we're going one level deeper, download each link and look for more links
@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]],
if new_links and depth == 1:
log_crawl_started(new_links)
for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
try:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
except Exception as err:
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]],
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
stderr()
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
if update:
stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
elif update_all:
stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
archive_links(new_links, overwrite=False, **archive_kwargs)
@ -1113,6 +1147,7 @@ def schedule(add: bool=False,
every: Optional[str]=None,
depth: int=0,
overwrite: bool=False,
update: bool=not ONLY_NEW,
import_path: Optional[str]=None,
out_dir: Path=OUTPUT_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
@ -1142,6 +1177,7 @@ def schedule(add: bool=False,
*([
'add',
*(['--overwrite'] if overwrite else []),
*(['--update'] if update else []),
f'--depth={depth}',
f'"{import_path}"',
] if import_path else ['update']),

View file

@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None,
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
atomic_write(source_path, raw_text)
referenced_texts = ''
for entry in raw_text.split():
try:
if Path(entry).exists():
referenced_texts += Path(entry).read_text()
except Exception as err:
print(err)
atomic_write(source_path, raw_text + '\n' + referenced_texts)
log_source_saved(source_file=source_path)
return source_path
@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
raise e
else:
# Source is a path to a local file on the filesystem

View file

@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
def link_from_article(article: dict, sources: list):
url: str = article['resolved_url'] or article['given_url']
url: str = article.get('resolved_url') or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
title = article.get('resolved_title') or article.get('given_title') or url
return Link(
url=url,

View file

@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
splits_fixed = leading_removed.replace('"\n href="', '" href="')
rows = splits_fixed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
def get_row(prefix):
return [
row.strip()
for row in rows
if row.strip().startswith('<{}'.format(prefix))
][0]
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
try:
@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags = None
yield Link(
url=htmldecode(url),
url=htmldecode(url_inside_attr or url_inside_link),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags or '',

View file

@ -197,7 +197,7 @@
// select the action button from the dropdown
container.find('select[name=action]')
.find('op:selected').removeAttr('selected').end()
.find('[selected]').removeAttr('selected').end()
.find('[value=' + action_type + ']').attr('selected', 'selected').click()
// click submit & replace the archivebox logo with a spinner

View file

@ -28,6 +28,14 @@
<a href="/add" id="submit">&nbsp; Add more URLs </a>
</center>
{% else %}
<div id="in-progress" style="display: none;">
<center><h3>Adding URLs to index and running archive methods...</h3>
<br/>
<div class="loader"></div>
<br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
<h1>Add new URLs to your archive</h1>
<br/>
@ -48,10 +56,9 @@
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
setTimeout(function() {
document.getElementById('add-form').innerHTML = '<center><h3>Adding URLs to index and running archive methods...<h3><br/><div class="loader"></div><br/>Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...</center>'
document.getElementById('delay-warning').style.display = 'block'
}, 200)
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block'
return true
})
</script>

View file

@ -414,6 +414,7 @@
</div>
</div>
{% endif %}
{% if PREVIEW_ORIGINALS %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
@ -427,6 +428,7 @@
</div>
</div>
</div>
{% endif %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>

View file

@ -91,9 +91,9 @@ echo " This is a helper script which installs the ArchiveBox dependencies on
echo " You may be prompted for a sudo password in order to install the following:"
echo ""
echo " - archivebox"
echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)"
echo " - curl, wget, git, youtube-dl (used for extracting title, favicon, git, media, and more)"
echo " - chromium (skips this if any Chrome/Chromium version is already installed)"
echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)"
echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)"
echo " - chromium (skips this if any Chrome/Chromium version is already installed)"
echo ""
echo " If you'd rather install these manually as-needed, you can find detailed documentation here:"
echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install"
@ -115,13 +115,13 @@ if which apt-get > /dev/null; then
fi
echo
echo "[+] Installing ArchiveBox system dependencies using apt..."
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl ffmpeg git nodejs npm ripgrep
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl yt-dlp ffmpeg git nodejs npm ripgrep
sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true
sudo apt-get install -y archivebox
sudo apt-get --only-upgrade install -y archivebox
echo ""
echo "[+] Installing ArchiveBox python dependencies using pip..."
sudo python3.7 -m pip install --upgrade --ignore-installed archivebox
echo "[+] Installing ArchiveBox python dependencies using pip3..."
sudo python3 -m pip install --upgrade --ignore-installed archivebox
# On Mac:
elif which brew > /dev/null; then
echo "[+] Installing ArchiveBox system dependencies using brew..."
@ -129,16 +129,16 @@ elif which brew > /dev/null; then
brew update
brew install --fetch-HEAD -f archivebox
echo ""
echo "[+] Installing ArchiveBox python dependencies using pip..."
echo "[+] Installing ArchiveBox python dependencies using pip3..."
python3 -m pip install --upgrade --ignore-installed archivebox
elif which pkg > /dev/null; then
echo "[+] Installing ArchiveBox system dependencies using pkg..."
sudo pkg install -y python37 py37-pip py37-sqlite3 node npm wget curl youtube_dl ffmpeg git ripgrep
echo "[+] Installing ArchiveBox system dependencies using pkg and pip (python3.9)..."
sudo pkg install -y python3 py39-pip py39-sqlite3 npm wget curl youtube_dl ffmpeg git ripgrep
sudo pkg install -y chromium
echo ""
echo "[+] Installing ArchiveBox python dependencies using pip..."
sudo python3.7 -m pip install --upgrade --ignore-installed archivebox
alias python3=python3.7
# don't use sudo here so that pip installs in $HOME/.local instead of into /usr/local
python3 -m pip install --upgrade --ignore-installed archivebox
else
echo "[!] Warning: Could not find aptitude/homebrew/pkg! May not be able to install all dependencies automatically."
echo ""
@ -192,7 +192,7 @@ echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized
echo " cd ~/archivebox"
echo " ps aux | grep archivebox"
echo " pkill -f archivebox"
echo " pip3 install --upgrade archivebox"
echo " python3 -m pip install --upgrade archivebox"
echo " archivebox server --quick-init 0.0.0.0:8000"
echo " archivebox manage createsuperuser"
echo " archivebox add 'https://example.com'"

@ -1 +1 @@
Subproject commit 95a1c1a0875841d076f06106bd4c2307504928c2
Subproject commit a4314719746de549f359c2fa975762fc73b62f94

View file

@ -8,7 +8,7 @@
# Documentation:
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
version: '2.4'
version: '2.4' # '3.9' or greater also works
services:
archivebox:
@ -23,15 +23,21 @@ services:
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below
# - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SecretPassword
# dns: # uncomment to use pihole below for ad/tracker blocking during archiving
# - pihole
volumes:
- ./data:/data
# - ./archivebox:/app/archivebox # for developers working on archivebox
# To run the Sonic full-text search backend, first download the config file to sonic.cfg
### Optional Addons: tweak these examples as needed for your specific use case
### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
# curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
# after starting, backfill any existing Snapshots into the index: docker-compose run archivebox update --index-only
# sonic:
# image: valeriansaliou/sonic:v1.3.0
# image: valeriansaliou/sonic:v1.3.1
# expose:
# - 1491
# environment:
@ -39,12 +45,25 @@ services:
# volumes:
# - ./sonic.cfg:/etc/sonic.cfg:ro
# - ./data/sonic:/var/lib/sonic/store
### Example: To run pihole in order to block ad/tracker requests during archiving,
# uncomment this block and set up pihole using its admin interface
# pihole:
# image: pihole/pihole:latest
# ports:
# - 80:80 # uncomment to access the admin HTTP interface on http://localhost:80
# environment:
# WEBPASSWORD: 'set a secure password here or it will be random'
# volumes:
# - ./data/pihole:/etc/pihole
# - ./data/dnsmasq:/etc/dnsmasq.d
### Optional Addons: tweak these examples as needed for your specific use case
### Example: Run scheduled imports in a docker instead of using cron on the
# host machine, add tasks and see more info with archivebox schedule --help
# Example: Run scheduled imports in a docker instead of using cron on the
# host machine, add tasks and see more info with archivebox schedule --help
# scheduler:
# image: archivebox/archivebox:latest
# command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all'
@ -54,7 +73,9 @@ services:
# volumes:
# - ./data:/data
# Example: Put Nginx in front of the ArchiveBox server for SSL termination
### Example: Put Nginx in front of the ArchiveBox server for SSL termination
# nginx:
# image: nginx:alpine
# ports:
@ -64,7 +85,9 @@ services:
# - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf
# - ./data:/var/www
# Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
# wireguard:
# image: linuxserver/wireguard
# network_mode: 'service:archivebox'
@ -78,14 +101,16 @@ services:
# - /lib/modules:/lib/modules
# - ./wireguard.conf:/config/wg0.conf:ro
# Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
# pywb:
# image: webrecorder/pywb:latest
# entrypoint: /bin/sh 'wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback --proxy;'
# entrypoint: /bin/sh -c '(wb-manager init default || test $$? -eq 2) && wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback;'
# environment:
# - INIT_COLLECTION=archivebox
# ports:
# - 8080:8080
# volumes:
# ./data:/archivebox
# ./data/wayback:/webarchive
# - ./data:/archivebox
# - ./data/wayback:/webarchive

View file

@ -55,7 +55,7 @@
# CURL_BINARY = curl
# GIT_BINARY = git
# WGET_BINARY = wget
# YOUTUBEDL_BINARY = youtube-dl
# YOUTUBEDL_BINARY = yt-dlp
# CHROME_BINARY = chromium
# CHROME_USER_DATA_DIR="~/.config/google-chrome/Default"

282
package-lock.json generated
View file

@ -5,11 +5,11 @@
"requires": true,
"dependencies": {
"@babel/runtime-corejs2": {
"version": "7.13.10",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.13.10.tgz",
"integrity": "sha512-rZw5P1ZewO6XZTDxtXuAuAFUqfNXyM8HO/9WiaDd34Anka0uFTpo0RvBLeV775AEE/zKw3LQB+poZw/O9lrZBg==",
"version": "7.17.11",
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.17.11.tgz",
"integrity": "sha512-pJe8Aerb88TGVi1Xe/AE36aRCPrg+h6ktZPGl6xaJvOfTLcMMuogQu3BYcxeXPTNHhSYbmsDVYBs8CfAxeFFTg==",
"requires": {
"core-js": "^2.6.5",
"core-js": "^2.6.12",
"regenerator-runtime": "^0.13.4"
}
},
@ -28,9 +28,8 @@
}
},
"@postlight/mercury-parser": {
"version": "2.2.0",
"resolved": "https://registry.npmjs.org/@postlight/mercury-parser/-/mercury-parser-2.2.0.tgz",
"integrity": "sha512-nz6dIvCAaiv74o1vhhp0BRsUe+ysPbZG5mVNpJmgLoI/goOBqRMM3Yg8uXtnv++e7tzKFSXdls8b2/zKk1qL0Q==",
"version": "git+https://github.com/postlight/mercury-parser.git#9cd9662bcbfea00b773fad691a4f6e53394ff543",
"from": "git+https://github.com/postlight/mercury-parser.git",
"requires": {
"@babel/runtime-corejs2": "^7.2.0",
"@postlight/ci-failed-test-reporter": "^1.0",
@ -50,35 +49,7 @@
"url": "^0.11.0",
"valid-url": "^1.0.9",
"wuzzy": "^0.1.4",
"yargs-parser": "^13.0.0"
},
"dependencies": {
"http-headers": {
"version": "3.0.2",
"bundled": true,
"requires": {
"next-line": "^1.1.0"
}
},
"jquery": {
"version": "3.4.1",
"bundled": true
},
"moment": {
"version": "2.23.0",
"bundled": true
},
"moment-timezone": {
"version": "0.5.26",
"bundled": true,
"requires": {
"moment": ">= 2.9.0"
}
},
"next-line": {
"version": "1.1.0",
"bundled": true
}
"yargs-parser": "^15.0.1"
}
},
"@postman/form-data": {
@ -105,9 +76,9 @@
"integrity": "sha512-RbzJvlNzmRq5c3O09UipeuXno4tA1FE6ikOjxZK0tuxVv3412l64l5t1W5pj4+rJq9vpkm/kwiR07aZXnsKPxw=="
},
"@types/node": {
"version": "16.0.0",
"resolved": "https://registry.npmjs.org/@types/node/-/node-16.0.0.tgz",
"integrity": "sha512-TmCW5HoZ2o2/z2EYi109jLqIaPIi9y/lc2LmDCWzuCi35bcaQ+OtUh6nwBiFK7SOu25FAU5+YKdqFZUwtqGSdg==",
"version": "17.0.4",
"resolved": "https://registry.npmjs.org/@types/node/-/node-17.0.4.tgz",
"integrity": "sha512-6xwbrW4JJiJLgF+zNypN5wr2ykM9/jHcL7rQ8fZe2vuftggjzZeRSM4OwRc6Xk8qWjwJ99qVHo/JgOGmomWRog==",
"optional": true
},
"@types/yauzl": {
@ -170,9 +141,9 @@
}
},
"ansi-regex": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz",
"integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg=="
"version": "5.0.1",
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="
},
"ansi-styles": {
"version": "4.3.0",
@ -188,9 +159,9 @@
"integrity": "sha1-jCpe8kcv2ep0KwTHenUJO6J1fJM="
},
"asn1": {
"version": "0.2.4",
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz",
"integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==",
"version": "0.2.6",
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz",
"integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==",
"requires": {
"safer-buffer": "~2.1.0"
}
@ -445,9 +416,9 @@
}
},
"debug": {
"version": "4.3.2",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz",
"integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==",
"version": "4.3.3",
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.3.tgz",
"integrity": "sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q==",
"requires": {
"ms": "2.1.2"
}
@ -515,9 +486,9 @@
}
},
"dompurify": {
"version": "2.3.0",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.0.tgz",
"integrity": "sha512-VV5C6Kr53YVHGOBKO/F86OYX6/iLTw2yVSI721gKetxpHCK/V5TaLEf9ODjRgl1KLSWRMY6cUhAbv/c+IUnwQw=="
"version": "2.3.4",
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.4.tgz",
"integrity": "sha512-6BVcgOAVFXjI0JTjEvZy901Rghm+7fDQOrNIcxB4+gdhj6Kwp6T9VBhBY/AbagKHJocRkDYGd6wvI+p4/10xtQ=="
},
"domutils": {
"version": "1.5.1",
@ -702,9 +673,9 @@
}
},
"glob": {
"version": "7.1.7",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.7.tgz",
"integrity": "sha512-OvD9ENzPLbegENnYP5UUfJIirTg4+XwMWGaQfQTY0JenxNvvIKP3U3/tAQSPIu/lHxXYSZmpXlUHeqAIdKzBLQ==",
"version": "7.2.0",
"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.0.tgz",
"integrity": "sha512-lmLf6gtyrPq8tTjSmrO94wBeQbFR3HbLHbuyD69wuyQkImp2hWqMGB47OX65FBkPffO641IP9jWa1z4ivqG26Q==",
"requires": {
"fs.realpath": "^1.0.0",
"inflight": "^1.0.4",
@ -729,9 +700,9 @@
}
},
"heap": {
"version": "0.2.6",
"resolved": "https://registry.npmjs.org/heap/-/heap-0.2.6.tgz",
"integrity": "sha1-CH4fELBGky/IWU3Z5tN4r8nR5aw="
"version": "0.2.7",
"resolved": "https://registry.npmjs.org/heap/-/heap-0.2.7.tgz",
"integrity": "sha512-2bsegYkkHO+h/9MGbn6KWcE45cHZgPANo5LXF7EvWdT0yT2EguSVO1nDgU5c8+ZOPwp2vMNa7YFsJhVcDR9Sdg=="
},
"html-encoding-sniffer": {
"version": "1.0.2",
@ -773,12 +744,12 @@
}
},
"http-signature": {
"version": "1.3.5",
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.5.tgz",
"integrity": "sha512-NwoTQYSJoFt34jSBbwzDHDofoA61NGXzu6wXh95o1Ry62EnmKjXb/nR/RknLeZ3G/uGwrlKNY2z7uPt+Cdl7Tw==",
"version": "1.3.6",
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.6.tgz",
"integrity": "sha512-3adrsD6zqo4GsTqtO7FyrejHNv+NgiIfAfv68+jVlFmSr9OGy7zrxONceFRLKvnnZA5jbxQBX1u9PpB6Wi32Gw==",
"requires": {
"assert-plus": "^1.0.0",
"jsprim": "^1.2.2",
"jsprim": "^2.0.2",
"sshpk": "^1.14.1"
}
},
@ -848,6 +819,11 @@
"resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
"integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo="
},
"jquery": {
"version": "3.6.0",
"resolved": "https://registry.npmjs.org/jquery/-/jquery-3.6.0.tgz",
"integrity": "sha512-JVzAR/AjBvVt2BmYhxRCSYysDsPcssdmTFnzyLEts9qNwmjmu4JTAMYubEfwVOSwpQ1I1sKKFcxhZCI2buerfw=="
},
"jsbn": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
@ -887,9 +863,9 @@
}
},
"json-schema": {
"version": "0.2.3",
"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz",
"integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM="
"version": "0.4.0",
"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="
},
"json-schema-traverse": {
"version": "0.4.1",
@ -902,20 +878,20 @@
"integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
},
"jsprim": {
"version": "1.4.1",
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz",
"integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=",
"version": "2.0.2",
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-2.0.2.tgz",
"integrity": "sha512-gqXddjPqQ6G40VdnI6T6yObEC+pDNvyP95wdQhkWkg7crHH3km5qP1FsOXEkzEQwnz6gz5qGTn1c2Y52wP3OyQ==",
"requires": {
"assert-plus": "1.0.0",
"extsprintf": "1.3.0",
"json-schema": "0.2.3",
"json-schema": "0.4.0",
"verror": "1.10.0"
}
},
"jszip": {
"version": "3.6.0",
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.6.0.tgz",
"integrity": "sha512-jgnQoG9LKnWO3mnVNBnfhkh0QknICd1FGSrXcgrl67zioyJ4wgx25o9ZqwNtrROSflGBCGYnJfjrIyRIby1OoQ==",
"version": "3.7.1",
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.7.1.tgz",
"integrity": "sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg==",
"requires": {
"lie": "~3.3.0",
"pako": "~1.0.2",
@ -1078,11 +1054,24 @@
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
},
"moment": {
"version": "2.29.3",
"resolved": "https://registry.npmjs.org/moment/-/moment-2.29.3.tgz",
"integrity": "sha512-c6YRvhEo//6T2Jz/vVtYzqBzwvPT95JBQ+smCytzf7c50oMZRsR/a4w88aD34I+/QVSfnoAnSBFPJHItlOMJVw=="
},
"moment-parseformat": {
"version": "3.0.0",
"resolved": "https://registry.npmjs.org/moment-parseformat/-/moment-parseformat-3.0.0.tgz",
"integrity": "sha512-dVgXe6b6DLnv4CHG7a1zUe5mSXaIZ3c6lSHm/EKeVeQI2/4pwe0VRde8OyoCE1Ro2lKT5P6uT9JElF7KDLV+jw=="
},
"moment-timezone": {
"version": "0.5.26",
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz",
"integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==",
"requires": {
"moment": ">= 2.9.0"
}
},
"ms": {
"version": "2.1.2",
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
@ -1094,9 +1083,33 @@
"integrity": "sha1-/K5XhTBStqm66CCOQN19PC0wRgM="
},
"node-fetch": {
"version": "2.6.1",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz",
"integrity": "sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw=="
"version": "2.6.7",
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.7.tgz",
"integrity": "sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==",
"requires": {
"whatwg-url": "^5.0.0"
},
"dependencies": {
"tr46": {
"version": "0.0.3",
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
"integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o="
},
"webidl-conversions": {
"version": "3.0.1",
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
},
"whatwg-url": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
"integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=",
"requires": {
"tr46": "~0.0.3",
"webidl-conversions": "^3.0.0"
}
}
}
},
"nth-check": {
"version": "1.0.2",
@ -1207,9 +1220,9 @@
"integrity": "sha512-2qHaIQr2VLRFoxe2nASzsV6ef4yOOH+Fi9FBOVH6cqeSgUnoyySPZkxzLuzd+RYOQTRpROA0ztTMqxROKSb/nA=="
},
"postman-request": {
"version": "2.88.1-postman.29",
"resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.29.tgz",
"integrity": "sha512-QuL3+AvGlmPLb1Qf0t/rM8M4U8LCYbADZBijUNToLl6l37i65KH8wY1gTLWLxlw2I6ugxUfX2Zyyk5/J5HFZIg==",
"version": "2.88.1-postman.31",
"resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.31.tgz",
"integrity": "sha512-OJbYqP7ItxQ84yHyuNpDywCZB0HYbpHJisMQ9lb1cSL3N5H3Td6a2+3l/a74UMd3u82BiGC5yQyYmdOIETP/nQ==",
"requires": {
"@postman/form-data": "~3.1.1",
"@postman/tunnel-agent": "^0.6.3",
@ -1308,16 +1321,16 @@
}
},
"ws": {
"version": "7.5.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
"version": "7.5.6",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
}
}
},
"qs": {
"version": "6.5.2",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
"integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
"version": "6.5.3",
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
"integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA=="
},
"querystring": {
"version": "0.2.0",
@ -1334,9 +1347,9 @@
},
"dependencies": {
"acorn": {
"version": "8.4.1",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz",
"integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA=="
"version": "8.6.0",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz",
"integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw=="
},
"acorn-globals": {
"version": "6.0.0",
@ -1417,9 +1430,9 @@
}
},
"estraverse": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz",
"integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ=="
"version": "5.3.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="
},
"form-data": {
"version": "3.0.1",
@ -1440,9 +1453,9 @@
}
},
"jsdom": {
"version": "16.6.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz",
"integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==",
"version": "16.7.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
"integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
"requires": {
"abab": "^2.0.5",
"acorn": "^8.2.4",
@ -1469,7 +1482,7 @@
"whatwg-encoding": "^1.0.5",
"whatwg-mimetype": "^2.3.0",
"whatwg-url": "^8.5.0",
"ws": "^7.4.5",
"ws": "^7.4.6",
"xml-name-validator": "^3.0.0"
}
},
@ -1512,9 +1525,9 @@
}
},
"ws": {
"version": "7.5.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
"version": "7.5.6",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
}
}
},
@ -1529,9 +1542,9 @@
}
},
"regenerator-runtime": {
"version": "0.13.7",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz",
"integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew=="
"version": "0.13.9",
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.9.tgz",
"integrity": "sha512-p3VT+cOEgxFsRRA9X4lkI1E+k2/CtnKtU4gcxyaCUreilL/vqI6CdZ3wxVUx3UOUg+gnUOQQcRI7BmSI656MYA=="
},
"request": {
"version": "2.88.2",
@ -1569,6 +1582,17 @@
"jsprim": "^1.2.2",
"sshpk": "^1.7.0"
}
},
"jsprim": {
"version": "1.4.2",
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz",
"integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==",
"requires": {
"assert-plus": "1.0.0",
"extsprintf": "1.3.0",
"json-schema": "0.4.0",
"verror": "1.10.0"
}
}
}
},
@ -1683,9 +1707,9 @@
},
"dependencies": {
"acorn": {
"version": "8.4.1",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz",
"integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA=="
"version": "8.6.0",
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz",
"integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw=="
},
"acorn-globals": {
"version": "6.0.0",
@ -1766,9 +1790,9 @@
}
},
"estraverse": {
"version": "5.2.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz",
"integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ=="
"version": "5.3.0",
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="
},
"form-data": {
"version": "3.0.1",
@ -1797,9 +1821,9 @@
}
},
"jsdom": {
"version": "16.6.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz",
"integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==",
"version": "16.7.0",
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
"integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
"requires": {
"abab": "^2.0.5",
"acorn": "^8.2.4",
@ -1826,7 +1850,7 @@
"whatwg-encoding": "^1.0.5",
"whatwg-mimetype": "^2.3.0",
"whatwg-url": "^8.5.0",
"ws": "^7.4.5",
"ws": "^7.4.6",
"xml-name-validator": "^3.0.0"
}
},
@ -1869,9 +1893,9 @@
}
},
"ws": {
"version": "7.5.2",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
"version": "7.5.6",
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
}
}
},
@ -1882,9 +1906,9 @@
"optional": true
},
"sshpk": {
"version": "1.16.1",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz",
"integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==",
"version": "1.17.0",
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.17.0.tgz",
"integrity": "sha512-/9HIEs1ZXGhSPE8X6Ccm7Nam1z8KcoCqPdI7ecm1N33EzAetWahvQWVqLZtaZQ+IDKX4IyA2o0gBzqIMkAagHQ==",
"requires": {
"asn1": "~0.2.3",
"assert-plus": "^1.0.0",
@ -1916,13 +1940,13 @@
"integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo="
},
"string-width": {
"version": "4.2.2",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.2.tgz",
"integrity": "sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==",
"version": "4.2.3",
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
"requires": {
"emoji-regex": "^8.0.0",
"is-fullwidth-code-point": "^3.0.0",
"strip-ansi": "^6.0.0"
"strip-ansi": "^6.0.1"
}
},
"string_decoder": {
@ -1934,11 +1958,11 @@
}
},
"strip-ansi": {
"version": "6.0.0",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz",
"integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==",
"version": "6.0.1",
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
"requires": {
"ansi-regex": "^5.0.0"
"ansi-regex": "^5.0.1"
}
},
"strong-data-uri": {
@ -2187,9 +2211,9 @@
}
},
"wuzzy": {
"version": "0.1.6",
"resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.6.tgz",
"integrity": "sha512-x1lDcj0VvzJ1ygDpd9LWMnQVei6gEkUbCcZUG8TPnXhlPbaQWQa32ab/6xbm/samxJ2T3Y2+P3xHeeQIAcEvqQ==",
"version": "0.1.8",
"resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.8.tgz",
"integrity": "sha512-FUzKQepFSTnANsDYwxpIzGJ/dIJaqxuMre6tzzbvWwFAiUHPsI1nVQVCLK4Xqr67KO7oYAK0kaCcI/+WYj/7JA==",
"requires": {
"lodash": "^4.17.15"
}
@ -2231,9 +2255,9 @@
}
},
"yargs-parser": {
"version": "13.1.2",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz",
"integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==",
"version": "15.0.3",
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-15.0.3.tgz",
"integrity": "sha512-/MVEVjTXy/cGAjdtQf8dW3V9b97bPN7rNn8ETj6BmAQL7ibC7O1Q9SPJbGjgh3SlwoBNXMzj/ZGIj8mBgl12YA==",
"requires": {
"camelcase": "^5.0.0",
"decamelize": "^1.2.0"

View file

@ -6,7 +6,7 @@
"repository": "github:ArchiveBox/ArchiveBox",
"license": "MIT",
"dependencies": {
"@postlight/mercury-parser": "^2.2.0",
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
}

View file

@ -42,6 +42,7 @@ INSTALL_REQUIRES = [
"django-extensions>=3.0.3",
"dateparser>=1.0.0",
"youtube-dl>=2021.04.17",
"yt-dlp>=2021.4.11",
"python-crontab>=2.5.1",
"croniter>=0.3.34",
"w3lib>=1.22.0",

View file

@ -5,7 +5,7 @@ Package3: archivebox
Suite: focal
Suite3: focal
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
X-Python3-Version: >= 3.7
XS-Python-Version: >= 3.7
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck