mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
Merge branch 'dev' into feat/reverse-proxy-auth
This commit is contained in:
commit
2538b170c7
36 changed files with 625 additions and 292 deletions
3
.github/workflows/homebrew.yml
vendored
3
.github/workflows/homebrew.yml
vendored
|
@ -23,11 +23,12 @@ jobs:
|
|||
cd brew_dist/
|
||||
brew install --build-bottle ./archivebox.rb
|
||||
# brew bottle archivebox
|
||||
archivebox version
|
||||
|
||||
- name: Add some links to test
|
||||
run: |
|
||||
mkdir data && cd data
|
||||
archivebox init
|
||||
archivebox init --setup
|
||||
archivebox add 'https://example.com'
|
||||
archivebox version
|
||||
archivebox status
|
||||
|
|
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -24,3 +24,6 @@ data1/
|
|||
data2/
|
||||
data3/
|
||||
output/
|
||||
|
||||
# vim
|
||||
*.sw?
|
||||
|
|
29
Dockerfile
29
Dockerfile
|
@ -1,13 +1,22 @@
|
|||
# This is the Dockerfile for ArchiveBox, it bundles the following dependencies:
|
||||
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, single-file
|
||||
# python3, ArchiveBox, curl, wget, git, chromium, youtube-dl, yt-dlp, single-file
|
||||
# Usage:
|
||||
# git submodule update --init --recursive
|
||||
# git pull --recurse-submodules
|
||||
# docker build . -t archivebox --no-cache
|
||||
# docker run -v "$PWD/data":/data archivebox init
|
||||
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
|
||||
# docker run -v "$PWD/data":/data -it archivebox manage createsuperuser
|
||||
# docker run -v "$PWD/data":/data -p 8000:8000 archivebox server
|
||||
# Multi-arch build:
|
||||
# docker buildx create --use
|
||||
# docker buildx build . --platform=linux/amd64,linux/arm64,linux/arm/v7 --push -t archivebox/archivebox:latest -t archivebox/archivebox:dev
|
||||
#
|
||||
# Read more about [developing
|
||||
# Archivebox](https://github.com/ArchiveBox/ArchiveBox#archivebox-development).
|
||||
|
||||
FROM python:3.9-slim-buster
|
||||
|
||||
FROM python:3.10-slim-bullseye
|
||||
|
||||
LABEL name="archivebox" \
|
||||
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
|
||||
|
@ -48,11 +57,12 @@ RUN apt-get update -qq \
|
|||
&& apt-get install -qq -y --no-install-recommends \
|
||||
wget curl chromium git ffmpeg youtube-dl ripgrep \
|
||||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
&& ln -s /usr/bin/chromium /usr/bin/chromium-browser \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Node environment
|
||||
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
|
||||
&& echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
|
||||
&& echo 'deb https://deb.nodesource.com/node_17.x buster main' >> /etc/apt/sources.list \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
nodejs \
|
||||
|
@ -80,7 +90,8 @@ RUN apt-get update -qq \
|
|||
build-essential python-dev python3-dev \
|
||||
&& echo 'empty placeholder for setup.py to use' > "$CODE_DIR/archivebox/README.md" \
|
||||
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
|
||||
&& pip install --quiet -r /tmp/requirements.txt \
|
||||
&& pip install -r /tmp/requirements.txt \
|
||||
&& pip install --upgrade youtube-dl yt-dlp \
|
||||
&& apt-get purge -y build-essential python-dev python3-dev \
|
||||
&& apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
@ -103,13 +114,14 @@ RUN pip install -e .
|
|||
WORKDIR "$DATA_DIR"
|
||||
ENV IN_DOCKER=True \
|
||||
CHROME_SANDBOX=False \
|
||||
CHROME_BINARY="chromium" \
|
||||
CHROME_BINARY="/usr/bin/chromium-browser" \
|
||||
USE_SINGLEFILE=True \
|
||||
SINGLEFILE_BINARY="$NODE_DIR/node_modules/.bin/single-file" \
|
||||
USE_READABILITY=True \
|
||||
READABILITY_BINARY="$NODE_DIR/node_modules/.bin/readability-extractor" \
|
||||
USE_MERCURY=True \
|
||||
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser"
|
||||
MERCURY_BINARY="$NODE_DIR/node_modules/.bin/mercury-parser" \
|
||||
YOUTUBEDL_BINARY="yt-dlp"
|
||||
|
||||
# Print version for nice docker finish summary
|
||||
# RUN archivebox version
|
||||
|
@ -119,8 +131,9 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
|
|||
VOLUME "$DATA_DIR"
|
||||
EXPOSE 8000
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
|
||||
CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
|
||||
# Optional:
|
||||
# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
|
||||
# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
|
||||
|
||||
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
|
||||
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
|
||||
|
|
52
README.md
52
README.md
|
@ -51,10 +51,13 @@ The goal is to sleep soundly knowing the part of the internet you care about wil
|
|||
|
||||
<br/>
|
||||
|
||||
**📦 Get ArchiveBox with `docker-compose` / `docker` / `apt` / `brew` / `pip3` ([see Quickstart below](#quickstart)).**
|
||||
**📦 Get ArchiveBox with Docker / `apt` / `brew` / `pip3` / etc. ([see Quickstart below](#quickstart)).**
|
||||
|
||||
```bash
|
||||
# Or use this auto setup script to install it for you (optional)
|
||||
# Follow the instructions for your package manager in the quickstart, e.g.:
|
||||
pip3 install archivebox
|
||||
|
||||
# Or use the optional auto setup script to install it for you:
|
||||
curl -sSL 'https://get.archivebox.io' | sh
|
||||
```
|
||||
|
||||
|
@ -81,15 +84,15 @@ ls ./archive/*/index.json # or browse directly via the filesyste
|
|||
|
||||
## Key Features
|
||||
|
||||
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
||||
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up online, stores all data locally
|
||||
- [**Powerful, intuitive command line interface**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) with [modular optional dependencies](#dependencies)
|
||||
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl), articles (readability), code (git), etc.](#output-formats)
|
||||
- [**Extracts a wide variety of content out-of-the-box**](https://github.com/ArchiveBox/ArchiveBox/issues/51): [media (youtube-dl or yt-dlp), articles (readability), code (git), etc.](#output-formats)
|
||||
- [**Supports scheduled/realtime importing**](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from [many types of sources](#input-formats)
|
||||
- [**Uses standard, durable, long-term formats**](#saves-lots-of-useful-stuff-for-each-imported-link) like HTML, JSON, PDF, PNG, and WARC
|
||||
- [**Usable as a oneshot CLI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage), [**self-hosted web UI**](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#UI-Usage), [Python API](https://docs.archivebox.io/en/latest/modules.html) (BETA), [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (ALPHA), or [desktop app](https://github.com/ArchiveBox/electron-archivebox) (ALPHA)
|
||||
- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode)
|
||||
- Planned: support for archiving [content requiring a login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (working, but ill-advised until some pending fixes are released)
|
||||
- [**Saves all pages to archive.org as well**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_archive_dot_org) by default for redundancy (can be [disabled](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) for local-only mode)
|
||||
- Advanced users: support for archiving [content requiring login/paywall/cookies](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir) (see wiki security caveats!)
|
||||
- Planned: support for running [JS during archiving](https://github.com/ArchiveBox/ArchiveBox/issues/51) to adblock, [autoscroll](https://github.com/ArchiveBox/ArchiveBox/issues/80), [modal-hide](https://github.com/ArchiveBox/ArchiveBox/issues/175), [thread-expand](https://github.com/ArchiveBox/ArchiveBox/issues/345)...
|
||||
|
||||
<br/><br/>
|
||||
|
@ -165,14 +168,16 @@ See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the C
|
|||
</ol>
|
||||
|
||||
See <a href="#%EF%B8%8F-cli-usage">below</a> for more usage examples using the CLI, Web UI, or filesystem/SQL/Python to manage your archive.<br/>
|
||||
See <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/bin/setup.sh"><code>setup.sh</code></a> for the source code of the auto-install script.
|
||||
See <a href="https://github.com/ArchiveBox/ArchiveBox/blob/dev/bin/setup.sh"><code>setup.sh</code></a> for the source code of the auto-install script.<br/>
|
||||
See <a href="https://docs.sweeting.me/s/against-curl-sh">"Against curl | sh as an install method"</a> blog post for my thoughts on the shortcomings of this install method.
|
||||
<br/><br/>
|
||||
</details>
|
||||
|
||||
<br/>
|
||||
|
||||
#### 🛠 Manual Setup
|
||||
#### 🛠 Package Manager Setup
|
||||
|
||||
<a name="Manual-Setup"></a>
|
||||
<details>
|
||||
<summary><b><img src="https://user-images.githubusercontent.com/511499/117448075-49597580-af0c-11eb-91ba-f34fff10096b.png" alt="aptitude" height="28px" align="top"/> <code>apt</code></b> (Ubuntu/Debian)</summary>
|
||||
<br/>
|
||||
|
@ -272,7 +277,7 @@ See the <a href="https://github.com/ArchiveBox/pip-archivebox"><code>pip-archive
|
|||
<summary><img src="https://user-images.githubusercontent.com/511499/118077361-f0616580-b381-11eb-973c-ee894a3349fb.png" alt="Arch" height="28px" align="top"/> <code>pacman</code> / <img src="https://user-images.githubusercontent.com/511499/118077946-29e6a080-b383-11eb-94f0-d4871da08c3f.png" alt="FreeBSD" height="28px" align="top"/> <code>pkg</code> / <img src="https://user-images.githubusercontent.com/511499/118077861-002d7980-b383-11eb-86a7-5936fad9190f.png" alt="Nix" height="28px" align="top"/> <code>nix</code> (Arch/FreeBSD/NixOS/more)</summary>
|
||||
<br/>
|
||||
<ul>
|
||||
<li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>pacman install archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
|
||||
<li>Arch: <a href="https://aur.archlinux.org/packages/archivebox/"><code>yay -S archivebox</code></a> (contributed by <a href="https://github.com/imlonghao"><code>@imlonghao</code></a>)</li>
|
||||
<li>FreeBSD: <a href="https://github.com/ArchiveBox/ArchiveBox#%EF%B8%8F-easy-setup"><code>curl -sSL 'https://get.archivebox.io' | sh</code></a> (uses <code>pkg</code> + <code>pip3</code> under-the-hood)</li>
|
||||
<li>Nix: <a href="https://github.com/NixOS/nixpkgs/blob/master/pkgs/applications/misc/archivebox/default.nix"><code>nix-env --install archivebox</code></a> (contributed by <a href="https://github.com/siraben"><code>@siraben</code></a>)</li>
|
||||
<li>More: <a href="https://github.com/ArchiveBox/ArchiveBox/issues/new"><i>contribute another distribution...!</i></a></li>
|
||||
|
@ -316,6 +321,7 @@ None of these hosting providers are officially endorsed:<br/>
|
|||
<sub><i>(most still require manual setup or manual periodic updating using the methods above)</i></sub>
|
||||
<br/><br/>
|
||||
<li><a href="https://www.stellarhosted.com/archivebox/"><img src="https://img.shields.io/badge/Semi_Managed_Hosting-StellarHosted.com-%23193f7e.svg?style=flat" height="22px"/></a> (USD $29-250/mo, <a href="https://www.stellarhosted.com/archivebox/#pricing">pricing</a>)</li>
|
||||
<li><a href="https://www.pikapods.com/pods?run=archivebox"><img src="https://img.shields.io/badge/Semi_Managed_Hosting-PikaPods.com-%2343a047.svg?style=flat" height="22px"/></a> (from USD $2.6/mo)</li>
|
||||
<li><a href="https://m.do.co/c/cbc4c0c17840">
|
||||
<img src="https://img.shields.io/badge/Unmanaged_VPS-DigitalOcean.com-%232f7cf7.svg?style=flat" height="22px"/>
|
||||
</a> (USD $5-50+/mo, <a href="https://m.do.co/c/cbc4c0c17840">🎗 referral link</a>, <a href="https://www.digitalocean.com/community/tutorials/how-to-install-and-use-docker-compose-on-ubuntu-20-04">instructions</a>)</li>
|
||||
|
@ -341,7 +347,7 @@ For more discussion on managed and paid hosting options see here: <a href="https
|
|||
|
||||
- Import URLs from some of the supported [Input Formats](#input-formats) or view the supported [Output Formats](#output-formats)...
|
||||
- Tweak your UI or archiving behavior [Configuration](#configuration) or read about some of the [Caveats](#caveats) and troubleshooting steps...
|
||||
- Read about the [Dependencies](#dependencies) used for archiving or the [Archive Layout](#archive-layout) on disk...
|
||||
- Read about the [Dependencies](#dependencies) used for archiving, the [Upgrading Process](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives), or the [Archive Layout](#archive-layout) on disk...
|
||||
- Or check out our full [Documentation](#documentation) or [Community Wiki](#internet-archiving-ecosystem)...
|
||||
|
||||
<br/>
|
||||
|
@ -362,13 +368,13 @@ archivebox help
|
|||
|
||||
- `archivebox setup/init/config/status/manage` to administer your collection
|
||||
- `archivebox add/schedule/remove/update/list/shell/oneshot` to manage Snapshots in the archive
|
||||
- `archivebox schedule` to pull in fresh URLs in regularly from [boorkmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
|
||||
- `archivebox schedule` to pull in fresh URLs in regularly from [bookmarks/history/Pocket/Pinboard/RSS/etc.](#input-formats)
|
||||
|
||||
#### 🖥 Web UI Usage
|
||||
|
||||
```bash
|
||||
archivebox manage createsuperuser
|
||||
archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it
|
||||
archivebox manage createsuperuser # set an admin password
|
||||
archivebox server 0.0.0.0:8000 # open http://127.0.0.1:8000 to view it
|
||||
|
||||
# you can also configure whether or not login is required for most features
|
||||
archivebox config --set PUBLIC_INDEX=False
|
||||
|
@ -419,6 +425,7 @@ ArchiveBox supports many input formats for URLs, including Pocket & Pinboard exp
|
|||
|
||||
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> TXT, RSS, XML, JSON, CSV, SQL, HTML, Markdown, or [any other text-based format...](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Import-a-list-of-URLs-from-a-text-file)
|
||||
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> [Browser history](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) or [browser bookmarks](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive) (see instructions for: [Chrome](https://support.google.com/chrome/answer/96816?hl=en), [Firefox](https://support.mozilla.org/en-US/kb/export-firefox-bookmarks-to-backup-or-transfer), [Safari](http://i.imgur.com/AtcvUZA.png), [IE](https://support.microsoft.com/en-us/help/211089/how-to-import-and-export-the-internet-explorer-favorites-folder-to-a-32-bit-version-of-windows), [Opera](http://help.opera.com/Windows/12.10/en/importexport.html), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive))
|
||||
- <img src="https://i.imgur.com/AQyHbu8.png" height="22px"/> Browser extension [`archivebox-exporter`](https://github.com/tjhorner/archivebox-exporter) (realtime archiving from Chrome/Chromium/Firefox)
|
||||
- <img src="https://getpocket.com/favicon.ico" height="22px"/> [Pocket](https://getpocket.com/export), [Pinboard](https://pinboard.in/export/), [Instapaper](https://www.instapaper.com/user/export), [Shaarli](https://shaarli.readthedocs.io/en/master/Usage/#importexport), [Delicious](https://www.groovypost.com/howto/howto/export-delicious-bookmarks-xml/), [Reddit Saved](https://github.com/csu/export-saved-reddit), [Wallabag](https://doc.wallabag.org/en/user/import/wallabagv2.html), [Unmark.it](http://help.unmark.it/import-export), [OneTab](https://www.addictivetips.com/web/onetab-save-close-all-chrome-tabs-to-restore-export-or-import/), [and more...](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||
|
||||
<img src="https://i.imgur.com/zM4z1aU.png" width="330px" align="right">
|
||||
|
@ -462,7 +469,7 @@ Inside each Snapshot folder, ArchiveBox save these different types of extractor
|
|||
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
|
||||
- **Article Text:** `article.html/json` Article text extraction using Readability & Mercury
|
||||
- **Archive.org Permalink:** `archive.org.txt` A link to the saved site on archive.org
|
||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl (or yt-dlp)
|
||||
- **Source Code:** `git/` clone of any repository found on GitHub, Bitbucket, or GitLab links
|
||||
- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
|
||||
|
||||
|
@ -522,7 +529,7 @@ To achieve high fidelity archives in as many situations as possible, ArchiveBox
|
|||
- `node` & `npm` (for readability, mercury, and singlefile)
|
||||
- `wget` (for plain HTML, static files, and WARC saving)
|
||||
- `curl` (for fetching headers, favicon, and posting to Archive.org)
|
||||
- `youtube-dl` (for audio, video, and subtitles)
|
||||
- `youtube-dl` or `yt-dlp` (for audio, video, and subtitles)
|
||||
- `git` (for cloning git repos)
|
||||
- and more as we grow...
|
||||
|
||||
|
@ -538,8 +545,9 @@ archivebox setup # auto install all the extractors and extras
|
|||
archivebox --version # see info and check validity of installed dependencies
|
||||
```
|
||||
|
||||
Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported**, but some advanced users have reported getting it working.
|
||||
Installing directly on **Windows without Docker or WSL/WSL2/Cygwin is not officially supported** (I cannot respond to Windows support tickets), but some advanced users have reported getting it working.
|
||||
|
||||
For detailed information about ugprading ArchiveBox and its dependencies, see: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives
|
||||
|
||||
<br/>
|
||||
|
||||
|
@ -829,6 +837,7 @@ You can also access the docs locally by looking in the [`ArchiveBox/docs/`](http
|
|||
- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Chromium-Install)
|
||||
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
|
||||
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
|
||||
- [Upgrading or Merging Archives](https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives)
|
||||
- [Python API](https://docs.archivebox.io/en/latest/modules.html) (alpha)
|
||||
- [REST API](https://github.com/ArchiveBox/ArchiveBox/issues/496) (alpha)
|
||||
|
||||
|
@ -895,7 +904,9 @@ archivebox --version
|
|||
# if you edit e.g. ./archivebox/core/models.py on the docker host, runserver
|
||||
# inside the container will reload and pick up your changes
|
||||
docker build . -t archivebox
|
||||
docker run -it archivebox init --setup
|
||||
docker run -it \
|
||||
-v $PWD/data:/data \
|
||||
archivebox init --setup
|
||||
docker run -it -p 8000:8000 \
|
||||
-v $PWD/data:/data \
|
||||
-v $PWD/archivebox:/app/archivebox \
|
||||
|
@ -921,6 +932,8 @@ archivebox config --set DEBUG=True
|
|||
archivebox server --debug ...
|
||||
```
|
||||
|
||||
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
|
||||
|
||||
</details>
|
||||
|
||||
#### Install and run a specific GitHub branch
|
||||
|
@ -975,7 +988,8 @@ cd path/to/test/data/
|
|||
archivebox shell
|
||||
archivebox manage dbshell
|
||||
```
|
||||
(uses `pytest -s`)
|
||||
(uses `pytest -s`)
|
||||
https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
|
||||
|
||||
</details>
|
||||
|
||||
|
@ -1067,7 +1081,7 @@ Extractors take the URL of a page to archive, write their output to the filesyst
|
|||
<img src="https://raw.githubusercontent.com/Monadical-SAS/redux-time/HEAD/examples/static/jeremy.jpg" height="40px"/>
|
||||
<br/>
|
||||
<i><sub>
|
||||
This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous contributors and <a href="https://monadical.com">Monadical</a> (✨ <a href="https://monadical.com">hire them</a> for dev work!).
|
||||
This project is maintained mostly in <a href="https://nicksweeting.com/blog#About">my spare time</a> with the help from generous <a href="https://github.com/ArchiveBox/ArchiveBox/graphs/contributors">contributors</a> and <a href="https://monadical.com">Monadical</a> (✨ <a href="https://monadical.com">hire them</a> for dev work!).
|
||||
</sub>
|
||||
</i>
|
||||
<br/><br/>
|
||||
|
|
|
@ -1 +1,3 @@
|
|||
theme: jekyll-theme-merlot
|
||||
production_url: https://archivebox.io
|
||||
theme: jekyll-theme-merlot
|
||||
# Github Pages static site settings for https://archivebox.io
|
||||
|
|
|
@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
'--update', #'-u',
|
||||
action='store_true',
|
||||
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
|
||||
help="Also retry previously skipped/failed links when adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Also update ALL links in index when finished adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
|
@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
tag=command.tag,
|
||||
update=command.update,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
|
|
|
@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
action='store_true',
|
||||
help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update',
|
||||
action='store_true',
|
||||
help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
|
||||
)
|
||||
group.add_argument(
|
||||
'--clear', # '-c'
|
||||
action='store_true',
|
||||
|
@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
every=command.every,
|
||||
depth=command.depth,
|
||||
overwrite=command.overwrite,
|
||||
update=command.update,
|
||||
import_path=command.import_path,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
|
|
@ -26,11 +26,12 @@ import io
|
|||
import re
|
||||
import sys
|
||||
import json
|
||||
import inspect
|
||||
import getpass
|
||||
import platform
|
||||
import shutil
|
||||
import sqlite3
|
||||
import django
|
||||
from sqlite3 import dbapi2 as sqlite3
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
|
@ -48,6 +49,9 @@ from .config_stubs import (
|
|||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
|
||||
### Pre-Fetch Minimal System Config
|
||||
|
||||
SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||
|
||||
try:
|
||||
|
@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
|
||||
'IN_DOCKER': {'type': bool, 'default': False},
|
||||
'PUID': {'type': int, 'default': os.getuid()},
|
||||
'PGID': {'type': int, 'default': os.getgid()},
|
||||
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
||||
},
|
||||
|
||||
|
@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
|
||||
'URL_WHITELIST': {'type': str, 'default': None},
|
||||
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
||||
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
||||
},
|
||||
|
||||
'SERVER_CONFIG': {
|
||||
|
@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
|
||||
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
|
||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--all-subs',
|
||||
'--write-auto-sub',
|
||||
# There are too many of these and youtube
|
||||
# throttles you with HTTP error 429
|
||||
#'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
# This flag doesn't exist in youtube-dl
|
||||
# only in yt-dlp
|
||||
'--no-abort-on-error',
|
||||
# --ignore-errors must come AFTER
|
||||
# --no-abort-on-error
|
||||
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
|
@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--compressed'
|
||||
]},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None}
|
||||
},
|
||||
|
||||
'SEARCH_BACKEND_CONFIG' : {
|
||||
|
@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
|
@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'static_index.json',
|
||||
}
|
||||
|
||||
def get_version(config):
|
||||
return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
|
||||
|
||||
def get_commit_hash(config):
|
||||
try:
|
||||
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
############################## Derived Config ##################################
|
||||
|
||||
|
||||
|
@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
|
||||
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
|
||||
|
||||
'VERSION': {'default': lambda c: get_version(c)},
|
||||
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
|
||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
|
||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
|
||||
|
||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
||||
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
|
||||
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
|
@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
|||
return None
|
||||
|
||||
try:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
|
||||
if not version_str:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except OSError:
|
||||
|
@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': config['OUTPUT_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
|
@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': config['CONFIG_FILE'].resolve(),
|
||||
|
@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
|||
'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
'PYTHON_BINARY': {
|
||||
'path': bin_path(config['PYTHON_BINARY']),
|
||||
'version': config['PYTHON_VERSION'],
|
||||
|
@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': bool(config['PYTHON_VERSION']),
|
||||
},
|
||||
'SQLITE_BINARY': {
|
||||
'path': bin_path(config['SQLITE_BINARY']),
|
||||
'version': config['SQLITE_VERSION'],
|
||||
'hash': bin_hash(config['SQLITE_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['SQLITE_VERSION']),
|
||||
},
|
||||
'DJANGO_BINARY': {
|
||||
'path': bin_path(config['DJANGO_BINARY']),
|
||||
'version': config['DJANGO_VERSION'],
|
||||
|
@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
|||
'enabled': True,
|
||||
'is_valid': bool(config['DJANGO_VERSION']),
|
||||
},
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
|
||||
'CURL_BINARY': {
|
||||
'path': bin_path(config['CURL_BINARY']),
|
||||
'version': config['CURL_VERSION'],
|
||||
|
@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
|||
'TIMEOUT': config['TIMEOUT'],
|
||||
'RESOLUTION': config['RESOLUTION'],
|
||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
||||
'CHROME_BINARY': config['CHROME_BINARY'],
|
||||
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
|
||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
||||
|
@ -972,13 +1020,22 @@ globals().update(CONFIG)
|
|||
|
||||
|
||||
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
|
||||
os.environ["TZ"] = 'UTC'
|
||||
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later
|
||||
os.environ["TZ"] = TIMEZONE
|
||||
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
|
||||
sys.path.append(NODE_BIN_PATH)
|
||||
|
||||
# OPTIONAL: also look around the host system for node modules to use
|
||||
# avoid enabling this unless absolutely needed,
|
||||
# having overlapping potential sources of libs is a big source of bugs/confusing to users
|
||||
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
|
||||
# sys.path.append(DEV_NODE_BIN_PATH)
|
||||
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
|
||||
# sys.path.append(USER_NODE_BIN_PATH)
|
||||
|
||||
# disable stderr "you really shouldnt disable ssl" warnings with library config
|
||||
if not CONFIG['CHECK_SSL_VALIDITY']:
|
||||
import urllib3
|
||||
|
@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
|
|||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# get SQLite database version, compile options, and runtime options
|
||||
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
|
||||
#cursor = sqlite3.connect(':memory:').cursor()
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
|
||||
#cursor.close()
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
||||
|
@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
|||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
stderr()
|
||||
|
||||
|
||||
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
assert isinstance(output_dir, (str, Path))
|
||||
|
@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
# without running migrations automatically (user runs them manually by calling init)
|
||||
django.setup()
|
||||
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
# Enable WAL mode in sqlite3
|
||||
from django.db import connection
|
||||
with connection.cursor() as cursor:
|
||||
|
||||
# Set Journal mode to WAL to allow for multiple writers
|
||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
||||
if current_mode != 'wal':
|
||||
cursor.execute("PRAGMA journal_mode=wal;")
|
||||
|
||||
# Set max blocking delay for concurrent writes and write sync mode
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
cursor.execute("PRAGMA busy_timeout = 5000;")
|
||||
cursor.execute("PRAGMA synchronous = NORMAL;")
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
|
@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
|
|
|
@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False):
|
|||
WGET_ARGS: List[str]
|
||||
CURL_ARGS: List[str]
|
||||
GIT_ARGS: List[str]
|
||||
TAG_SEPARATOR_PATTERN: str
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
|
|
18
archivebox/core/migrations/0021_auto_20220914_0934.py
Normal file
18
archivebox/core/migrations/0021_auto_20220914_0934.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.14 on 2022-09-14 09:34
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
),
|
||||
]
|
|
@ -19,7 +19,7 @@ from ..config import (
|
|||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
LOGS_DIR,
|
||||
TIME_ZONE,
|
||||
TIMEZONE,
|
||||
)
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
|
@ -157,7 +157,7 @@ DATABASES = {
|
|||
'timeout': 60,
|
||||
'check_same_thread': False,
|
||||
},
|
||||
'TIME_ZONE': 'UTC',
|
||||
'TIME_ZONE': TIMEZONE,
|
||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||
}
|
||||
}
|
||||
|
@ -227,7 +227,8 @@ USE_L10N = True
|
|||
USE_TZ = True
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
TIME_ZONE = TIME_ZONE # noqa
|
||||
TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
|
||||
|
||||
|
||||
from django.conf.locale.en import formats as en_formats
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
|||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
|
||||
|
||||
# print('DEBUG', settings.DEBUG)
|
||||
|
@ -24,14 +24,16 @@ urlpatterns = [
|
|||
|
||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||
path('add/', AddView.as_view(), name='add'),
|
||||
|
||||
|
||||
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
|
||||
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
|
||||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', admin.site.urls),
|
||||
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
|
||||
path('index.html', RedirectView.as_view(url='/')),
|
||||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||
path('', HomepageView.as_view(), name='Home'),
|
||||
|
|
|
@ -38,7 +38,7 @@ class HomepageView(View):
|
|||
|
||||
if PUBLIC_INDEX:
|
||||
return redirect('/public')
|
||||
|
||||
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
|
||||
|
@ -205,7 +205,7 @@ class SnapshotView(View):
|
|||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
|
||||
|
||||
|
||||
class PublicIndexView(ListView):
|
||||
template_name = 'public_index.html'
|
||||
|
@ -220,7 +220,7 @@ class PublicIndexView(ListView):
|
|||
'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
def get_queryset(self, **kwargs):
|
||||
def get_queryset(self, **kwargs):
|
||||
qs = super().get_queryset(**kwargs)
|
||||
query = self.request.GET.get('q')
|
||||
if query and query.strip():
|
||||
|
@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
url = self.request.GET.get('url', None)
|
||||
if url:
|
||||
return {'url': url if '://' in url else f'https://{url}'}
|
||||
|
||||
|
||||
return super().get_initial()
|
||||
|
||||
def test_func(self):
|
||||
|
@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
"form": AddLinkForm()
|
||||
})
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
||||
|
||||
class HealthCheckView(View):
|
||||
"""
|
||||
A Django view that renders plain text "OK" for service discovery tools
|
||||
"""
|
||||
def get(self, request):
|
||||
"""
|
||||
Handle a GET request
|
||||
"""
|
||||
return HttpResponse(
|
||||
'OK',
|
||||
content_type='text/plain',
|
||||
status=200
|
||||
)
|
||||
|
|
|
@ -1,12 +1,14 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional, List, Iterable, Union
|
||||
from datetime import datetime, timezone
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from ..core.settings import ERROR_LOG
|
||||
from ..index.schema import Link
|
||||
from ..index.sql import write_link_to_sql_index
|
||||
from ..index import (
|
||||
|
@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers
|
|||
|
||||
def get_default_archive_methods():
|
||||
return [
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('headers', should_save_headers, save_headers),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
|
@ -50,7 +51,8 @@ def get_default_archive_methods():
|
|||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
|
||||
('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them
|
||||
('readability', should_save_readability, save_readability),
|
||||
('mercury', should_save_mercury, save_mercury),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
|
@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
|
||||
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
|
||||
# are fixed.
|
||||
"""
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
"""
|
||||
# Instead, use the kludgy workaround from
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
command,
|
||||
ts
|
||||
) + "\n"))
|
||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
|
@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
|||
except KeyboardInterrupt:
|
||||
log_archiving_paused(num_links, idx, link.timestamp)
|
||||
raise SystemExit(0)
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
except BaseException:
|
||||
print()
|
||||
raise
|
||||
|
||||
|
|
|
@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
|||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'media'
|
||||
|
@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
YOUTUBEDL_BINARY,
|
||||
*YOUTUBEDL_ARGS,
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
|
@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
|
@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
|||
timer.end()
|
||||
|
||||
# add video description and subtitles to full-text index
|
||||
# Let's try a few different
|
||||
index_texts = [
|
||||
text_file.read_text(encoding='utf-8').strip()
|
||||
# errors:
|
||||
# * 'strict' to raise a ValueError exception if there is an
|
||||
# encoding error. The default value of None has the same effect.
|
||||
# * 'ignore' ignores errors. Note that ignoring encoding errors
|
||||
# can lead to data loss.
|
||||
# * 'xmlcharrefreplace' is only supported when writing to a
|
||||
# file. Characters not supported by the encoding are replaced with
|
||||
# the appropriate XML character reference &#nnn;.
|
||||
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
|
||||
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
|
||||
for text_file in (
|
||||
*output_path.glob('*.description'),
|
||||
*output_path.glob('*.srt'),
|
||||
|
|
|
@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError
|
|||
from ..system import run, atomic_write
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
download_url,
|
||||
is_static_file,
|
||||
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
|
@ -22,28 +20,8 @@ from ..config import (
|
|||
READABILITY_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
from .title import get_html
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path) -> str:
|
||||
"""
|
||||
Try to find wget, singlefile and then dom files.
|
||||
If none is found, download the url again.
|
||||
"""
|
||||
canonical = link.canonical_outputs()
|
||||
abs_path = path.absolute()
|
||||
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
|
||||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url)
|
||||
else:
|
||||
return document
|
||||
|
||||
@enforce_types
|
||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
|
|
@ -17,6 +17,7 @@ from ..config import (
|
|||
SAVE_SINGLEFILE,
|
||||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
CHROME_BINARY,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
|||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
options = [
|
||||
*SINGLEFILE_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
browser_args,
|
||||
]
|
||||
|
||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
||||
#
|
||||
# NOTE: Options names that come first clobber conflicting names that come later
|
||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
deduped_options = list(filter(test_seen, options))
|
||||
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
*deduped_options,
|
||||
link.url,
|
||||
output,
|
||||
]
|
||||
|
|
|
@ -58,6 +58,27 @@ class TitleParser(HTMLParser):
|
|||
if tag.lower() == "title":
|
||||
self.inside_title_tag = False
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||
"""
|
||||
Try to find wget, singlefile and then dom files.
|
||||
If none is found, download the url again.
|
||||
"""
|
||||
canonical = link.canonical_outputs()
|
||||
abs_path = path.absolute()
|
||||
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
|
||||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url, timeout=timeout)
|
||||
else:
|
||||
return document
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
|||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
html = get_html(link, out_dir, timeout=timeout)
|
||||
try:
|
||||
# try using relatively strict html parser first
|
||||
parser = TitleParser()
|
||||
|
|
|
@ -24,6 +24,7 @@ from ..config import (
|
|||
FOOTER_INFO,
|
||||
HTML_INDEX_FILENAME,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
PREVIEW_ORIGINALS,
|
||||
)
|
||||
|
||||
MAIN_INDEX_TEMPLATE = 'static_index.html'
|
||||
|
@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str:
|
|||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
|
||||
})
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import re
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Iterator
|
||||
|
@ -8,7 +10,10 @@ from django.db import transaction
|
|||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types, parse_date
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
TAG_SEPARATOR_PATTERN,
|
||||
)
|
||||
|
||||
|
||||
### Main Links Index
|
||||
|
@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
|
|||
def write_link_to_sql_index(link: Link):
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
tags = info.pop("tags")
|
||||
if tags is None:
|
||||
tags = []
|
||||
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
||||
))
|
||||
info.pop('tags')
|
||||
|
||||
try:
|
||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||
|
@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link):
|
|||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
|
||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
snapshot.save_tags(tags)
|
||||
snapshot.save_tags(tag_list)
|
||||
|
||||
for extractor, entries in link.history.items():
|
||||
for entry in entries:
|
||||
|
@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
||||
)
|
||||
tag_list = list(tag_set) or []
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
||||
))
|
||||
|
||||
snap.save()
|
||||
snap.save_tags(tag_list)
|
||||
|
|
|
@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
|||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
|
||||
else:
|
||||
if isinstance(hints, bytes):
|
||||
hints = hints.decode()
|
||||
hints = hints.split('\n')
|
||||
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
|
@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str:
|
|||
def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
if folder['enabled']:
|
||||
if folder['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
color, symbol, note, num_files = 'green', '√', 'valid', ''
|
||||
else:
|
||||
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
|
@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
|||
)
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
if folder.get('is_mount'):
|
||||
# add symbol @ next to filecount if path is a remote filesystem mount
|
||||
num_files = f'{num_files} @' if num_files else '@'
|
||||
|
||||
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
|
||||
if path and ' ' in path:
|
||||
|
|
|
@ -4,8 +4,9 @@ import os
|
|||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
from django.utils import timezone
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
from datetime import date, datetime
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from crontab import CronTab, CronSlices
|
||||
|
@ -70,7 +71,12 @@ from .config import (
|
|||
IS_TTY,
|
||||
DEBUG,
|
||||
IN_DOCKER,
|
||||
PUID,
|
||||
PGID,
|
||||
USER,
|
||||
TIMEZONE,
|
||||
ENFORCE_ATOMIC_WRITES,
|
||||
OUTPUT_PERMISSIONS,
|
||||
PYTHON_BINARY,
|
||||
ARCHIVEBOX_BINARY,
|
||||
ONLY_NEW,
|
||||
|
@ -90,6 +96,7 @@ from .config import (
|
|||
check_data_folder,
|
||||
write_config_file,
|
||||
VERSION,
|
||||
COMMIT_HASH,
|
||||
CODE_LOCATIONS,
|
||||
EXTERNAL_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
|
@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
|||
def version(quiet: bool=False,
|
||||
out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Print the ArchiveBox version and dependency information"""
|
||||
|
||||
if quiet:
|
||||
print(VERSION)
|
||||
else:
|
||||
# ArchiveBox v0.5.6
|
||||
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
|
||||
print(VERSION)
|
||||
|
||||
if not quiet:
|
||||
# 0.6.3
|
||||
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
|
||||
|
||||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(VERSION),
|
||||
*((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
|
||||
sys.implementation.name.title(),
|
||||
p.system,
|
||||
platform.platform(),
|
||||
p.machine,
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||
print(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'DEBUG={DEBUG}',
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IS_TTY={IS_TTY}',
|
||||
f'TZ={os.environ.get("TZ", "UTC")}',
|
||||
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
|
||||
f'TZ={TIMEZONE}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
||||
)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
for name, dependency in DEPENDENCIES.items():
|
||||
print(printable_dependency_version(name, dependency))
|
||||
|
||||
# add a newline between core dependencies and extractor dependencies for easier reading
|
||||
if name == 'ARCHIVEBOX_BINARY':
|
||||
print()
|
||||
|
||||
print()
|
||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||
|
@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
|||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
|
|||
def add(urls: Union[str, List[str]],
|
||||
tag: str='',
|
||||
depth: int=0,
|
||||
update_all: bool=not ONLY_NEW,
|
||||
update: bool=not ONLY_NEW,
|
||||
update_all: bool=False,
|
||||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
|
||||
|
@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]],
|
|||
# save verbatim args to sources
|
||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
|
||||
new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
|
||||
|
||||
# If we're going one level deeper, download each link and look for more links
|
||||
|
@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]],
|
|||
if new_links and depth == 1:
|
||||
log_crawl_started(new_links)
|
||||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
try:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
except Exception as err:
|
||||
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||
|
||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
|
@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]],
|
|||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
if update_all:
|
||||
stderr()
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
if update:
|
||||
stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif update_all:
|
||||
stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
|
||||
|
@ -1113,6 +1147,7 @@ def schedule(add: bool=False,
|
|||
every: Optional[str]=None,
|
||||
depth: int=0,
|
||||
overwrite: bool=False,
|
||||
update: bool=not ONLY_NEW,
|
||||
import_path: Optional[str]=None,
|
||||
out_dir: Path=OUTPUT_DIR):
|
||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
|
@ -1142,6 +1177,7 @@ def schedule(add: bool=False,
|
|||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
*(['--update'] if update else []),
|
||||
f'--depth={depth}',
|
||||
f'"{import_path}"',
|
||||
] if import_path else ['update']),
|
||||
|
|
|
@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None,
|
|||
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
|
||||
ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
|
||||
source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
|
||||
atomic_write(source_path, raw_text)
|
||||
|
||||
referenced_texts = ''
|
||||
|
||||
for entry in raw_text.split():
|
||||
try:
|
||||
if Path(entry).exists():
|
||||
referenced_texts += Path(entry).read_text()
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
atomic_write(source_path, raw_text + '\n' + referenced_texts)
|
||||
log_source_saved(source_file=source_path)
|
||||
return source_path
|
||||
|
||||
|
@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
|||
ANSI['reset'],
|
||||
))
|
||||
print(' ', e)
|
||||
raise SystemExit(1)
|
||||
raise e
|
||||
|
||||
else:
|
||||
# Source is a path to a local file on the filesystem
|
||||
|
|
|
@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
|
|||
|
||||
|
||||
def link_from_article(article: dict, sources: list):
|
||||
url: str = article['resolved_url'] or article['given_url']
|
||||
url: str = article.get('resolved_url') or article['given_url']
|
||||
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
|
||||
if broken_protocol:
|
||||
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
|
||||
title = article['resolved_title'] or article['given_title'] or url
|
||||
title = article.get('resolved_title') or article.get('given_title') or url
|
||||
|
||||
return Link(
|
||||
url=url,
|
||||
|
|
|
@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
splits_fixed = leading_removed.replace('"\n href="', '" href="')
|
||||
rows = splits_fixed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
def get_row(prefix):
|
||||
return [
|
||||
row.strip()
|
||||
for row in rows
|
||||
if row.strip().startswith('<{}'.format(prefix))
|
||||
][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
|
||||
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
|
||||
url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
|
||||
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
try:
|
||||
|
@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
|||
tags = None
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
url=htmldecode(url_inside_attr or url_inside_link),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=tags or '',
|
||||
|
|
|
@ -197,7 +197,7 @@
|
|||
|
||||
// select the action button from the dropdown
|
||||
container.find('select[name=action]')
|
||||
.find('op:selected').removeAttr('selected').end()
|
||||
.find('[selected]').removeAttr('selected').end()
|
||||
.find('[value=' + action_type + ']').attr('selected', 'selected').click()
|
||||
|
||||
// click submit & replace the archivebox logo with a spinner
|
||||
|
|
|
@ -28,6 +28,14 @@
|
|||
<a href="/add" id="submit"> Add more URLs ➕</a>
|
||||
</center>
|
||||
{% else %}
|
||||
<div id="in-progress" style="display: none;">
|
||||
<center><h3>Adding URLs to index and running archive methods...</h3>
|
||||
<br/>
|
||||
<div class="loader"></div>
|
||||
<br/>
|
||||
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
|
||||
</center>
|
||||
</div>
|
||||
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
|
||||
<h1>Add new URLs to your archive</h1>
|
||||
<br/>
|
||||
|
@ -48,10 +56,9 @@
|
|||
{% endif %}
|
||||
<script>
|
||||
document.getElementById('add-form').addEventListener('submit', function(event) {
|
||||
setTimeout(function() {
|
||||
document.getElementById('add-form').innerHTML = '<center><h3>Adding URLs to index and running archive methods...<h3><br/><div class="loader"></div><br/>Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...</center>'
|
||||
document.getElementById('delay-warning').style.display = 'block'
|
||||
}, 200)
|
||||
document.getElementById('in-progress').style.display = 'block'
|
||||
document.getElementById('add-form').style.display = 'none'
|
||||
document.getElementById('delay-warning').style.display = 'block'
|
||||
return true
|
||||
})
|
||||
</script>
|
||||
|
|
|
@ -414,6 +414,7 @@
|
|||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if PREVIEW_ORIGINALS %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
|
||||
|
@ -427,6 +428,7 @@
|
|||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
|
|
24
bin/setup.sh
24
bin/setup.sh
|
@ -91,9 +91,9 @@ echo " This is a helper script which installs the ArchiveBox dependencies on
|
|||
echo " You may be prompted for a sudo password in order to install the following:"
|
||||
echo ""
|
||||
echo " - archivebox"
|
||||
echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)"
|
||||
echo " - curl, wget, git, youtube-dl (used for extracting title, favicon, git, media, and more)"
|
||||
echo " - chromium (skips this if any Chrome/Chromium version is already installed)"
|
||||
echo " - python3, pip, nodejs, npm (languages used by ArchiveBox, and its extractor modules)"
|
||||
echo " - curl, wget, git, youtube-dl, yt-dlp (used for extracting title, favicon, git, media, and more)"
|
||||
echo " - chromium (skips this if any Chrome/Chromium version is already installed)"
|
||||
echo ""
|
||||
echo " If you'd rather install these manually as-needed, you can find detailed documentation here:"
|
||||
echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install"
|
||||
|
@ -115,13 +115,13 @@ if which apt-get > /dev/null; then
|
|||
fi
|
||||
echo
|
||||
echo "[+] Installing ArchiveBox system dependencies using apt..."
|
||||
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl ffmpeg git nodejs npm ripgrep
|
||||
sudo apt-get install -y git python3 python3-pip python3-distutils wget curl youtube-dl yt-dlp ffmpeg git nodejs npm ripgrep
|
||||
sudo apt-get install -y libgtk2.0-0 libgtk-3-0 libnotify-dev libgconf-2-4 libnss3 libxss1 libasound2 libxtst6 xauth xvfb libgbm-dev || sudo apt-get install -y chromium || sudo apt-get install -y chromium-browser || true
|
||||
sudo apt-get install -y archivebox
|
||||
sudo apt-get --only-upgrade install -y archivebox
|
||||
echo ""
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip..."
|
||||
sudo python3.7 -m pip install --upgrade --ignore-installed archivebox
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip3..."
|
||||
sudo python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
# On Mac:
|
||||
elif which brew > /dev/null; then
|
||||
echo "[+] Installing ArchiveBox system dependencies using brew..."
|
||||
|
@ -129,16 +129,16 @@ elif which brew > /dev/null; then
|
|||
brew update
|
||||
brew install --fetch-HEAD -f archivebox
|
||||
echo ""
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip..."
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip3..."
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
elif which pkg > /dev/null; then
|
||||
echo "[+] Installing ArchiveBox system dependencies using pkg..."
|
||||
sudo pkg install -y python37 py37-pip py37-sqlite3 node npm wget curl youtube_dl ffmpeg git ripgrep
|
||||
echo "[+] Installing ArchiveBox system dependencies using pkg and pip (python3.9)..."
|
||||
sudo pkg install -y python3 py39-pip py39-sqlite3 npm wget curl youtube_dl ffmpeg git ripgrep
|
||||
sudo pkg install -y chromium
|
||||
echo ""
|
||||
echo "[+] Installing ArchiveBox python dependencies using pip..."
|
||||
sudo python3.7 -m pip install --upgrade --ignore-installed archivebox
|
||||
alias python3=python3.7
|
||||
# don't use sudo here so that pip installs in $HOME/.local instead of into /usr/local
|
||||
python3 -m pip install --upgrade --ignore-installed archivebox
|
||||
else
|
||||
echo "[!] Warning: Could not find aptitude/homebrew/pkg! May not be able to install all dependencies automatically."
|
||||
echo ""
|
||||
|
@ -192,7 +192,7 @@ echo "[√] Server started on http://0.0.0.0:8000 and data directory initialized
|
|||
echo " cd ~/archivebox"
|
||||
echo " ps aux | grep archivebox"
|
||||
echo " pkill -f archivebox"
|
||||
echo " pip3 install --upgrade archivebox"
|
||||
echo " python3 -m pip install --upgrade archivebox"
|
||||
echo " archivebox server --quick-init 0.0.0.0:8000"
|
||||
echo " archivebox manage createsuperuser"
|
||||
echo " archivebox add 'https://example.com'"
|
||||
|
|
|
@ -1 +1 @@
|
|||
Subproject commit 95a1c1a0875841d076f06106bd4c2307504928c2
|
||||
Subproject commit a4314719746de549f359c2fa975762fc73b62f94
|
|
@ -8,7 +8,7 @@
|
|||
# Documentation:
|
||||
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
|
||||
|
||||
version: '2.4'
|
||||
version: '2.4' # '3.9' or greater also works
|
||||
|
||||
services:
|
||||
archivebox:
|
||||
|
@ -23,15 +23,21 @@ services:
|
|||
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below
|
||||
# - SEARCH_BACKEND_HOST_NAME=sonic
|
||||
# - SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
# dns: # uncomment to use pihole below for ad/tracker blocking during archiving
|
||||
# - pihole
|
||||
volumes:
|
||||
- ./data:/data
|
||||
# - ./archivebox:/app/archivebox # for developers working on archivebox
|
||||
|
||||
# To run the Sonic full-text search backend, first download the config file to sonic.cfg
|
||||
|
||||
### Optional Addons: tweak these examples as needed for your specific use case
|
||||
|
||||
### Example: To run the Sonic full-text search backend, first download the config file to sonic.cfg
|
||||
# curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic.cfg
|
||||
# after starting, backfill any existing Snapshots into the index: docker-compose run archivebox update --index-only
|
||||
|
||||
# sonic:
|
||||
# image: valeriansaliou/sonic:v1.3.0
|
||||
# image: valeriansaliou/sonic:v1.3.1
|
||||
# expose:
|
||||
# - 1491
|
||||
# environment:
|
||||
|
@ -39,12 +45,25 @@ services:
|
|||
# volumes:
|
||||
# - ./sonic.cfg:/etc/sonic.cfg:ro
|
||||
# - ./data/sonic:/var/lib/sonic/store
|
||||
|
||||
|
||||
### Example: To run pihole in order to block ad/tracker requests during archiving,
|
||||
# uncomment this block and set up pihole using its admin interface
|
||||
|
||||
# pihole:
|
||||
# image: pihole/pihole:latest
|
||||
# ports:
|
||||
# - 80:80 # uncomment to access the admin HTTP interface on http://localhost:80
|
||||
# environment:
|
||||
# WEBPASSWORD: 'set a secure password here or it will be random'
|
||||
# volumes:
|
||||
# - ./data/pihole:/etc/pihole
|
||||
# - ./data/dnsmasq:/etc/dnsmasq.d
|
||||
|
||||
|
||||
### Optional Addons: tweak these examples as needed for your specific use case
|
||||
### Example: Run scheduled imports in a docker instead of using cron on the
|
||||
# host machine, add tasks and see more info with archivebox schedule --help
|
||||
|
||||
# Example: Run scheduled imports in a docker instead of using cron on the
|
||||
# host machine, add tasks and see more info with archivebox schedule --help
|
||||
# scheduler:
|
||||
# image: archivebox/archivebox:latest
|
||||
# command: schedule --foreground --every=day --depth=1 'https://getpocket.com/users/USERNAME/feed/all'
|
||||
|
@ -54,7 +73,9 @@ services:
|
|||
# volumes:
|
||||
# - ./data:/data
|
||||
|
||||
# Example: Put Nginx in front of the ArchiveBox server for SSL termination
|
||||
|
||||
### Example: Put Nginx in front of the ArchiveBox server for SSL termination
|
||||
|
||||
# nginx:
|
||||
# image: nginx:alpine
|
||||
# ports:
|
||||
|
@ -64,7 +85,9 @@ services:
|
|||
# - ./etc/nginx/nginx.conf:/etc/nginx/nginx.conf
|
||||
# - ./data:/var/www
|
||||
|
||||
# Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
|
||||
|
||||
### Example: run all your ArchiveBox traffic through a WireGuard VPN tunnel
|
||||
|
||||
# wireguard:
|
||||
# image: linuxserver/wireguard
|
||||
# network_mode: 'service:archivebox'
|
||||
|
@ -78,14 +101,16 @@ services:
|
|||
# - /lib/modules:/lib/modules
|
||||
# - ./wireguard.conf:/config/wg0.conf:ro
|
||||
|
||||
# Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
|
||||
|
||||
### Example: Run PYWB in parallel and auto-import WARCs from ArchiveBox
|
||||
|
||||
# pywb:
|
||||
# image: webrecorder/pywb:latest
|
||||
# entrypoint: /bin/sh 'wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback --proxy;'
|
||||
# entrypoint: /bin/sh -c '(wb-manager init default || test $$? -eq 2) && wb-manager add default /archivebox/archive/*/warc/*.warc.gz; wayback;'
|
||||
# environment:
|
||||
# - INIT_COLLECTION=archivebox
|
||||
# ports:
|
||||
# - 8080:8080
|
||||
# volumes:
|
||||
# ./data:/archivebox
|
||||
# ./data/wayback:/webarchive
|
||||
# - ./data:/archivebox
|
||||
# - ./data/wayback:/webarchive
|
||||
|
|
|
@ -55,7 +55,7 @@
|
|||
# CURL_BINARY = curl
|
||||
# GIT_BINARY = git
|
||||
# WGET_BINARY = wget
|
||||
# YOUTUBEDL_BINARY = youtube-dl
|
||||
# YOUTUBEDL_BINARY = yt-dlp
|
||||
# CHROME_BINARY = chromium
|
||||
|
||||
# CHROME_USER_DATA_DIR="~/.config/google-chrome/Default"
|
||||
|
|
282
package-lock.json
generated
282
package-lock.json
generated
|
@ -5,11 +5,11 @@
|
|||
"requires": true,
|
||||
"dependencies": {
|
||||
"@babel/runtime-corejs2": {
|
||||
"version": "7.13.10",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.13.10.tgz",
|
||||
"integrity": "sha512-rZw5P1ZewO6XZTDxtXuAuAFUqfNXyM8HO/9WiaDd34Anka0uFTpo0RvBLeV775AEE/zKw3LQB+poZw/O9lrZBg==",
|
||||
"version": "7.17.11",
|
||||
"resolved": "https://registry.npmjs.org/@babel/runtime-corejs2/-/runtime-corejs2-7.17.11.tgz",
|
||||
"integrity": "sha512-pJe8Aerb88TGVi1Xe/AE36aRCPrg+h6ktZPGl6xaJvOfTLcMMuogQu3BYcxeXPTNHhSYbmsDVYBs8CfAxeFFTg==",
|
||||
"requires": {
|
||||
"core-js": "^2.6.5",
|
||||
"core-js": "^2.6.12",
|
||||
"regenerator-runtime": "^0.13.4"
|
||||
}
|
||||
},
|
||||
|
@ -28,9 +28,8 @@
|
|||
}
|
||||
},
|
||||
"@postlight/mercury-parser": {
|
||||
"version": "2.2.0",
|
||||
"resolved": "https://registry.npmjs.org/@postlight/mercury-parser/-/mercury-parser-2.2.0.tgz",
|
||||
"integrity": "sha512-nz6dIvCAaiv74o1vhhp0BRsUe+ysPbZG5mVNpJmgLoI/goOBqRMM3Yg8uXtnv++e7tzKFSXdls8b2/zKk1qL0Q==",
|
||||
"version": "git+https://github.com/postlight/mercury-parser.git#9cd9662bcbfea00b773fad691a4f6e53394ff543",
|
||||
"from": "git+https://github.com/postlight/mercury-parser.git",
|
||||
"requires": {
|
||||
"@babel/runtime-corejs2": "^7.2.0",
|
||||
"@postlight/ci-failed-test-reporter": "^1.0",
|
||||
|
@ -50,35 +49,7 @@
|
|||
"url": "^0.11.0",
|
||||
"valid-url": "^1.0.9",
|
||||
"wuzzy": "^0.1.4",
|
||||
"yargs-parser": "^13.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"http-headers": {
|
||||
"version": "3.0.2",
|
||||
"bundled": true,
|
||||
"requires": {
|
||||
"next-line": "^1.1.0"
|
||||
}
|
||||
},
|
||||
"jquery": {
|
||||
"version": "3.4.1",
|
||||
"bundled": true
|
||||
},
|
||||
"moment": {
|
||||
"version": "2.23.0",
|
||||
"bundled": true
|
||||
},
|
||||
"moment-timezone": {
|
||||
"version": "0.5.26",
|
||||
"bundled": true,
|
||||
"requires": {
|
||||
"moment": ">= 2.9.0"
|
||||
}
|
||||
},
|
||||
"next-line": {
|
||||
"version": "1.1.0",
|
||||
"bundled": true
|
||||
}
|
||||
"yargs-parser": "^15.0.1"
|
||||
}
|
||||
},
|
||||
"@postman/form-data": {
|
||||
|
@ -105,9 +76,9 @@
|
|||
"integrity": "sha512-RbzJvlNzmRq5c3O09UipeuXno4tA1FE6ikOjxZK0tuxVv3412l64l5t1W5pj4+rJq9vpkm/kwiR07aZXnsKPxw=="
|
||||
},
|
||||
"@types/node": {
|
||||
"version": "16.0.0",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-16.0.0.tgz",
|
||||
"integrity": "sha512-TmCW5HoZ2o2/z2EYi109jLqIaPIi9y/lc2LmDCWzuCi35bcaQ+OtUh6nwBiFK7SOu25FAU5+YKdqFZUwtqGSdg==",
|
||||
"version": "17.0.4",
|
||||
"resolved": "https://registry.npmjs.org/@types/node/-/node-17.0.4.tgz",
|
||||
"integrity": "sha512-6xwbrW4JJiJLgF+zNypN5wr2ykM9/jHcL7rQ8fZe2vuftggjzZeRSM4OwRc6Xk8qWjwJ99qVHo/JgOGmomWRog==",
|
||||
"optional": true
|
||||
},
|
||||
"@types/yauzl": {
|
||||
|
@ -170,9 +141,9 @@
|
|||
}
|
||||
},
|
||||
"ansi-regex": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.0.tgz",
|
||||
"integrity": "sha512-bY6fj56OUQ0hU1KjFNDQuJFezqKdrAyFdIevADiqrWHwSlbmBNMHp5ak2f40Pm8JTFyM2mqxkG6ngkHO11f/lg=="
|
||||
"version": "5.0.1",
|
||||
"resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz",
|
||||
"integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ=="
|
||||
},
|
||||
"ansi-styles": {
|
||||
"version": "4.3.0",
|
||||
|
@ -188,9 +159,9 @@
|
|||
"integrity": "sha1-jCpe8kcv2ep0KwTHenUJO6J1fJM="
|
||||
},
|
||||
"asn1": {
|
||||
"version": "0.2.4",
|
||||
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.4.tgz",
|
||||
"integrity": "sha512-jxwzQpLQjSmWXgwaCZE9Nz+glAG01yF1QnWgbhGwHI5A6FRIEY6IVqtHhIepHqI7/kyEyQEagBC5mBEFlIYvdg==",
|
||||
"version": "0.2.6",
|
||||
"resolved": "https://registry.npmjs.org/asn1/-/asn1-0.2.6.tgz",
|
||||
"integrity": "sha512-ix/FxPn0MDjeyJ7i/yoHGFt/EX6LyNbxSEhPPXODPL+KB0VPk86UYfL0lMdy+KCnv+fmvIzySwaK5COwqVbWTQ==",
|
||||
"requires": {
|
||||
"safer-buffer": "~2.1.0"
|
||||
}
|
||||
|
@ -445,9 +416,9 @@
|
|||
}
|
||||
},
|
||||
"debug": {
|
||||
"version": "4.3.2",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.2.tgz",
|
||||
"integrity": "sha512-mOp8wKcvj7XxC78zLgw/ZA+6TSgkoE2C/ienthhRD298T7UNwAg9diBpLRxC0mOezLl4B0xV7M0cCO6P/O0Xhw==",
|
||||
"version": "4.3.3",
|
||||
"resolved": "https://registry.npmjs.org/debug/-/debug-4.3.3.tgz",
|
||||
"integrity": "sha512-/zxw5+vh1Tfv+4Qn7a5nsbcJKPaSvCDhojn6FEl9vupwK2VCSDtEiEtqr8DFtzYFOdz63LBkxec7DYuc2jon6Q==",
|
||||
"requires": {
|
||||
"ms": "2.1.2"
|
||||
}
|
||||
|
@ -515,9 +486,9 @@
|
|||
}
|
||||
},
|
||||
"dompurify": {
|
||||
"version": "2.3.0",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.0.tgz",
|
||||
"integrity": "sha512-VV5C6Kr53YVHGOBKO/F86OYX6/iLTw2yVSI721gKetxpHCK/V5TaLEf9ODjRgl1KLSWRMY6cUhAbv/c+IUnwQw=="
|
||||
"version": "2.3.4",
|
||||
"resolved": "https://registry.npmjs.org/dompurify/-/dompurify-2.3.4.tgz",
|
||||
"integrity": "sha512-6BVcgOAVFXjI0JTjEvZy901Rghm+7fDQOrNIcxB4+gdhj6Kwp6T9VBhBY/AbagKHJocRkDYGd6wvI+p4/10xtQ=="
|
||||
},
|
||||
"domutils": {
|
||||
"version": "1.5.1",
|
||||
|
@ -702,9 +673,9 @@
|
|||
}
|
||||
},
|
||||
"glob": {
|
||||
"version": "7.1.7",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.1.7.tgz",
|
||||
"integrity": "sha512-OvD9ENzPLbegENnYP5UUfJIirTg4+XwMWGaQfQTY0JenxNvvIKP3U3/tAQSPIu/lHxXYSZmpXlUHeqAIdKzBLQ==",
|
||||
"version": "7.2.0",
|
||||
"resolved": "https://registry.npmjs.org/glob/-/glob-7.2.0.tgz",
|
||||
"integrity": "sha512-lmLf6gtyrPq8tTjSmrO94wBeQbFR3HbLHbuyD69wuyQkImp2hWqMGB47OX65FBkPffO641IP9jWa1z4ivqG26Q==",
|
||||
"requires": {
|
||||
"fs.realpath": "^1.0.0",
|
||||
"inflight": "^1.0.4",
|
||||
|
@ -729,9 +700,9 @@
|
|||
}
|
||||
},
|
||||
"heap": {
|
||||
"version": "0.2.6",
|
||||
"resolved": "https://registry.npmjs.org/heap/-/heap-0.2.6.tgz",
|
||||
"integrity": "sha1-CH4fELBGky/IWU3Z5tN4r8nR5aw="
|
||||
"version": "0.2.7",
|
||||
"resolved": "https://registry.npmjs.org/heap/-/heap-0.2.7.tgz",
|
||||
"integrity": "sha512-2bsegYkkHO+h/9MGbn6KWcE45cHZgPANo5LXF7EvWdT0yT2EguSVO1nDgU5c8+ZOPwp2vMNa7YFsJhVcDR9Sdg=="
|
||||
},
|
||||
"html-encoding-sniffer": {
|
||||
"version": "1.0.2",
|
||||
|
@ -773,12 +744,12 @@
|
|||
}
|
||||
},
|
||||
"http-signature": {
|
||||
"version": "1.3.5",
|
||||
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.5.tgz",
|
||||
"integrity": "sha512-NwoTQYSJoFt34jSBbwzDHDofoA61NGXzu6wXh95o1Ry62EnmKjXb/nR/RknLeZ3G/uGwrlKNY2z7uPt+Cdl7Tw==",
|
||||
"version": "1.3.6",
|
||||
"resolved": "https://registry.npmjs.org/http-signature/-/http-signature-1.3.6.tgz",
|
||||
"integrity": "sha512-3adrsD6zqo4GsTqtO7FyrejHNv+NgiIfAfv68+jVlFmSr9OGy7zrxONceFRLKvnnZA5jbxQBX1u9PpB6Wi32Gw==",
|
||||
"requires": {
|
||||
"assert-plus": "^1.0.0",
|
||||
"jsprim": "^1.2.2",
|
||||
"jsprim": "^2.0.2",
|
||||
"sshpk": "^1.14.1"
|
||||
}
|
||||
},
|
||||
|
@ -848,6 +819,11 @@
|
|||
"resolved": "https://registry.npmjs.org/isstream/-/isstream-0.1.2.tgz",
|
||||
"integrity": "sha1-R+Y/evVa+m+S4VAOaQ64uFKcCZo="
|
||||
},
|
||||
"jquery": {
|
||||
"version": "3.6.0",
|
||||
"resolved": "https://registry.npmjs.org/jquery/-/jquery-3.6.0.tgz",
|
||||
"integrity": "sha512-JVzAR/AjBvVt2BmYhxRCSYysDsPcssdmTFnzyLEts9qNwmjmu4JTAMYubEfwVOSwpQ1I1sKKFcxhZCI2buerfw=="
|
||||
},
|
||||
"jsbn": {
|
||||
"version": "0.1.1",
|
||||
"resolved": "https://registry.npmjs.org/jsbn/-/jsbn-0.1.1.tgz",
|
||||
|
@ -887,9 +863,9 @@
|
|||
}
|
||||
},
|
||||
"json-schema": {
|
||||
"version": "0.2.3",
|
||||
"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.2.3.tgz",
|
||||
"integrity": "sha1-tIDIkuWaLwWVTOcnvT8qTogvnhM="
|
||||
"version": "0.4.0",
|
||||
"resolved": "https://registry.npmjs.org/json-schema/-/json-schema-0.4.0.tgz",
|
||||
"integrity": "sha512-es94M3nTIfsEPisRafak+HDLfHXnKBhV3vU5eqPcS3flIWqcxJWgXHXiey3YrpaNsanY5ei1VoYEbOzijuq9BA=="
|
||||
},
|
||||
"json-schema-traverse": {
|
||||
"version": "0.4.1",
|
||||
|
@ -902,20 +878,20 @@
|
|||
"integrity": "sha1-Epai1Y/UXxmg9s4B1lcB4sc1tus="
|
||||
},
|
||||
"jsprim": {
|
||||
"version": "1.4.1",
|
||||
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.1.tgz",
|
||||
"integrity": "sha1-MT5mvB5cwG5Di8G3SZwuXFastqI=",
|
||||
"version": "2.0.2",
|
||||
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-2.0.2.tgz",
|
||||
"integrity": "sha512-gqXddjPqQ6G40VdnI6T6yObEC+pDNvyP95wdQhkWkg7crHH3km5qP1FsOXEkzEQwnz6gz5qGTn1c2Y52wP3OyQ==",
|
||||
"requires": {
|
||||
"assert-plus": "1.0.0",
|
||||
"extsprintf": "1.3.0",
|
||||
"json-schema": "0.2.3",
|
||||
"json-schema": "0.4.0",
|
||||
"verror": "1.10.0"
|
||||
}
|
||||
},
|
||||
"jszip": {
|
||||
"version": "3.6.0",
|
||||
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.6.0.tgz",
|
||||
"integrity": "sha512-jgnQoG9LKnWO3mnVNBnfhkh0QknICd1FGSrXcgrl67zioyJ4wgx25o9ZqwNtrROSflGBCGYnJfjrIyRIby1OoQ==",
|
||||
"version": "3.7.1",
|
||||
"resolved": "https://registry.npmjs.org/jszip/-/jszip-3.7.1.tgz",
|
||||
"integrity": "sha512-ghL0tz1XG9ZEmRMcEN2vt7xabrDdqHHeykgARpmZ0BiIctWxM47Vt63ZO2dnp4QYt/xJVLLy5Zv1l/xRdh2byg==",
|
||||
"requires": {
|
||||
"lie": "~3.3.0",
|
||||
"pako": "~1.0.2",
|
||||
|
@ -1078,11 +1054,24 @@
|
|||
"resolved": "https://registry.npmjs.org/mkdirp-classic/-/mkdirp-classic-0.5.3.tgz",
|
||||
"integrity": "sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A=="
|
||||
},
|
||||
"moment": {
|
||||
"version": "2.29.3",
|
||||
"resolved": "https://registry.npmjs.org/moment/-/moment-2.29.3.tgz",
|
||||
"integrity": "sha512-c6YRvhEo//6T2Jz/vVtYzqBzwvPT95JBQ+smCytzf7c50oMZRsR/a4w88aD34I+/QVSfnoAnSBFPJHItlOMJVw=="
|
||||
},
|
||||
"moment-parseformat": {
|
||||
"version": "3.0.0",
|
||||
"resolved": "https://registry.npmjs.org/moment-parseformat/-/moment-parseformat-3.0.0.tgz",
|
||||
"integrity": "sha512-dVgXe6b6DLnv4CHG7a1zUe5mSXaIZ3c6lSHm/EKeVeQI2/4pwe0VRde8OyoCE1Ro2lKT5P6uT9JElF7KDLV+jw=="
|
||||
},
|
||||
"moment-timezone": {
|
||||
"version": "0.5.26",
|
||||
"resolved": "https://registry.npmjs.org/moment-timezone/-/moment-timezone-0.5.26.tgz",
|
||||
"integrity": "sha512-sFP4cgEKTCymBBKgoxZjYzlSovC20Y6J7y3nanDc5RoBIXKlZhoYwBoZGe3flwU6A372AcRwScH8KiwV6zjy1g==",
|
||||
"requires": {
|
||||
"moment": ">= 2.9.0"
|
||||
}
|
||||
},
|
||||
"ms": {
|
||||
"version": "2.1.2",
|
||||
"resolved": "https://registry.npmjs.org/ms/-/ms-2.1.2.tgz",
|
||||
|
@ -1094,9 +1083,33 @@
|
|||
"integrity": "sha1-/K5XhTBStqm66CCOQN19PC0wRgM="
|
||||
},
|
||||
"node-fetch": {
|
||||
"version": "2.6.1",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.1.tgz",
|
||||
"integrity": "sha512-V4aYg89jEoVRxRb2fJdAg8FHvI7cEyYdVAh94HH0UIK8oJxUfkjlDQN9RbMx+bEjP7+ggMiFRprSti032Oipxw=="
|
||||
"version": "2.6.7",
|
||||
"resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.6.7.tgz",
|
||||
"integrity": "sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==",
|
||||
"requires": {
|
||||
"whatwg-url": "^5.0.0"
|
||||
},
|
||||
"dependencies": {
|
||||
"tr46": {
|
||||
"version": "0.0.3",
|
||||
"resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz",
|
||||
"integrity": "sha1-gYT9NH2snNwYWZLzpmIuFLnZq2o="
|
||||
},
|
||||
"webidl-conversions": {
|
||||
"version": "3.0.1",
|
||||
"resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz",
|
||||
"integrity": "sha1-JFNCdeKnvGvnvIZhHMFq4KVlSHE="
|
||||
},
|
||||
"whatwg-url": {
|
||||
"version": "5.0.0",
|
||||
"resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz",
|
||||
"integrity": "sha1-lmRU6HZUYuN2RNNib2dCzotwll0=",
|
||||
"requires": {
|
||||
"tr46": "~0.0.3",
|
||||
"webidl-conversions": "^3.0.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"nth-check": {
|
||||
"version": "1.0.2",
|
||||
|
@ -1207,9 +1220,9 @@
|
|||
"integrity": "sha512-2qHaIQr2VLRFoxe2nASzsV6ef4yOOH+Fi9FBOVH6cqeSgUnoyySPZkxzLuzd+RYOQTRpROA0ztTMqxROKSb/nA=="
|
||||
},
|
||||
"postman-request": {
|
||||
"version": "2.88.1-postman.29",
|
||||
"resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.29.tgz",
|
||||
"integrity": "sha512-QuL3+AvGlmPLb1Qf0t/rM8M4U8LCYbADZBijUNToLl6l37i65KH8wY1gTLWLxlw2I6ugxUfX2Zyyk5/J5HFZIg==",
|
||||
"version": "2.88.1-postman.31",
|
||||
"resolved": "https://registry.npmjs.org/postman-request/-/postman-request-2.88.1-postman.31.tgz",
|
||||
"integrity": "sha512-OJbYqP7ItxQ84yHyuNpDywCZB0HYbpHJisMQ9lb1cSL3N5H3Td6a2+3l/a74UMd3u82BiGC5yQyYmdOIETP/nQ==",
|
||||
"requires": {
|
||||
"@postman/form-data": "~3.1.1",
|
||||
"@postman/tunnel-agent": "^0.6.3",
|
||||
|
@ -1308,16 +1321,16 @@
|
|||
}
|
||||
},
|
||||
"ws": {
|
||||
"version": "7.5.2",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
|
||||
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
|
||||
"version": "7.5.6",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
|
||||
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
|
||||
}
|
||||
}
|
||||
},
|
||||
"qs": {
|
||||
"version": "6.5.2",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.2.tgz",
|
||||
"integrity": "sha512-N5ZAX4/LxJmF+7wN74pUD6qAh9/wnvdQcjq9TZjevvXzSUo7bfmw91saqMjzGS2xq91/odN2dW/WOl7qQHNDGA=="
|
||||
"version": "6.5.3",
|
||||
"resolved": "https://registry.npmjs.org/qs/-/qs-6.5.3.tgz",
|
||||
"integrity": "sha512-qxXIEh4pCGfHICj1mAJQ2/2XVZkjCDTcEgfoSQxc/fYivUZxTkk7L3bDBJSoNrEzXI17oUO5Dp07ktqE5KzczA=="
|
||||
},
|
||||
"querystring": {
|
||||
"version": "0.2.0",
|
||||
|
@ -1334,9 +1347,9 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"acorn": {
|
||||
"version": "8.4.1",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz",
|
||||
"integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA=="
|
||||
"version": "8.6.0",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz",
|
||||
"integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw=="
|
||||
},
|
||||
"acorn-globals": {
|
||||
"version": "6.0.0",
|
||||
|
@ -1417,9 +1430,9 @@
|
|||
}
|
||||
},
|
||||
"estraverse": {
|
||||
"version": "5.2.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz",
|
||||
"integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ=="
|
||||
"version": "5.3.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
|
||||
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="
|
||||
},
|
||||
"form-data": {
|
||||
"version": "3.0.1",
|
||||
|
@ -1440,9 +1453,9 @@
|
|||
}
|
||||
},
|
||||
"jsdom": {
|
||||
"version": "16.6.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz",
|
||||
"integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==",
|
||||
"version": "16.7.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
|
||||
"integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
|
||||
"requires": {
|
||||
"abab": "^2.0.5",
|
||||
"acorn": "^8.2.4",
|
||||
|
@ -1469,7 +1482,7 @@
|
|||
"whatwg-encoding": "^1.0.5",
|
||||
"whatwg-mimetype": "^2.3.0",
|
||||
"whatwg-url": "^8.5.0",
|
||||
"ws": "^7.4.5",
|
||||
"ws": "^7.4.6",
|
||||
"xml-name-validator": "^3.0.0"
|
||||
}
|
||||
},
|
||||
|
@ -1512,9 +1525,9 @@
|
|||
}
|
||||
},
|
||||
"ws": {
|
||||
"version": "7.5.2",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
|
||||
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
|
||||
"version": "7.5.6",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
|
||||
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -1529,9 +1542,9 @@
|
|||
}
|
||||
},
|
||||
"regenerator-runtime": {
|
||||
"version": "0.13.7",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.7.tgz",
|
||||
"integrity": "sha512-a54FxoJDIr27pgf7IgeQGxmqUNYrcV338lf/6gH456HZ/PhX+5BcwHXG9ajESmwe6WRO0tAzRUrRmNONWgkrew=="
|
||||
"version": "0.13.9",
|
||||
"resolved": "https://registry.npmjs.org/regenerator-runtime/-/regenerator-runtime-0.13.9.tgz",
|
||||
"integrity": "sha512-p3VT+cOEgxFsRRA9X4lkI1E+k2/CtnKtU4gcxyaCUreilL/vqI6CdZ3wxVUx3UOUg+gnUOQQcRI7BmSI656MYA=="
|
||||
},
|
||||
"request": {
|
||||
"version": "2.88.2",
|
||||
|
@ -1569,6 +1582,17 @@
|
|||
"jsprim": "^1.2.2",
|
||||
"sshpk": "^1.7.0"
|
||||
}
|
||||
},
|
||||
"jsprim": {
|
||||
"version": "1.4.2",
|
||||
"resolved": "https://registry.npmjs.org/jsprim/-/jsprim-1.4.2.tgz",
|
||||
"integrity": "sha512-P2bSOMAc/ciLz6DzgjVlGJP9+BrJWu5UDGK70C2iweC5QBIeFf0ZXRvGjEj2uYgrY2MkAAhsSWHDWlFtEroZWw==",
|
||||
"requires": {
|
||||
"assert-plus": "1.0.0",
|
||||
"extsprintf": "1.3.0",
|
||||
"json-schema": "0.4.0",
|
||||
"verror": "1.10.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -1683,9 +1707,9 @@
|
|||
},
|
||||
"dependencies": {
|
||||
"acorn": {
|
||||
"version": "8.4.1",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.4.1.tgz",
|
||||
"integrity": "sha512-asabaBSkEKosYKMITunzX177CXxQ4Q8BSSzMTKD+FefUhipQC70gfW5SiUDhYQ3vk8G+81HqQk7Fv9OXwwn9KA=="
|
||||
"version": "8.6.0",
|
||||
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.6.0.tgz",
|
||||
"integrity": "sha512-U1riIR+lBSNi3IbxtaHOIKdH8sLFv3NYfNv8sg7ZsNhcfl4HF2++BfqqrNAxoCLQW1iiylOj76ecnaUxz+z9yw=="
|
||||
},
|
||||
"acorn-globals": {
|
||||
"version": "6.0.0",
|
||||
|
@ -1766,9 +1790,9 @@
|
|||
}
|
||||
},
|
||||
"estraverse": {
|
||||
"version": "5.2.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.2.0.tgz",
|
||||
"integrity": "sha512-BxbNGGNm0RyRYvUdHpIwv9IWzeM9XClbOxwoATuFdOE7ZE6wHL+HQ5T8hoPM+zHvmKzzsEqhgy0GrQ5X13afiQ=="
|
||||
"version": "5.3.0",
|
||||
"resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz",
|
||||
"integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA=="
|
||||
},
|
||||
"form-data": {
|
||||
"version": "3.0.1",
|
||||
|
@ -1797,9 +1821,9 @@
|
|||
}
|
||||
},
|
||||
"jsdom": {
|
||||
"version": "16.6.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.6.0.tgz",
|
||||
"integrity": "sha512-Ty1vmF4NHJkolaEmdjtxTfSfkdb8Ywarwf63f+F8/mDD1uLSSWDxDuMiZxiPhwunLrn9LOSVItWj4bLYsLN3Dg==",
|
||||
"version": "16.7.0",
|
||||
"resolved": "https://registry.npmjs.org/jsdom/-/jsdom-16.7.0.tgz",
|
||||
"integrity": "sha512-u9Smc2G1USStM+s/x1ru5Sxrl6mPYCbByG1U/hUmqaVsm4tbNyS7CicOSRyuGQYZhTu0h84qkZZQ/I+dzizSVw==",
|
||||
"requires": {
|
||||
"abab": "^2.0.5",
|
||||
"acorn": "^8.2.4",
|
||||
|
@ -1826,7 +1850,7 @@
|
|||
"whatwg-encoding": "^1.0.5",
|
||||
"whatwg-mimetype": "^2.3.0",
|
||||
"whatwg-url": "^8.5.0",
|
||||
"ws": "^7.4.5",
|
||||
"ws": "^7.4.6",
|
||||
"xml-name-validator": "^3.0.0"
|
||||
}
|
||||
},
|
||||
|
@ -1869,9 +1893,9 @@
|
|||
}
|
||||
},
|
||||
"ws": {
|
||||
"version": "7.5.2",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.2.tgz",
|
||||
"integrity": "sha512-lkF7AWRicoB9mAgjeKbGqVUekLnSNO4VjKVnuPHpQeOxZOErX6BPXwJk70nFslRCEEA8EVW7ZjKwXaP9N+1sKQ=="
|
||||
"version": "7.5.6",
|
||||
"resolved": "https://registry.npmjs.org/ws/-/ws-7.5.6.tgz",
|
||||
"integrity": "sha512-6GLgCqo2cy2A2rjCNFlxQS6ZljG/coZfZXclldI8FB/1G3CCI36Zd8xy2HrFVACi8tfk5XrgLQEk+P0Tnz9UcA=="
|
||||
}
|
||||
}
|
||||
},
|
||||
|
@ -1882,9 +1906,9 @@
|
|||
"optional": true
|
||||
},
|
||||
"sshpk": {
|
||||
"version": "1.16.1",
|
||||
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.16.1.tgz",
|
||||
"integrity": "sha512-HXXqVUq7+pcKeLqqZj6mHFUMvXtOJt1uoUx09pFW6011inTMxqI8BA8PM95myrIyyKwdnzjdFjLiE6KBPVtJIg==",
|
||||
"version": "1.17.0",
|
||||
"resolved": "https://registry.npmjs.org/sshpk/-/sshpk-1.17.0.tgz",
|
||||
"integrity": "sha512-/9HIEs1ZXGhSPE8X6Ccm7Nam1z8KcoCqPdI7ecm1N33EzAetWahvQWVqLZtaZQ+IDKX4IyA2o0gBzqIMkAagHQ==",
|
||||
"requires": {
|
||||
"asn1": "~0.2.3",
|
||||
"assert-plus": "^1.0.0",
|
||||
|
@ -1916,13 +1940,13 @@
|
|||
"integrity": "sha1-PYRT5ydKLkShQrPchEnftk2a3jo="
|
||||
},
|
||||
"string-width": {
|
||||
"version": "4.2.2",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.2.tgz",
|
||||
"integrity": "sha512-XBJbT3N4JhVumXE0eoLU9DCjcaF92KLNqTmFCnG1pf8duUxFGwtP6AD6nkjw9a3IdiRtL3E2w3JDiE/xi3vOeA==",
|
||||
"version": "4.2.3",
|
||||
"resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz",
|
||||
"integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==",
|
||||
"requires": {
|
||||
"emoji-regex": "^8.0.0",
|
||||
"is-fullwidth-code-point": "^3.0.0",
|
||||
"strip-ansi": "^6.0.0"
|
||||
"strip-ansi": "^6.0.1"
|
||||
}
|
||||
},
|
||||
"string_decoder": {
|
||||
|
@ -1934,11 +1958,11 @@
|
|||
}
|
||||
},
|
||||
"strip-ansi": {
|
||||
"version": "6.0.0",
|
||||
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.0.tgz",
|
||||
"integrity": "sha512-AuvKTrTfQNYNIctbR1K/YGTR1756GycPsg7b9bdV9Duqur4gv6aKqHXah67Z8ImS7WEz5QVcOtlfW2rZEugt6w==",
|
||||
"version": "6.0.1",
|
||||
"resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz",
|
||||
"integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==",
|
||||
"requires": {
|
||||
"ansi-regex": "^5.0.0"
|
||||
"ansi-regex": "^5.0.1"
|
||||
}
|
||||
},
|
||||
"strong-data-uri": {
|
||||
|
@ -2187,9 +2211,9 @@
|
|||
}
|
||||
},
|
||||
"wuzzy": {
|
||||
"version": "0.1.6",
|
||||
"resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.6.tgz",
|
||||
"integrity": "sha512-x1lDcj0VvzJ1ygDpd9LWMnQVei6gEkUbCcZUG8TPnXhlPbaQWQa32ab/6xbm/samxJ2T3Y2+P3xHeeQIAcEvqQ==",
|
||||
"version": "0.1.8",
|
||||
"resolved": "https://registry.npmjs.org/wuzzy/-/wuzzy-0.1.8.tgz",
|
||||
"integrity": "sha512-FUzKQepFSTnANsDYwxpIzGJ/dIJaqxuMre6tzzbvWwFAiUHPsI1nVQVCLK4Xqr67KO7oYAK0kaCcI/+WYj/7JA==",
|
||||
"requires": {
|
||||
"lodash": "^4.17.15"
|
||||
}
|
||||
|
@ -2231,9 +2255,9 @@
|
|||
}
|
||||
},
|
||||
"yargs-parser": {
|
||||
"version": "13.1.2",
|
||||
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-13.1.2.tgz",
|
||||
"integrity": "sha512-3lbsNRf/j+A4QuSZfDRA7HRSfWrzO0YjqTJd5kjAq37Zep1CEgaYmrH9Q3GwPiB9cHyd1Y1UwggGhJGoxipbzg==",
|
||||
"version": "15.0.3",
|
||||
"resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-15.0.3.tgz",
|
||||
"integrity": "sha512-/MVEVjTXy/cGAjdtQf8dW3V9b97bPN7rNn8ETj6BmAQL7ibC7O1Q9SPJbGjgh3SlwoBNXMzj/ZGIj8mBgl12YA==",
|
||||
"requires": {
|
||||
"camelcase": "^5.0.0",
|
||||
"decamelize": "^1.2.0"
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
"repository": "github:ArchiveBox/ArchiveBox",
|
||||
"license": "MIT",
|
||||
"dependencies": {
|
||||
"@postlight/mercury-parser": "^2.2.0",
|
||||
"@postlight/mercury-parser": "git+https://github.com/postlight/mercury-parser.git",
|
||||
"readability-extractor": "git+https://github.com/ArchiveBox/readability-extractor.git",
|
||||
"single-file": "git+https://github.com/gildas-lormeau/SingleFile.git"
|
||||
}
|
||||
|
|
1
setup.py
1
setup.py
|
@ -42,6 +42,7 @@ INSTALL_REQUIRES = [
|
|||
"django-extensions>=3.0.3",
|
||||
"dateparser>=1.0.0",
|
||||
"youtube-dl>=2021.04.17",
|
||||
"yt-dlp>=2021.4.11",
|
||||
"python-crontab>=2.5.1",
|
||||
"croniter>=0.3.34",
|
||||
"w3lib>=1.22.0",
|
||||
|
|
|
@ -5,7 +5,7 @@ Package3: archivebox
|
|||
Suite: focal
|
||||
Suite3: focal
|
||||
Build-Depends: debhelper, dh-python, python3-all, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
|
||||
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
|
||||
Depends3: nodejs, wget, curl, git, ffmpeg, youtube-dl, yt-dlp, python3-all, python3-pip, python3-setuptools, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
|
||||
X-Python3-Version: >= 3.7
|
||||
XS-Python-Version: >= 3.7
|
||||
Setup-Env-Vars: DEB_BUILD_OPTIONS=nocheck
|
||||
|
|
Loading…
Reference in a new issue