Merge branch 'v0.5.0' of github.com:ArchiveBox/ArchiveBox into feat-snapshots-grid

This commit is contained in:
jdcaballerov 2020-12-14 15:05:19 -05:00
commit 7b66e1514d
32 changed files with 401 additions and 237 deletions

View file

@ -4,10 +4,12 @@ on:
workflow_dispatch: workflow_dispatch:
push: push:
env:
DEB_BUILD_OPTIONS: nocheck
jobs: jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-20.04
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -15,31 +17,60 @@ jobs:
submodules: true submodules: true
fetch-depth: 1 fetch-depth: 1
- name: Set up Python - name: Install packaging dependencies
uses: actions/setup-python@v1
with:
python-version: 3.9
architecture: x64
- name: Build Debian/Apt package
run: | run: |
sudo apt install -y python3 python3-dev python3-pip python3-venv python3-all dh-python debhelper devscripts dput software-properties-common python3-setuptools python3-wheel python3-stdeb sudo apt install -y \
pip3 install --upgrade pip setuptools wheel stdeb python3 python3-dev python3-pip python3-venv python3-all \
./bin/build_deb.sh dh-python debhelper devscripts dput software-properties-common \
python3-distutils python3-setuptools python3-wheel python3-stdeb
- name: Build Debian/Apt sdist_dsc
run: |
rm -Rf deb_dist/*
python3 setup.py --command-packages=stdeb.command sdist_dsc
- name: Build Debian/Apt bdist_deb
run: |
python3 setup.py --command-packages=stdeb.command bdist_deb
- name: Install archivebox from deb - name: Install archivebox from deb
run: | run: |
apt install deb_dist/archivebox*.deb cd deb_dist/
sudo apt install ./archivebox*.deb
- name: Check ArchiveBox version
run: |
# must create dir needed for snaps to run as non-root on github actions
sudo mkdir -p /run/user/1001 && sudo chmod -R 777 /run/user/1001
mkdir "${{ github.workspace }}/data" && cd "${{ github.workspace }}/data"
archivebox init
archivebox config --set SAVE_READABILITY=False
archivebox config --set SAVE_MERCURY=False
archivebox config --set SAVE_SINGLEFILE=False
archivebox version
- name: Add some links to test - name: Add some links to test
run: | run: |
mkdir data && cd data cd "${{ github.workspace }}/data"
archivebox init
archivebox add 'https://example.com' archivebox add 'https://example.com'
archivebox version
archivebox status archivebox status
# TODO: push debian package to launchpad PPA # - name: Commit built package
# - name: Push to launchpad
# run: | # run: |
# cd deb_dist/
# git config --local user.email "action@github.com"
# git config --local user.name "GitHub Action"
# git commit -m "Debian package autobuild" -a
# - name: Push build to Github
# uses: ad-m/github-push-action@master
# with:
# github_token: ${{ secrets.GITHUB_TOKEN }}
# repository: ArchiveBox/debian-archivebox
# branch: ${{ github.ref }}
# directory: deb_dist
# - name: Push build to Launchpad PPA
# run: |
# debsign -k "$PGP_KEY_ID" "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"
# dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes" # dput archivebox "deb_dist/archivebox_${VERSION}-${DEBIAN_VERSION}_source.changes"

View file

@ -15,12 +15,14 @@ jobs:
submodules: true submodules: true
fetch-depth: 1 fetch-depth: 1
# TODO: modify archivebox.rb to update src url, hashes, and dependencies
- name: Build Homebrew Bottle - name: Build Homebrew Bottle
run: | run: |
pip3 install --upgrade pip setuptools wheel pip3 install --upgrade pip setuptools wheel
cd brew_dist/ cd brew_dist/
brew install --build-bottle ./archivebox.rb brew install --build-bottle ./archivebox.rb
brew bottle archivebox # brew bottle archivebox
- name: Add some links to test - name: Add some links to test
run: | run: |
@ -30,4 +32,19 @@ jobs:
archivebox version archivebox version
archivebox status archivebox status
# TODO: push bottle to Github and open homebrew core PR with latest changes # - name: Commit built package
# run: |
# cd brew_dist/
# git config --local user.email "action@github.com"
# git config --local user.name "GitHub Action"
# git commit -m "Homebrew package autobuild" -a
# - name: Push build to Github
# uses: ad-m/github-push-action@master
# with:
# github_token: ${{ secrets.GITHUB_TOKEN }}
# repository: ArchiveBox/homebrew-archivebox
# branch: ${{ github.ref }}
# directory: brew_dist
# TODO: push bottle homebrew core PR with latest changes

View file

@ -9,7 +9,7 @@ env:
jobs: jobs:
lint: lint:
runs-on: ubuntu-latest runs-on: ubuntu-20.04
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
with: with:

View file

@ -7,7 +7,7 @@ on:
jobs: jobs:
build: build:
runs-on: ubuntu-latest runs-on: ubuntu-20.04
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -24,6 +24,7 @@ jobs:
- name: Build Python Package - name: Build Python Package
run: | run: |
pip3 install --upgrade pip setuptools wheel pip3 install --upgrade pip setuptools wheel
rm -Rf pip_dist/*.whl
python3 setup.py \ python3 setup.py \
sdist --dist-dir=./pip_dist \ sdist --dist-dir=./pip_dist \
bdist_wheel --dist-dir=./pip_dist \ bdist_wheel --dist-dir=./pip_dist \
@ -38,4 +39,23 @@ jobs:
archivebox version archivebox version
archivebox status archivebox status
# TODO: push to PyPI with twine # - name: Commit built package
# run: |
# cd pip_dist/
# git config --local user.email "action@github.com"
# git config --local user.name "GitHub Action"
# git commit -m "Pip package autobuild" -a
# - name: Push build to Github
# uses: ad-m/github-push-action@master
# with:
# github_token: ${{ secrets.GITHUB_TOKEN }}
# repository: ArchiveBox/pip-archivebox
# branch: ${{ github.ref }}
# directory: pip_dist
# - name: Push build to PyPI
# run: |
# cd pip_dist/
# python3 -m twine upload --repository testpypi pip_dist/*.{whl,tar.gz}
# python3 -m twine upload --repository pypi pip_dist/*.{whl,tar.gz}

View file

@ -3,6 +3,9 @@ on: [push]
env: env:
DOCKER_IMAGE: archivebox-ci DOCKER_IMAGE: archivebox-ci
PYTHONIOENCODING: utf-8
PYTHONLEGACYWINDOWSSTDIO: utf-8
USE_COLOR: False
jobs: jobs:
python_tests: python_tests:
@ -10,8 +13,8 @@ jobs:
strategy: strategy:
matrix: matrix:
os: [ubuntu-latest, macos-latest] os: [ubuntu-20.04, macos-latest, windows-latest]
python: [3.7, 3.8] python: [3.7]
steps: steps:
- uses: actions/checkout@v2 - uses: actions/checkout@v2
@ -77,10 +80,15 @@ jobs:
- name: Directory listing for debugging - name: Directory listing for debugging
run: | run: |
pwd pwd
ls -a ./ ls
- name: Archivebox version
run: |
archivebox version archivebox version
- name: Test built package with pytest - name: Test built package with pytest
# TODO: remove this exception for windows once we get tests passing on that platform
if: ${{ !contains(matrix.os, 'windows') }}
run: | run: |
python -m pytest -s python -m pytest -s
@ -102,8 +110,8 @@ jobs:
- name: Init data dir - name: Init data dir
run: | run: |
mkdir data mkdir "${{ github.workspace }}/data"
docker run -v "$PWD"/data:/data "$DOCKER_IMAGE" init docker run -v "${{ github.workspace }}/data":/data "$DOCKER_IMAGE" init
- name: Run test server - name: Run test server
run: | run: |

2
.gitignore vendored
View file

@ -3,6 +3,7 @@
*.pyc *.pyc
__pycache__/ __pycache__/
.mypy_cache/ .mypy_cache/
tests/out/
# Python and Node dependencies # Python and Node dependencies
venv/ venv/
@ -11,6 +12,7 @@ venv/
node_modules/ node_modules/
# Packaging artifacts # Packaging artifacts
archivebox.egg-info
archivebox-*.tar.gz archivebox-*.tar.gz
build/ build/
dist/ dist/

311
README.md
View file

@ -26,62 +26,175 @@
<hr/> <hr/>
</div> </div>
ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects. ArchiveBox can be installed via [Docker](https://docs.docker.com/get-docker/) (recommended), [`apt`](https://launchpad.net/~archivebox/+archive/ubuntu/archivebox/+packages), [`brew`](https://github.com/ArchiveBox/homebrew-archivebox), or [`pip`](https://www.python.org/downloads/). It works on macOS, Windows, and Linux/BSD (both armv7 and amd64). ArchiveBox is a powerful self-hosted internet archiving solution written in Python 3. You feed it URLs of pages you want to archive, and it saves them to disk in a varitety of formats depending on the configuration and the content it detects.
Once installed, URLs can be added via the command line `archivebox add` or the built-in Web UI `archivebox server`. It can ingest bookmarks from a service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. Your archive can be managed through the command line with commands like `archivebox add`, through the built-in Web UI `archivebox server`, or via the Python library API (beta). It can ingest bookmarks from a browser or service like Pocket/Pinboard, your entire browsing history, RSS feeds, or URLs one at a time. You can also schedule regular/realtime imports with `archivebox schedule`.
The main index is a self-contained `data/index.sqlite3` file, and each snapshot is stored as a folder `data/archive/<timestamp>/`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: 3 types of HTML snapshots (wget, Chrome headless, singlefile), a PDF snapshot, a screenshot, a WARC archive, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python API. The main index is a self-contained `index.sqlite3` file, and each snapshot is stored as a folder `data/archive/<timestamp>/`, with an easy-to-read `index.html` and `index.json` within. For each page, ArchiveBox auto-extracts many types of assets/media and saves them in standard formats, with out-of-the-box support for: several types of HTML snapshots (wget, Chrome headless, singlefile), PDF snapshotting, screenshotting, WARC archiving, git repositories, images, audio, video, subtitles, article text, and more. The snapshots are browseable and managable offline through the filesystem, the built-in webserver, or the Python library API.
#### Quickstart ### Quickstart
It works on Linux/BSD (Intel and ARM CPUs with `docker`/`apt`/`pip3`), macOS (with `docker`/`brew`/`pip3`), and Windows (beta with `docker`/`pip3`).
**First, get ArchiveBox using your system package manager, Docker, or pip:**
```bash ```bash
# You can run it with Docker or Docker Compose (recommended) pip3 install archivebox
docker pull archivebox/archivebox archivebox --version
# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml # install extras as-needed, or use one of full setup methods below to get everything out-of-the-box
# or Ubuntu/Debian mkdir ~/archivebox && cd ~/archivebox # this can be anywhere
archivebox init
archivebox add 'https://example.com'
archivebox add --depth=1 'https://example.com'
archivebox schedule --every=day https://getpocket.com/users/USERNAME/feed/all
archivebox oneshot --extract=title,favicon,media https://www.youtube.com/watch?v=dQw4w9WgXcQ
archivebox help # to see more options
```
*(click to expand the sections below for full setup instructions)*
<details>
<summary><b>Get ArchiveBox with <code>docker-compose</code> on any platform (recommended, everything included out-of-the-box)</b></summary>
First make sure you have Docker installed: https://docs.docker.com/get-docker/
<br/><br/>
This is the recommended way to run ArchiveBox because it includes *all* the extractors like chrome, wget, youtube-dl, git, etc., as well as full-text search with sonic, and many other great features.
```bash
# create a new empty directory and initalize your collection (can be anywhere)
mkdir ~/archivebox && cd ~/archivebox
curl -O https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml
docker-compose run archivebox init
docker-compose run archivebox --version
# start the webserver and open the UI (optional)
docker-compose run archivebox manage createsuperuser
docker-compose up -d
open http://127.0.0.1:8000
# you can also add links and manage your archive via the CLI:
docker-compose run archivebox add 'https://example.com'
docker-compose run archivebox status
docker-compose run archivebox help # to see more options
```
</details>
<details>
<summary><b>Get ArchiveBox with <code>docker</code> on any platform</b></summary>
First make sure you have Docker installed: https://docs.docker.com/get-docker/<br/>
```bash
# create a new empty directory and initalize your collection (can be anywhere)
mkdir ~/archivebox && cd ~/archivebox
docker run -v $PWD:/data -it archivebox/archivebox init
docker run -v $PWD:/data -it archivebox/archivebox --version
# start the webserver and open the UI (optional)
docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
docker run -v $PWD:/data -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
# you can also add links and manage your archive via the CLI:
docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
docker run -v $PWD:/data -it archivebox/archivebox status
docker run -v $PWD:/data -it archivebox/archivebox help # to see more options
```
</details>
<details>
<summary><b>Get ArchiveBox with <code>apt</code> on Ubuntu >=20.04</b></summary>
```bash
sudo add-apt-repository -u ppa:archivebox/archivebox sudo add-apt-repository -u ppa:archivebox/archivebox
apt install archivebox sudo apt install archivebox
# or macOS # create a new empty directory and initalize your collection (can be anywhere)
mkdir ~/archivebox && cd ~/archivebox
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
archivebox init
archivebox --version
# start the webserver and open the web UI (optional)
archivebox manage createsuperuser
archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
# you can also add URLs and manage the archive via the CLI and filesystem:
archivebox add 'https://example.com'
archivebox status
archivebox list --html --with-headers > index.html
archivebox list --json --with-headers > index.json
archivebox help # to see more options
```
For other Debian-based systems or older Ubuntu systems you can add these sources to `/etc/apt/sources.list`:
```bash
deb http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
deb-src http://ppa.launchpad.net/archivebox/archivebox/ubuntu focal main
```
(you may need to install some other dependencies manually however)
</details>
<details>
<summary><b>Get ArchiveBox with <code>brew</code> on macOS >=10.13</b></summary>
```bash
brew install archivebox/archivebox/archivebox brew install archivebox/archivebox/archivebox
# or for the Python version only, without wget/git/chrome/etc. included # create a new empty directory and initalize your collection (can be anywhere)
mkdir ~/archivebox && cd ~/archivebox
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
archivebox init
archivebox --version
# start the webserver and open the web UI (optional)
archivebox manage createsuperuser
archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
# you can also add URLs and manage the archive via the CLI and filesystem:
archivebox add 'https://example.com'
archivebox status
archivebox list --html --with-headers > index.html
archivebox list --json --with-headers > index.json
archivebox help # to see more options
```
</details>
<details>
<summary><b>Get ArchiveBox with <code>pip</code> on any platform</b></summary>
```bash
pip3 install archivebox pip3 install archivebox
# If you're using an apt/brew/pip install you can run archivebox commands normally # create a new empty directory and initalize your collection (can be anywhere)
# archivebox [subcommand] [...args] mkdir ~/archivebox && cd ~/archivebox
# If you're using Docker you'll have to run the commands like this npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
# docker run -v $PWD:/data -it archivebox/archivebox [subcommand] [...args]
# And the equivalent in Docker Compose:
# docker-compose run archivebox [subcommand] [...args]
```
<small>Check that everything installed correctly with `archivebox --version`</small>
**To start using archivebox, you have to create a data folder and `cd` into it:**
```bash
mkdir ~/archivebox && cd ~/archivebox # you can put the collection dir anywhere
archivebox init archivebox init
archivebox --version
# Install any missing extras like wget/git/chrome/etc. manually as needed
# start the webserver and open the web UI (optional)
archivebox manage createsuperuser
archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
# you can also add URLs and manage the archive via the CLI and filesystem:
archivebox add 'https://example.com'
archivebox status
archivebox list --html --with-headers > index.html
archivebox list --json --with-headers > index.json
archivebox help # to see more options
``` ```
**Then Add some URLs to your archive collection:** </details>
```bash
archivebox add https://github.com/ArchiveBox/ArchiveBox ---
archivebox add --depth=1 https://example.com
```
**View the snapshots of the URLs you added via the self-hosted web UI:**
```bash
archivebox manage createsuperuser # create an admin acct
archivebox server 0.0.0.0:8000 # start the web server
open http://127.0.0.1:8000/ # open the interactive admin panel
ls ~/archivebox/archive/*/index.html # or browse the snapshots on disk
```
<div align="center"> <div align="center">
<img src="https://i.imgur.com/lUuicew.png" width="400px"> <img src="https://i.imgur.com/lUuicew.png" width="400px">
<br/> <br/>
@ -97,9 +210,9 @@ For more information, see the <a href="https://github.com/ArchiveBox/ArchiveBox/
ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It can be installed on Docker, macOS, and Linux/BSD, and Windows. You can download and install it as a Debian/Ubuntu package, Homebrew package, Python3 package, or a Docker image. No matter which install method you choose, they all provide the same CLI, Web UI, and on-disk data format. ArchiveBox is a command line tool, self-hostable web-archiving server, and Python library all-in-one. It can be installed on Docker, macOS, and Linux/BSD, and Windows. You can download and install it as a Debian/Ubuntu package, Homebrew package, Python3 package, or a Docker image. No matter which install method you choose, they all provide the same CLI, Web UI, and on-disk data format.
To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/export/manage/etc using the CLI `archivebox help`, or you can run the Web UI (recommended). To use ArchiveBox you start by creating a folder for your data to live in (it can be anywhere on your system), and running `archivebox init` inside of it. That will create a sqlite3 index and an `ArchiveBox.conf` file. After that, you can continue to add/export/manage/etc using the CLI `archivebox help`, or you can run the Web UI (recommended). If you only want to archive a single site, you can run `archivebox oneshot` to avoid having to create a whole collection.
The CLI is considered "stable", the ArchiveBox Python API and REST APIs are in "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is in "alpha" stage. The CLI is considered "stable", the ArchiveBox Python API and REST APIs are "beta", and the [desktop app](https://github.com/ArchiveBox/desktop) is "alpha".
At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots. At the end of the day, the goal is to sleep soundly knowing that the part of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer). You can also self-host your archivebox server on a public domain to provide archive.org-style public access to your site snapshots.
@ -146,7 +259,7 @@ archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples. See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly. It also includes a built-in scheduled import feature with `archivebox schedule` and browser bookmarklet, so you can pull in URLs from RSS feeds, websites, or the filesystem regularly/on-demand.
## Output formats ## Output formats
@ -161,11 +274,14 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details - **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
- **Title:** `title` title of the site - **Title:** `title` title of the site
- **Favicon:** `favicon.ico` favicon of the site - **Favicon:** `favicon.ico` favicon of the site
- **Headers:** `headers.json` Any HTTP headers the site returns are saved in a json file
- **SingleFile:** `singlefile.html` HTML snapshot rendered with headless Chrome using SingleFile
- **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present - **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present
- **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving - **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving
- **PDF:** `output.pdf` Printed PDF of site using headless chrome - **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome - **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome - **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
- **Readability:** `article.html/json` Article text extraction using Readability
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org - **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl - **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links - **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
@ -191,8 +307,8 @@ archivebox add 'https://example.com/any/url/you/want/to/keep/secret/'
# without first disabling share the URL with 3rd party APIs: # without first disabling share the URL with 3rd party APIs:
archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org archivebox config --set SAVE_ARCHIVE_DOT_ORG=False # disable saving all URLs in Archive.org
archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL archivebox config --set SAVE_FAVICON=False # optional: only the domain is leaked, not full URL
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google archivebox config --set CHROME_BINARY=chromium # optional: switch to chromium to avoid Chrome phoning home to Google
``` ```
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details. Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
@ -215,95 +331,6 @@ archivebox add 'https://example.com#2020-10-25'
--- ---
# Setup
## Docker Compose
*This is the recommended way of running ArchiveBox.*
It comes with everything working out of the box, including all extractors,
a headless browser runtime, a full webserver, and CLI interface.
```bash
# docker-compose run archivebox <command> [args]
mkdir archivebox && cd archivebox
wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
docker-compose run archivebox init
docker-compose run archivebox add 'https://example.com'
docker-compose run archivebox manage createsuperuser
docker-compose up
open http://127.0.0.1:8000
```
## Docker
```bash
# docker run -v $PWD:/data -it archivebox/archivebox <command> [args]
mkdir archivebox && cd archivebox
docker run -v $PWD:/data -it archivebox/archivebox init
docker run -v $PWD:/data -it archivebox/archivebox add 'https://example.com'
docker run -v $PWD:/data -it archivebox/archivebox manage createsuperuser
# run the webserver to access the web UI
docker run -v $PWD:/data -it -p 8000:8000 archivebox/archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
# or export a static version of the index if you dont want to run a server
docker run -v $PWD:/data -it archivebox/archivebox list --html --with-headers > index.html
docker run -v $PWD:/data -it archivebox/archivebox list --json --with-headers > index.json
open ./index.html
```
## Bare Metal
```bash
# archivebox <command> [args]
# on Debian/Ubuntu
sudo add-apt-repository -u ppa:archivebox/archivebox
apt install archivebox
# on macOS
brew install archivebox/archivebox/archivebox
```
Initialize your archive in a directory somewhere and add some links:
```bash
mkdir ~/archivebox && cd archivebox
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
archivebox init
archivebox add 'https://example.com' # add URLs as args pipe them in via stdin
archivebox add --depth=1 https://example.com/table-of-contents.html
# it can injest links from many formats, including RSS/JSON/XML/MD/TXT and more
curl https://getpocket.com/users/USERNAME/feed/all | archivebox add
```
Start the webserver to access the web UI:
```bash
archivebox manage createsuperuser
archivebox server 0.0.0.0:8000
open http://127.0.0.1:8000
```
Or export a static HTML version of the index if you don't want to run a webserver:
```bash
archivebox list --html --with-headers > index.html
archivebox list --json --with-headers > index.json
open ./index.html
```
To view more information about your dependencies, data, or the CLI:
```bash
archivebox version
archivebox status
archivebox help
```
---
<div align="center"> <div align="center">
<img src="https://i.imgur.com/PVO88AZ.png" width="80%"/> <img src="https://i.imgur.com/PVO88AZ.png" width="80%"/>
</div> </div>
@ -418,22 +445,18 @@ All contributions to ArchiveBox are welcomed! Check our [issues](https://github.
First, install the system dependencies from the "Bare Metal" section above. First, install the system dependencies from the "Bare Metal" section above.
Then you can clone the ArchiveBox repo and install Then you can clone the ArchiveBox repo and install
```python3 ```python3
git clone https://github.com/ArchiveBox/ArchiveBox git clone https://github.com/ArchiveBox/ArchiveBox && cd ArchiveBox
cd ArchiveBox
git checkout master # or the branch you want to test git checkout master # or the branch you want to test
git pull git pull --recurse-submodules
git submodule init
git submodule update
# Install ArchiveBox + python dependencies # Install ArchiveBox + python dependencies
python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev] python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev]
# or # or with pipenv: pipenv install --dev && pipenv shell
pipenv install --dev && pipenv shell
# Install node dependencies # Install node dependencies
npm install npm install
# Optional: install the extractor dependencies # Optional: install extractor dependencies manually or with helper script
./bin/setup.sh ./bin/setup.sh
# Optional: develop via docker by mounting the code dir into the container # Optional: develop via docker by mounting the code dir into the container
@ -473,6 +496,8 @@ You can also run all these in Docker. For more examples see the Github Actions C
# or individually: # or individually:
./bin/build_docs.sh ./bin/build_docs.sh
./bin/build_pip.sh ./bin/build_pip.sh
./bin/build_deb.sh
./bin/build_brew.sh
./bin/build_docker.sh ./bin/build_docker.sh
``` ```

View file

@ -63,7 +63,7 @@ def run_subcommand(subcommand: str,
if subcommand not in meta_cmds: if subcommand not in meta_cmds:
from ..config import setup_django from ..config import setup_django
setup_django(in_memory_db=subcommand in fake_db) setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
module = import_module('.archivebox_{}'.format(subcommand), __package__) module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore

View file

@ -89,8 +89,8 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
index_only=command.index_only, index_only=command.index_only,
overwrite=command.overwrite, overwrite=command.overwrite,
init=command.init, init=command.init,
out_dir=pwd or OUTPUT_DIR,
extractors=command.extract, extractors=command.extract,
out_dir=pwd or OUTPUT_DIR,
) )

View file

@ -36,6 +36,13 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
' ~/Desktop/sites_list.csv\n' ' ~/Desktop/sites_list.csv\n'
) )
) )
parser.add_argument(
"--extract",
type=str,
help="Pass a list of the extractors to be used. If the method name is not correct, it will be ignored. \
This does not take precedence over the configuration",
default=""
)
parser.add_argument( parser.add_argument(
'--out-dir', '--out-dir',
type=str, type=str,
@ -55,6 +62,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
oneshot( oneshot(
url=stdin_url or url, url=stdin_url or url,
out_dir=Path(command.out_dir).resolve(), out_dir=Path(command.out_dir).resolve(),
extractors=command.extract,
) )

View file

@ -161,6 +161,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'USE_CHROME': {'type': bool, 'default': True}, 'USE_CHROME': {'type': bool, 'default': True},
'USE_NODE': {'type': bool, 'default': True}, 'USE_NODE': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True}, 'USE_YOUTUBEDL': {'type': bool, 'default': True},
'USE_RIPGREP': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'}, 'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'}, 'GIT_BINARY': {'type': str, 'default': 'git'},
@ -170,6 +171,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'}, 'MERCURY_BINARY': {'type': str, 'default': 'mercury-parser'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'}, 'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'NODE_BINARY': {'type': str, 'default': 'node'}, 'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None}, 'CHROME_BINARY': {'type': str, 'default': None},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None}, 'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
@ -275,7 +277,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}}, 'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent}, 'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME / 'legacy'}, 'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()}, 'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME}, 'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
@ -312,6 +314,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']}, 'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []}, 'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']}, 'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None}, 'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
@ -320,7 +323,7 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None}, 'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']}, 'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if (c['USE_MERCURY'] and c['MERCURY_BINARY']) else None}, # mercury is unversioned 'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury is unversioned
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']}, 'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None}, 'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
@ -334,8 +337,6 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])}, 'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'] or c['SAVE_SINGLEFILE'])},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()}, 'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None}, 'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']}, 'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']}, 'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
@ -343,6 +344,9 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']}, 'SAVE_SINGLEFILE': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SINGLEFILE'] and c['USE_NODE']},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']}, 'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']}, 'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)}, 'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)}, 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
@ -595,7 +599,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
if node_modules_bin.exists(): if node_modules_bin.exists():
return str(node_modules_bin.resolve()) return str(node_modules_bin.resolve())
return shutil.which(Path(binary).expanduser()) or binary return shutil.which(str(Path(binary).expanduser())) or shutil.which(str(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]: def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None: if binary is None:
@ -682,7 +686,7 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
'TEMPLATES_DIR': { 'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(), 'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True, 'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(), 'is_valid': (config['TEMPLATES_DIR'] / config['ACTIVE_THEME'] / 'static').exists(),
}, },
# 'NODE_MODULES_DIR': { # 'NODE_MODULES_DIR': {
# 'path': , # 'path': ,
@ -826,6 +830,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_CHROME'], 'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']), 'is_valid': bool(config['CHROME_VERSION']),
}, },
'RIPGREP_BINARY': {
'path': bin_path(config['RIPGREP_BINARY']),
'version': config['RIPGREP_VERSION'],
'hash': bin_hash(config['RIPGREP_BINARY']),
'enabled': config['USE_RIPGREP'],
'is_valid': bool(config['RIPGREP_VERSION']),
},
} }
def get_chrome_info(config: ConfigDict) -> ConfigValue: def get_chrome_info(config: ConfigDict) -> ConfigValue:

View file

@ -10,11 +10,22 @@ CHOICES = (
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'), ('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
) )
from ..extractors import get_default_archive_methods
ARCHIVE_METHODS = [
(name, name)
for name, _, _ in get_default_archive_methods()
]
class AddLinkForm(forms.Form): class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True) url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0') depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
archive_methods = forms.MultipleChoiceField(
required=False,
widget=forms.SelectMultiple,
choices=ARCHIVE_METHODS,
)
class TagWidgetMixin: class TagWidgetMixin:
def format_value(self, value): def format_value(self, value):
if value is not None and not isinstance(value, str): if value is not None and not isinstance(value, str):

View file

@ -9,6 +9,12 @@ import django.db.models.deletion
from config import CONFIG from config import CONFIG
from index.json import to_json from index.json import to_json
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
def forwards_func(apps, schema_editor): def forwards_func(apps, schema_editor):
from core.models import EXTRACTORS from core.models import EXTRACTORS
@ -76,7 +82,7 @@ class Migration(migrations.Migration):
name='ArchiveResult', name='ArchiveResult',
fields=[ fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('cmd', models.JSONField()), ('cmd', JSONField()),
('pwd', models.CharField(max_length=256)), ('pwd', models.CharField(max_length=256)),
('cmd_version', models.CharField(max_length=32)), ('cmd_version', models.CharField(max_length=32)),
('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)), ('status', models.CharField(choices=[('succeeded', 'succeeded'), ('failed', 'failed'), ('skipped', 'skipped')], max_length=16)),

View file

@ -18,6 +18,12 @@ STATUS_CHOICES = [
("skipped", "skipped") ("skipped", "skipped")
] ]
try:
JSONField = models.JSONField
except AttributeError:
import jsonfield
JSONField = jsonfield.JSONField
class Tag(models.Model): class Tag(models.Model):
""" """
@ -173,7 +179,7 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model): class ArchiveResult(models.Model):
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE) snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
cmd = models.JSONField() cmd = JSONField()
pwd = models.CharField(max_length=256) pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=32) cmd_version = models.CharField(max_length=32)
output = models.CharField(max_length=512) output = models.CharField(max_length=512)

View file

@ -12,6 +12,7 @@ from ..config import (
ALLOWED_HOSTS, ALLOWED_HOSTS,
PACKAGE_DIR, PACKAGE_DIR,
ACTIVE_THEME, ACTIVE_THEME,
TEMPLATES_DIR_NAME,
SQL_INDEX_FILENAME, SQL_INDEX_FILENAME,
OUTPUT_DIR, OUTPUT_DIR,
) )
@ -68,14 +69,14 @@ AUTHENTICATION_BACKENDS = [
STATIC_URL = '/static/' STATIC_URL = '/static/'
STATICFILES_DIRS = [ STATICFILES_DIRS = [
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME / 'static'), str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME / 'static'),
str(Path(PACKAGE_DIR) / 'themes' / 'default' / 'static'), str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default' / 'static'),
] ]
TEMPLATE_DIRS = [ TEMPLATE_DIRS = [
str(Path(PACKAGE_DIR) / 'themes' / ACTIVE_THEME), str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / ACTIVE_THEME),
str(Path(PACKAGE_DIR) / 'themes' / 'default'), str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME / 'default'),
str(Path(PACKAGE_DIR) / 'themes'), str(Path(PACKAGE_DIR) / TEMPLATES_DIR_NAME),
] ]
TEMPLATES = [ TEMPLATES = [

View file

@ -150,12 +150,15 @@ class AddView(UserPassesTestMixin, FormView):
url = form.cleaned_data["url"] url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}') print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1 depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = { input_kwargs = {
"urls": url, "urls": url,
"depth": depth, "depth": depth,
"update_all": False, "update_all": False,
"out_dir": OUTPUT_DIR, "out_dir": OUTPUT_DIR,
} }
if extractors:
input_kwargs.update({"extractors": extractors})
add_stdout = StringIO() add_stdout = StringIO()
with redirect_stdout(add_stdout): with redirect_stdout(add_stdout):
add(**input_kwargs) add(**input_kwargs)

View file

@ -20,7 +20,6 @@ from ..config import (
CURL_ARGS, CURL_ARGS,
CURL_VERSION, CURL_VERSION,
CURL_USER_AGENT, CURL_USER_AGENT,
setup_django,
) )
from ..logging_util import TimedProgress from ..logging_util import TimedProgress
@ -81,7 +80,6 @@ def extract_title_with_regex(html):
def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult: def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content""" """try to guess the page's title from its content"""
setup_django(out_dir=out_dir)
from core.models import Snapshot from core.models import Snapshot
output: ArchiveOutput = None output: ArchiveOutput = None

View file

@ -18,7 +18,6 @@ from ..util import (
ExtendedEncoder, ExtendedEncoder,
) )
from ..config import ( from ..config import (
setup_django,
ARCHIVE_DIR_NAME, ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME, SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME, JSON_INDEX_FILENAME,
@ -243,16 +242,9 @@ def write_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
log_indexing_process_finished() log_indexing_process_finished()
@enforce_types
def get_empty_snapshot_queryset(out_dir: Path=OUTPUT_DIR):
setup_django(out_dir, check_db=True)
from core.models import Snapshot
return Snapshot.objects.none()
@enforce_types @enforce_types
def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]: def load_main_index(out_dir: Path=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in""" """parse and load existing index with any new links from import_path merged in"""
setup_django(out_dir, check_db=True)
from core.models import Snapshot from core.models import Snapshot
try: try:
return Snapshot.objects.all() return Snapshot.objects.all()
@ -390,8 +382,9 @@ def search_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type:
color='red', color='red',
) )
raise SystemExit(2) raise SystemExit(2)
from core.models import Snapshot
qsearch = get_empty_snapshot_queryset() qsearch = Snapshot.objects.none()
for pattern in filter_patterns: for pattern in filter_patterns:
try: try:
qsearch |= query_search_index(pattern) qsearch |= query_search_index(pattern)

View file

@ -23,7 +23,6 @@ from ..config import (
GIT_SHA, GIT_SHA,
FOOTER_INFO, FOOTER_INFO,
HTML_INDEX_FILENAME, HTML_INDEX_FILENAME,
setup_django,
) )
MAIN_INDEX_TEMPLATE = 'main_index.html' MAIN_INDEX_TEMPLATE = 'main_index.html'
@ -111,7 +110,6 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content""" """render a given html template string with the given template content"""
from django.template.loader import render_to_string from django.template.loader import render_to_string
setup_django(check_db=False)
return render_to_string(template, context) return render_to_string(template, context)

View file

@ -9,7 +9,6 @@ DO NOT ADD ANY NEW FEATURES TO THIS FILE, NEW CODE GOES HERE: core/models.py
__package__ = 'archivebox.index' __package__ = 'archivebox.index'
from pathlib import Path from pathlib import Path
from django.db.utils import OperationalError
from datetime import datetime, timedelta from datetime import datetime, timedelta

View file

@ -19,6 +19,7 @@ if TYPE_CHECKING:
from .util import enforce_types from .util import enforce_types
from .config import ( from .config import (
ConfigDict, ConfigDict,
OUTPUT_DIR,
PYTHON_ENCODING, PYTHON_ENCODING,
ANSI, ANSI,
IS_TTY, IS_TTY,
@ -514,19 +515,24 @@ def printable_folder_status(name: str, folder: Dict) -> str:
else: else:
num_files = 'missing' num_files = 'missing'
if ' ' in str(folder['path']): path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
folder['path'] = f'"{folder["path"]}"' if path and ' ' in path:
path = f'"{path}"'
# if path is just a plain dot, replace it back with the full path for clarity
if path == '.':
path = str(OUTPUT_DIR)
return ' '.join(( return ' '.join((
ANSI[color], ANSI[color],
symbol, symbol,
ANSI['reset'], ANSI['reset'],
name.ljust(22), name.ljust(21),
(str(folder["path"]) or '').ljust(76),
num_files.ljust(14), num_files.ljust(14),
ANSI[color], ANSI[color],
note, note.ljust(8),
ANSI['reset'], ANSI['reset'],
path.ljust(76),
)) ))
@ -546,17 +552,18 @@ def printable_dependency_version(name: str, dependency: Dict) -> str:
else: else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-' color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
if ' ' in (dependency["path"] or ''): path = str(dependency["path"]).replace(str(OUTPUT_DIR), '.') if dependency["path"] else ''
dependency["path"] = f'"{dependency["path"]}"' if path and ' ' in path:
path = f'"{path}"'
return ' '.join(( return ' '.join((
ANSI[color], ANSI[color],
symbol, symbol,
ANSI['reset'], ANSI['reset'],
name.ljust(22), name.ljust(21),
(dependency["path"] or '').ljust(76),
version.ljust(14), version.ljust(14),
ANSI[color], ANSI[color],
note, note.ljust(8),
ANSI['reset'], ANSI['reset'],
path.ljust(76),
)) ))

View file

@ -29,7 +29,6 @@ from .util import enforce_types # type: ignore
from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT from .system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from .index import ( from .index import (
load_main_index, load_main_index,
get_empty_snapshot_queryset,
parse_links_from_source, parse_links_from_source,
dedupe_links, dedupe_links,
write_main_index, write_main_index,
@ -218,7 +217,7 @@ def version(quiet: bool=False,
else: else:
print('ArchiveBox v{}'.format(VERSION)) print('ArchiveBox v{}'.format(VERSION))
p = platform.uname() p = platform.uname()
print(p.system, platform.platform(), p.machine) print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
print() print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI)) print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@ -265,6 +264,7 @@ def run(subcommand: str,
@enforce_types @enforce_types
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None: def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory""" """Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
Path(out_dir).mkdir(exist_ok=True) Path(out_dir).mkdir(exist_ok=True)
is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR) is_empty = not len(set(os.listdir(out_dir)) - ALLOWED_IN_OUTPUT_DIR)
@ -335,7 +335,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
print() print()
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI)) print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
all_links = get_empty_snapshot_queryset() all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {} pending_links: Dict[str, Link] = {}
if existing_index: if existing_index:
@ -511,7 +511,7 @@ def status(out_dir: Path=OUTPUT_DIR) -> None:
@enforce_types @enforce_types
def oneshot(url: str, out_dir: Path=OUTPUT_DIR): def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
""" """
Create a single URL archive folder with an index.json and index.html, and all the archive method outputs. Create a single URL archive folder with an index.json and index.html, and all the archive method outputs.
You can run this to archive single pages without needing to create a whole collection with archivebox init. You can run this to archive single pages without needing to create a whole collection with archivebox init.
@ -523,7 +523,8 @@ def oneshot(url: str, out_dir: Path=OUTPUT_DIR):
color='red' color='red'
) )
raise SystemExit(2) raise SystemExit(2)
methods = ignore_methods(['title'])
methods = extractors.split(",") if extractors else ignore_methods(['title'])
archive_link(oneshot_link[0], out_dir=out_dir, methods=methods) archive_link(oneshot_link[0], out_dir=out_dir, methods=methods)
return oneshot_link return oneshot_link
@ -534,8 +535,8 @@ def add(urls: Union[str, List[str]],
index_only: bool=False, index_only: bool=False,
overwrite: bool=False, overwrite: bool=False,
init: bool=False, init: bool=False,
out_dir: Path=OUTPUT_DIR, extractors: str="",
extractors: str="") -> List[Link]: out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive""" """Add a new URL or list of URLs to your archive"""
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)' assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'

View file

@ -6,7 +6,7 @@ from django.db.models import QuerySet
from archivebox.index.schema import Link from archivebox.index.schema import Link
from archivebox.util import enforce_types from archivebox.util import enforce_types
from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE from archivebox.config import stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE
from .utils import get_indexable_content, log_index_started from .utils import get_indexable_content, log_index_started
@ -49,7 +49,6 @@ def write_search_index(link: Link, texts: Union[List[str], None]=None, out_dir:
@enforce_types @enforce_types
def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet: def query_search_index(query: str, out_dir: Path=OUTPUT_DIR) -> QuerySet:
setup_django(out_dir, check_db=True)
from core.models import Snapshot from core.models import Snapshot
if search_backend_enabled(): if search_backend_enabled():
@ -107,4 +106,3 @@ def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR):
) )
else: else:
write_search_index(link, texts, out_dir=out_dir) write_search_index(link, texts, out_dir=out_dir)

View file

@ -1,8 +1,8 @@
import re import re
from subprocess import run, PIPE, DEVNULL from subprocess import run, PIPE
from typing import List, Generator from typing import List, Generator
from archivebox.config import setup_django, ARCHIVE_DIR from archivebox.config import ARCHIVE_DIR, RIPGREP_VERSION
from archivebox.util import enforce_types from archivebox.util import enforce_types
RG_IGNORE_EXTENSIONS = ('css','js','orig','svg') RG_IGNORE_EXTENSIONS = ('css','js','orig','svg')
@ -26,11 +26,9 @@ def flush(snapshot_ids: Generator[str, None, None]):
@enforce_types @enforce_types
def search(text: str) -> List[str]: def search(text: str) -> List[str]:
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL) if not RIPGREP_VERSION:
if is_rg_installed.returncode:
raise Exception("ripgrep binary not found, install ripgrep to use this search backend") raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
setup_django(check_db=True)
from core.models import Snapshot from core.models import Snapshot
rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)] rg_cmd = ['rg', RG_ADD_TYPE, RG_IGNORE_ARGUMENTS, RG_DEFAULT_ARGUMENTS, RG_REGEX_ARGUMENT, text, str(ARCHIVE_DIR)]
@ -45,4 +43,3 @@ def search(text: str) -> List[str]:
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)] snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
return snap_ids return snap_ids

13
bin/build_brew.sh Normal file → Executable file
View file

@ -12,11 +12,18 @@ IFS=$'\n'
REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )" REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
CURRENT_PLAFORM="$(uname)"
REQUIRED_PLATFORM="Darwin"
if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
echo "[!] Skipping the Homebrew package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)."
exit 0
fi
cd "$REPO_DIR/brew_dist" cd "$REPO_DIR/brew_dist"
# make sure archivebox.rb is up-to-date with the dependencies # make sure archivebox.rb is up-to-date with the dependencies
echo "[+] Building bottle" echo "[+] Building Homebrew bottle"
brew install --build-bottle ./archivebox.rb brew install --build-bottle ./archivebox.rb
brew bottle archivebox brew bottle archivebox

View file

@ -19,6 +19,13 @@ else
fi fi
cd "$REPO_DIR" cd "$REPO_DIR"
CURRENT_PLAFORM="$(uname)"
REQUIRED_PLATFORM="Linux"
if [[ "$CURRENT_PLAFORM" != "$REQUIRED_PLATFORM" ]]; then
echo "[!] Skipping the Debian package build on $CURRENT_PLAFORM (it can only be run on $REQUIRED_PLATFORM)."
exit 0
fi
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
DEBIAN_VERSION="1" DEBIAN_VERSION="1"
PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988" PGP_KEY_ID="7D5695D3B618872647861D51C38137A7C1675988"

View file

@ -14,6 +14,7 @@ REPO_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && p
VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")" VERSION="$(jq -r '.version' < "$REPO_DIR/package.json")"
cd "$REPO_DIR" cd "$REPO_DIR"
which docker > /dev/null
echo "[+] Building docker image in the background..." echo "[+] Building docker image in the background..."
docker build . -t archivebox \ docker build . -t archivebox \

View file

@ -14,4 +14,4 @@ DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && cd .. && pwd )"
source "$DIR/.venv/bin/activate" source "$DIR/.venv/bin/activate"
pytest -s pytest -s --basetemp=tests/out

View file

@ -1,6 +1,6 @@
{ {
"name": "archivebox", "name": "archivebox",
"version": "0.5.0", "version": "0.5.1",
"description": "ArchiveBox: The self-hosted internet archive", "description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>", "author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"license": "MIT", "license": "MIT",

View file

@ -23,7 +23,7 @@ PROJECT_URLS = {
ROOT_DIR = Path(__file__).parent.resolve() ROOT_DIR = Path(__file__).parent.resolve()
PACKAGE_DIR = ROOT_DIR / PKG_NAME PACKAGE_DIR = ROOT_DIR / PKG_NAME
README = (PACKAGE_DIR / "README.md").read_text() README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version'] VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
# To see when setup.py gets called (uncomment for debugging): # To see when setup.py gets called (uncomment for debugging):

View file

@ -5,5 +5,5 @@ Package3: archivebox
Suite: focal Suite: focal
Suite3: focal Suite3: focal
Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb Build-Depends: dh-python, python3-pip, python3-setuptools, python3-wheel, python3-stdeb
Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep Depends3: nodejs, chromium-browser, wget, curl, git, ffmpeg, youtube-dl, python3-atomicwrites, python3-croniter, python3-crontab, python3-dateparser, python3-django, python3-django-extensions, python3-django-jsonfield, python3-mypy-extensions, python3-requests, python3-w3lib, ripgrep
XS-Python-Version: >= 3.7 XS-Python-Version: >= 3.7

View file

@ -9,11 +9,20 @@ def test_oneshot_command_exists(tmp_path, disable_extractors_dict):
def test_oneshot_command_saves_page_in_right_folder(tmp_path, disable_extractors_dict): def test_oneshot_command_saves_page_in_right_folder(tmp_path, disable_extractors_dict):
disable_extractors_dict.update({"SAVE_DOM": "true"}) disable_extractors_dict.update({"SAVE_DOM": "true"})
process = subprocess.run(["archivebox", "oneshot", f"--out-dir={tmp_path}", "http://127.0.0.1:8080/static/example.com.html"], process = subprocess.run(
capture_output=True, env=disable_extractors_dict) [
"archivebox",
"oneshot",
f"--out-dir={tmp_path}",
"--extract=title,favicon,dom",
"http://127.0.0.1:8080/static/example.com.html",
],
capture_output=True,
env=disable_extractors_dict,
)
items = ' '.join([str(x) for x in tmp_path.iterdir()]) items = ' '.join([str(x) for x in tmp_path.iterdir()])
current_path = ' '.join([str(x) for x in Path.cwd().iterdir()]) current_path = ' '.join([str(x) for x in Path.cwd().iterdir()])
assert "index.json" in items assert "index.json" in items
assert not "index.sqlite3" in current_path assert not "index.sqlite3" in current_path
assert "output.html" in items assert "output.html" in items