Merge pull request #207 from pirate/django

This commit is contained in:
Nick Sweeting 2020-07-28 08:23:00 -04:00 committed by GitHub
commit 56c697948d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
159 changed files with 12366 additions and 3724 deletions

View file

@ -1,20 +1,16 @@
.DS_Store
._*
*.pyc
__pycache__/
.mypy_cache/
archivebox.egg-info/
venv/
.venv/
.docker-venv/
*.egg-info/
build/
dist/
# Dependency code
.venv # main pipenv venv path
venv # old venv path, (no longer used)
archivebox/.venv # old venv path, (no longer used)
archivebox/venv # old venv path, (no longer used)
# Stateful data folders
data # main archivebox data folder
archivebox/output # old output folder path (no longer used)
output # old output folder path (no longer used)
data/
output/

6
.flake8 Normal file
View file

@ -0,0 +1,6 @@
[flake8]
ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391
select = F,E9,W
max-line-length = 130
max-complexity = 10
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv

145
.github/workflows/test.yml vendored Normal file
View file

@ -0,0 +1,145 @@
name: Test workflow
on: [push]
env:
MAX_LINE_LENGTH: 110
jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v1
with:
python-version: 3.8
architecture: x64
- name: Install flake8
run: |
pip install flake8
- name: Lint with flake8
run: |
# one pass for show-stopper syntax errors or undefined names
flake8 archivebox --count --show-source --statistics
# one pass for small stylistic things
flake8 archivebox --count --max-line-length="$MAX_LINE_LENGTH" --statistics
# - name: Lint with mypy
# run: |
# pip install mypy
# mypy archivebox || true
test:
runs-on: ${{ matrix.os }}
strategy:
matrix:
os: [ubuntu-latest, macos-latest]
python: [3.7, 3.8]
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v1
with:
python-version: ${{ matrix.python }}
architecture: x64
- name: Cache virtualenv
uses: actions/cache@v2
id: cache-venv
with:
path: .venv
key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }}
restore-keys: |
${{ runner.os }}-${{ matrix.python }}-venv-
- name: Create virtualenv
if: steps.cache-venv.outputs.cache-hit != 'true'
run: |
python3 -m venv .venv
source .venv/bin/activate
python3 -m pip install --upgrade pip setuptools
- name: Install dependencies
run: |
source .venv/bin/activate
python -m pip install .
python -m pip install pytest bottle
- name: Test built package with pytest
run: |
source .venv/bin/activate
python -m pytest -s
docker-test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
with:
fetch-depth: 1
- uses: satackey/action-docker-layer-caching@v0.0.4
- name: Build image
run: |
docker build . -t archivebox
- name: Init data dir
run: |
mkdir data
docker run -v "$PWD"/data:/data archivebox init
- name: Run test server
run: |
sudo bash -c 'echo "127.0.0.1 www.test-nginx-1.local www.test-nginx-2.local" >> /etc/hosts'
docker run --name www-nginx -p 80:80 -d nginx
- name: Add link
run: |
docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local
- name: Add stdin link
run: |
echo "http://www.test-nginx-2.local" | docker run -i -v "$PWD"/data:/data archivebox add
- name: List links
run: |
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
- name: Start docker-compose stack
run: |
docker-compose run archivebox init
docker-compose up -d
sleep 4
curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'django.jQuery'
- name: Check added urls show up in index
run: |
docker-compose run archivebox add 'http://example.com/#test_docker' --index-only
curl --silent --location 'http://127.0.0.1:8000' | grep 'http://example.com/#test_docker'
- name: Curl index with PUBLIC_INDEX=False
run: |
docker-compose run archivebox config --set PUBLIC_INDEX=False
docker-compose up -d || true
sleep 8
curl --silent --location 'http://127.0.0.1:8000' | grep 'Log in'
docker-compose down
- name: Curl index with PUBLIC_INDEX=True
run: |
docker-compose run archivebox config --set PUBLIC_INDEX=True
docker-compose up -d || true
sleep 8
curl --silent --location 'http://127.0.0.1:8000' | grep 'Add Links'
docker-compose down

23
.gitignore vendored
View file

@ -1,21 +1,16 @@
.DS_Store
._*
*.pyc
__pycache__/
.mypy_cache/
archivebox.egg-info/
venv/
.venv/
.docker-venv/
*.egg-info/
build/
dist/
# Dependency code
.venv # main pipenv venv path
venv # old venv path, (no longer used)
archivebox/.venv # old venv path, (no longer used)
archivebox/venv # old venv path, (no longer used)
# Stateful data folders
data/ # main archivebox data folder
archivebox/output/ # old output folder path (no longer used)
output/ # old output folder path (no longer used)
data/
output/

View file

@ -1,74 +1,68 @@
# This Dockerfile for ArchiveBox installs the following in a container:
# - curl, wget, python3, youtube-dl, google-chrome-unstable
# - ArchiveBox
# This is the Dockerfile for ArchiveBox, it includes the following major pieces:
# git, curl, wget, python3, youtube-dl, google-chrome-stable, ArchiveBox
# Usage:
# docker build github.com/pirate/ArchiveBox -t archivebox
# echo 'https://example.com' | docker run -i -v ./data:/data archivebox /bin/archive
# docker run -v ./data:/data archivebox /bin/archive 'https://example.com/some/rss/feed.xml'
# docker build . -t archivebox
# docker run -v "$PWD/data":/data archivebox init
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
# Documentation:
# https://github.com/pirate/ArchiveBox/wiki/Docker#docker
# TODO: bump to latest chrome and node version, confirm chrome doesn't hang on simple pages
FROM python:3.8-slim-buster
FROM node:11-slim
LABEL maintainer="Nick Sweeting <archivebox-git@sweeting.me>"
LABEL name="archivebox" \
maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \
description="All-in-one personal internet archiving container"
RUN apt-get update \
&& apt-get install -yq --no-install-recommends \
git wget curl youtube-dl gnupg2 libgconf-2-4 python3 python3-pip \
&& rm -rf /var/lib/apt/lists/*
# Install latest chrome package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
RUN apt-get update && apt-get install -y wget --no-install-recommends \
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
&& apt-get update \
&& apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \
--no-install-recommends \
&& rm -rf /var/lib/apt/lists/* \
&& rm -rf /src/*.deb
# It's a good idea to use dumb-init to help prevent zombie chrome processes.
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init
RUN chmod +x /usr/local/bin/dumb-init
# Uncomment to skip the chromium download when installing puppeteer. If you do,
# you'll need to launch puppeteer with:
# browser.launch({executablePath: 'google-chrome-unstable'})
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
# Install puppeteer so it's available in the container.
RUN npm i puppeteer
# Add user so we don't need --no-sandbox.
RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
&& mkdir -p /home/pptruser/Downloads \
&& chown -R pptruser:pptruser /home/pptruser \
&& chown -R pptruser:pptruser /node_modules
# Install the ArchiveBox repository and pip requirements
COPY . /home/pptruser/app
RUN mkdir -p /data \
&& chown -R pptruser:pptruser /data \
&& ln -s /data /home/pptruser/app/archivebox/output \
&& ln -s /home/pptruser/app/bin/* /bin/ \
&& ln -s /home/pptruser/app/bin/archivebox /bin/archive \
&& chown -R pptruser:pptruser /home/pptruser/app/archivebox
# && pip3 install -r /home/pptruser/app/archivebox/requirements.txt
VOLUME /data
ENV LANG=C.UTF-8 \
ENV TZ=UTC \
LANGUAGE=en_US:en \
LC_ALL=C.UTF-8 \
LANG=C.UTF-8 \
PYTHONIOENCODING=UTF-8 \
CHROME_SANDBOX=False \
CHROME_BINARY=google-chrome-unstable \
OUTPUT_DIR=/data
PYTHONUNBUFFERED=1 \
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
CODE_PATH=/app \
VENV_PATH=/venv \
DATA_PATH=/data
# First install CLI utils and base deps, then Chrome + Fons
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
dumb-init jq git wget curl youtube-dl ffmpeg \
&& curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
&& echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
&& apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
google-chrome-stable \
fontconfig \
fonts-ipafont-gothic \
fonts-wqy-zenhei \
fonts-thai-tlwg \
fonts-kacst \
fonts-symbola \
fonts-noto \
fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
# Run everything from here on out as non-privileged user
USER pptruser
WORKDIR /home/pptruser/app
RUN groupadd --system archivebox \
&& useradd --system --create-home --gid archivebox --groups audio,video archivebox
ENTRYPOINT ["dumb-init", "--"]
CMD ["/bin/archive"]
ADD . "$CODE_PATH"
WORKDIR "$CODE_PATH"
ENV PATH="${PATH}:$VENV_PATH/bin"
RUN python -m venv --clear --symlinks "$VENV_PATH" \
&& pip install --upgrade pip setuptools \
&& pip install -e .
VOLUME "$DATA_PATH"
WORKDIR "$DATA_PATH"
EXPOSE 8000
ENV CHROME_BINARY=google-chrome \
CHROME_SANDBOX=False
RUN env ALLOW_ROOT=True archivebox version
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh", "archivebox"]
CMD ["server", "0.0.0.0:8000"]

4
MANIFEST.in Normal file
View file

@ -0,0 +1,4 @@
include LICENSE
include README.md
include archivebox/VERSION
recursive-include archivebox/themes *

12
Pipfile Normal file
View file

@ -0,0 +1,12 @@
[[source]]
name = "pypi"
url = "https://pypi.org/simple"
verify_ssl = true
[packages]
# see setup.py for package dependency list
"e1839a8" = {path = ".", editable = true}
[dev-packages]
# see setup.py for dev package dependency list
"e1839a8" = {path = ".", extras = ["dev"], editable = true}

227
README.md
View file

@ -15,6 +15,7 @@
</pre>
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
<a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
<a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
@ -24,10 +25,8 @@
<hr/>
<br/>
<i>💥 Attention: Big API changes are coming soon (including a proper config file format and <code>pip install archivebox</code>)! Check out <a href="https://github.com/pirate/ArchiveBox/pull/207">v0.4</a> and help us test it! 💥</i>
<i>💥 Attention: Big API changes are coming with the current release (including <code>pip install archivebox</code>)!
<br/><br/>
<b>Note: There are some important security design issues that need to be fixed before v0.4 can be pushed, all help is appreciated!<br/>
(This project is not abandoned, it's my primary side-project for the forseeable future, my day job is very busy right now.)<br/>
See the <a href="https://github.com/pirate/ArchiveBox/pull/207#issuecomment-494107553">v0.4 release PR</a> for more information.</b>
<br/>
<hr/>
@ -41,17 +40,23 @@ You can use it to preserve access to websites you care about by storing them loc
#### How does it work?
```bash
echo 'http://example.com' | ./archive
mkdir data && cd data
archivebox init
archivebox add 'https://example.com'
archivebox add 'https://getpocket.com/users/USERNAME/feed/all' --depth=1
archivebox server
```
After installing the dependencies, just pipe some new links into the `./archive` command to start your archive.
After installing archivebox, just pass some new links to the `archivebox add` command to start your collection.
ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl, pywb, and other common UNIX tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer).
<div align="center">
<img src="https://i.imgur.com/3tBL7PU.png" width="30%" alt="CLI Screenshot" align="top">
<img src="https://i.imgur.com/viklZNG.png" width="30%" alt="Desktop index screenshot" align="top">
<img src="https://i.imgur.com/RefWsXB.jpg" width="30%" alt="Desktop details page Screenshot"/><br/>
<img src="https://i.imgur.com/3tBL7PU.png" width="22%" alt="CLI Screenshot" align="top">
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
<br/>
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
@ -63,22 +68,49 @@ ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https:
To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
```bash
# 1. Install dependencies (use apt on ubuntu, brew on mac, or pkg on BSD)
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
# 2. Download ArchiveBox
git clone https://github.com/pirate/ArchiveBox.git && cd ArchiveBox
# 3. Add your first links to your archive
echo 'https://example.com' | ./archive # pass URLs to archive via stdin
./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed
# Docker
mkdir data && cd data
docker run -v $PWD:/data nikisweeting:archivebox init
docker run -v $PWD:/data nikisweeting:archivebox add 'https://example.com'
docker run -v $PWD:/data -p 8000 nikisweeting:archivebox server
open https://127.0.0.1:8000
```
Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
```bash
# Docker Compose
# Download https://github.com/pirate/ArchiveBox/tree/master/docker-compose.yml
docker-compose run archivebox init
docker-compose run archivebox add 'https://example.com'
docker-compose up
```
*(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)*
```bash
# Bare Metal
# Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
pip install archivebox # install archivebox
mkdir data && cd data # (doesn't have to be called data)
archivebox init
archivebox add 'https://example.com' # add URLs via args or stdin
# or import an RSS/JSON/XML/TXT feed/list of links
archivebox add https://getpocket.com/users/USERNAME/feed/all --depth=1
```
Once you've added your first links, open `data/index.html` in a browser to view the static archive.
You can also start it as a server with a full web UI to manage your links:
```bash
archivebox manage createsuperuser
archivebox server
```
You can visit `https://127.0.0.1:8000` in your browser to access it.
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
---
@ -103,13 +135,14 @@ All the archived links are stored by date bookmarked in `output/archive/<timesta
#### Can import links from many formats:
```bash
echo 'http://example.com' | ./archive
./archive ~/Downloads/firefox_bookmarks_export.html
./archive https://example.com/some/rss/feed.xml
echo 'http://example.com' | archivebox add
archivebox add ~/Downloads/firefox_bookmarks_export.html --depth=1
archivebox add https://example.com/some/rss/feed.xml --depth=1
```
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more)
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more)
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
@ -119,18 +152,18 @@ See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
ls output/archive/<timestamp>/
```
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
- **Title:** `title` title of the site
- **Favicon:** `favicon.ico` favicon of the site
- **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present
- **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
- *More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)...*
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
- **Title:** `title` title of the site
- **Favicon:** `favicon.ico` favicon of the site
- **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present
- **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
@ -138,15 +171,15 @@ If you're importing URLs with secret tokens in them (e.g Google Docs, CodiMD not
## Key Features
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
- **Doesn't require a constantly-running server**, proxy, or native app
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.4 is released with some security fixes)
- Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
- **Doesn't require a constantly-running server**, proxy, or native app
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
- Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
## Background & Motivation
@ -164,7 +197,6 @@ archive internet content enables to you save the stuff you care most about befor
The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful.
I don't think everything should be preserved in an automated fashion, making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about.
## Comparison to Other Projects
▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
@ -173,17 +205,15 @@ I don't think everything should be preserved in an automated fashion, making all
#### User Interface & Intended Purpose
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI.
An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now, it only ingests lists of links at a time via browser history, bookmarks, RSS, etc.
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
#### Private Local Archives vs Centralized Public Archives
Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.4 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle.
Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.5 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle.
#### Storage Requirements
Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your mileage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than using a single method, but more content is accurately replayable over extended periods. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `FETCH_MEDIA=False` to skip audio & video files.
Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `SAVE_MEDIA=False` to skip audio & video files.
## Learn more
@ -193,18 +223,18 @@ Whether you want to learn which organizations are the big players in the web arc
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
+ [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
*Community-maintained indexes of archiving tools and institutions.*
+ [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
*Open source tools and projects in the internet archiving space.*
+ [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
*Articles, posts, and blogs relevant to ArchiveBox and web archiving in general.*
+ [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
*A collection of the most active internet archiving communities and initiatives.*
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
- [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
_Community-maintained indexes of archiving tools and institutions._
- [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
_Open source tools and projects in the internet archiving space._
- [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
- [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
_A collection of the most active internet archiving communities and initiatives._
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
---
@ -212,47 +242,46 @@ Whether you want to learn which organizations are the big players in the web arc
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) for documentation.
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) for documentation.
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
You can build the docs by running:
```python
cd ArchiveBox
pipenv install --dev
sphinx-apidoc -o docs archivebox
cd docs/
make html
# then open docs/_build/html/index.html
```
## Getting Started
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
## Reference
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
## More Info
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
---
# Screenshots
<div align="center">
<img src="https://i.imgur.com/biVfFYr.png" width="18%" alt="CLI Screenshot" align="top">
<img src="https://i.imgur.com/viklZNG.png" width="40%" alt="Desktop index screenshot" align="top">
<img src="https://i.imgur.com/wnpdAVM.jpg" width="30%" alt="Desktop details page Screenshot" align="top">
<img src="https://i.imgur.com/mW2dITg.png" width="8%" alt="Mobile details page screenshot" align="top">
</div>
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
---
@ -272,12 +301,14 @@ Contributor Spotlight:<br/><br/>
<a href="https://sourcerer.io/fame/pirate/pirate/ArchiveBox/links/5"><img src="https://sourcerer.io/fame/pirate/pirate/ArchiveBox/images/5"></a>
<br/>
<a href="https://github.com/sponsors/pirate">Sponsor us on Github</a>
<br>
<br>
<a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
<br/>
<br/>
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
<br/><br/>

View file

@ -1 +0,0 @@
bin/archivebox

6
archivebox/.flake8 Normal file
View file

@ -0,0 +1,6 @@
[flake8]
ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391
select = F,E9,W
max-line-length = 130
max-complexity = 10
exclude = migrations,tests,node_modules,vendor,static,venv,.venv,.venv2,.docker-venv

1
archivebox/VERSION Normal file
View file

@ -0,0 +1 @@
0.4.8

View file

@ -1 +1 @@
# if you're looking for the source of the main `archivebox` shell command, it's in `archivebox/archivebox.py`
__package__ = 'archivebox'

11
archivebox/__main__.py Executable file
View file

@ -0,0 +1,11 @@
#!/usr/bin/env python3
__package__ = 'archivebox'
import sys
from .cli import main
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -1,137 +0,0 @@
#!/usr/bin/env python3
"""
ArchiveBox command line application.
./archive and ./bin/archivebox both point to this file,
but you can also run it directly using `python3 archive.py`
Usage & Documentation:
https://github.com/pirate/ArchiveBox/Wiki
"""
import os
import sys
from links import links_after_timestamp
from index import write_links_index, load_links_index
from archive_methods import archive_link
from config import (
ARCHIVE_DIR,
ONLY_NEW,
OUTPUT_DIR,
GIT_SHA,
)
from util import (
save_remote_source,
save_stdin_source,
)
from logs import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
)
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
__VERSION__ = GIT_SHA[:9]
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
def print_help():
print('ArchiveBox: The self-hosted internet archive.\n')
print("Documentation:")
print(" https://github.com/pirate/ArchiveBox/wiki\n")
print("UI Usage:")
print(" Open output/index.html to view your archive.\n")
print("CLI Usage:")
print(" echo 'https://example.com' | ./archive\n")
print(" ./archive ~/Downloads/bookmarks_export.html\n")
print(" ./archive https://example.com/feed.rss\n")
print(" ./archive 15109948213.123\n")
def main(*args):
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
print_help()
raise SystemExit(0)
if set(args).intersection(('--version', 'version')):
print('ArchiveBox version {}'.format(__VERSION__))
raise SystemExit(0)
### Handle CLI arguments
# ./archive bookmarks.html
# ./archive 1523422111.234
import_path, resume = None, None
if len(args) == 2:
# if the argument is a string, it's a import_path file to import
# if it's a number, it's a timestamp to resume archiving from
if args[1].replace('.', '').isdigit():
import_path, resume = None, args[1]
else:
import_path, resume = args[1], None
### Set up output folder
if not os.path.exists(OUTPUT_DIR):
os.makedirs(OUTPUT_DIR)
### Handle ingesting urls piped in through stdin
# (.e.g if user does cat example_urls.txt | ./archive)
if not sys.stdin.isatty():
stdin_raw_text = sys.stdin.read()
if stdin_raw_text and import_path:
print(
'[X] You should pass either a path as an argument, '
'or pass a list of links via stdin, but not both.\n'
)
print_help()
raise SystemExit(1)
if stdin_raw_text:
import_path = save_stdin_source(stdin_raw_text)
### Handle ingesting urls from a remote file/feed
# (e.g. if an RSS feed URL is used as the import path)
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
import_path = save_remote_source(import_path)
### Run the main archive update process
update_archive_data(import_path=import_path, resume=resume)
def update_archive_data(import_path=None, resume=None):
"""The main ArchiveBox entrancepoint. Everything starts here."""
# Step 1: Load list of links from the existing index
# merge in and dedupe new links from import_path
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
# Step 2: Write updated index with deduped old and new links back to disk
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
# Step 3: Run the archive methods for each link
links = new_links if ONLY_NEW else all_links
log_archiving_started(len(links), resume)
idx, link = 0, 0
try:
for idx, link in enumerate(links_after_timestamp(links, resume)):
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
archive_link(link_dir, link)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link and link['timestamp'])
raise SystemExit(0)
except:
print()
raise
log_archiving_finished(len(links))
# Step 4: Re-write links index with updated titles, icons, and resources
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
if __name__ == '__main__':
main(*sys.argv)

View file

@ -1,623 +0,0 @@
import os
from collections import defaultdict
from datetime import datetime
from index import (
write_link_index,
patch_links_index,
load_json_link_index,
)
from config import (
CURL_BINARY,
GIT_BINARY,
WGET_BINARY,
YOUTUBEDL_BINARY,
FETCH_FAVICON,
FETCH_TITLE,
FETCH_WGET,
FETCH_WGET_REQUISITES,
FETCH_PDF,
FETCH_SCREENSHOT,
FETCH_DOM,
FETCH_WARC,
FETCH_GIT,
FETCH_MEDIA,
SUBMIT_ARCHIVE_DOT_ORG,
TIMEOUT,
MEDIA_TIMEOUT,
ANSI,
OUTPUT_DIR,
GIT_DOMAINS,
GIT_SHA,
RESTRICT_FILE_NAMES,
CURL_USER_AGENT,
WGET_USER_AGENT,
CHECK_SSL_VALIDITY,
COOKIES_FILE,
WGET_AUTO_COMPRESSION
)
from util import (
domain,
extension,
without_query,
without_fragment,
fetch_page_title,
is_static_file,
TimedProgress,
chmod_file,
wget_output_path,
chrome_args,
check_link_structure,
run, PIPE, DEVNULL
)
from logs import (
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_started,
log_archive_method_finished,
)
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
def archive_link(link_dir, link):
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = (
('title', should_fetch_title, fetch_title),
('favicon', should_fetch_favicon, fetch_favicon),
('wget', should_fetch_wget, fetch_wget),
('pdf', should_fetch_pdf, fetch_pdf),
('screenshot', should_fetch_screenshot, fetch_screenshot),
('dom', should_fetch_dom, fetch_dom),
('git', should_fetch_git, fetch_git),
('media', should_fetch_media, fetch_media),
('archive_org', should_fetch_archive_dot_org, archive_dot_org),
)
try:
is_new = not os.path.exists(link_dir)
if is_new:
os.makedirs(link_dir)
link = load_json_link_index(link_dir, link)
log_link_archiving_started(link_dir, link, is_new)
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS:
if method_name not in link['history']:
link['history'][method_name] = []
if should_run(link_dir, link):
log_archive_method_started(method_name)
result = method_function(link_dir, link)
link['history'][method_name].append(result)
stats[result['status']] += 1
log_archive_method_finished(result)
else:
stats['skipped'] += 1
# print(' ', stats)
write_link_index(link_dir, link)
patch_links_index(link)
log_link_archiving_finished(link_dir, link, is_new, stats)
except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link
### Archive Method Functions
def should_fetch_title(link_dir, link):
# if link already has valid title, skip it
if link['title'] and not link['title'].lower().startswith('http'):
return False
if is_static_file(link['url']):
return False
return FETCH_TITLE
def fetch_title(link_dir, link, timeout=TIMEOUT):
"""try to guess the page's title from its content"""
output = None
cmd = [
CURL_BINARY,
link['url'],
'|',
'grep',
'<title>',
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output = fetch_page_title(link['url'], timeout=timeout, progress=False)
if not output:
raise ArchiveError('Unable to detect page title')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_favicon(link_dir, link):
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
return False
return FETCH_FAVICON
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
"""download site favicon from google's favicon api"""
output = 'favicon.ico'
cmd = [
CURL_BINARY,
'--max-time', str(timeout),
'--location',
'--output', output,
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
chmod_file(output, cwd=link_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_wget(link_dir, link):
output_path = wget_output_path(link)
if output_path and os.path.exists(os.path.join(link_dir, output_path)):
return False
return FETCH_WGET
def fetch_wget(link_dir, link, timeout=TIMEOUT):
"""download full site using wget"""
if FETCH_WARC:
warc_dir = os.path.join(link_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
*(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()),
'--timeout={}'.format(timeout),
*(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
*(() if FETCH_WARC else ('--timestamping',)),
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
link['url'],
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
# Check for common failure cases
if result.returncode > 0 and files_downloaded < 1:
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Got an error from the server', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_pdf(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
return False
return FETCH_PDF
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
"""print PDF of site to file using chrome --headless"""
output = 'output.pdf'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link['url'],
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to print PDF', hints)
chmod_file('output.pdf', cwd=link_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_screenshot(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
return False
return FETCH_SCREENSHOT
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
"""take screenshot of site using chrome --headless"""
output = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link['url'],
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to take screenshot', hints)
chmod_file(output, cwd=link_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_dom(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'output.html')):
return False
return FETCH_DOM
def fetch_dom(link_dir, link, timeout=TIMEOUT):
"""print HTML of site to file using chrome --dump-html"""
output = 'output.html'
output_path = os.path.join(link_dir, output)
cmd = [
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link['url']
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
with open(output_path, 'w+') as f:
result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to fetch DOM', hints)
chmod_file(output, cwd=link_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_git(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'git')):
return False
is_clonable_url = (
(domain(link['url']) in GIT_DOMAINS)
or (extension(link['url']) == 'git')
)
if not is_clonable_url:
return False
return FETCH_GIT
def fetch_git(link_dir, link, timeout=TIMEOUT):
"""download full site using git"""
output = 'git'
output_path = os.path.join(link_dir, 'git')
os.makedirs(output_path, exist_ok=True)
cmd = [
GIT_BINARY,
'clone',
'--mirror',
'--recursive',
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
without_query(without_fragment(link['url'])),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed git download', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_media(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'media')):
return False
return FETCH_MEDIA
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
output = 'media'
output_path = os.path.join(link_dir, 'media')
os.makedirs(output_path, exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--yes-playlist',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
link['url'],
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=link_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to download media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def should_fetch_archive_dot_org(link_dir, link):
if is_static_file(link['url']):
return False
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SUBMIT_ARCHIVE_DOT_ORG
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
"""submit site to archive.org for archiving via their service, save returned archive url"""
output = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
cmd = [
CURL_BINARY,
'--location',
'--head',
*(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
'--max-time', str(timeout),
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=link_dir)
output = archive_org_url
return {
'cmd': cmd,
'pwd': link_dir,
'output': output,
'status': status,
**timer.stats,
}
def parse_archive_dot_org_response(response):
# Parse archive.org response headers
headers = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers['content-location']
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

135
archivebox/cli/__init__.py Normal file
View file

@ -0,0 +1,135 @@
__package__ = 'archivebox.cli'
__command__ = 'archivebox'
import os
import sys
import argparse
from typing import Optional, Dict, List, IO
from ..config import OUTPUT_DIR
from importlib import import_module
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version')
main_cmds = ('init', 'info', 'config')
archive_cmds = ('add', 'remove', 'update', 'list')
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
# every imported command module must have these properties in order to be valid
required_attrs = ('__package__', '__command__', 'main')
# basic checks to make sure imported files are valid subcommands
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
is_valid_cli_module = lambda module, subcommand: (
all(hasattr(module, attr) for attr in required_attrs)
and module.__command__.split(' ')[-1] == subcommand
)
def list_subcommands() -> Dict[str, str]:
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
COMMANDS = []
for filename in os.listdir(CLI_DIR):
if is_cli_module(filename):
subcommand = filename.replace('archivebox_', '').replace('.py', '')
module = import_module('.archivebox_{}'.format(subcommand), __package__)
assert is_valid_cli_module(module, subcommand)
COMMANDS.append((subcommand, module.main.__doc__))
globals()[subcommand] = module.main
display_order = lambda cmd: (
display_first.index(cmd[0])
if cmd[0] in display_first else
100 + len(cmd[0])
)
return dict(sorted(COMMANDS, key=display_order))
def run_subcommand(subcommand: str,
subcommand_args: List[str]=None,
stdin: Optional[IO]=None,
pwd: Optional[str]=None) -> None:
"""Run a given ArchiveBox subcommand with the given list of args"""
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
SUBCOMMANDS = list_subcommands()
class NotProvided:
pass
def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None:
args = sys.argv[1:] if args is NotProvided else args
stdin = sys.stdin if stdin is NotProvided else stdin
subcommands = list_subcommands()
parser = argparse.ArgumentParser(
prog=__command__,
description='ArchiveBox: The self-hosted internet archive',
add_help=False,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--help', '-h',
action='store_true',
help=subcommands['help'],
)
group.add_argument(
'--version',
action='store_true',
help=subcommands['version'],
)
group.add_argument(
"subcommand",
type=str,
help= "The name of the subcommand to run",
nargs='?',
choices=subcommands.keys(),
default=None,
)
parser.add_argument(
"subcommand_args",
help="Arguments for the subcommand",
nargs=argparse.REMAINDER,
)
command = parser.parse_args(args or ())
if command.help or command.subcommand is None:
command.subcommand = 'help'
elif command.version:
command.subcommand = 'version'
if command.subcommand not in ('help', 'version', 'status'):
from ..logging_util import log_cli_command
log_cli_command(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR
)
run_subcommand(
subcommand=command.subcommand,
subcommand_args=command.subcommand_args,
stdin=stdin,
pwd=pwd or OUTPUT_DIR,
)
__all__ = (
'SUBCOMMANDS',
'list_subcommands',
'run_subcommand',
*SUBCOMMANDS.keys(),
)

View file

@ -0,0 +1,92 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox add'
import sys
import argparse
from typing import List, Optional, IO
from ..main import add
from ..util import docstring
from ..config import OUTPUT_DIR, ONLY_NEW
from ..logging_util import SmartFormatter, accept_stdin, stderr
@docstring(add.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=add.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Add the links to the main index without archiving them",
)
parser.add_argument(
'urls',
nargs='*',
type=str,
default=None,
help=(
'URLs or paths to archive e.g.:\n'
' https://getpocket.com/users/USERNAME/feed/all\n'
' https://example.com/some/rss/feed.xml\n'
' https://example.com\n'
' ~/Downloads/firefox_bookmarks_export.html\n'
' ~/Desktop/sites_list.csv\n'
)
)
parser.add_argument(
"--depth",
action="store",
default=0,
choices=[0, 1],
type=int,
help="Recursively archive all linked pages up to this many hops away"
)
command = parser.parse_args(args or ())
urls = command.urls
stdin_urls = accept_stdin(stdin)
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
color='red',
)
raise SystemExit(2)
add(
urls=stdin_urls or urls,
depth=command.depth,
update_all=command.update_all,
index_only=command.index_only,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)
# TODO: Implement these
#
# parser.add_argument(
# '--mirror', #'-m',
# action='store_true',
# help='Archive an entire site (finding all linked pages below it on the same domain)',
# )
# parser.add_argument(
# '--crawler', #'-r',
# choices=('depth_first', 'breadth_first'),
# help='Controls which crawler to use in order to find outlinks in a given page',
# default=None,
# )

View file

@ -0,0 +1,61 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox config'
import sys
import argparse
from typing import Optional, List, IO
from ..main import config
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin
@docstring(config.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=config.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--get', #'-g',
action='store_true',
help="Get the value for the given config KEYs",
)
group.add_argument(
'--set', #'-s',
action='store_true',
help="Set the given KEY=VALUE config values",
)
group.add_argument(
'--reset', #'-s',
action='store_true',
help="Reset the given KEY config values to their defaults",
)
parser.add_argument(
'config_options',
nargs='*',
type=str,
help='KEY or KEY=VALUE formatted config values to get or set',
)
command = parser.parse_args(args or ())
config_options_str = accept_stdin(stdin)
config(
config_options_str=config_options_str,
config_options=command.config_options,
get=command.get,
set=command.set,
reset=command.reset,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox help'
import sys
import argparse
from typing import Optional, List, IO
from ..main import help
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(help.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=help.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
help(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,40 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox init'
import sys
import argparse
from typing import Optional, List, IO
from ..main import init
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(init.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=init.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--force', # '-f',
action='store_true',
help='Ignore unrecognized files in current directory and initialize anyway',
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
init(
force=command.force,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,120 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox list'
import sys
import argparse
from typing import Optional, List, IO
from ..main import list_all
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
from ..logging_util import SmartFormatter, accept_stdin
@docstring(list_all.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=list_all.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--csv', #'-c',
type=str,
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
default=None,
)
group.add_argument(
'--json', #'-j',
action='store_true',
help="Print the output in JSON format with all columns included.",
)
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated.",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only links bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only links bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'List only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
)
command = parser.parse_args(args or ())
filter_patterns_str = accept_stdin(stdin)
matching_folders = list_all(
filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,
after=command.after,
before=command.before,
sort=command.sort,
csv=command.csv,
json=command.json,
out_dir=pwd or OUTPUT_DIR,
)
raise SystemExit(not matching_folders)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,24 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox manage'
import sys
from typing import Optional, List, IO
from ..main import manage
from ..util import docstring
from ..config import OUTPUT_DIR
@docstring(manage.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
manage(
args=args,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,79 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox remove'
import sys
import argparse
from typing import Optional, List, IO
from ..main import remove
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin
@docstring(remove.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=remove.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--yes', # '-y',
action='store_true',
help='Remove links instantly without prompting to confirm.',
)
parser.add_argument(
'--delete', # '-r',
action='store_true',
help=(
"In addition to removing the link from the index, "
"also delete its archived content and metadata folder."
),
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only URLs bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only URLs bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args or ())
filter_str = accept_stdin(stdin)
remove(
filter_str=filter_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
before=command.before,
after=command.after,
yes=command.yes,
delete=command.delete,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,89 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox schedule'
import sys
import argparse
from typing import Optional, List, IO
from ..main import schedule
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(schedule.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=schedule.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help=("Don't warn about storage space."),
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
'--add', # '-a',
action='store_true',
help='Add a new scheduled ArchiveBox update job to cron',
)
parser.add_argument(
'--every', # '-e',
type=str,
default='day',
help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
)
group.add_argument(
'--show', # '-s'
action='store_true',
help=("Print a list of currently active ArchiveBox cron jobs"),
)
group.add_argument(
'--foreground', '-f',
action='store_true',
help=("Launch ArchiveBox scheduler as a long-running foreground task "
"instead of using cron."),
)
group.add_argument(
'--run-all', # '-a',
action='store_true',
help=("Run all the scheduled jobs once immediately, independent of "
"their configured schedules, can be used together with --foreground"),
)
parser.add_argument(
'import_path',
nargs='?',
type=str,
default=None,
help=("Check this path and import any new links on every run "
"(can be either local file or remote URL)"),
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
schedule(
add=command.add,
show=command.show,
clear=command.clear,
foreground=command.foreground,
run_all=command.run_all,
quiet=command.quiet,
every=command.every,
import_path=command.import_path,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,60 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox server'
import sys
import argparse
from typing import Optional, List, IO
from ..main import server
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(server.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=server.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'runserver_args',
nargs='*',
type=str,
default=None,
help='Arguments to pass to Django runserver'
)
parser.add_argument(
'--reload',
action='store_true',
help='Enable auto-reloading when code or templates change',
)
parser.add_argument(
'--debug',
action='store_true',
help='Enable DEBUG=True mode with more verbose errors',
)
parser.add_argument(
'--init',
action='store_true',
help='Run archivebox init before starting the server',
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
server(
runserver_args=command.runserver_args,
reload=command.reload,
debug=command.debug,
init=command.init,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,34 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox shell'
import sys
import argparse
from typing import Optional, List, IO
from ..main import shell
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(shell.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=shell.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
shell(
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,32 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox status'
import sys
import argparse
from typing import Optional, List, IO
from ..main import status
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(status.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=status.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.parse_args(args or ())
reject_stdin(__command__, stdin)
status(out_dir=pwd or OUTPUT_DIR)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,124 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox update'
import sys
import argparse
from typing import List, Optional, IO
from ..main import update
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_invalid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
from ..logging_util import SmartFormatter, accept_stdin
@docstring(update.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=update.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--only-new', #'-n',
action='store_true',
help="Don't attempt to retry previously skipped/failed links when updating",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
help="Update the main index without archiving any content",
)
parser.add_argument(
'--resume', #'-r',
type=float,
help='Resume the update process from a given timestamp',
default=None,
)
parser.add_argument(
'--overwrite', #'-x',
action='store_true',
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
)
parser.add_argument(
'--before', #'-b',
type=float,
help="Update only links bookmarked before the given timestamp.",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="Update only links bookmarked after the given timestamp.",
default=None,
)
parser.add_argument(
'--status',
type=str,
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
default='indexed',
help=(
'Update only links or data directories that have the given status\n'
f' indexed {get_indexed_folders.__doc__} (the default)\n'
f' archived {get_archived_folders.__doc__}\n'
f' unarchived {get_unarchived_folders.__doc__}\n'
'\n'
f' present {get_present_folders.__doc__}\n'
f' valid {get_valid_folders.__doc__}\n'
f' invalid {get_invalid_folders.__doc__}\n'
'\n'
f' duplicate {get_duplicate_folders.__doc__}\n'
f' orphaned {get_orphaned_folders.__doc__}\n'
f' corrupted {get_corrupted_folders.__doc__}\n'
f' unrecognized {get_unrecognized_folders.__doc__}\n'
)
)
parser.add_argument(
'--filter-type',
type=str,
choices=('exact', 'substring', 'domain', 'regex'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
parser.add_argument(
'filter_patterns',
nargs='*',
type=str,
default=None,
help='Update only URLs matching these filter patterns.'
)
command = parser.parse_args(args or ())
filter_patterns_str = accept_stdin(stdin)
update(
resume=command.resume,
only_new=command.only_new,
index_only=command.index_only,
overwrite=command.overwrite,
filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,
after=command.after,
before=command.before,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

View file

@ -0,0 +1,40 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
__command__ = 'archivebox version'
import sys
import argparse
from typing import Optional, List, IO
from ..main import version
from ..util import docstring
from ..config import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin
@docstring(version.__doc__)
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
parser = argparse.ArgumentParser(
prog=__command__,
description=version.__doc__,
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--quiet', '-q',
action='store_true',
help='Only print ArchiveBox version number and nothing else.',
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
version(
quiet=command.quiet,
out_dir=pwd or OUTPUT_DIR,
)
if __name__ == '__main__':
main(args=sys.argv[1:], stdin=sys.stdin)

226
archivebox/cli/tests.py Executable file
View file

@ -0,0 +1,226 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
import os
import sys
import shutil
import unittest
from contextlib import contextmanager
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'OUTPUT_DIR': 'data.tests',
'SAVE_ARCHIVE_DOT_ORG': 'False',
'SAVE_TITLE': 'False',
'USE_CURL': 'False',
'USE_WGET': 'False',
'USE_GIT': 'False',
'USE_CHROME': 'False',
'USE_YOUTUBEDL': 'False',
}
OUTPUT_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from ..main import init
from ..index import load_main_index
from ..config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from . import (
archivebox_init,
archivebox_add,
archivebox_remove,
)
HIDE_CLI_OUTPUT = True
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
stdout = sys.stdout
stderr = sys.stderr
@contextmanager
def output_hidden(show_failing=True):
if not HIDE_CLI_OUTPUT:
yield
return
sys.stdout = open('stdout.txt', 'w+')
sys.stderr = open('stderr.txt', 'w+')
try:
yield
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
except:
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
if show_failing:
with open('stdout.txt', 'r') as f:
print(f.read())
with open('stderr.txt', 'r') as f:
print(f.read())
raise
finally:
os.remove('stdout.txt')
os.remove('stderr.txt')
class TestInit(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_basic_init(self):
with output_hidden():
archivebox_init.main([])
assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self):
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
f.write('test')
try:
with output_hidden(show_failing=False):
archivebox_init.main([])
assert False, 'Init should have exited with an exception'
except SystemExit:
pass
assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
except:
pass
def test_no_dirty_state(self):
with output_hidden():
init()
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
with output_hidden():
init()
class TestAdd(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
with output_hidden():
init()
def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_add_arg_url(self):
with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 30
def test_add_arg_file(self):
test_file = os.path.join(OUTPUT_DIR, 'test.txt')
with open(test_file, 'w+') as f:
f.write(test_urls)
with output_hidden():
archivebox_add.main([test_file])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
os.remove(test_file)
def test_add_stdin_url(self):
with output_hidden():
archivebox_add.main([], stdin=test_urls)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
class TestRemove(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
with output_hidden():
init()
archivebox_add.main([], stdin=test_urls)
# def tearDown(self):
# shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_remove_exact(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 11
def test_remove_regex(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 4
def test_remove_domain(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 10
def test_remove_none(self):
try:
with output_hidden(show_failing=False):
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
assert False, 'Should raise if no URLs match'
except:
pass
if __name__ == '__main__':
if '--verbose' in sys.argv or '-v' in sys.argv:
HIDE_CLI_OUTPUT = False
unittest.main()

View file

@ -1,278 +0,0 @@
import os
import re
import sys
import shutil
from subprocess import run, PIPE, DEVNULL
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
# env USE_COLOR=True CHROME_BINARY=google-chrome ./archive export.html
# ******************************************************************************
IS_TTY = sys.stdout.isatty()
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true'
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true'
FETCH_WARC = os.getenv('FETCH_WARC', 'True' ).lower() == 'true'
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true'
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
RESTRICT_FILE_NAMES = os.getenv('RESTRICT_FILE_NAMES', 'windows' )
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)')
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
CURL_BINARY = os.getenv('CURL_BINARY', 'curl')
GIT_BINARY = os.getenv('GIT_BINARY', 'git')
WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
URL_BLACKLIST = os.getenv('URL_BLACKLIST', None)
try:
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
except Exception:
OUTPUT_DIR = None
# ******************************************************************************
# **************************** Derived Settings ********************************
# ******************************************************************************
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
if not OUTPUT_DIR:
OUTPUT_DIR = os.path.join(REPO_DIR, 'output')
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode)
URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE)
########################### Environment & Dependencies #########################
try:
### Terminal Configuration
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
ANSI = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
'lightred': '\033[01;35m',
'red': '\033[01;31m',
'green': '\033[01;32m',
'blue': '\033[01;34m',
'white': '\033[01;37m',
'black': '\033[01;30m',
}
if not USE_COLOR:
# dont show colors if USE_COLOR is False
ANSI = {k: '' for k in ANSI.keys()}
if not CHROME_BINARY:
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
default_executable_paths = (
'chromium-browser',
'chromium',
'chrome',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'google-chrome',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'google-chrome-unstable',
'google-chrome-dev',
)
for name in default_executable_paths:
full_path_exists = shutil.which(name)
if full_path_exists:
CHROME_BINARY = name
break
else:
CHROME_BINARY = 'chromium-browser'
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
if CHROME_USER_DATA_DIR is None:
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
default_profile_paths = (
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',
'~/.config/google-chrome-stable',
'~/.config/google-chrome-beta',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Google/Chrome SxS/User Data',
'~/.config/google-chrome-unstable',
'~/.config/google-chrome-dev',
)
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
CHROME_USER_DATA_DIR = full_path
break
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
CHROME_OPTIONS = {
'TIMEOUT': TIMEOUT,
'RESOLUTION': RESOLUTION,
'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY,
'CHROME_BINARY': CHROME_BINARY,
'CHROME_HEADLESS': CHROME_HEADLESS,
'CHROME_SANDBOX': CHROME_SANDBOX,
'CHROME_USER_AGENT': CHROME_USER_AGENT,
'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR,
}
### Check Python environment
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
if python_vers < 3.5:
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(1)
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:')
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
print('')
print(' Alternatively, run this script with:')
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
### Get code version by parsing git log
GIT_SHA = 'unknown'
try:
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
except Exception:
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
### Get absolute path for cookies file
try:
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
except Exception:
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
raise
### Make sure curl is installed
if FETCH_FAVICON or FETCH_TITLE or SUBMIT_ARCHIVE_DOT_ORG:
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
CURL_USER_AGENT = CURL_USER_AGENT.format(GIT_SHA=GIT_SHA[:9])
### Make sure wget is installed and calculate version
if FETCH_WGET or FETCH_WARC:
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
WGET_VERSION = 'unknown'
try:
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
except Exception:
if USE_WGET:
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
### Make sure chrome is installed and calculate version
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
try:
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
version_str = result.stdout.decode('utf-8')
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
version = [l for l in version_lines if l.isdigit()][-1]
if int(version) < 59:
print(version_lines)
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except (IndexError, TypeError, OSError):
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
CHROME_VERSION = 'unknown'
try:
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
except Exception:
if USE_CHROME:
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
### Make sure git is installed
if FETCH_GIT:
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
### Make sure youtube-dl is installed
if FETCH_MEDIA:
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
raise SystemExit(1)
except KeyboardInterrupt:
raise SystemExit(1)
except:
print('[X] There was an error during the startup procedure, your archive data is unaffected.')
raise

View file

@ -0,0 +1,874 @@
__package__ = 'archivebox.config'
import os
import io
import re
import sys
import django
import getpass
import shutil
from hashlib import md5
from pathlib import Path
from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
from collections import defaultdict
from .stubs import (
SimpleConfigValueDict,
ConfigValue,
ConfigDict,
ConfigDefaultValue,
ConfigDefaultDict,
)
# precedence order for config:
# 1. cli args
# 2. shell environment vars
# 3. config file
# 4. defaults
# env USE_COLO=false archivebox add '...'
# env SHOW_PROGRESS=1 archivebox add '...'
# ******************************************************************************
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
# ******************************************************************************
################################# User Config ##################################
CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'SHELL_CONFIG': {
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
},
'GENERAL_CONFIG': {
'OUTPUT_DIR': {'type': str, 'default': None},
'CONFIG_FILE': {'type': str, 'default': None},
'ONLY_NEW': {'type': bool, 'default': True},
'TIMEOUT': {'type': int, 'default': 60},
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
'URL_BLACKLIST': {'type': str, 'default': None},
},
'SERVER_CONFIG': {
'SECRET_KEY': {'type': str, 'default': None},
'ALLOWED_HOSTS': {'type': str, 'default': '*'},
'DEBUG': {'type': bool, 'default': False},
'PUBLIC_INDEX': {'type': bool, 'default': True},
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'ACTIVE_THEME': {'type': str, 'default': 'default'},
},
'ARCHIVE_METHOD_TOGGLES': {
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
'SAVE_PLAYLISTS': {'type': bool, 'default': True, 'aliases': ('FETCH_PLAYLISTS',)},
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
},
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': True},
},
'DEPENDENCY_CONFIG': {
'USE_CURL': {'type': bool, 'default': True},
'USE_WGET': {'type': bool, 'default': True},
'USE_GIT': {'type': bool, 'default': True},
'USE_CHROME': {'type': bool, 'default': True},
'USE_YOUTUBEDL': {'type': bool, 'default': True},
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'CHROME_BINARY': {'type': str, 'default': None},
},
}
CONFIG_ALIASES = {
alias: key
for section in CONFIG_DEFAULTS.values()
for key, default in section.items()
for alias in default.get('aliases', ())
}
USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()}
def get_real_name(key: str) -> str:
return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
############################## Derived Config ##############################
# Constants
DEFAULT_CLI_COLORS = {
'reset': '\033[00;00m',
'lightblue': '\033[01;30m',
'lightyellow': '\033[01;33m',
'lightred': '\033[01;35m',
'red': '\033[01;31m',
'green': '\033[01;32m',
'blue': '\033[01;34m',
'white': '\033[01;37m',
'black': '\033[01;30m',
}
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'00': [(0, 0, 0), (0, 0, 0)],
'30': [(0, 0, 0), (0, 0, 0)],
'31': [(255, 0, 0), (128, 0, 0)],
'32': [(0, 200, 0), (0, 128, 0)],
'33': [(255, 255, 0), (128, 128, 0)],
'34': [(0, 0, 255), (0, 0, 128)],
'35': [(255, 0, 255), (128, 0, 128)],
'36': [(0, 255, 255), (0, 128, 128)],
'37': [(255, 255, 255), (255, 255, 255)],
})
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
VERSION_FILENAME = 'VERSION'
PYTHON_DIR_NAME = 'archivebox'
TEMPLATES_DIR_NAME = 'themes'
ARCHIVE_DIR_NAME = 'archive'
SOURCES_DIR_NAME = 'sources'
LOGS_DIR_NAME = 'logs'
STATIC_DIR_NAME = 'static'
SQL_INDEX_FILENAME = 'index.sqlite3'
JSON_INDEX_FILENAME = 'index.json'
HTML_INDEX_FILENAME = 'index.html'
ROBOTS_TXT_FILENAME = 'robots.txt'
FAVICON_FILENAME = 'favicon.ico'
CONFIG_FILENAME = 'ArchiveBox.conf'
CONFIG_HEADER = (
"""# This is the config file for your ArchiveBox collection.
#
# You can add options here manually in INI format, or automatically by running:
# archivebox config --set KEY=VALUE
#
# If you modify this file manually, make sure to update your archive after by running:
# archivebox init
#
# A list of all possible config with documentation and examples can be found here:
# https://github.com/pirate/ArchiveBox/wiki/Configuration
""")
DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
'SOURCES_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], SOURCES_DIR_NAME)},
'LOGS_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], LOGS_DIR_NAME)},
'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else os.path.join(c['OUTPUT_DIR'], CONFIG_FILENAME)},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)},
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE)},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
'VERSION': {'default': lambda c: open(os.path.join(c['PYTHON_DIR'], VERSION_FILENAME), 'r').read().strip()},
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['FETCH_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'SAVE_PLAYLISTS': {'default': lambda c: c['SAVE_PLAYLISTS'] and c['SAVE_MEDIA']},
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'])},
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
}
################################### Helpers ####################################
def load_config_val(key: str,
default: ConfigDefaultValue=None,
type: Optional[Type]=None,
aliases: Optional[Tuple[str, ...]]=None,
config: Optional[ConfigDict]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
"""parse bool, int, and str key=value pairs from env"""
config_keys_to_check = (key, *(aliases or ()))
for key in config_keys_to_check:
if env_vars:
val = env_vars.get(key)
if val:
break
if config_file_vars:
val = config_file_vars.get(key)
if val:
break
if type is None or val is None:
if callable(default):
assert isinstance(config, dict)
return default(config)
return default
elif type is bool:
if val.lower() in ('true', 'yes', '1'):
return True
elif val.lower() in ('false', 'no', '0'):
return False
else:
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
elif type is str:
if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
return val.strip()
elif type is int:
if not val.isdigit():
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
return int(val)
raise Exception('Config values can only be str, bool, or int')
def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
config_path = os.path.join(out_dir, CONFIG_FILENAME)
if os.path.exists(config_path):
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
# flatten into one namespace
config_file_vars = {
key.upper(): val
for section, options in config_file.items()
for key, val in options.items()
}
# print('[i] Loaded config file', os.path.abspath(config_path))
# print(config_file_vars)
return config_file_vars
return None
def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
from ..system import atomic_write
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
config_path = os.path.join(out_dir, CONFIG_FILENAME)
if not os.path.exists(config_path):
atomic_write(config_path, CONFIG_HEADER)
config_file = ConfigParser()
config_file.optionxform = str
config_file.read(config_path)
with open(config_path, 'r') as old:
atomic_write(f'{config_path}.bak', old.read())
find_section = lambda key: [name for name, opts in CONFIG_DEFAULTS.items() if key in opts][0]
# Set up sections in empty config file
for key, val in config.items():
section = find_section(key)
if section in config_file:
existing_config = dict(config_file[section])
else:
existing_config = {}
config_file[section] = {**existing_config, key: val}
# always make sure there's a SECRET_KEY defined for Django
existing_secret_key = None
if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
from django.utils.crypto import get_random_string
chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
random_secret_key = get_random_string(50, chars)
if 'SERVER_CONFIG' in config_file:
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
else:
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
with open(config_path, 'w+') as new:
config_file.write(new)
try:
# validate the config by attempting to re-parse it
CONFIG = load_all_config()
return {
key.upper(): CONFIG.get(key.upper())
for key in config.keys()
}
except:
# something went horribly wrong, rever to the previous version
with open(f'{config_path}.bak', 'r') as old:
atomic_write(config_path, old.read())
if os.path.exists(f'{config_path}.bak'):
os.remove(f'{config_path}.bak')
return {}
def load_config(defaults: ConfigDefaultDict,
config: Optional[ConfigDict]=None,
out_dir: Optional[str]=None,
env_vars: Optional[os._Environ]=None,
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
env_vars = env_vars or os.environ
config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
extended_config: ConfigDict = config.copy() if config else {}
for key, default in defaults.items():
try:
extended_config[key] = load_config_val(
key,
default=default['default'],
type=default.get('type'),
aliases=default.get('aliases'),
config=extended_config,
env_vars=env_vars,
config_file_vars=config_file_vars,
)
except KeyboardInterrupt:
raise SystemExit(0)
except Exception as e:
stderr()
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
stderr(' {}: {}'.format(e.__class__.__name__, e))
stderr()
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
stderr()
stderr(' For config documentation and examples see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
stderr()
raise
raise SystemExit(2)
return extended_config
# def write_config(config: ConfigDict):
# with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
def stderr(*args, color: Optional[str]=None, config: Optional[ConfigDict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
else:
strs = [' '.join(str(a) for a in args), '\n']
sys.stderr.write(''.join(strs))
def bin_version(binary: Optional[str]) -> Optional[str]:
"""check the presence and return valid version line of a specified binary"""
abspath = bin_path(binary)
if not abspath:
return None
try:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except Exception:
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
# stderr(f' {binary} --version')
# stderr()
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
# stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
# stderr()
return None
def bin_path(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
return shutil.which(os.path.expanduser(binary)) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None:
return None
abs_path = bin_path(binary)
if abs_path is None or not Path(abs_path).exists():
return None
file_hash = md5()
with io.open(abs_path, mode='rb') as f:
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
file_hash.update(chunk)
return f'md5:{file_hash.hexdigest()}'
def find_chrome_binary() -> Optional[str]:
"""find any installed chrome binaries in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_executable_paths = (
'chromium-browser',
'chromium',
'/Applications/Chromium.app/Contents/MacOS/Chromium',
'chrome',
'google-chrome',
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
'google-chrome-stable',
'google-chrome-beta',
'google-chrome-canary',
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
'google-chrome-unstable',
'google-chrome-dev',
)
for name in default_executable_paths:
full_path_exists = shutil.which(name)
if full_path_exists:
return name
stderr('[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?', color='red')
stderr()
return None
def find_chrome_data_dir() -> Optional[str]:
"""find any installed chrome user data directories in the default locations"""
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
# make sure data dir finding precedence order always matches binary finding order
default_profile_paths = (
'~/.config/chromium',
'~/Library/Application Support/Chromium',
'~/AppData/Local/Chromium/User Data',
'~/.config/chrome',
'~/.config/google-chrome',
'~/Library/Application Support/Google/Chrome',
'~/AppData/Local/Google/Chrome/User Data',
'~/.config/google-chrome-stable',
'~/.config/google-chrome-beta',
'~/Library/Application Support/Google/Chrome Canary',
'~/AppData/Local/Google/Chrome SxS/User Data',
'~/.config/google-chrome-unstable',
'~/.config/google-chrome-dev',
)
for path in default_profile_paths:
full_path = os.path.expanduser(path)
if os.path.exists(full_path):
return full_path
return None
def wget_supports_compression(config):
cmd = [
config['WGET_BINARY'],
"--compression=auto",
"--help",
]
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
return {
'REPO_DIR': {
'path': os.path.abspath(config['REPO_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
},
'PYTHON_DIR': {
'path': os.path.abspath(config['PYTHON_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
},
'TEMPLATES_DIR': {
'path': os.path.abspath(config['TEMPLATES_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['TEMPLATES_DIR'], 'static')),
},
}
def get_external_locations(config: ConfigDict) -> ConfigValue:
abspath = lambda path: None if path is None else os.path.abspath(path)
return {
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
},
'COOKIES_FILE': {
'path': abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']),
},
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
'OUTPUT_DIR': {
'path': os.path.abspath(config['OUTPUT_DIR']),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
},
'SOURCES_DIR': {
'path': os.path.abspath(config['SOURCES_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['SOURCES_DIR']),
},
'LOGS_DIR': {
'path': os.path.abspath(config['LOGS_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['LOGS_DIR']),
},
'ARCHIVE_DIR': {
'path': os.path.abspath(config['ARCHIVE_DIR']),
'enabled': True,
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
},
'CONFIG_FILE': {
'path': os.path.abspath(config['CONFIG_FILE']),
'enabled': True,
'is_valid': os.path.exists(config['CONFIG_FILE']),
},
'SQL_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
},
'JSON_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
},
'HTML_INDEX': {
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
'enabled': True,
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
},
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
'PYTHON_BINARY': {
'path': bin_path(config['PYTHON_BINARY']),
'version': config['PYTHON_VERSION'],
'hash': bin_hash(config['PYTHON_BINARY']),
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
'DJANGO_BINARY': {
'path': bin_path(config['DJANGO_BINARY']),
'version': config['DJANGO_VERSION'],
'hash': bin_hash(config['DJANGO_BINARY']),
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
'hash': bin_hash(config['PYTHON_BINARY']),
'enabled': config['USE_CURL'],
'is_valid': bool(config['CURL_VERSION']),
},
'WGET_BINARY': {
'path': bin_path(config['WGET_BINARY']),
'version': config['WGET_VERSION'],
'hash': bin_hash(config['WGET_BINARY']),
'enabled': config['USE_WGET'],
'is_valid': bool(config['WGET_VERSION']),
},
'GIT_BINARY': {
'path': bin_path(config['GIT_BINARY']),
'version': config['GIT_VERSION'],
'hash': bin_hash(config['GIT_BINARY']),
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
'YOUTUBEDL_BINARY': {
'path': bin_path(config['YOUTUBEDL_BINARY']),
'version': config['YOUTUBEDL_VERSION'],
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']),
},
'CHROME_BINARY': {
'path': bin_path(config['CHROME_BINARY']),
'version': config['CHROME_VERSION'],
'hash': bin_hash(config['CHROME_BINARY']),
'enabled': config['USE_CHROME'],
'is_valid': bool(config['CHROME_VERSION']),
},
}
def get_chrome_info(config: ConfigDict) -> ConfigValue:
return {
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': config['CHROME_BINARY'],
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
}
################################## Load Config #################################
def load_all_config():
CONFIG: ConfigDict = {}
for section_name, section_config in CONFIG_DEFAULTS.items():
CONFIG = load_config(section_config, CONFIG)
return load_config(DERIVED_CONFIG_DEFAULTS, CONFIG)
CONFIG = load_all_config()
globals().update(CONFIG)
############################## Importable Checkers #############################
def check_system_config(config: ConfigDict=CONFIG) -> None:
### Check system environment
if config['USER'] == 'root':
stderr('[!] ArchiveBox should never be run as root!', color='red')
stderr(' For more information, see the security overview documentation:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
raise SystemExit(2)
### Check Python environment
if sys.version_info[:3] < (3, 6, 0):
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
raise SystemExit(2)
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
stderr('')
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
raise SystemExit(2)
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None:
if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
stderr(' For more info see:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
if 'Default' in config['CHROME_USER_DATA_DIR']:
stderr()
stderr(' Try removing /Default from the end e.g.:')
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
raise SystemExit(2)
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
invalid = [
'{}: {} ({})'.format(name, info['path'] or 'unable to find binary', info['version'] or 'unable to detect version')
for name, info in config['DEPENDENCIES'].items()
if info['enabled'] and not info['is_valid']
]
if invalid:
stderr('[X] Missing some required dependencies.', color='red')
stderr()
stderr(' {}'.format('\n '.join(invalid)))
if show_help:
stderr()
stderr(' To get more info on dependency status run:')
stderr(' archivebox --version')
raise SystemExit(2)
if config['TIMEOUT'] < 5:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
stderr()
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
stderr()
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
stderr()
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
stderr(' (Setting it somewhere over 60 seconds is recommended)')
stderr()
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR']
assert isinstance(output_dir, str)
json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
if not json_index_exists:
stderr('[X] No archivebox index found in the current directory.', color='red')
stderr(f' {output_dir}', color='lightyellow')
stderr()
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
stderr(' cd path/to/your/archive/folder')
stderr(' archivebox [command]')
stderr()
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
stderr(' archivebox init')
raise SystemExit(2)
sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
from ..index.sql import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
if (not sql_index_exists) or pending_migrations:
if sql_index_exists:
pending_operation = f'apply the {len(pending_migrations)} pending migrations'
else:
pending_operation = 'generate the new SQL main index'
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
stderr(f' {output_dir}')
stderr()
stderr(f' To upgrade it to the latest version and {pending_operation} run:')
stderr(' archivebox init')
raise SystemExit(3)
sources_dir = os.path.join(output_dir, SOURCES_DIR_NAME)
if not os.path.exists(sources_dir):
os.makedirs(sources_dir)
def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -> None:
check_system_config()
output_dir = out_dir or config['OUTPUT_DIR']
assert isinstance(output_dir, str) and isinstance(config['PYTHON_DIR'], str)
try:
import django
sys.path.append(config['PYTHON_DIR'])
os.environ.setdefault('OUTPUT_DIR', output_dir)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
if check_db:
sql_index_path = os.path.join(output_dir, SQL_INDEX_FILENAME)
assert os.path.exists(sql_index_path), (
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
except KeyboardInterrupt:
raise SystemExit(2)
os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821

131
archivebox/config/stubs.py Normal file
View file

@ -0,0 +1,131 @@
from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any
from mypy_extensions import TypedDict
SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
class BaseConfig(TypedDict):
pass
class ConfigDict(BaseConfig, total=False):
"""
# Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
print('class ConfigDict(BaseConfig, total=False):')
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
for section, configs in CONFIG_DEFAULTS.items():
for key, attrs in configs.items():
Type, default = attrs['type'], attrs['default']
if default is None:
print(f' {key}: Optional[{Type.__name__}]')
else:
print(f' {key}: {Type.__name__}')
print()
"""
IS_TTY: bool
USE_COLOR: bool
SHOW_PROGRESS: bool
OUTPUT_DIR: str
CONFIG_FILE: str
ONLY_NEW: bool
TIMEOUT: int
MEDIA_TIMEOUT: int
OUTPUT_PERMISSIONS: str
URL_BLACKLIST: Optional[str]
SECRET_KEY: str
ALLOWED_HOSTS: str
DEBUG: bool
PUBLIC_INDEX: bool
PUBLIC_SNAPSHOTS: bool
FOOTER_INFO: str
ACTIVE_THEME: str
SAVE_TITLE: bool
SAVE_FAVICON: bool
SAVE_WGET: bool
SAVE_WGET_REQUISITES: bool
SAVE_PDF: bool
SAVE_SCREENSHOT: bool
SAVE_DOM: bool
SAVE_WARC: bool
SAVE_GIT: bool
SAVE_MEDIA: bool
SAVE_PLAYLISTS: bool
SAVE_ARCHIVE_DOT_ORG: bool
RESOLUTION: str
GIT_DOMAINS: str
CHECK_SSL_VALIDITY: bool
CURL_USER_AGENT: str
WGET_USER_AGENT: str
CHROME_USER_AGENT: str
COOKIES_FILE: Optional[str]
CHROME_USER_DATA_DIR: Optional[str]
CHROME_HEADLESS: bool
CHROME_SANDBOX: bool
USE_CURL: bool
USE_WGET: bool
USE_GIT: bool
USE_CHROME: bool
USE_YOUTUBEDL: bool
CURL_BINARY: Optional[str]
GIT_BINARY: Optional[str]
WGET_BINARY: Optional[str]
YOUTUBEDL_BINARY: Optional[str]
CHROME_BINARY: Optional[str]
TERM_WIDTH: Callable[[], int]
USER: str
ANSI: Dict[str, str]
REPO_DIR: str
PYTHON_DIR: str
TEMPLATES_DIR: str
ARCHIVE_DIR: str
SOURCES_DIR: str
LOGS_DIR: str
URL_BLACKLIST_PTN: Optional[Pattern]
WGET_AUTO_COMPRESSION: bool
ARCHIVEBOX_BINARY: str
VERSION: str
GIT_SHA: str
PYTHON_BINARY: str
PYTHON_ENCODING: str
PYTHON_VERSION: str
DJANGO_BINARY: str
DJANGO_VERSION: str
CURL_VERSION: str
WGET_VERSION: str
YOUTUBEDL_VERSION: str
GIT_VERSION: str
CHROME_VERSION: str
DEPENDENCIES: Dict[str, SimpleConfigValueDict]
CODE_LOCATIONS: Dict[str, SimpleConfigValueDict]
CONFIG_LOCATIONS: Dict[str, SimpleConfigValueDict]
DATA_LOCATIONS: Dict[str, SimpleConfigValueDict]
CHROME_OPTIONS: Dict[str, SimpleConfigValue]
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
ConfigDefault = TypedDict('ConfigDefault', {
'default': ConfigDefaultValue,
'type': Optional[Type],
'aliases': Optional[Tuple[str, ...]],
}, total=False)
ConfigDefaultDict = Dict[str, ConfigDefault]

View file

@ -0,0 +1 @@
__package__ = 'archivebox.core'

200
archivebox/core/admin.py Normal file
View file

@ -0,0 +1,200 @@
__package__ = 'archivebox.core'
from io import StringIO
from contextlib import redirect_stdout
from pathlib import Path
from django.contrib import admin
from django.urls import path
from django.utils.html import format_html
from django.utils.safestring import mark_safe
from django.shortcuts import render, redirect
from django.contrib.auth import get_user_model
from core.models import Snapshot
from core.forms import AddLinkForm
from util import htmldecode, urldecode, ansi_to_html
from logging_util import printable_filesize
from main import add, remove
from config import OUTPUT_DIR
from extractors import archive_links
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
def update_snapshots(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Archive"
def update_titles(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title',), out_dir=OUTPUT_DIR)
update_titles.short_description = "Pull title"
def overwrite_snapshots(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Re-archive (overwrite)"
def verify_snapshots(modeladmin, request, queryset):
for snapshot in queryset:
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
verify_snapshots.short_description = "Check"
def delete_snapshots(modeladmin, request, queryset):
remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
class SnapshotAdmin(admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size', 'updated')
sort_fields = ('title_str', 'url_str', 'added', 'updated')
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
search_fields = ('url', 'timestamp', 'title', 'tags')
fields = ('title', 'tags', *readonly_fields)
list_filter = ('added', 'updated', 'tags')
ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
def id_str(self, obj):
return format_html(
'<code style="font-size: 10px">{}</code>',
obj.url_hash[:8],
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html('<span>{}</span>', tag.strip())
for tag in obj.tags.split(',')
) if obj.tags else ''
return format_html(
'<a href="/{}">'
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
'</a>'
'<a href="/{}/{}">'
'<b class="status-{}">{}</b>'
'</a>',
obj.archive_path,
obj.archive_path, canon['favicon_path'],
obj.archive_path, canon['wget_path'] or '',
'fetched' if obj.latest_title or obj.title else 'pending',
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
) + mark_safe(f'<span class="tags">{tags}</span>')
def files(self, obj):
link = obj.as_link()
canon = link.canonical_outputs()
out_dir = Path(link.link_dir)
link_tuple = lambda link, method: (link.archive_path, canon[method], canon[method] and (out_dir / canon[method]).exists())
return format_html(
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
'<a href="/{}/{}/" class="exists-{}" title="Wget clone">🌐 </a> '
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
'</span>',
*link_tuple(link, 'wget_path'),
*link_tuple(link, 'pdf_path'),
*link_tuple(link, 'screenshot_path'),
*link_tuple(link, 'dom_path'),
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
)
def size(self, obj):
return format_html(
'<a href="/{}" title="View all files">{}</a>',
obj.archive_path,
printable_filesize(obj.archive_size) if obj.archive_size else 'pending',
)
def url_str(self, obj):
return format_html(
'<a href="{}">{}</a>',
obj.url,
obj.url.split('://www.', 1)[-1].split('://', 1)[-1][:64],
)
id_str.short_description = 'ID'
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
id_str.admin_order_field = 'id'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
index_title = 'Links'
site_title = 'Index'
def get_urls(self):
return [
path('core/snapshot/add/', self.add_view, name='Add'),
] + super().get_urls()
def add_view(self, request):
if not request.user.is_authenticated:
return redirect(f'/admin/login/?next={request.path}')
request.current_app = self.name
context = {
**self.each_context(request),
'title': 'Add URLs',
}
if request.method == 'GET':
context['form'] = AddLinkForm()
elif request.method == 'POST':
form = AddLinkForm(request.POST)
if form.is_valid():
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
depth = 0 if form.cleaned_data["depth"] == "0" else 1
input_kwargs = {
"urls": url,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,
}
add_stdout = StringIO()
with redirect_stdout(add_stdout):
add(**input_kwargs)
print(add_stdout.getvalue())
context.update({
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
"form": AddLinkForm()
})
else:
context["form"] = form
return render(template_name='add_links.html', request=request, context=context)
admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.disable_action('delete_selected')

5
archivebox/core/apps.py Normal file
View file

@ -0,0 +1,5 @@
from django.apps import AppConfig
class CoreConfig(AppConfig):
name = 'core'

14
archivebox/core/forms.py Normal file
View file

@ -0,0 +1,14 @@
__package__ = 'archivebox.core'
from django import forms
from ..util import URL_REGEX
CHOICES = (
('0', 'depth = 0 (archive just these URLs)'),
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
)
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')

View file

@ -0,0 +1,18 @@
__package__ = 'archivebox'
from django.core.management.base import BaseCommand
from .cli import run_subcommand
class Command(BaseCommand):
help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
def add_arguments(self, parser):
parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
def handle(self, *args, **kwargs):
run_subcommand(kwargs['subcommand'], args=kwargs['command_args'])

View file

@ -0,0 +1,27 @@
# Generated by Django 2.2 on 2019-05-01 03:27
from django.db import migrations, models
import uuid
class Migration(migrations.Migration):
initial = True
dependencies = [
]
operations = [
migrations.CreateModel(
name='Snapshot',
fields=[
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
('url', models.URLField(unique=True)),
('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
('title', models.CharField(default=None, max_length=128, null=True)),
('tags', models.CharField(default=None, max_length=256, null=True)),
('added', models.DateTimeField(auto_now_add=True)),
('updated', models.DateTimeField(default=None, null=True)),
],
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.0.7 on 2020-06-25 15:21
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(default=None, max_length=32, null=True),
),
]

View file

@ -0,0 +1,38 @@
# Generated by Django 3.0.7 on 2020-06-30 10:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0002_auto_20200625_1521'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='added',
field=models.DateTimeField(auto_now_add=True, db_index=True),
),
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.CharField(db_index=True, default=None, max_length=256, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, default=None, max_length=32, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(db_index=True, default=None, max_length=128, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='updated',
field=models.DateTimeField(db_index=True, default=None, null=True),
),
]

View file

@ -0,0 +1,19 @@
# Generated by Django 3.0.7 on 2020-07-13 15:52
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0003_auto_20200630_1034'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='timestamp',
field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
preserve_default=False,
),
]

View file

@ -0,0 +1,28 @@
# Generated by Django 3.0.7 on 2020-07-28 03:26
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0004_auto_20200713_1552'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
),
migrations.AlterField(
model_name='snapshot',
name='updated',
field=models.DateTimeField(blank=True, db_index=True, null=True),
),
]

94
archivebox/core/models.py Normal file
View file

@ -0,0 +1,94 @@
__package__ = 'archivebox.core'
import uuid
from django.db import models
from django.utils.functional import cached_property
from ..util import parse_date
from ..index.schema import Link
class Snapshot(models.Model):
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
url = models.URLField(unique=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
added = models.DateTimeField(auto_now_add=True, db_index=True)
updated = models.DateTimeField(null=True, blank=True, db_index=True)
# bookmarked = models.DateTimeField()
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
def __repr__(self) -> str:
title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
def __str__(self) -> str:
title = self.title or '-'
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
@classmethod
def from_json(cls, info: dict):
info = {k: v for k, v in info.items() if k in cls.keys}
return cls(**info)
def as_json(self, *args) -> dict:
args = args or self.keys
return {
key: getattr(self, key)
for key in args
}
def as_link(self) -> Link:
return Link.from_json(self.as_json())
@cached_property
def bookmarked(self):
return parse_date(self.timestamp)
@cached_property
def is_archived(self):
return self.as_link().is_archived
@cached_property
def num_outputs(self):
return self.as_link().num_outputs
@cached_property
def url_hash(self):
return self.as_link().url_hash
@cached_property
def base_url(self):
return self.as_link().base_url
@cached_property
def link_dir(self):
return self.as_link().link_dir
@cached_property
def archive_path(self):
return self.as_link().archive_path
@cached_property
def archive_size(self):
return self.as_link().archive_size
@cached_property
def history(self):
from ..index import load_link_details
return load_link_details(self.as_link()).history
@cached_property
def latest_title(self):
if ('title' in self.history
and self.history['title']
and (self.history['title'][-1].status == 'succeeded')
and self.history['title'][-1].output.strip()):
return self.history['title'][-1].output.strip()
return None

127
archivebox/core/settings.py Normal file
View file

@ -0,0 +1,127 @@
__package__ = 'archivebox.core'
import os
import sys
from django.utils.crypto import get_random_string
from ..config import ( # noqa: F401
DEBUG,
SECRET_KEY,
ALLOWED_HOSTS,
PYTHON_DIR,
ACTIVE_THEME,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
)
ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
INSTALLED_APPS = [
'django.contrib.auth',
'django.contrib.contenttypes',
'django.contrib.sessions',
'django.contrib.messages',
'django.contrib.staticfiles',
'django.contrib.admin',
'core',
'django_extensions',
]
MIDDLEWARE = [
'django.middleware.security.SecurityMiddleware',
'django.contrib.sessions.middleware.SessionMiddleware',
'django.middleware.common.CommonMiddleware',
'django.middleware.csrf.CsrfViewMiddleware',
'django.contrib.auth.middleware.AuthenticationMiddleware',
'django.contrib.messages.middleware.MessageMiddleware',
]
ROOT_URLCONF = 'core.urls'
APPEND_SLASH = True
TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME),
os.path.join(PYTHON_DIR, 'themes', 'default'),
os.path.join(PYTHON_DIR, 'themes'),
],
'APP_DIRS': True,
'OPTIONS': {
'context_processors': [
'django.template.context_processors.debug',
'django.template.context_processors.request',
'django.contrib.auth.context_processors.auth',
'django.contrib.messages.context_processors.messages',
],
},
},
]
WSGI_APPLICATION = 'core.wsgi.application'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME),
}
}
AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.ModelBackend',
]
AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
{'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'},
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
]
################################################################################
### Security Settings
################################################################################
SECURE_BROWSER_XSS_FILTER = True
SECURE_CONTENT_TYPE_NOSNIFF = True
SESSION_COOKIE_SECURE = False
CSRF_COOKIE_SECURE = False
SESSION_COOKIE_DOMAIN = None
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = True
SESSION_COOKIE_AGE = 1209600 # 2 weeks
LOGIN_URL = '/accounts/login/'
LOGOUT_REDIRECT_URL = '/'
PASSWORD_RESET_URL = '/accounts/password_reset/'
SHELL_PLUS = 'ipython'
SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL:
os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py')
LANGUAGE_CODE = 'en-us'
TIME_ZONE = 'UTC'
USE_I18N = False
USE_L10N = False
USE_TZ = False
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
STATIC_URL = '/static/'
STATICFILES_DIRS = [
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'),
os.path.join(PYTHON_DIR, 'themes', 'default', 'static'),
]

3
archivebox/core/tests.py Normal file
View file

@ -0,0 +1,3 @@
#from django.test import TestCase
# Create your tests here.

34
archivebox/core/urls.py Normal file
View file

@ -0,0 +1,34 @@
from django.contrib import admin
from django.urls import path, include
from django.views import static
from django.conf import settings
from django.views.generic.base import RedirectView
from core.views import MainIndex, OldIndex, LinkDetails
# print('DEBUG', settings.DEBUG)
urlpatterns = [
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'),
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
path('add/', RedirectView.as_view(url='/admin/core/snapshot/add/')),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', admin.site.urls),
path('old.html', OldIndex.as_view(), name='OldHome'),
path('index.html', RedirectView.as_view(url='/')),
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
path('', MainIndex.as_view(), name='Home'),
]

104
archivebox/core/views.py Normal file
View file

@ -0,0 +1,104 @@
__package__ = 'archivebox.core'
from django.shortcuts import render, redirect
from django.http import HttpResponse
from django.views import View, static
from core.models import Snapshot
from ..index import load_main_index, load_main_index_meta
from ..config import (
OUTPUT_DIR,
VERSION,
FOOTER_INFO,
PUBLIC_INDEX,
PUBLIC_SNAPSHOTS,
)
from ..util import base_url
class MainIndex(View):
template = 'main_index.html'
def get(self, request):
if request.user.is_authenticated:
return redirect('/admin/core/snapshot/')
if PUBLIC_INDEX:
return redirect('OldHome')
return redirect(f'/admin/login/?next={request.path}')
class OldIndex(View):
template = 'main_index.html'
def get(self, request):
if PUBLIC_INDEX or request.user.is_authenticated:
all_links = load_main_index(out_dir=OUTPUT_DIR)
meta_info = load_main_index_meta(out_dir=OUTPUT_DIR)
context = {
'updated': meta_info['updated'],
'num_links': meta_info['num_links'],
'links': all_links,
'VERSION': VERSION,
'FOOTER_INFO': FOOTER_INFO,
}
return render(template_name=self.template, request=request, context=context)
return redirect(f'/admin/login/?next={request.path}')
class LinkDetails(View):
def get(self, request, path):
# missing trailing slash -> redirect to index
if '/' not in path:
return redirect(f'{path}/index.html')
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
try:
slug, archivefile = path.split('/', 1)
except (IndexError, ValueError):
slug, archivefile = path.split('/', 1)[0], 'index.html'
all_pages = list(Snapshot.objects.all())
# slug is a timestamp
by_ts = {page.timestamp: page for page in all_pages}
try:
# print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
return response
except KeyError:
pass
# slug is a hash
by_hash = {page.url_hash: page for page in all_pages}
try:
timestamp = by_hash[slug].timestamp
return redirect(f'/archive/{timestamp}/{archivefile}')
except KeyError:
pass
# slug is a URL
by_url = {page.base_url: page for page in all_pages}
try:
# TODO: add multiple snapshot support by showing index of all snapshots
# for given url instead of redirecting to timestamp index
timestamp = by_url[base_url(path)].timestamp
return redirect(f'/archive/{timestamp}/index.html')
except KeyError:
pass
return HttpResponse(
'No archived link matches the given timestamp or hash.',
content_type="text/plain",
status=404,
)

View file

@ -0,0 +1,5 @@
from archivebox.logging_util import log_shell_welcome_msg
if __name__ == '__main__':
log_shell_welcome_msg()

16
archivebox/core/wsgi.py Normal file
View file

@ -0,0 +1,16 @@
"""
WSGI config for archivebox project.
It exposes the WSGI callable as a module-level variable named ``application``.
For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
application = get_wsgi_application()

View file

@ -0,0 +1,143 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional, List, Iterable
from datetime import datetime
from ..index.schema import Link
from ..index import (
load_link_details,
write_link_details,
patch_main_index,
)
from ..util import enforce_types
from ..logging_util import (
log_archiving_started,
log_archiving_paused,
log_archiving_finished,
log_link_archiving_started,
log_link_archiving_finished,
log_archive_method_started,
log_archive_method_finished,
)
from .title import should_save_title, save_title
from .favicon import should_save_favicon, save_favicon
from .wget import should_save_wget, save_wget
from .pdf import should_save_pdf, save_pdf
from .screenshot import should_save_screenshot, save_screenshot
from .dom import should_save_dom, save_dom
from .git import should_save_git, save_git
from .media import should_save_media, save_media
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
@enforce_types
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> Link:
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
ARCHIVE_METHODS = [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
]
if methods is not None:
ARCHIVE_METHODS = [
method for method in ARCHIVE_METHODS
if method[1] in methods
]
out_dir = out_dir or link.link_dir
try:
is_new = not os.path.exists(out_dir)
if is_new:
os.makedirs(out_dir)
link = load_link_details(link, out_dir=out_dir)
write_link_details(link, out_dir=link.link_dir)
log_link_archiving_started(link, out_dir, is_new)
link = link.overwrite(updated=datetime.now())
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
for method_name, should_run, method_function in ARCHIVE_METHODS:
try:
if method_name not in link.history:
link.history[method_name] = []
if should_run(link, out_dir) or overwrite:
log_archive_method_started(method_name)
result = method_function(link=link, out_dir=out_dir)
link.history[method_name].append(result)
stats[result.status] += 1
log_archive_method_finished(result)
else:
stats['skipped'] += 1
except Exception as e:
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
# print(' ', stats)
try:
latest_title = link.history['title'][-1].output.strip()
if latest_title and len(latest_title) >= len(link.title or ''):
link = link.overwrite(title=latest_title)
except Exception:
pass
write_link_details(link, out_dir=link.link_dir)
patch_main_index(link)
# # If any changes were made, update the main links index json and html
# was_changed = stats['succeeded'] or stats['failed']
# if was_changed:
# patch_main_index(link)
log_link_archiving_finished(link, link.link_dir, is_new, stats)
except KeyboardInterrupt:
try:
write_link_details(link, out_dir=link.link_dir)
except:
pass
raise
except Exception as err:
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
raise
return link
@enforce_types
def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
if not links:
return []
log_archiving_started(len(links))
idx: int = 0
link: Link = links[0]
try:
for idx, link in enumerate(links):
archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
except KeyboardInterrupt:
log_archiving_paused(len(links), idx, link.timestamp)
raise SystemExit(0)
except BaseException:
print()
raise
log_archiving_finished(len(links))
return links

View file

@ -0,0 +1,113 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional, List, Dict, Tuple
from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
)
from ..config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
SAVE_ARCHIVE_DOT_ORG,
CURL_BINARY,
CURL_VERSION,
CURL_USER_AGENT,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
# if open(path, 'r').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG
@enforce_types
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""submit site to archive.org for archiving via their service, save returned archive url"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'archive.org.txt'
archive_org_url = None
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
cmd = [
CURL_BINARY,
'--silent',
'--location',
'--head',
'--compressed',
'--max-time', str(timeout),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
submit_url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
content_location, errors = parse_archive_dot_org_response(result.stdout)
if content_location:
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
archive_org_url = None
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
elif errors:
raise ArchiveError(', '.join(errors))
else:
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
if output and not isinstance(output, Exception):
# instead of writing None when archive.org rejects the url write the
# url to resubmit it to archive.org. This is so when the user visits
# the URL in person, it will attempt to re-archive it, and it'll show the
# nicer error message explaining why the url was rejected if it fails.
archive_org_url = archive_org_url or submit_url
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
f.write(archive_org_url)
chmod_file('archive.org.txt', cwd=out_dir)
output = archive_org_url
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
# Parse archive.org response headers
headers: Dict[str, List[str]] = defaultdict(list)
# lowercase all the header names and store in dict
for header in response.splitlines():
if b':' not in header or not header.strip():
continue
name, val = header.decode().split(':', 1)
headers[name.lower().strip()].append(val.strip())
# Get successful archive url in "content-location" header or any errors
content_location = headers.get('content-location', headers['location'])
errors = headers['x-archive-wayback-runtime-error']
return content_location, errors

View file

@ -0,0 +1,70 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file, atomic_write
from ..util import (
enforce_types,
is_static_file,
chrome_args,
)
from ..config import (
TIMEOUT,
SAVE_DOM,
CHROME_VERSION,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.html')):
return False
return SAVE_DOM
@enforce_types
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.html'
output_path = os.path.join(out_dir, str(output))
cmd = [
*chrome_args(TIMEOUT=timeout),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
atomic_write(output_path, result.stdout)
if result.returncode:
hints = result.stderr.decode()
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,65 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from ..system import chmod_file, run
from ..util import enforce_types, domain
from ..config import (
TIMEOUT,
SAVE_FAVICON,
CURL_BINARY,
CURL_VERSION,
CHECK_SSL_VALIDITY,
CURL_USER_AGENT,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
return False
return SAVE_FAVICON
@enforce_types
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download site favicon from google's favicon api"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'favicon.ico'
cmd = [
CURL_BINARY,
'--silent',
'--max-time', str(timeout),
'--location',
'--compressed',
'--output', str(output),
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
]
status = 'pending'
timer = TimedProgress(timeout, prefix=' ')
try:
run(cmd, cwd=out_dir, timeout=timeout)
chmod_file(output, cwd=out_dir)
status = 'succeeded'
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,89 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
domain,
extension,
without_query,
without_fragment,
)
from ..config import (
TIMEOUT,
SAVE_GIT,
GIT_BINARY,
GIT_VERSION,
GIT_DOMAINS,
CHECK_SSL_VALIDITY
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'git')):
return False
is_clonable_url = (
(domain(link.url) in GIT_DOMAINS)
or (extension(link.url) == 'git')
)
if not is_clonable_url:
return False
return SAVE_GIT
@enforce_types
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using git"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'git'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
GIT_BINARY,
'clone',
'--recursive',
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
without_query(without_fragment(link.url)),
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=output_path, timeout=timeout + 1)
if result.returncode == 128:
# ignore failed re-download when the folder already exists
pass
elif result.returncode > 0:
hints = 'Got git response code: {}.'.format(result.returncode)
raise ArchiveError('Failed to save git clone', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=GIT_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,98 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
)
from ..config import (
MEDIA_TIMEOUT,
SAVE_MEDIA,
SAVE_PLAYLISTS,
YOUTUBEDL_BINARY,
YOUTUBEDL_VERSION,
CHECK_SSL_VALIDITY
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'media')):
return False
return SAVE_MEDIA
@enforce_types
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'media'
output_path = os.path.join(out_dir, str(output))
os.makedirs(output_path, exist_ok=True)
cmd = [
YOUTUBEDL_BINARY,
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--no-check-certificate',
'--user-agent',
'--all-subs',
'--extract-audio',
'--keep-video',
'--ignore-errors',
'--geo-bypass',
'--audio-format', 'mp3',
'--audio-quality', '320K',
'--embed-thumbnail',
'--add-metadata',
*(['--yes-playlist'] if SAVE_PLAYLISTS else []),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=output_path, timeout=timeout + 1)
chmod_file(output, cwd=out_dir)
if result.returncode:
if (b'ERROR: Unsupported URL' in result.stderr
or b'HTTP Error 404' in result.stderr
or b'HTTP Error 403' in result.stderr
or b'URL could be a direct video link' in result.stderr
or b'Unable to extract container ID' in result.stderr):
# These happen too frequently on non-media pages to warrant printing to console
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=YOUTUBEDL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,69 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
chrome_args,
)
from ..config import (
TIMEOUT,
SAVE_PDF,
CHROME_VERSION,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
return False
return SAVE_PDF
@enforce_types
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'output.pdf'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save PDF', hints)
chmod_file('output.pdf', cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,68 @@
__package__ = 'archivebox.extractors'
import os
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
chrome_args,
)
from ..config import (
TIMEOUT,
SAVE_SCREENSHOT,
CHROME_VERSION,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
if is_static_file(link.url):
return False
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
return False
return SAVE_SCREENSHOT
@enforce_types
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
out_dir = out_dir or link.link_dir
output: ArchiveOutput = 'screenshot.png'
cmd = [
*chrome_args(TIMEOUT=timeout),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
if result.returncode:
hints = (result.stderr or result.stdout).decode()
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CHROME_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,85 @@
__package__ = 'archivebox.extractors'
import re
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
enforce_types,
is_static_file,
download_url,
htmldecode,
)
from ..config import (
TIMEOUT,
CHECK_SSL_VALIDITY,
SAVE_TITLE,
CURL_BINARY,
CURL_VERSION,
CURL_USER_AGENT,
setup_django,
)
from ..logging_util import TimedProgress
HTML_TITLE_REGEX = re.compile(
r'<title.*?>' # start matching text after <title> tag
r'(.[^<>]+)', # get everything up to these symbols
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
)
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
# if link already has valid title, skip it
if link.title and not link.title.lower().startswith('http'):
return False
if is_static_file(link.url):
return False
return SAVE_TITLE
@enforce_types
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""try to guess the page's title from its content"""
setup_django(out_dir=out_dir)
from core.models import Snapshot
output: ArchiveOutput = None
cmd = [
CURL_BINARY,
'--silent',
'--max-time', str(timeout),
'--location',
'--compressed',
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
html = download_url(link.url, timeout=timeout)
match = re.search(HTML_TITLE_REGEX, html)
output = htmldecode(match.group(1).strip()) if match else None
if output:
if not link.title or len(output) >= len(link.title):
Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)
else:
raise ArchiveError('Unable to detect page title')
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=CURL_VERSION,
output=output,
status=status,
**timer.stats,
)

View file

@ -0,0 +1,195 @@
__package__ = 'archivebox.extractors'
import os
import re
from typing import Optional
from datetime import datetime
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..system import run, chmod_file
from ..util import (
enforce_types,
is_static_file,
without_scheme,
without_fragment,
without_query,
path,
domain,
urldecode,
)
from ..config import (
TIMEOUT,
SAVE_WGET,
SAVE_WARC,
WGET_BINARY,
WGET_VERSION,
RESTRICT_FILE_NAMES,
CHECK_SSL_VALIDITY,
SAVE_WGET_REQUISITES,
WGET_AUTO_COMPRESSION,
WGET_USER_AGENT,
COOKIES_FILE,
)
from ..logging_util import TimedProgress
@enforce_types
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
output_path = wget_output_path(link)
out_dir = out_dir or link.link_dir
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
return False
return SAVE_WGET
@enforce_types
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
"""download full site using wget"""
out_dir = out_dir or link.link_dir
if SAVE_WARC:
warc_dir = os.path.join(out_dir, 'warc')
os.makedirs(warc_dir, exist_ok=True)
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
output: ArchiveOutput = None
cmd = [
WGET_BINARY,
# '--server-response', # print headers for better error parsing
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
'--timeout={}'.format(timeout),
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
*([] if SAVE_WARC else ['--timestamping']),
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=out_dir, timeout=timeout)
output = wget_output_path(link)
# parse out number of files downloaded from last line of stderr:
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
if line.strip()
]
files_downloaded = (
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
if 'Downloaded:' in output_tail[-1]
else 0
)
hints = (
'Got wget response code: {}.'.format(result.returncode),
*output_tail,
)
# Check for common failure cases
if (result.returncode > 0 and files_downloaded < 1) or output is None:
if b'403: Forbidden' in result.stderr:
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
if b'404: Not Found' in result.stderr:
raise ArchiveError('404 Not Found', hints)
if b'ERROR 500: Internal Server Error' in result.stderr:
raise ArchiveError('500 Internal Server Error', hints)
raise ArchiveError('Wget failed or got an error from the server', hints)
chmod_file(output, cwd=out_dir)
except Exception as err:
status = 'failed'
output = err
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=out_dir,
cmd_version=WGET_VERSION,
output=output,
status=status,
**timer.stats,
)
@enforce_types
def wget_output_path(link: Link) -> Optional[str]:
"""calculate the path to the wgetted .html file, since wget may
adjust some paths to be different than the base_url path.
See docs on wget --adjust-extension (-E)
"""
if is_static_file(link.url):
return without_scheme(without_fragment(link.url))
# Wget downloads can save in a number of different ways depending on the url:
# https://example.com
# > example.com/index.html
# https://example.com?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://www.example.com/?v=zzVa_tX1OiI
# > example.com/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc
# > example.com/abc.html
# https://example.com/abc/
# > example.com/abc/index.html
# https://example.com/abc?v=zzVa_tX1OiI.html
# > example.com/abc?v=zzVa_tX1OiI.html
# https://example.com/abc/?v=zzVa_tX1OiI.html
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
# https://example.com/abc/test.html
# > example.com/abc/test.html
# https://example.com/abc/test?v=zzVa_tX1OiI
# > example.com/abc/test?v=zzVa_tX1OiI.html
# https://example.com/abc/test/?v=zzVa_tX1OiI
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
# There's also lots of complexity around how the urlencoding and renaming
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
# Since the wget algorithm for -E (appending .html) is incredibly complex
# and there's no way to get the computed output path from wget
# in order to avoid having to reverse-engineer how they calculate it,
# we just look in the output folder read the filename wget used from the filesystem
full_path = without_fragment(without_query(path(link.url))).strip('/')
search_dir = os.path.join(
link.link_dir,
domain(link.url).replace(":", "+"),
urldecode(full_path),
)
for _ in range(4):
if os.path.exists(search_dir):
if os.path.isdir(search_dir):
html_files = [
f for f in os.listdir(search_dir)
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
]
if html_files:
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
return os.path.join(path_from_link_dir, html_files[0])
# Move up one directory level
search_dir = search_dir.rsplit('/', 1)[0]
if search_dir == link.link_dir:
break
return None

View file

@ -1,271 +0,0 @@
import os
import json
from datetime import datetime
from string import Template
try:
from distutils.dir_util import copy_tree
except ImportError:
print('[X] Missing "distutils" python package. To install it, run:')
print(' pip install distutils')
from config import (
OUTPUT_DIR,
TEMPLATES_DIR,
GIT_SHA,
FOOTER_INFO,
)
from util import (
chmod_file,
urlencode,
derived_link_info,
check_link_structure,
check_links_structure,
wget_output_path,
latest_output,
)
from parse import parse_links
from links import validate_links
from logs import (
log_indexing_process_started,
log_indexing_started,
log_indexing_finished,
log_parsing_started,
log_parsing_finished,
)
TITLE_LOADING_MSG = 'Not yet archived...'
### Homepage index for all the links
def write_links_index(out_dir, links, finished=False):
"""create index.html file for a given list of links"""
log_indexing_process_started()
check_links_structure(links)
log_indexing_started(out_dir, 'index.json')
write_json_links_index(out_dir, links)
log_indexing_finished(out_dir, 'index.json')
log_indexing_started(out_dir, 'index.html')
write_html_links_index(out_dir, links, finished=finished)
log_indexing_finished(out_dir, 'index.html')
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
"""parse and load existing index with any new links from import_path merged in"""
existing_links = []
if out_dir:
existing_links = parse_json_links_index(out_dir)
check_links_structure(existing_links)
new_links = []
if import_path:
# parse and validate the import file
log_parsing_started(import_path)
raw_links, parser_name = parse_links(import_path)
new_links = validate_links(raw_links)
check_links_structure(new_links)
# merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links)
check_links_structure(all_links)
num_new_links = len(all_links) - len(existing_links)
if import_path and parser_name:
log_parsing_finished(num_new_links, parser_name)
return all_links, new_links
def write_json_links_index(out_dir, links):
"""write the json link index to a given path"""
check_links_structure(links)
path = os.path.join(out_dir, 'index.json')
index_json = {
'info': 'ArchiveBox Index',
'help': 'https://github.com/pirate/ArchiveBox',
'version': GIT_SHA,
'num_links': len(links),
'updated': str(datetime.now().timestamp()),
'links': links,
}
with open(path, 'w', encoding='utf-8') as f:
json.dump(index_json, f, indent=4, default=str)
chmod_file(path)
def parse_json_links_index(out_dir=OUTPUT_DIR):
"""parse a archive index json file and return the list of links"""
index_path = os.path.join(out_dir, 'index.json')
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = json.load(f)['links']
check_links_structure(links)
return links
return []
def write_html_links_index(out_dir, links, finished=False):
"""write the html link index to a given path"""
check_links_structure(links)
path = os.path.join(out_dir, 'index.html')
copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
f.write('User-agent: *\nDisallow: /')
with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
index_html = f.read()
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
link_row_html = f.read()
full_links_info = (derived_link_info(link) for link in links)
link_rows = '\n'.join(
Template(link_row_html).substitute(**{
**link,
'title': (
link['title']
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
),
'favicon_url': (
os.path.join('archive', link['timestamp'], 'favicon.ico')
# if link['is_archived'] else ''
),
'archive_url': urlencode(
wget_output_path(link) or 'index.html'
),
})
for link in full_links_info
)
template_vars = {
'num_links': len(links),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'footer_info': FOOTER_INFO,
'git_sha': GIT_SHA,
'short_git_sha': GIT_SHA[:8],
'rows': link_rows,
'status': 'finished' if finished else 'running',
}
with open(path, 'w', encoding='utf-8') as f:
f.write(Template(index_html).substitute(**template_vars))
chmod_file(path)
def patch_links_index(link, out_dir=OUTPUT_DIR):
"""hack to in-place update one row's info in the generated index html"""
title = link['title'] or latest_output(link)['title']
successful = len(tuple(filter(None, latest_output(link).values())))
# Patch JSON index
changed = False
json_file_links = parse_json_links_index(out_dir)
for saved_link in json_file_links:
if saved_link['url'] == link['url']:
saved_link['title'] = title
saved_link['history'] = link['history']
changed = True
break
if changed:
write_json_links_index(out_dir, json_file_links)
# Patch HTML index
html_path = os.path.join(out_dir, 'index.html')
with open(html_path, 'r') as html_file:
html = html_file.read().splitlines()
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
html[idx] = '<span>{}</span>'.format(title)
elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
html[idx] = '<span>{}</span>'.format(successful)
break
with open(html_path, 'w') as f:
f.write('\n'.join(html))
### Individual link index
def write_link_index(out_dir, link):
link['updated'] = str(datetime.now().timestamp())
write_json_link_index(out_dir, link)
write_html_link_index(out_dir, link)
def write_json_link_index(out_dir, link):
"""write a json file with some info about the link"""
check_link_structure(link)
path = os.path.join(out_dir, 'index.json')
with open(path, 'w', encoding='utf-8') as f:
json.dump(link, f, indent=4, default=str)
chmod_file(path)
def parse_json_link_index(out_dir):
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, 'index.json')
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
link_json = json.load(f)
check_link_structure(link_json)
return link_json
return {}
def load_json_link_index(out_dir, link):
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
link = {
**parse_json_link_index(out_dir),
**link,
}
link.update({
'history': link.get('history') or {},
})
check_link_structure(link)
return link
def write_html_link_index(out_dir, link):
check_link_structure(link)
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
link_html = f.read()
path = os.path.join(out_dir, 'index.html')
link = derived_link_info(link)
with open(path, 'w', encoding='utf-8') as f:
f.write(Template(link_html).substitute({
**link,
'title': (
link['title']
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
),
'archive_url': urlencode(
wget_output_path(link)
or (link['domain'] if link['is_archived'] else 'about:blank')
),
'extension': link['extension'] or 'html',
'tags': link['tags'].strip() or 'untagged',
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
'status_color': 'success' if link['is_archived'] else 'danger',
}))
chmod_file(path)

View file

@ -0,0 +1,615 @@
__package__ = 'archivebox.index'
import re
import os
import shutil
import json as pyjson
from itertools import chain
from typing import List, Tuple, Dict, Optional, Iterable
from collections import OrderedDict
from contextlib import contextmanager
from ..system import atomic_write
from ..util import (
scheme,
enforce_types,
ExtendedEncoder,
)
from ..config import (
ARCHIVE_DIR_NAME,
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
OUTPUT_DIR,
TIMEOUT,
URL_BLACKLIST_PTN,
ANSI,
stderr,
OUTPUT_PERMISSIONS
)
from ..logging_util import (
TimedProgress,
log_indexing_process_started,
log_indexing_process_finished,
log_indexing_started,
log_indexing_finished,
log_parsing_finished,
log_deduping_finished,
)
from .schema import Link, ArchiveResult
from .html import (
write_html_main_index,
write_html_link_details,
)
from .json import (
parse_json_main_index,
write_json_main_index,
parse_json_link_details,
write_json_link_details,
)
from .sql import (
write_sql_main_index,
parse_sql_main_index,
write_sql_link_details,
)
### Link filtering and checking
@enforce_types
def merge_links(a: Link, b: Link) -> Link:
"""deterministially merge two links, favoring longer field values over shorter,
and "cleaner" values over worse ones.
"""
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
# longest url wins (because a fuzzy url will always be shorter)
url = a.url if len(a.url) > len(b.url) else b.url
# best title based on length and quality
possible_titles = [
title
for title in (a.title, b.title)
if title and title.strip() and '://' not in title
]
title = None
if len(possible_titles) == 2:
title = max(possible_titles, key=lambda t: len(t))
elif len(possible_titles) == 1:
title = possible_titles[0]
# earliest valid timestamp
timestamp = (
a.timestamp
if float(a.timestamp or 0) < float(b.timestamp or 0) else
b.timestamp
)
# all unique, truthy tags
tags_set = (
set(tag.strip() for tag in (a.tags or '').split(','))
| set(tag.strip() for tag in (b.tags or '').split(','))
)
tags = ','.join(tags_set) or None
# all unique source entries
sources = list(set(a.sources + b.sources))
# all unique history entries for the combined archive methods
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
history = {
method: (a.history.get(method) or []) + (b.history.get(method) or [])
for method in all_methods
}
for method in all_methods:
deduped_jsons = {
pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
for result in history[method]
}
history[method] = list(reversed(sorted(
(ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
key=lambda result: result.start_ts,
)))
return Link(
url=url,
timestamp=timestamp,
title=title,
tags=tags,
sources=sources,
history=history,
)
@enforce_types
def validate_links(links: Iterable[Link]) -> List[Link]:
timer = TimedProgress(TIMEOUT * 4)
try:
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = sorted_links(links) # deterministically sort the links based on timstamp, url
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
finally:
timer.end()
return list(links)
@enforce_types
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
if scheme_is_valid and not_blacklisted:
yield link
@enforce_types
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
"""
ensures that all non-duplicate links have monotonically increasing timestamps
"""
unique_urls: OrderedDict[str, Link] = OrderedDict()
for link in sorted_links:
if link.base_url in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[link.base_url], link)
unique_urls[link.base_url] = link
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
for link in unique_urls.values():
new_link = link.overwrite(
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
)
unique_timestamps[new_link.timestamp] = new_link
return unique_timestamps.values()
@enforce_types
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
return sorted(links, key=sort_func, reverse=True)
@enforce_types
def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
if not resume:
yield from links
return
for link in links:
try:
if float(link.timestamp) <= resume:
yield link
except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.')
@enforce_types
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0]
nonce = 0
# first try 152323423 before 152323423.0
if timestamp not in used_timestamps:
return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp
### Main Links Index
@contextmanager
@enforce_types
def timed_index_update(out_path: str):
log_indexing_started(out_path)
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
try:
yield
finally:
timer.end()
assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
log_indexing_finished(out_path)
@enforce_types
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""create index.html file for a given list of links"""
log_indexing_process_started(len(links))
with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
write_sql_main_index(links, out_dir=out_dir)
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
write_json_main_index(links, out_dir=out_dir)
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
write_html_main_index(links, out_dir=out_dir, finished=finished)
log_indexing_process_finished()
@enforce_types
def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
"""parse and load existing index with any new links from import_path merged in"""
all_links: List[Link] = []
all_links = list(parse_json_main_index(out_dir))
links_from_sql = list(parse_sql_main_index(out_dir))
if warn and not set(l.url for l in all_links) == set(l.url for l in links_from_sql):
stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
stderr(' To repair the index and re-import any orphaned links run:')
stderr(' archivebox init')
return all_links
@enforce_types
def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
meta_dict = pyjson.load(f)
meta_dict.pop('links')
return meta_dict
return None
@enforce_types
def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:
from ..parsers import parse_links
new_links: List[Link] = []
# parse and validate the import file
raw_links, parser_name = parse_links(source_path)
new_links = validate_links(raw_links)
if parser_name:
num_parsed = len(raw_links)
log_parsing_finished(num_parsed, parser_name)
return new_links
@enforce_types
def dedupe_links(existing_links: List[Link],
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
# merge existing links in out_dir and new links
all_links = validate_links(existing_links + new_links)
all_link_urls = {link.url for link in existing_links}
new_links = [
link for link in new_links
if link.url not in all_link_urls
]
all_links_deduped = {link.url: link for link in all_links}
for i in range(len(new_links)):
if new_links[i].url in all_links_deduped.keys():
new_links[i] = all_links_deduped[new_links[i].url]
log_deduping_finished(len(new_links))
return all_links, new_links
@enforce_types
def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
"""hack to in-place update one row's info in the generated index files"""
# TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
title = link.title or link.latest_outputs(status='succeeded')['title']
successful = link.num_outputs
# Patch JSON main index
json_file_links = parse_json_main_index(out_dir)
patched_links = []
for saved_link in json_file_links:
if saved_link.url == link.url:
patched_links.append(saved_link.overwrite(
title=title,
history=link.history,
updated=link.updated,
))
else:
patched_links.append(saved_link)
write_json_main_index(patched_links, out_dir=out_dir)
# Patch HTML main index
html_path = os.path.join(out_dir, 'index.html')
with open(html_path, 'r') as f:
html = f.read().splitlines()
for idx, line in enumerate(html):
if title and ('<span data-title-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(title)
elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
html[idx] = '<span>{}</span>'.format(successful)
break
atomic_write(html_path, '\n'.join(html))
### Link Details Index
@enforce_types
def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
write_json_link_details(link, out_dir=out_dir)
write_html_link_details(link, out_dir=out_dir)
write_sql_link_details(link)
@enforce_types
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
"""check for an existing link archive in the given directory,
and load+merge it into the given link dict
"""
out_dir = out_dir or link.link_dir
existing_link = parse_json_link_details(out_dir)
if existing_link:
return merge_links(existing_link, link)
return link
LINK_FILTERS = {
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
'substring': lambda link, pattern: pattern in link.url,
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
'domain': lambda link, pattern: link.domain == pattern,
}
@enforce_types
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
for pattern in filter_patterns:
try:
if LINK_FILTERS[filter_type](link, pattern):
return True
except Exception:
stderr()
stderr(
f'[X] Got invalid pattern for --filter-type={filter_type}:',
color='red',
)
stderr(f' {pattern}')
raise SystemExit(2)
return False
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
return {
link.link_dir: link
for link in links
}
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
return {
link.link_dir: link
for link in filter(is_archived, links)
}
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
return {
link.link_dir: link
for link in filter(is_unarchived, links)
}
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that actually exist in the archive/ folder"""
all_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
all_folders[entry.path] = link
return all_folders
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs with a valid index matched to the main index and archived content"""
return {
link.link_dir: link
for link in filter(is_valid, links)
}
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
return {**duplicate, **orphaned, **corrupted, **unrecognized}
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that conflict with other directories that have the same link URL or timestamp"""
links = list(links)
by_url = {link.url: 0 for link in links}
by_timestamp = {link.timestamp: 0 for link in links}
duplicate_folders = {}
indexed_folders = {link.link_dir for link in links}
data_folders = (
entry.path
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
)
for path in chain(sorted(indexed_folders), sorted(data_folders)):
link = None
try:
link = parse_json_link_details(path)
except Exception:
pass
if link:
# link folder has same timestamp as different link folder
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
if by_timestamp[link.timestamp] > 1:
duplicate_folders[path] = link
# link folder has same url as different link folder
by_url[link.url] = by_url.get(link.url, 0) + 1
if by_url[link.url] > 1:
duplicate_folders[path] = link
return duplicate_folders
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that contain a valid index but aren't listed in the main index"""
links = list(links)
indexed_folders = {link.link_dir: link for link in links}
orphaned_folders = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
link = None
try:
link = parse_json_link_details(entry.path)
except Exception:
pass
if link and entry.path not in indexed_folders:
# folder is a valid link data dir with index details, but it's not in the main index
orphaned_folders[entry.path] = link
return orphaned_folders
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain a valid index and aren't listed in the main index"""
return {
link.link_dir: link
for link in filter(is_corrupt, links)
}
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
by_timestamp = {link.timestamp: 0 for link in links}
unrecognized_folders: Dict[str, Optional[Link]] = {}
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
link = None
try:
link = parse_json_link_details(entry.path)
except KeyError:
# Try to fix index
if index_exists:
try:
# Last attempt to repair the detail index
link_guessed = parse_json_link_details(entry.path, guess=True)
write_json_link_details(link_guessed, out_dir=entry.path)
link = parse_json_link_details(entry.path)
except Exception:
pass
if index_exists and link is None:
# index exists but it's corrupted or unparseable
unrecognized_folders[entry.path] = link
elif not index_exists:
# link details index doesn't exist and the folder isn't in the main index
timestamp = entry.path.rsplit('/', 1)[-1]
if timestamp not in by_timestamp:
unrecognized_folders[entry.path] = link
return unrecognized_folders
def is_valid(link: Link) -> bool:
dir_exists = os.path.exists(link.link_dir)
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
if not dir_exists:
# unarchived links are not included in the valid list
return False
if dir_exists and not index_exists:
return False
if dir_exists and index_exists:
try:
parsed_link = parse_json_link_details(link.link_dir, guess=True)
return link.url == parsed_link.url
except Exception:
pass
return False
def is_corrupt(link: Link) -> bool:
if not os.path.exists(link.link_dir):
# unarchived links are not considered corrupt
return False
if is_valid(link):
return False
return True
def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
if not os.path.exists(link.link_dir):
return True
return not link.is_archived
def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
fixed = []
cant_fix = []
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
try:
link = parse_json_link_details(entry.path)
except KeyError:
link = None
if not link:
continue
if not entry.path.endswith(f'/{link.timestamp}'):
dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
if os.path.exists(dest):
cant_fix.append(entry.path)
else:
shutil.move(entry.path, dest)
fixed.append(dest)
timestamp = entry.path.rsplit('/', 1)[-1]
assert link.link_dir == entry.path
assert link.timestamp == timestamp
write_json_link_details(link, out_dir=entry.path)
return fixed, cant_fix

37
archivebox/index/csv.py Normal file
View file

@ -0,0 +1,37 @@
__package__ = 'archivebox.index'
from typing import List, Optional, Any
from ..util import enforce_types
from .schema import Link
@enforce_types
def links_to_csv(links: List[Link],
cols: Optional[List[str]]=None,
header: bool=True,
separator: str=',',
ljust: int=0) -> str:
cols = cols or ['timestamp', 'is_archived', 'url']
header_str = ''
if header:
header_str = separator.join(col.ljust(ljust) for col in cols)
row_strs = (
link.to_csv(cols=cols, ljust=ljust, separator=separator)
for link in links
)
return '\n'.join((header_str, *row_strs))
@enforce_types
def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
from .json import to_json
return separator.join(
to_json(getattr(obj, col), indent=None).ljust(ljust)
for col in cols
)

156
archivebox/index/html.py Normal file
View file

@ -0,0 +1,156 @@
__package__ = 'archivebox.index'
import os
from string import Template
from datetime import datetime
from typing import List, Optional, Iterator, Mapping
from .schema import Link
from ..system import atomic_write, copy_and_overwrite
from ..util import (
enforce_types,
ts_to_date,
urlencode,
htmlencode,
urldecode,
)
from ..config import (
OUTPUT_DIR,
TEMPLATES_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
ARCHIVE_DIR_NAME,
HTML_INDEX_FILENAME,
STATIC_DIR_NAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
)
join = lambda *paths: os.path.join(*paths)
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'
### Main Links Index
@enforce_types
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls"""
index_path = join(out_dir, HTML_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
for line in f:
if 'class="link-url"' in line:
yield line.split('"')[1]
return ()
@enforce_types
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))
copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
rendered_html = main_index_template(links, finished=finished)
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
@enforce_types
def main_index_template(links: List[Link], finished: bool=True) -> str:
"""render the template for the entire main index"""
return render_legacy_template(MAIN_INDEX_TEMPLATE, {
'version': VERSION,
'git_sha': GIT_SHA,
'num_links': str(len(links)),
'status': 'finished' if finished else 'running',
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
'rows': '\n'.join(
main_index_row_template(link)
for link in links
),
'footer_info': FOOTER_INFO,
})
@enforce_types
def main_index_row_template(link: Link) -> str:
"""render the template for an individual link row of the main index"""
from ..extractors.wget import wget_output_path
return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
**link._asdict(extended=True),
# before pages are finished archiving, show loading msg instead of title
'title': htmlencode(
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
# before pages are finished archiving, show fallback loading favicon
'favicon_url': (
join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
# if link['is_archived'] else ''
),
# before pages are finished archiving, show the details page instead
'wget_url': urlencode(wget_output_path(link) or 'index.html'),
# replace commas in tags with spaces, or file extension if it's static
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
})
### Link Details Index
@enforce_types
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
@enforce_types
def link_details_template(link: Link) -> str:
from ..extractors.wget import wget_output_path
link_info = link._asdict(extended=True)
return render_legacy_template(LINK_DETAILS_TEMPLATE, {
**link_info,
**link_info['canonical'],
'title': htmlencode(
link.title
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
),
'url_str': htmlencode(urldecode(link.base_url)),
'archive_url': urlencode(
wget_output_path(link)
or (link.domain if link.is_archived else '')
) or 'about:blank',
'extension': link.extension or 'html',
'tags': link.tags or 'untagged',
'status': 'archived' if link.is_archived else 'not yet archived',
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
})
@enforce_types
def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
"""render a given html template string with the given template content"""
# will be replaced by django templates in the future
with open(template_path, 'r', encoding='utf-8') as template:
template_str = template.read()
return Template(template_str).substitute(**context)

165
archivebox/index/json.py Normal file
View file

@ -0,0 +1,165 @@
__package__ = 'archivebox.index'
import os
import sys
import json as pyjson
from pathlib import Path
from datetime import datetime
from typing import List, Optional, Iterator, Any
from .schema import Link, ArchiveResult
from ..system import atomic_write
from ..util import enforce_types
from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
ANSI
)
MAIN_INDEX_HEADER = {
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
'schema': 'archivebox.index.json',
'copyright_info': FOOTER_INFO,
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': GIT_SHA,
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
'source': 'https://github.com/pirate/ArchiveBox',
'issues': 'https://github.com/pirate/ArchiveBox/issues',
'dependencies': DEPENDENCIES,
},
}
### Main Links Index
@enforce_types
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
"""parse an archive index json file and return the list of links"""
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(index_path):
with open(index_path, 'r', encoding='utf-8') as f:
links = pyjson.load(f)['links']
for link_json in links:
try:
yield Link.from_json(link_json)
except KeyError:
try:
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
yield parse_json_link_details(str(detail_index_path))
except KeyError:
# as a last effort, try to guess the missing values out of existing ones
try:
yield Link.from_json(link_json, guess=True)
except KeyError:
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
continue
return ()
@enforce_types
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
"""write the json link index to a given path"""
assert isinstance(links, List), 'Links must be a list, not a generator.'
assert not links or isinstance(links[0].history, dict)
assert not links or isinstance(links[0].sources, list)
if links and links[0].history.get('title'):
assert isinstance(links[0].history['title'][0], ArchiveResult)
if links and links[0].sources:
assert isinstance(links[0].sources[0], str)
main_index_json = {
**MAIN_INDEX_HEADER,
'num_links': len(links),
'updated': datetime.now(),
'last_run_cmd': sys.argv,
'links': links,
}
atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
### Link Details Index
@enforce_types
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
atomic_write(path, link._asdict(extended=True))
@enforce_types
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
"""load the json link index from a given directory"""
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
if os.path.exists(existing_index):
with open(existing_index, 'r', encoding='utf-8') as f:
try:
link_json = pyjson.load(f)
return Link.from_json(link_json, guess)
except pyjson.JSONDecodeError:
pass
return None
@enforce_types
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
if entry.is_dir(follow_symlinks=True):
if os.path.exists(os.path.join(entry.path, 'index.json')):
try:
link = parse_json_link_details(entry.path)
except KeyError:
link = None
if link:
yield link
### Helpers
class ExtendedEncoder(pyjson.JSONEncoder):
"""
Extended json serializer that supports serializing several model
fields and objects
"""
def default(self, obj):
cls_name = obj.__class__.__name__
if hasattr(obj, '_asdict'):
return obj._asdict()
elif isinstance(obj, bytes):
return obj.decode()
elif isinstance(obj, datetime):
return obj.isoformat()
elif isinstance(obj, Exception):
return '{}: {}'.format(obj.__class__.__name__, obj)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return pyjson.JSONEncoder.default(self, obj)
@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)

431
archivebox/index/schema.py Normal file
View file

@ -0,0 +1,431 @@
__package__ = 'archivebox.index'
import os
from pathlib import Path
from datetime import datetime, timedelta
from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields
from ..system import get_dir_size
from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME
class ArchiveError(Exception):
def __init__(self, message, hints=None):
super().__init__(message)
self.hints = hints
LinkDict = Dict[str, Any]
ArchiveOutput = Union[str, Exception, None]
@dataclass(frozen=True)
class ArchiveResult:
cmd: List[str]
pwd: Optional[str]
cmd_version: Optional[str]
output: ArchiveOutput
status: str
start_ts: datetime
end_ts: datetime
schema: str = 'ArchiveResult'
def __post_init__(self):
self.typecheck()
def _asdict(self):
return asdict(self)
def typecheck(self) -> None:
assert self.schema == self.__class__.__name__
assert isinstance(self.status, str) and self.status
assert isinstance(self.start_ts, datetime)
assert isinstance(self.end_ts, datetime)
assert isinstance(self.cmd, list)
assert all(isinstance(arg, str) and arg for arg in self.cmd)
assert self.pwd is None or isinstance(self.pwd, str) and self.pwd
assert self.cmd_version is None or isinstance(self.cmd_version, str) and self.cmd_version
assert self.output is None or isinstance(self.output, (str, Exception))
if isinstance(self.output, str):
assert self.output
@classmethod
def guess_ts(_cls, dict_info):
from ..util import parse_date
parsed_timestamp = parse_date(dict_info["timestamp"])
start_ts = parsed_timestamp
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
return start_ts, end_ts
@classmethod
def from_json(cls, json_info, guess=False):
from ..util import parse_date
info = {
key: val
for key, val in json_info.items()
if key in cls.field_names()
}
if guess:
keys = info.keys()
if "start_ts" not in keys:
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
else:
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
if "pwd" not in keys:
info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"])
if "cmd_version" not in keys:
info["cmd_version"] = "Undefined"
if "cmd" not in keys:
info["cmd"] = []
else:
info['start_ts'] = parse_date(info['start_ts'])
info['end_ts'] = parse_date(info['end_ts'])
info['cmd_version'] = info.get('cmd_version')
if type(info["cmd"]) is str:
info["cmd"] = [info["cmd"]]
return cls(**info)
def to_dict(self, *keys) -> dict:
if keys:
return {k: v for k, v in asdict(self).items() if k in keys}
return asdict(self)
def to_json(self, indent=4, sort_keys=True) -> str:
from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
from .csv import to_csv
return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property
def duration(self) -> int:
return (self.end_ts - self.start_ts).seconds
@dataclass(frozen=True)
class Link:
timestamp: str
url: str
title: Optional[str]
tags: Optional[str]
sources: List[str]
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
updated: Optional[datetime] = None
schema: str = 'Link'
def __str__(self) -> str:
return f'[{self.timestamp}] {self.base_url} "{self.title}"'
def __post_init__(self):
self.typecheck()
def overwrite(self, **kwargs):
"""pure functional version of dict.update that returns a new instance"""
return Link(**{**self._asdict(), **kwargs})
def __eq__(self, other):
if not isinstance(other, Link):
return NotImplemented
return self.url == other.url
def __gt__(self, other):
if not isinstance(other, Link):
return NotImplemented
if not self.timestamp or not other.timestamp:
return
return float(self.timestamp) > float(other.timestamp)
def typecheck(self) -> None:
from ..config import stderr, ANSI
try:
assert self.schema == self.__class__.__name__
assert isinstance(self.timestamp, str) and self.timestamp
assert self.timestamp.replace('.', '').isdigit()
assert isinstance(self.url, str) and '://' in self.url
assert self.updated is None or isinstance(self.updated, datetime)
assert self.title is None or (isinstance(self.title, str) and self.title)
assert self.tags is None or isinstance(self.tags, str)
assert isinstance(self.sources, list)
assert all(isinstance(source, str) and source for source in self.sources)
assert isinstance(self.history, dict)
for method, results in self.history.items():
assert isinstance(method, str) and method
assert isinstance(results, list)
assert all(isinstance(result, ArchiveResult) for result in results)
except Exception:
stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
raise
def _asdict(self, extended=False):
info = {
'schema': 'Link',
'url': self.url,
'title': self.title or None,
'timestamp': self.timestamp,
'updated': self.updated or None,
'tags': self.tags or None,
'sources': self.sources or [],
'history': self.history or {},
}
if extended:
info.update({
'link_dir': self.link_dir,
'archive_path': self.archive_path,
'hash': self.url_hash,
'base_url': self.base_url,
'scheme': self.scheme,
'domain': self.domain,
'path': self.path,
'basename': self.basename,
'extension': self.extension,
'is_static': self.is_static,
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
'oldest_archive_date': self.oldest_archive_date,
'newest_archive_date': self.newest_archive_date,
'is_archived': self.is_archived,
'num_outputs': self.num_outputs,
'num_failures': self.num_failures,
'latest': self.latest_outputs(),
'canonical': self.canonical_outputs(),
})
return info
@classmethod
def from_json(cls, json_info, guess=False):
from ..util import parse_date
info = {
key: val
for key, val in json_info.items()
if key in cls.field_names()
}
info['updated'] = parse_date(info.get('updated'))
info['sources'] = info.get('sources') or []
json_history = info.get('history') or {}
cast_history = {}
for method, method_history in json_history.items():
cast_history[method] = []
for json_result in method_history:
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
cast_result = ArchiveResult.from_json(json_result, guess)
cast_history[method].append(cast_result)
info['history'] = cast_history
return cls(**info)
def to_json(self, indent=4, sort_keys=True) -> str:
from .json import to_json
return to_json(self, indent=indent, sort_keys=sort_keys)
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
from .csv import to_csv
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]
@property
def link_dir(self) -> str:
from ..config import CONFIG
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
@property
def archive_path(self) -> str:
from ..config import ARCHIVE_DIR_NAME
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
@property
def archive_size(self) -> float:
try:
return get_dir_size(self.archive_path)[0]
except Exception:
return 0
### URL Helpers
@property
def url_hash(self):
from ..util import hashurl
return hashurl(self.url)
@property
def scheme(self) -> str:
from ..util import scheme
return scheme(self.url)
@property
def extension(self) -> str:
from ..util import extension
return extension(self.url)
@property
def domain(self) -> str:
from ..util import domain
return domain(self.url)
@property
def path(self) -> str:
from ..util import path
return path(self.url)
@property
def basename(self) -> str:
from ..util import basename
return basename(self.url)
@property
def base_url(self) -> str:
from ..util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@property
def bookmarked_date(self) -> Optional[str]:
from ..util import ts_to_date
max_ts = (datetime.now() + timedelta(days=30)).timestamp()
if self.timestamp and self.timestamp.replace('.', '').isdigit():
if 0 < float(self.timestamp) < max_ts:
return ts_to_date(datetime.fromtimestamp(float(self.timestamp)))
else:
return str(self.timestamp)
return None
@property
def updated_date(self) -> Optional[str]:
from ..util import ts_to_date
return ts_to_date(self.updated) if self.updated else None
@property
def archive_dates(self) -> List[datetime]:
return [
result.start_ts
for method in self.history.keys()
for result in self.history[method]
]
@property
def oldest_archive_date(self) -> Optional[datetime]:
return min(self.archive_dates, default=None)
@property
def newest_archive_date(self) -> Optional[datetime]:
return max(self.archive_dates, default=None)
### Archive Status Helpers
@property
def num_outputs(self) -> int:
return len(tuple(filter(None, self.latest_outputs().values())))
@property
def num_failures(self) -> int:
return sum(1
for method in self.history.keys()
for result in self.history[method]
if result.status == 'failed')
@property
def is_static(self) -> bool:
from ..util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
from ..config import ARCHIVE_DIR
from ..util import domain
output_paths = (
domain(self.url),
'output.pdf',
'screenshot.png',
'output.html',
'media',
)
return any(
os.path.exists(os.path.join(ARCHIVE_DIR, self.timestamp, path))
for path in output_paths
)
def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]:
"""get the latest output that each archive method produced for link"""
ARCHIVE_METHODS = (
'title', 'favicon', 'wget', 'warc', 'pdf',
'screenshot', 'dom', 'git', 'media', 'archive_org',
)
latest: Dict[str, ArchiveOutput] = {}
for archive_method in ARCHIVE_METHODS:
# get most recent succesful result in history for each archive method
history = self.history.get(archive_method) or []
history = list(filter(lambda result: result.output, reversed(history)))
if status is not None:
history = list(filter(lambda result: result.status == status, history))
history = list(history)
if history:
latest[archive_method] = history[0].output
else:
latest[archive_method] = None
return latest
def canonical_outputs(self) -> Dict[str, Optional[str]]:
"""predict the expected output paths that should be present after archiving"""
from ..extractors.wget import wget_output_path
canonical = {
'index_path': 'index.html',
'favicon_path': 'favicon.ico',
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
'wget_path': wget_output_path(self),
'warc_path': 'warc',
'pdf_path': 'output.pdf',
'screenshot_path': 'screenshot.png',
'dom_path': 'output.html',
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
'git_path': 'git',
'media_path': 'media',
}
if self.is_static:
# static binary files like PDF and images are handled slightly differently.
# they're just downloaded once and aren't archived separately multiple times,
# so the wget, screenshot, & pdf urls should all point to the same file
static_path = wget_output_path(self)
canonical.update({
'title': self.basename,
'wget_path': static_path,
'pdf_path': static_path,
'screenshot_path': static_path,
'dom_path': static_path,
})
return canonical

90
archivebox/index/sql.py Normal file
View file

@ -0,0 +1,90 @@
__package__ = 'archivebox.index'
from io import StringIO
from typing import List, Tuple, Iterator
from .schema import Link
from ..util import enforce_types
from ..config import setup_django, OUTPUT_DIR
### Main Links Index
@enforce_types
def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
return (
Link.from_json(page.as_json(*Snapshot.keys))
for page in Snapshot.objects.all()
)
@enforce_types
def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
from django.db import transaction
with transaction.atomic():
for link in links:
Snapshot.objects.filter(url=link.url).delete()
@enforce_types
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
from django.db import transaction
with transaction.atomic():
for link in links:
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
Snapshot.objects.update_or_create(url=link.url, defaults=info)
@enforce_types
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
setup_django(out_dir, check_db=True)
from core.models import Snapshot
from django.db import transaction
with transaction.atomic():
snap = Snapshot.objects.get(url=link.url, timestamp=link.timestamp)
snap.title = link.title
snap.tags = link.tags
snap.save()
@enforce_types
def list_migrations(out_dir: str=OUTPUT_DIR) -> List[Tuple[bool, str]]:
setup_django(out_dir, check_db=False)
from django.core.management import call_command
out = StringIO()
call_command("showmigrations", list=True, stdout=out)
out.seek(0)
migrations = []
for line in out.readlines():
if line.strip() and ']' in line:
status_str, name_str = line.strip().split(']', 1)
is_applied = 'X' in status_str
migration_name = name_str.strip()
migrations.append((is_applied, migration_name))
return migrations
@enforce_types
def apply_migrations(out_dir: str=OUTPUT_DIR) -> List[str]:
setup_django(out_dir, check_db=False)
from django.core.management import call_command
null, out = StringIO(), StringIO()
call_command("makemigrations", interactive=False, stdout=null)
call_command("migrate", interactive=False, stdout=out)
out.seek(0)
return [line.strip() for line in out.readlines() if line.strip()]
@enforce_types
def get_admins(out_dir: str=OUTPUT_DIR) -> List[str]:
setup_django(out_dir, check_db=False)
from django.contrib.auth.models import User
return User.objects.filter(is_superuser=True)

View file

@ -1,123 +0,0 @@
"""
In ArchiveBox, a Link represents a single entry that we track in the
json index. All links pass through all archiver functions and the latest,
most up-to-date canonical output for each is stored in "latest".
Link {
timestamp: str, (how we uniquely id links)
url: str,
title: str,
tags: str,
sources: [str],
history: {
pdf: [
{start_ts, end_ts, duration, cmd, pwd, status, output},
...
],
...
},
}
"""
from html import unescape
from collections import OrderedDict
from util import (
scheme,
merge_links,
check_link_structure,
check_links_structure,
)
from config import (
URL_BLACKLIST,
)
def validate_links(links):
check_links_structure(links)
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
links = sorted_links(links) # deterministically sort the links based on timstamp, url
if not links:
print('[X] No links found :(')
raise SystemExit(1)
for link in links:
link['title'] = unescape(link['title'].strip()) if link['title'] else None
check_link_structure(link)
return list(links)
def archivable_links(links):
"""remove chrome://, about:// or other schemed links that cant be archived"""
for link in links:
scheme_is_valid = scheme(link['url']) in ('http', 'https', 'ftp')
not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True
if scheme_is_valid and not_blacklisted:
yield link
def uniquefied_links(sorted_links):
"""
ensures that all non-duplicate links have monotonically increasing timestamps
"""
unique_urls = OrderedDict()
lower = lambda url: url.lower().strip()
without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
for link in sorted_links:
fuzzy_url = without_www(without_trailing_slash(lower(link['url'])))
if fuzzy_url in unique_urls:
# merge with any other links that share the same url
link = merge_links(unique_urls[fuzzy_url], link)
unique_urls[fuzzy_url] = link
unique_timestamps = OrderedDict()
for link in unique_urls.values():
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
unique_timestamps[link['timestamp']] = link
return unique_timestamps.values()
def sorted_links(links):
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
return sorted(links, key=sort_func, reverse=True)
def links_after_timestamp(links, timestamp=None):
if not timestamp:
yield from links
return
for link in links:
try:
if float(link['timestamp']) <= float(timestamp):
yield link
except (ValueError, TypeError):
print('Resume value and all timestamp values must be valid numbers.')
def lowest_uniq_timestamp(used_timestamps, timestamp):
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
timestamp = timestamp.split('.')[0]
nonce = 0
# first try 152323423 before 152323423.0
if timestamp not in used_timestamps:
return timestamp
new_timestamp = '{}.{}'.format(timestamp, nonce)
while new_timestamp in used_timestamps:
nonce += 1
new_timestamp = '{}.{}'.format(timestamp, nonce)
return new_timestamp

547
archivebox/logging_util.py Normal file
View file

@ -0,0 +1,547 @@
__package__ = 'archivebox'
import re
import os
import sys
import time
import argparse
from multiprocessing import Process
from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
if TYPE_CHECKING:
from .index.schema import Link, ArchiveResult
from .util import enforce_types
from .config import (
ConfigDict,
PYTHON_ENCODING,
ANSI,
IS_TTY,
TERM_WIDTH,
OUTPUT_DIR,
SOURCES_DIR_NAME,
HTML_INDEX_FILENAME,
stderr,
)
@dataclass
class RuntimeStats:
"""mutable stats counter for logging archiving timing info to CLI output"""
skipped: int = 0
succeeded: int = 0
failed: int = 0
parse_start_ts: Optional[datetime] = None
parse_end_ts: Optional[datetime] = None
index_start_ts: Optional[datetime] = None
index_end_ts: Optional[datetime] = None
archiving_start_ts: Optional[datetime] = None
archiving_end_ts: Optional[datetime] = None
# globals are bad, mmkay
_LAST_RUN_STATS = RuntimeStats()
class SmartFormatter(argparse.HelpFormatter):
"""Patched formatter that prints newlines in argparse help strings"""
def _split_lines(self, text, width):
if '\n' in text:
return text.splitlines()
return argparse.HelpFormatter._split_lines(self, text, width)
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if stdin and not stdin.isatty():
stdin_raw_text = stdin.read().strip()
if stdin_raw_text:
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
stderr()
raise SystemExit(1)
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
"""accept any standard input and return it as a string or None"""
if not stdin:
return None
elif stdin and not stdin.isatty():
stdin_str = stdin.read().strip()
return stdin_str or None
return None
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
def __init__(self, seconds, prefix=''):
from .config import SHOW_PROGRESS
self.SHOW_PROGRESS = SHOW_PROGRESS
if self.SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix))
self.p.start()
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
def end(self):
"""immediately end progress, clear the progressbar line, and save end_ts"""
end_ts = datetime.now()
self.stats['end_ts'] = end_ts
if self.SHOW_PROGRESS:
# terminate if we havent already terminated
self.p.terminate()
self.p.join()
self.p.close()
# clear whole terminal line
try:
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
except (IOError, BrokenPipeError):
# ignore when the parent proc has stopped listening to our stdout
pass
@enforce_types
def progress_bar(seconds: int, prefix: str='') -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining"""
chunk = '' if PYTHON_ENCODING == 'UTF-8' else '#'
last_width = TERM_WIDTH()
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try:
for s in range(seconds * chunks):
max_width = TERM_WIDTH()
if max_width < last_width:
# when the terminal size is shrunk, we have to write a newline
# otherwise the progress bar will keep wrapping incorrectly
sys.stdout.write('\r\n')
sys.stdout.flush()
chunks = max_width - len(prefix) - 20
progress = s / chunks / seconds * 100
bar_width = round(progress/(100/chunks))
last_width = max_width
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(progress, 1),
round(s/chunks),
seconds,
))
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.flush()
except (KeyboardInterrupt, BrokenPipeError):
print()
pass
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
from .config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
VERSION=VERSION,
cmd=cmd,
stdin_hint=stdin_hint,
**ANSI,
))
stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
stderr()
### Parsing Stage
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
_LAST_RUN_STATS.parse_start_ts = datetime.now()
print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
depth,
' (index only)' if index_only else '',
**ANSI,
))
def log_source_saved(source_file: str):
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
def log_parsing_finished(num_parsed: int, parser_name: str):
_LAST_RUN_STATS.parse_end_ts = datetime.now()
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
def log_deduping_finished(num_new_links: int):
print(' > Found {} new URLs not already in index'.format(num_new_links))
def log_crawl_started(new_links):
print('{lightred}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
### Indexing Stage
def log_indexing_process_started(num_links: int):
start_ts = datetime.now()
_LAST_RUN_STATS.index_start_ts = start_ts
print()
print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_indexing_process_finished():
end_ts = datetime.now()
_LAST_RUN_STATS.index_end_ts = end_ts
def log_indexing_started(out_path: str):
if IS_TTY:
sys.stdout.write(f' > {out_path}')
def log_indexing_finished(out_path: str):
print(f'\r{out_path}')
### Archiving Stage
def log_archiving_started(num_links: int, resume: Optional[float]=None):
start_ts = datetime.now()
_LAST_RUN_STATS.archiving_start_ts = start_ts
print()
if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
end_ts = datetime.now()
_LAST_RUN_STATS.archiving_end_ts = end_ts
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print()
print(' {lightred}Hint:{reset} To view your archive index, open:'.format(**ANSI))
print(' {}/{}'.format(OUTPUT_DIR, HTML_INDEX_FILENAME))
print(' Continue archiving where you left off by running:')
print(' archivebox update --resume={}'.format(timestamp))
def log_archiving_finished(num_links: int):
end_ts = datetime.now()
_LAST_RUN_STATS.archiving_end_ts = end_ts
assert _LAST_RUN_STATS.archiving_start_ts is not None
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60)
else:
duration = '{0:.2f} sec'.format(seconds)
print()
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
ANSI['reset'],
))
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
print()
print(' {lightred}Hint:{reset} To view your archive index, open:'.format(**ANSI))
print(' {}/{}'.format(OUTPUT_DIR, HTML_INDEX_FILENAME))
print(' Or run the built-in webserver:')
print(' archivebox server')
def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
symbol_color=ANSI['green' if is_new else 'black'],
symbol='+' if is_new else '',
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
title=link.title or link.base_url,
**ANSI,
))
print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
print(' {} {}'.format(
'>' if is_new else '',
pretty_path(link_dir),
))
def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict):
total = sum(stats.values())
if stats['failed'] > 0 :
_LAST_RUN_STATS.failed += 1
elif stats['skipped'] == total:
_LAST_RUN_STATS.skipped += 1
else:
_LAST_RUN_STATS.succeeded += 1
def log_archive_method_started(method: str):
print(' > {}'.format(method))
def log_archive_method_finished(result: "ArchiveResult"):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in result.cmd
)
if result.status == 'failed':
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
# Collect and prefix output lines with indentation
output_lines = [
'{lightred}Failed:{reset}'.format(**ANSI),
' {reset}{} {red}{}{reset}'.format(
result.output.__class__.__name__.replace('ArchiveError', ''),
result.output,
**ANSI,
),
*hints,
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
*([' cd {};'.format(result.pwd)] if result.pwd else []),
' {}'.format(quoted_cmd),
]
print('\n'.join(
' {}'.format(line)
for line in output_lines
if line
))
print()
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
filter_type,
**ANSI,
))
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_finished(links):
from .index.csv import links_to_csv
print()
print('---------------------------------------------------------------------------------------------------')
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print()
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
if delete:
file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
)
else:
print(
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
' (Pass --delete if you also want to permanently delete the data folders)'
)
if not yes:
print()
print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
try:
assert input(' y/[n]: ').lower() == 'y'
except (KeyboardInterrupt, EOFError, AssertionError):
raise SystemExit(0)
def log_removal_finished(all_links: int, to_keep: int):
if all_links == 0:
print()
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
else:
num_removed = all_links - to_keep
print()
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
num_removed,
all_links,
**ANSI,
))
print(' Index now contains {} links.'.format(to_keep))
def log_shell_welcome_msg():
from .cli import list_subcommands
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
print('{green}from archivebox.core.models import Snapshot, User{reset}'.format(**ANSI))
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
print()
print('[i] Welcome to the ArchiveBox Shell!')
print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
print()
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
print(' print(Snapshot.objects.filter(is_archived=True).count())')
print(' Snapshot.objects.get(url="https://example.com").as_json()')
print(' add("https://example.com/some/new/url")')
### Helpers
@enforce_types
def pretty_path(path: str) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = os.path.abspath('.')
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
return path.replace(pwd + '/', './')
@enforce_types
def printable_filesize(num_bytes: Union[int, float]) -> str:
for count in ['Bytes','KB','MB','GB']:
if num_bytes > -1024.0 and num_bytes < 1024.0:
return '%3.1f %s' % (num_bytes, count)
num_bytes /= 1024.0
return '%3.1f %s' % (num_bytes, 'TB')
@enforce_types
def printable_folders(folders: Dict[str, Optional["Link"]],
json: bool=False,
csv: Optional[str]=None) -> str:
if json:
from .index.json import to_json
return to_json(folders.values(), indent=4, sort_keys=True)
elif csv:
from .index.csv import links_to_csv
return links_to_csv(folders.values(), cols=csv.split(','), header=True)
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
@enforce_types
def printable_config(config: ConfigDict, prefix: str='') -> str:
return f'\n{prefix}'.join(
f'{key}={val}'
for key, val in config.items()
if not (isinstance(val, dict) or callable(val))
)
@enforce_types
def printable_folder_status(name: str, folder: Dict) -> str:
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
if os.path.exists(folder['path']):
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
printable_filesize(os.path.getsize(folder['path']))
)
else:
num_files = 'missing'
if ' ' in folder['path']:
folder['path'] = f'"{folder["path"]}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(folder["path"] or '').ljust(76),
num_files.ljust(14),
ANSI[color],
note,
ANSI['reset'],
))
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note, version = 'green', '', 'valid', ''
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
if not version:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
if ' ' in dependency["path"]:
dependency["path"] = f'"{dependency["path"]}"'
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(22),
(dependency["path"] or '').ljust(76),
version.ljust(14),
ANSI[color],
note,
ANSI['reset'],
))

View file

@ -1,201 +0,0 @@
import sys
from datetime import datetime
from config import ANSI, REPO_DIR, OUTPUT_DIR
# globals are bad, mmkay
_LAST_RUN_STATS = {
'skipped': 0,
'succeeded': 0,
'failed': 0,
'parsing_start_ts': 0,
'parsing_end_ts': 0,
'indexing_start_ts': 0,
'indexing_end_ts': 0,
'archiving_start_ts': 0,
'archiving_end_ts': 0,
'links': {},
}
def pretty_path(path):
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
return path.replace(REPO_DIR + '/', '')
### Parsing Stage
def log_parsing_started(source_file):
start_ts = datetime.now()
_LAST_RUN_STATS['parse_start_ts'] = start_ts
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
source_file.rsplit('/', 1)[-1],
**ANSI,
))
def log_parsing_finished(num_new_links, parser_name):
print(' > Adding {} new links to index (parsed import as {})'.format(
num_new_links,
parser_name,
))
### Indexing Stage
def log_indexing_process_started():
start_ts = datetime.now()
_LAST_RUN_STATS['index_start_ts'] = start_ts
print('{green}[*] [{}] Saving main index files...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
**ANSI,
))
def log_indexing_started(out_dir, out_file):
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
def log_indexing_finished(out_dir, out_file):
end_ts = datetime.now()
_LAST_RUN_STATS['index_end_ts'] = end_ts
print('\r{}/{}'.format(pretty_path(out_dir), out_file))
### Archiving Stage
def log_archiving_started(num_links, resume):
start_ts = datetime.now()
_LAST_RUN_STATS['start_ts'] = start_ts
if resume:
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
**ANSI,
))
else:
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
**ANSI,
))
def log_archiving_paused(num_links, idx, timestamp):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
print()
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
**ANSI,
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
print(' Continue where you left off by running:')
print(' {} {}'.format(
pretty_path(sys.argv[0]),
timestamp,
))
def log_archiving_finished(num_links):
end_ts = datetime.now()
_LAST_RUN_STATS['end_ts'] = end_ts
seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp()
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60, 2)
else:
duration = '{0:.2f} sec'.format(seconds, 2)
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
ANSI['green'],
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
ANSI['reset'],
))
print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped']))
print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded']))
print(' - {} links had errors'.format(_LAST_RUN_STATS['failed']))
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
def log_link_archiving_started(link_dir, link, is_new):
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
symbol_color=ANSI['green' if is_new else 'black'],
symbol='+' if is_new else '*',
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
title=link['title'] or link['url'],
**ANSI,
))
print(' {blue}{url}{reset}'.format(url=link['url'], **ANSI))
print(' {} {}'.format(
'>' if is_new else '',
pretty_path(link_dir),
))
def log_link_archiving_finished(link_dir, link, is_new, stats):
total = sum(stats.values())
if stats['failed'] > 0 :
_LAST_RUN_STATS['failed'] += 1
elif stats['skipped'] == total:
_LAST_RUN_STATS['skipped'] += 1
else:
_LAST_RUN_STATS['succeeded'] += 1
def log_archive_method_started(method):
print(' > {}'.format(method))
def log_archive_method_finished(result):
"""quote the argument with whitespace in a command so the user can
copy-paste the outputted string directly to run the cmd
"""
required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts')
assert (
isinstance(result, dict)
and all(key in result for key in required_keys)
and ('output' in result)
), 'Archive method did not return a valid result.'
# Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if ' ' in arg else arg
for arg in result['cmd']
)
if result['status'] == 'failed':
# Prettify error output hints string and limit to five lines
hints = getattr(result['output'], 'hints', None) or ()
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
)
# Collect and prefix output lines with indentation
output_lines = [
'{}Failed:{} {}{}'.format(
ANSI['red'],
result['output'].__class__.__name__.replace('ArchiveError', ''),
result['output'],
ANSI['reset']
),
*hints,
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
' cd {};'.format(result['pwd']),
' {}'.format(quoted_cmd),
]
print('\n'.join(
' {}'.format(line)
for line in output_lines
if line
))

1057
archivebox/main.py Normal file

File diff suppressed because it is too large Load diff

30
archivebox/manage.py Executable file
View file

@ -0,0 +1,30 @@
#!/usr/bin/env python
import os
import sys
if __name__ == '__main__':
# if you're a developer working on archivebox, still prefer the archivebox
# versions of ./manage.py commands whenever possible. When that's not possible
# (e.g. makemigrations), you can comment out this check temporarily
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
print("[X] Don't run ./manage.py directly, use the archivebox CLI instead e.g.:")
print(' archivebox manage createsuperuser')
print()
print(' Hint: Use these archivebox commands instead of the ./manage.py equivalents:')
print(' archivebox init (migrates the databse to latest version)')
print(' archivebox server (runs the Django web server)')
print(' archivebox shell (opens an iPython Django shell with all models imported)')
print(' archivebox manage [cmd] (any other management commands)')
raise SystemExit(2)
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
try:
from django.core.management import execute_from_command_line
except ImportError as exc:
raise ImportError(
"Couldn't import Django. Are you sure it's installed and "
"available on your PYTHONPATH environment variable? Did you "
"forget to activate a virtual environment?"
) from exc
execute_from_command_line(sys.argv)

3
archivebox/mypy.ini Normal file
View file

@ -0,0 +1,3 @@
[mypy]
plugins =
mypy_django_plugin.main

View file

@ -1,315 +0,0 @@
"""
Everything related to parsing links from input sources.
For a list of supported services, see the README.md.
For examples of supported import formats see tests/.
Link: {
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
'timestamp': '1544212312.4234',
'title': 'Example.com Page Title',
'tags': 'abc,def',
'sources': [
'output/sources/ril_export.html',
'output/sources/getpocket.com-1523422111.txt',
'output/sources/stdin-234234112312.txt'
]
}
"""
import re
import json
from datetime import datetime
import xml.etree.ElementTree as etree
from config import TIMEOUT
from util import (
str_between,
URL_REGEX,
check_url_parsing_invariants,
TimedProgress,
)
def parse_links(source_file):
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
check_url_parsing_invariants()
PARSERS = (
# Specialized parsers
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_rss_export),
('Generic JSON', parse_json_export),
# Fallback parser
('Plain Text', parse_plain_text_export),
)
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try:
links = list(parser_func(file))
if links:
timer.end()
return links, parser_name
except Exception as err:
# Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
pass
timer.end()
return [], 'Failed to parse'
### Import Parser Functions
def parse_pocket_html_export(html_file):
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title or None,
'tags': tags or '',
'sources': [html_file.name],
}
def parse_json_export(json_file):
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip() or None
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip() or None
elif link.get('name'):
title = link['name'].strip() or None
yield {
'url': url,
'timestamp': ts_str,
'title': title,
'tags': link.get('tags') or '',
'sources': [json_file.name],
}
def parse_rss_export(rss_file):
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title,
'tags': '',
'sources': [rss_file.name],
}
def parse_shaarli_rss_export(rss_file):
"""Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:]
for entry in entries:
# example entry:
# <entry>
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
# <id>https://demo.shaarli.org/?cEV4vw</id>
# <published>2019-01-30T06:06:01+00:00</published>
# <updated>2019-01-30T06:06:01+00:00</updated>
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
# </entry>
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
title = str_between(get_row('title'), '<title>', '</title>').strip()
url = str_between(get_row('link'), '<link href="', '" />')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title or None,
'tags': '',
'sources': [rss_file.name],
}
def parse_netscape_html_export(html_file):
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line)
if match:
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': match.group(3).strip() or None,
'tags': '',
'sources': [html_file.name],
}
def parse_pinboard_rss_export(rss_file):
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
url = item.find("{http://purl.org/rss/1.0/}link").text
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title or None,
'tags': tags or '',
'sources': [rss_file.name],
}
def parse_medium_rss_export(rss_file):
"""Parse Medium RSS feed files into links"""
rss_file.seek(0)
root = etree.parse(rss_file).getroot()
items = root.find("channel").findall("item")
for item in items:
url = item.find("link").text
title = item.find("title").text.strip()
ts_str = item.find("pubDate").text
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
yield {
'url': url,
'timestamp': str(time.timestamp()),
'title': title or None,
'tags': '',
'sources': [rss_file.name],
}
def parse_plain_text_export(text_file):
"""Parse raw links from each line in a text file"""
text_file.seek(0)
for line in text_file.readlines():
urls = re.findall(URL_REGEX, line) if line.strip() else ()
for url in urls:
yield {
'url': url,
'timestamp': str(datetime.now().timestamp()),
'title': None,
'tags': '',
'sources': [text_file.name],
}

View file

@ -0,0 +1,159 @@
"""
Everything related to parsing links from input sources.
For a list of supported services, see the README.md.
For examples of supported import formats see tests/.
"""
__package__ = 'archivebox.parsers'
import re
import os
from typing import Tuple, List
from datetime import datetime
from ..system import atomic_write
from ..config import (
ANSI,
OUTPUT_DIR,
SOURCES_DIR_NAME,
TIMEOUT,
)
from ..util import (
basename,
download_url,
enforce_types,
URL_REGEX,
)
from ..index.schema import Link
from ..logging_util import TimedProgress, log_source_saved
from .pocket_html import parse_pocket_html_export
from .pinboard_rss import parse_pinboard_rss_export
from .shaarli_rss import parse_shaarli_rss_export
from .medium_rss import parse_medium_rss_export
from .netscape_html import parse_netscape_html_export
from .generic_rss import parse_generic_rss_export
from .generic_json import parse_generic_json_export
from .generic_txt import parse_generic_txt_export
@enforce_types
def parse_links(source_file: str) -> Tuple[List[Link], str]:
"""parse a list of URLs with their metadata from an
RSS feed, bookmarks export, or text file
"""
check_url_parsing_invariants()
PARSERS = (
# Specialized parsers
('Pocket HTML', parse_pocket_html_export),
('Pinboard RSS', parse_pinboard_rss_export),
('Shaarli RSS', parse_shaarli_rss_export),
('Medium RSS', parse_medium_rss_export),
# General parsers
('Netscape HTML', parse_netscape_html_export),
('Generic RSS', parse_generic_rss_export),
('Generic JSON', parse_generic_json_export),
# Fallback parser
('Plain Text', parse_generic_txt_export),
)
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
for parser_name, parser_func in PARSERS:
try:
links = list(parser_func(file))
if links:
timer.end()
return links, parser_name
except Exception as err: # noqa
pass
# Parsers are tried one by one down the list, and the first one
# that succeeds is used. To see why a certain parser was not used
# due to error or format incompatibility, uncomment this line:
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
# raise
timer.end()
return [], 'Failed to parse'
@enforce_types
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
atomic_write(source_path, raw_text)
log_source_saved(source_file=source_path)
return source_path
@enforce_types
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
ts = str(datetime.now().timestamp()).split('.', 1)[0]
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
# Source is a URL that needs to be downloaded
print('{}[*] [{}] Downloading {}{}'.format(
ANSI['green'],
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
path,
ANSI['reset'],
))
timer = TimedProgress(timeout, prefix=' ')
try:
raw_source_text = download_url(path, timeout=timeout)
timer.end()
except Exception as e:
timer.end()
print('{}[!] Failed to download {}{}\n'.format(
ANSI['red'],
path,
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
else:
# Source is a path to a local file on the filesystem
with open(path, 'r') as f:
raw_source_text = f.read()
atomic_write(source_path, raw_source_text)
log_source_saved(source_file=source_path)
return source_path
def check_url_parsing_invariants() -> None:
"""Check that plain text regex URL parsing works as expected"""
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
assert len(re.findall(URL_REGEX, test_urls)) == 12

View file

@ -0,0 +1,65 @@
__package__ = 'archivebox.parsers'
import json
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
json_file.seek(0)
links = json.load(json_file)
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
for link in links:
# example line
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
if link:
# Parse URL
url = link.get('href') or link.get('url') or link.get('URL')
if not url:
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
# Parse the timestamp
ts_str = str(datetime.now().timestamp())
if link.get('timestamp'):
# chrome/ff histories use a very precise timestamp
ts_str = str(link['timestamp'] / 10000000)
elif link.get('time'):
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
elif link.get('created_at'):
ts_str = str(json_date(link['created_at']).timestamp())
elif link.get('created'):
ts_str = str(json_date(link['created']).timestamp())
elif link.get('date'):
ts_str = str(json_date(link['date']).timestamp())
elif link.get('bookmarked'):
ts_str = str(json_date(link['bookmarked']).timestamp())
elif link.get('saved'):
ts_str = str(json_date(link['saved']).timestamp())
# Parse the title
title = None
if link.get('title'):
title = link['title'].strip()
elif link.get('description'):
title = link['description'].replace(' — Readability', '').strip()
elif link.get('name'):
title = link['name'].strip()
yield Link(
url=htmldecode(url),
timestamp=ts_str,
title=htmldecode(title) or None,
tags=htmldecode(link.get('tags')) or '',
sources=[json_file.name],
)

View file

@ -0,0 +1,49 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
)
@enforce_types
def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse RSS XML-format files into links"""
rss_file.seek(0)
items = rss_file.read().split('<item>')
items = items[1:] if items else []
for item in items:
# example item:
# <item>
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
# <category>Unread</category>
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
# </item>
trailing_removed = item.split('</item>', 1)[0]
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
url = str_between(get_row('link'), '<link>', '</link>')
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,57 @@
__package__ = 'archivebox.parsers'
__description__ = 'Plain Text'
import re
from typing import IO, Iterable
from datetime import datetime
from pathlib import Path
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
URL_REGEX
)
@enforce_types
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
"""Parse raw links from each line in a text file"""
text_file.seek(0)
for line in text_file.readlines():
if not line.strip():
continue
# if the line is a local file path that resolves, then we can archive it
if Path(line).exists():
yield Link(
url=line,
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)
# otherwise look for anything that looks like a URL in the line
for url in re.findall(URL_REGEX, line):
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)
# look inside the URL for any sub-urls, e.g. for archive.org links
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
for url in re.findall(URL_REGEX, line[1:]):
yield Link(
url=htmldecode(url),
timestamp=str(datetime.now().timestamp()),
title=None,
tags=None,
sources=[text_file.name],
)

View file

@ -0,0 +1,35 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Medium RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.find("channel").findall("item") # type: ignore
for item in items:
url = item.find("link").text # type: ignore
title = item.find("title").text.strip() # type: ignore
ts_str = item.find("pubDate").text # type: ignore
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,39 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
html_file.seek(0)
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
for line in html_file:
# example line
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
match = pattern.search(line)
if match:
url = match.group(1)
time = datetime.fromtimestamp(float(match.group(2)))
title = match.group(3).strip()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[html_file.name],
)

View file

@ -0,0 +1,47 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Pinboard RSS feed files into links"""
rss_file.seek(0)
root = ElementTree.parse(rss_file).getroot()
items = root.findall("{http://purl.org/rss/1.0/}item")
for item in items:
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
url = find("{http://purl.org/rss/1.0/}link")
tags = find("{http://purl.org/dc/elements/1.1/}subject")
title = find("{http://purl.org/rss/1.0/}title")
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
# Pinboard includes a colon in its date stamp timezone offsets, which
# Python can't parse. Remove it:
if ts_str and ts_str[-3:-2] == ":":
ts_str = ts_str[:-3]+ts_str[-2:]
if ts_str:
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
else:
time = datetime.now()
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=htmldecode(tags) or None,
sources=[rss_file.name],
)

View file

@ -0,0 +1,38 @@
__package__ = 'archivebox.parsers'
import re
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
)
@enforce_types
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
html_file.seek(0)
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
for line in html_file:
# example line
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
match = pattern.search(line)
if match:
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
time = datetime.fromtimestamp(float(match.group(2)))
tags = match.group(3)
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags or '',
sources=[html_file.name],
)

View file

@ -0,0 +1,50 @@
__package__ = 'archivebox.parsers'
from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
htmldecode,
enforce_types,
str_between,
)
@enforce_types
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
"""Parse Shaarli-specific RSS XML-format files into links"""
rss_file.seek(0)
entries = rss_file.read().split('<entry>')[1:]
for entry in entries:
# example entry:
# <entry>
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
# <id>https://demo.shaarli.org/?cEV4vw</id>
# <published>2019-01-30T06:06:01+00:00</published>
# <updated>2019-01-30T06:06:01+00:00</updated>
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>&#8212; <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
# </entry>
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
title = str_between(get_row('title'), '<title>', '</title>').strip()
url = str_between(get_row('link'), '<link href="', '" />')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
yield Link(
url=htmldecode(url),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=None,
sources=[rss_file.name],
)

View file

@ -1,86 +0,0 @@
#!/usr/bin/env python3
import re
from argparse import ArgumentParser
from os.path import exists, join
from shutil import rmtree
from typing import List
from config import ARCHIVE_DIR, OUTPUT_DIR
from index import (parse_json_links_index, write_html_links_index,
write_json_links_index)
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
if not exists(join(OUTPUT_DIR, 'index.json')):
exit('index.json is missing; nothing to do')
compiled = [re.compile(r) for r in regexes]
links = parse_json_links_index(OUTPUT_DIR)
filtered = []
remaining = []
for l in links:
url = l['url']
for r in compiled:
if r.search(url):
filtered.append((l, r))
break
else:
remaining.append(l)
if not filtered:
exit('Search did not match any entries.')
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
for link, regex in filtered:
url = link['url']
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
if not proceed:
answer = input('Remove {} entries from index? [y/n] '.format(
len(filtered)))
proceed = answer.strip().lower() in ('y', 'yes')
if not proceed:
exit('Aborted')
write_json_links_index(OUTPUT_DIR, remaining)
write_html_links_index(OUTPUT_DIR, remaining)
if delete:
for link, _ in filtered:
data_dir = join(ARCHIVE_DIR, link['timestamp'])
if exists(data_dir):
rmtree(data_dir)
if __name__ == '__main__':
p = ArgumentParser('Index purging tool')
p.add_argument(
'--regex',
'-r',
action='append',
help='Regular expression matching URLs to purge',
)
p.add_argument(
'--delete',
'-d',
action='store_true',
default=False,
help='Delete webpage files from archive',
)
p.add_argument(
'--yes',
'-y',
action='store_true',
default=False,
help='Do not prompt for confirmation',
)
args = p.parse_args()
if args.regex:
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
else:
p.print_help()

116
archivebox/system.py Normal file
View file

@ -0,0 +1,116 @@
__package__ = 'archivebox'
import os
import shutil
from json import dump
from pathlib import Path
from typing import Optional, Union, Set, Tuple
from subprocess import run as subprocess_run
from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder
from .config import OUTPUT_PERMISSIONS
def run(*args, input=None, capture_output=True, text=False, **kwargs):
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
if input is not None:
if 'stdin' in kwargs:
raise ValueError('stdin and input arguments may not both be used.')
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used '
'with capture_output.')
return subprocess_run(*args, input=input, capture_output=capture_output, text=text, **kwargs)
@enforce_types
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):
f.write(contents)
os.chmod(path, int(OUTPUT_PERMISSIONS, base=8))
@enforce_types
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) -> None:
"""chmod -R <permissions> <cwd>/<path>"""
root = Path(cwd) / path
if not root.exists():
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
if not root.is_dir():
os.chmod(root, int(OUTPUT_PERMISSIONS, base=8))
else:
for subpath in Path(path).glob('**/*'):
os.chmod(subpath, int(OUTPUT_PERMISSIONS, base=8))
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
"""copy a given file or directory to a given path, overwriting the destination"""
if os.path.isdir(from_path):
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
contents = src.read()
atomic_write(to_path, contents)
@enforce_types
def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
"""get the total disk size of a given directory, optionally summing up
recursively and limiting to a given filter list
"""
num_bytes, num_dirs, num_files = 0, 0, 0
for entry in os.scandir(path):
if (pattern is not None) and (pattern not in entry.path):
continue
if entry.is_dir(follow_symlinks=False):
if not recursive:
continue
num_dirs += 1
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
num_bytes += bytes_inside
num_dirs += dirs_inside
num_files += files_inside
else:
num_bytes += entry.stat(follow_symlinks=False).st_size
num_files += 1
return num_bytes, num_dirs, num_files
CRON_COMMENT = 'archivebox_schedule'
@enforce_types
def dedupe_cron_jobs(cron: CronTab) -> CronTab:
deduped: Set[Tuple[str, str]] = set()
for job in list(cron):
unique_tuple = (str(job.slices), job.command)
if unique_tuple not in deduped:
deduped.add(unique_tuple)
cron.remove(job)
for schedule, command in deduped:
job = cron.new(command=command, comment=CRON_COMMENT)
job.setall(schedule)
job.enable()
return cron

View file

@ -1,348 +0,0 @@
<html>
<head>
<meta charset="utf-8">
<title>$title</title>
<style>
html, body {
width: 100%;
height: 100%;
}
body {
background-color: #ddd;
}
header {
width: 100%;
height: 90px;
background-color: #aa1e55;
margin: 0px;
text-align: center;
color: white;
}
header h1 {
padding-top: 5px;
padding-bottom: 5px;
margin: 0px;
font-weight: 200;
font-family: "Gill Sans", Helvetica, sans-serif;
font-size: calc(16px + 1vw);
}
.collapse-icon {
float: right;
color: black;
width: 126px;
font-size: 0.8em;
margin-top: 20px;
margin-right: 0px;
margin-left: -35px;
}
.nav-icon img {
float: left;
display: block;
margin-right: 13px;
color: black;
height: 53px;
margin-top: 12px;
margin-left: 10px;
}
.nav-icon img:hover {
opacity: 0.5;
}
.title-url {
color: black;
display: block;
width: 75%;
white-space: nowrap;
overflow: hidden;
margin: auto;
}
.archive-page-header {
margin-top: 5px;
margin-bottom: 5px;
}
.archive-page-header .alert {
margin-bottom: 0px;
}
h1 small {
opacity: 0.4;
font-size: 0.6em;
}
h1 small:hover {
opacity: 0.8;
}
.card {
overflow: hidden;
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
}
.card h4 {
font-size: 1.4vw;
}
.card-body {
font-size: 1vw;
padding-top: 1.2vw;
padding-left: 1vw;
padding-right: 1vw;
padding-bottom: 1vw;
line-height: 1.1;
word-wrap: break-word;
max-height: 102px;
overflow: hidden;
}
.card-img-top {
border: 0px;
padding: 0px;
margin: 0px;
overflow: hidden;
opacity: 0.8;
border-top: 1px solid gray;
border-radius: 3px;
border-bottom: 1px solid #ddd;
height: 430px;
width: 400%;
margin-bottom: -330px;
transform: scale(0.25);
transform-origin: 0 0;
}
.full-page-iframe {
border-top: 1px solid #ddd;
width: 100%;
height: 69vh;
margin: 0px;
border: 0px;
border-top: 3px solid #aa1e55;
}
.card.selected-card {
border: 2px solid orange;
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
}
.iframe-large {
height: 93%;
margin-top: -10px;
}
.pdf-frame {
transform: none;
width: 100%;
height: 160px;
margin-top: -60px;
margin-bottom: 0px;
}
img.external {
height: 30px;
margin-right: -10px;
padding: 3px;
border-radius: 4px;
vertical-align: middle;
border: 4px solid rgba(0,0,0,0);
}
img.external:hover {
border: 4px solid green;
}
.screenshot {
transform: none;
width: 100%;
height: auto;
max-height: 100px;
margin-bottom: 0px;
object-fit: cover;
object-position: top;
}
@media(max-width: 1092px) {
iframe {
display: none;
}
}
@media(max-width: 728px) {
.card h4 {
font-size: 5vw;
}
.card-body {
font-size: 4vw;
}
.card {
margin-bottom: 5px;
}
header > h1 > a.collapse-icon, header > h1 > a.nav-icon {
display: none;
}
}
</style>
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" integrity="sha384-rwoIResjU2yc3z8GV/NPeZWAv56rSmLldC3R/AZzGRnGxQQKnKkoFVhFQhNUwEyJ" crossorigin="anonymous">
</head>
<body>
<header>
<h1 class="page-title">
<a href="../../index.html" class="nav-icon" title="Go to Main Index...">
<img src="../../static/archive.png" alt="Archive Icon">
</a>
<a href="#" class="collapse-icon" style="text-decoration: none" title="Toggle info panel...">
</a>
<img src="$favicon_url" height="20px"> $title<br/>
<a href="$url" class="title-url">
<small>$base_url</small>
</a>
</h1>
</header>
<div class="site-header container-fluid">
<div class="row archive-page-header">
<div class="col-lg-4 alert well">
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
&nbsp; | &nbsp;
Last updated: <small title="Timestamp: $updated">$updated_date</small>
&nbsp; | &nbsp;
Total files: <small title="Archive methods">🗃 $num_outputs</small>
</div>
<div class="col-lg-4 alert well">
Type:
<span class="badge badge-default">$extension</span>
&nbsp; | &nbsp;
Tags:
<span class="badge badge-warning">$tags</span>
&nbsp; | &nbsp;
Status:
<span class="badge badge-$status_color">$status</span>
</div>
<div class="col-lg-4 alert well">
Archive Methods:
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
<a href="git/" title="Any git repos at the url">Git Repos</a> |
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
<a href="." title="Webserver-provided index of files directory.">See all files...</a>
</div>
<hr/>
<div class="col-lg-2">
<div class="card selected-card">
<iframe class="card-img-top" src="$archive_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$archive_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$archive_url" target="preview"><h4 class="card-title">Local Archive</h4></a>
<p class="card-text">archive/$domain</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$dom_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$dom_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$dom_url" target="preview"><h4 class="card-title">HTML</h4></a>
<p class="card-text">archive/output.html</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top pdf-frame" src="$pdf_url" scrolling="no"></iframe>
<div class="card-body">
<a href="$pdf_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$pdf_url" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
<p class="card-text">archive/output.pdf</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<img class="card-img-top screenshot" src="$screenshot_url"></iframe>
<div class="card-body">
<a href="$screenshot_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$screenshot_url" target="preview"><h4 class="card-title">Screenshot</h4></a>
<p class="card-text">archive/screenshot.png</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
<p class="card-text">$domain</p>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="$archive_org_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="$archive_org_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
<img src="../../static/external.png" class="external"/>
</a>
<a href="$archive_org_url" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<p class="card-text">web.archive.org/web/...</p>
</div>
</div>
</div>
</div>
</div>
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="$archive_url" name="preview"></iframe>
<script
src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g="
crossorigin="anonymous"></script>
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
<script>
// show selected file in iframe when preview card is clicked
jQuery('.card').on('click', function(e) {
jQuery('.selected-card').removeClass('selected-card')
jQuery(e.target).closest('.card').addClass('selected-card')
})
jQuery('.card a[target=preview]').on('click', function(e) {
if (e.currentTarget.href.endsWith('.pdf')) {
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
} else {
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
}
return true
})
// un-sandbox iframes showing pdfs (required to display pdf viewer)
jQuery('iframe').map(function() {
if (this.src.endsWith('.pdf')) {
this.removeAttribute('sandbox')
this.src = this.src
}
})
// hide header when collapse icon is clicked
jQuery('.collapse-icon').on('click', function() {
if (jQuery('.collapse-icon').text().includes('▾')) {
jQuery('.collapse-icon').text('▸')
jQuery('.site-header').hide()
jQuery('.full-page-iframe').addClass('iframe-large')
} else {
jQuery('.collapse-icon').text('▾')
jQuery('.site-header').show()
jQuery('.full-page-iframe').removeClass('iframe-large')
}
return true
})
// hide all preview iframes on small screens
if (window.innerWidth < 1091) {
jQuery('.card a[target=preview]').attr('target', '_self')
}
var pdf_frame = document.querySelector('.pdf-frame');
pdf_frame.onload = function () {
pdf_frame.contentWindow.scrollTo(0, 400);
}
</script>
</body>
</html>

View file

@ -1,34 +0,0 @@
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<!-- This is an automatically generated file.
It will be read and overwritten.
DO NOT EDIT! -->
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Bookmarks</TITLE>
<H1>Bookmarks Menu</H1>
<DL><p>
<DT><A HREF="place:folder=BOOKMARKS_MENU&folder=UNFILED_BOOKMARKS&folder=TOOLBAR&queryType=1&sort=12&maxResults=10&excludeQueries=1" ADD_DATE="1409779227" LAST_MODIFIED="1470506008">Recently Bookmarked</A>
<DT><A HREF="place:type=6&sort=14&maxResults=10" ADD_DATE="1470506008" LAST_MODIFIED="1470506008">Recent Tags</A>
<HR> <DT><H3 ADD_DATE="1409779227" LAST_MODIFIED="1409779227">Mozilla Firefox</H3>
<DL><p>
<DT><A HREF="https://www.mozilla.org/en-US/firefox/help/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/0-1409779227970" ICON="">Help and Tutorials</A>
<DT><A HREF="https://www.mozilla.org/en-US/firefox/customize/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/1-1409779227971" ICON="">Customize Firefox</A>
<DT><A HREF="https://www.mozilla.org/en-US/contribute/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/2-1409779227973" ICON="">Get Involved</A>
<DT><A HREF="https://www.mozilla.org/en-US/about/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/3-1409779227974" ICON="">About Us</A>
</DL><p>
<DT><H3 ADD_DATE="1497562973" LAST_MODIFIED="1497562974">[Folder Name]</H3>
<DL><p>
<DT><A HREF="https://duckduckgo.com/?q=firefox+export+bookmarks&t=ffhp&ia=web" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://duckduckgo.com/favicon.ico" ICON="">firefox export bookmarks at DuckDuckGo</A>
<DT><A HREF="https://duckduckgo.com/?q=archive+firefox+bookmarks&t=ffab&ia=web" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://duckduckgo.com/favicon.ico" ICON="">archive firefox bookmarks at DuckDuckGo</A>
<DT><A HREF="https://github.com/nodiscc" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">nodiscc (nodiscc) · GitHub</A>
<DT><A HREF="https://github.com/pirate/ArchiveBox#troubleshooting" ADD_DATE="1497562975" LAST_MODIFIED="1497562975" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">pirate/ArchiveBox · Github</A>
<DT><A HREF="http://www.cs.unc.edu/~fabian/papers/foniks-oak11.pdf" ADD_DATE="1497562976" LAST_MODIFIED="1497562976" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">Phonotactic Reconstruction of Encrypted VoIP Conversations</A>
<DT><A HREF="https://www.ghacks.net/2009/07/23/firefox-bookmarks-archiver/" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://www.ghacks.net/wp-content/uploads/2005/10/favicon.ico" ICON="">Firefox Bookmarks Archiver - gHacks Tech News</A>
</DL><p>
<DT><H3 ADD_DATE="1409779227" LAST_MODIFIED="1470506008" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks Toolbar</H3>
<DD>Add bookmarks to this folder to see them displayed on the Bookmarks Toolbar
<DL><p>
<DT><A HREF="place:sort=8&maxResults=10" ADD_DATE="1470506008" LAST_MODIFIED="1470506008">Most Visited</A>
<DT><A HREF="https://www.mozilla.org/en-US/firefox/central/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227">Getting Started</A>
</DL><p>
</DL>

View file

@ -1,12 +0,0 @@
<!DOCTYPE NETSCAPE-Bookmark-file-1>
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
<TITLE>Pinboard Bookmarks</TITLE>
<H1>Bookmarks</H1>
<DL>
<p>
<DT><A HREF="https://github.com/trailofbits/algo" ADD_DATE="1542616733" PRIVATE="1" TOREAD="1" TAGS="vpn,scripts,toread">Algo VPN scripts</A>
<DT><A HREF="http://www.ulisp.com/" ADD_DATE="1542374412" PRIVATE="1" TOREAD="1" TAGS="arduino,avr,embedded,lisp,toread">uLisp</A>
</DL>
</p>

View file

@ -1,8 +0,0 @@
[{"href":"https:\/\/en.wikipedia.org\/wiki\/International_Typographic_Style","description":"International Typographic Style - Wikipedia, the free encyclopedia","extended":"","meta":"32f4cc916e6f5919cc19aceb10559cc1","hash":"3dd64e155e16731d20350bec6bef7cb5","time":"2016-06-07T11:27:08Z","shared":"no","toread":"yes","tags":""},
{"href":"https:\/\/news.ycombinator.com\/item?id=11686984","description":"Announcing Certbot: EFF's Client for Let's Encrypt | Hacker News","extended":"","meta":"4a49602ba5d20ec3505c75d38ebc1d63","hash":"1c1acb53a5bd520e8529ce4f9600abee","time":"2016-05-13T05:46:16Z","shared":"no","toread":"yes","tags":""},
{"href":"https:\/\/github.com\/google\/styleguide","description":"GitHub - google\/styleguide: Style guides for Google-originated open-source projects","extended":"","meta":"15a8d50f7295f18ccb6dd19cb689c68a","hash":"1028bf9872d8e4ea1b1858f4044abb58","time":"2016-02-24T08:49:25Z","shared":"no","toread":"no","tags":"code.style.guide programming reference web.dev"},
{"href":"http:\/\/en.wikipedia.org\/wiki\/List_of_XML_and_HTML_character_entity_references","description":"List of XML and HTML character entity references - Wikipedia, the free encyclopedia","extended":"","meta":"6683a70f0f59c92c0bfd0bce653eab69","hash":"344d975c6251a8d460971fa2c43d9bbb","time":"2014-06-16T04:17:15Z","shared":"no","toread":"no","tags":"html reference web.dev typography"},
{"href":"https:\/\/pushover.net\/","description":"Pushover: Simple Notifications for Android, iOS, and Desktop","extended":"","meta":"1e68511234d9390d10b7772c8ccc4b9e","hash":"bb93374ead8a937b18c7c46e13168a7d","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"app android"},
{"href":"http:\/\/www.reddit.com\/r\/Android","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 1"},
{"href":"http:\/\/www.reddit.com\/r\/Android2","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e2","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 2"},
{"href":"http:\/\/www.reddit.com\/r\/Android3","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e4","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 3"}]

View file

@ -1,46 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
<channel rdf:about="http://pinboard.in">
<title>Pinboard (private aaronmueller)</title>
<link>https://pinboard.in/u:aaronmueller/private/</link>
<description></description>
<items>
<rdf:Seq>
<rdf:li rdf:resource="https://mehkee.com/"/>
<rdf:li rdf:resource="https://qmk.fm/"/>
</rdf:Seq>
</items>
</channel>
<item rdf:about="https://mehkee.com/">
<title>Mehkee - Mechanical Keyboard Parts &amp; Accessories</title>
<dc:date>2018-11-08T21:29:32+00:00</dc:date>
<link>https://mehkee.com/</link>
<dc:creator>aaronmueller</dc:creator>
<dc:subject>keyboard gadget diy</dc:subject>
<dc:source>http://pinboard.in/</dc:source>
<dc:identifier>http://pinboard.in/u:aaronmueller/b:xxx/</dc:identifier>
<taxo:topics>
<rdf:Bag>
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:keyboard"/>
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:gadget"/>
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:diy"/>
</rdf:Bag>
</taxo:topics>
</item>
<item rdf:about="https://qmk.fm/">
<title>QMK Firmware - An open source firmware for AVR and ARM based keyboards</title>
<dc:date>2018-11-06T22:36:21+00:00</dc:date>
<link>https://qmk.fm/</link>
<dc:creator>aaronmueller</dc:creator>
<dc:subject>firmware keyboard</dc:subject>
<dc:source>http://pinboard.in/</dc:source>
<dc:identifier>http://pinboard.in/u:aaronmueller/b:xxx/</dc:identifier>
<taxo:topics>
<rdf:Bag>
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:firmware"/>
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:keyboard"/>
</rdf:Bag>
</taxo:topics>
</item>
</rdf:RDF>

View file

@ -1,5 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?>
<posts user="aaronmueller">
<post href="https://github.com/trailofbits/algo" time="2018-11-19T08:38:53Z" description="Algo VPN scripts" extended="" tag="vpn scripts" hash="18d708f67bb26d843b1cac4530bb52aa" shared="no" toread="yes" />
<post href="http://www.ulisp.com/" time="2018-11-16T13:20:12Z" description="uLisp" extended="" tag="arduino avr embedded lisp" hash="2a17ae95925a03a5b9bb38cf7f6c6f9b" shared="no" toread="yes" />
</posts>

View file

@ -1,2 +0,0 @@
[{"href":"https:\/\/github.com\/trailofbits\/algo","description":"Algo VPN scripts","extended":"","meta":"62325ba3b577683aee854d7f191034dc","hash":"18d708f67bb26d843b1cac4530bb52aa","time":"2018-11-19T08:38:53Z","shared":"no","toread":"yes","tags":"vpn scripts"},
{"href":"http:\/\/www.ulisp.com\/","description":"uLisp","extended":"","meta":"7bd0c0ef31f69d1459e3d37366e742b3","hash":"2a17ae95925a03a5b9bb38cf7f6c6f9b","time":"2018-11-16T13:20:12Z","shared":"no","toread":"yes","tags":"arduino avr embedded lisp"}]

View file

@ -1,38 +0,0 @@
<!DOCTYPE html>
<html>
<!--So long and thanks for all the fish-->
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<title>Pocket Export</title>
</head>
<body>
<h1>Unread</h1>
<ul>
<li><a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3110382/" time_added="1493913054" tags="">The Radical Plasticity Thesis: How the Brain Learns to be Conscious</a></li>
<li><a href="https://martinfowler.com/eaaDev/uiArchs.html" time_added="1493909628" tags="">GUI Architectures</a></li>
<li><a href="https://issuu.com/crowdcraft/docs/shanghai-talk-july-2012" time_added="1493900327" tags="make512">Shanghai Talk July 2012 by Mike Hall - issuu</a></li>
<li><a href="http://make512.weebly.com/about-us.html" time_added="1493900002" tags="">About Us - make512</a></li>
<li><a href="https://openzfsonosx.org/wiki/ZFS_on_Boot" time_added="1493887140" tags="">ZFS on Boot - OpenZFS on OS X</a></li>
<li><a href="http://www.softpanorama.org/DNS/history.shtml" time_added="1493869958" tags="">History of DNS</a></li>
<li><a href="https://chromium.googlesource.com/chromium/src/+/master/docs/linux_sandboxing.md" time_added="1493869649" tags="">Linux Sandboxing</a></li>
<li><a href="https://hackernoon.com/rems-and-ems-and-why-you-probably-dont-need-them-664b9ce1e09f" time_added="1493694979" tags="">rems and ems, and why you probably dont need them Hacker Noon</a></li>
<li><a href="https://wiki.archlinux.org/index.php/full_system_backup_with_rsync" time_added="1493581911" tags="">Full system backup with rsync - ArchWiki</a></li>
<li><a href="https://www.youtube.com/watch?v=iNnAQpAHfmA" time_added="1493581911" tags="">SingUnltd. - Nature Boy (Flying Lotus Massage Situation Sample?! )</a></li>
</ul>
<h1>Read Archive</h1>
<ul>
<li><a href="https://github.com/Droogans/unmaintainable-code" time_added="1478739800" tags="">Droogans/unmaintainable-code: An easier to share version of the infamous ht</a></li>
<li><a href="http://www.benstopford.com/2015/02/14/log-structured-merge-trees/" time_added="1478739709" tags="">Log Structured Merge Trees - ben stopford</a></li>
<li><a href="http://jgthms.com/web-design-in-4-minutes/#share" time_added="1478739628" tags="">Web Design in 4 minutes</a></li>
<li><a href="https://eev.ee/blog/2016/07/26/the-hardest-problem-in-computer-science/" time_added="1478739622" tags="">The hardest problem in computer science / fuzzy notepad</a></li>
<li><a href="https://medium.com/@iamjordanlittle/9-underutilized-features-in-css-90ced6ddbfe7#.690ah7whf" time_added="1476686912" tags="">9 Underutilized Features in CSS Medium</a></li>
<li><a href="http://themacro.com/articles/2016/09/employee-1-coinbase/" time_added="1476686907" tags="">Employee #1: Coinbase · The Macro</a></li>
<li><a href="https://juokaz.com/blog/becoming-a-cto" time_added="1476686904" tags="">Becoming a CTO // Juozas Kaziukėnas</a></li>
<li><a href="https://backchannel.com/the-internet-really-has-changed-everything-here-s-the-proof-928eaead18a8#.ekfmwcjh2" time_added="1476686896" tags="">The Internet Really Has Changed Everything. Heres the Proof.</a></li>
<li><a href="http://www.hindawi.com/journals/ijbm/2011/172389/" time_added="1424321329" tags="">Experimental and Modeling Study of Collagen Scaffolds with the Effects of C</a></li>
<li><a href="http://search.cpan.org/dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod?#A_Localization_Horror_Story:_It_Could_Happen_To_You" time_added="1424306906" tags="">Locale::Maketext::TPJ13 - search.cpan.org</a></li>
</ul>
</body>
</html>

View file

@ -1,228 +0,0 @@
<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
xmlns:content="http://purl.org/rss/1.0/modules/content/"
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:atom="http://www.w3.org/2005/Atom"
>
<channel>
<title>My Reading List: Read and Unread</title>
<description>Items I've saved to read</description>
<link>http://readitlaterlist.com/users/nikisweeting/feed/all</link>
<atom:link href="http://readitlaterlist.com/users/nikisweeting/feed/all" rel="self" type="application/rss+xml" />
<item>
<title><![CDATA[Cell signaling]]></title>
<category>Unread</category>
<link>https://en.wikipedia.org/wiki/Cell_signaling</link>
<guid>https://en.wikipedia.org/wiki/Cell_signaling</guid>
<pubDate>Mon, 30 Oct 2017 01:12:10 -0500</pubDate>
</item>
<item>
<title><![CDATA[Hayflick limit]]></title>
<category>Unread</category>
<link>https://en.wikipedia.org/wiki/Hayflick_limit</link>
<guid>https://en.wikipedia.org/wiki/Hayflick_limit</guid>
<pubDate>Mon, 30 Oct 2017 01:11:38 -0500</pubDate>
</item>
<item>
<title><![CDATA[Even moderate drinking by parents can upset children  study]]></title>
<category>Unread</category>
<link>https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal</link>
<guid>https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal</guid>
<pubDate>Mon, 30 Oct 2017 01:11:30 -0500</pubDate>
</item>
<item>
<title><![CDATA[How Merkle trees enable the decentralized Web]]></title>
<category>Unread</category>
<link>https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web</link>
<guid>https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web</guid>
<pubDate>Mon, 30 Oct 2017 01:11:30 -0500</pubDate>
</item>
<item>
<title><![CDATA[Inertial navigation system]]></title>
<category>Unread</category>
<link>https://en.wikipedia.org/wiki/Inertial_navigation_system</link>
<guid>https://en.wikipedia.org/wiki/Inertial_navigation_system</guid>
<pubDate>Mon, 30 Oct 2017 01:10:10 -0500</pubDate>
</item>
<item>
<title><![CDATA[Dead reckoning]]></title>
<category>Unread</category>
<link>https://en.wikipedia.org/wiki/Dead_reckoning</link>
<guid>https://en.wikipedia.org/wiki/Dead_reckoning</guid>
<pubDate>Mon, 30 Oct 2017 01:10:08 -0500</pubDate>
</item>
<item>
<title><![CDATA[Calling Rust From Python]]></title>
<category>Unread</category>
<link>https://bheisler.github.io/post/calling-rust-in-python</link>
<guid>https://bheisler.github.io/post/calling-rust-in-python</guid>
<pubDate>Mon, 30 Oct 2017 01:04:33 -0500</pubDate>
</item>
<item>
<title><![CDATA[Why would anyone choose Docker over fat binaries?]]></title>
<category>Unread</category>
<link>http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries</link>
<guid>http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries</guid>
<pubDate>Sun, 29 Oct 2017 14:57:25 -0500</pubDate>
</item>
<item>
<title><![CDATA[]]></title>
<category>Unread</category>
<link>https://heml.io</link>
<guid>https://heml.io</guid>
<pubDate>Sun, 29 Oct 2017 14:55:26 -0500</pubDate>
</item>
<item>
<title><![CDATA[A surprising amount of people want to be in North Korea]]></title>
<category>Unread</category>
<link>https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad</link>
<guid>https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad</guid>
<pubDate>Sat, 28 Oct 2017 05:41:41 -0500</pubDate>
</item>
<item>
<title><![CDATA[Learning a Hierarchy]]></title>
<category>Unread</category>
<link>https://blog.openai.com/learning-a-hierarchy</link>
<guid>https://blog.openai.com/learning-a-hierarchy</guid>
<pubDate>Thu, 26 Oct 2017 16:43:48 -0500</pubDate>
</item>
<item>
<title><![CDATA[High Performance Browser Networking]]></title>
<category>Unread</category>
<link>https://hpbn.co</link>
<guid>https://hpbn.co</guid>
<pubDate>Wed, 25 Oct 2017 19:05:24 -0500</pubDate>
</item>
<item>
<title><![CDATA[What tender and juicy drama is going on at your school/workplace?]]></title>
<category>Unread</category>
<link>https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v</link>
<guid>https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v</guid>
<pubDate>Wed, 25 Oct 2017 18:05:58 -0500</pubDate>
</item>
<item>
<title><![CDATA[Using an SSH Bastion Host]]></title>
<category>Unread</category>
<link>https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host</link>
<guid>https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host</guid>
<pubDate>Wed, 25 Oct 2017 11:38:47 -0500</pubDate>
</item>
<item>
<title><![CDATA[Let's Define &quot;undefined&quot; | NathanShane.me]]></title>
<category>Unread</category>
<link>https://nathanshane.me/blog/let's-define-undefined</link>
<guid>https://nathanshane.me/blog/let's-define-undefined</guid>
<pubDate>Wed, 25 Oct 2017 11:32:59 -0500</pubDate>
</item>
<item>
<title><![CDATA[Control theory]]></title>
<category>Unread</category>
<link>https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function</link>
<guid>https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function</guid>
<pubDate>Tue, 24 Oct 2017 22:57:43 -0500</pubDate>
</item>
<item>
<title><![CDATA[J012-86-intractable.pdf]]></title>
<category>Unread</category>
<link>http://mit.edu/~jnt/Papers/J012-86-intractable.pdf</link>
<guid>http://mit.edu/~jnt/Papers/J012-86-intractable.pdf</guid>
<pubDate>Tue, 24 Oct 2017 22:56:32 -0500</pubDate>
</item>
<item>
<title><![CDATA[Dynamic Programming: First Principles]]></title>
<category>Unread</category>
<link>http://flawlessrhetoric.com/Dynamic-Programming-First-Principles</link>
<guid>http://flawlessrhetoric.com/Dynamic-Programming-First-Principles</guid>
<pubDate>Tue, 24 Oct 2017 22:56:30 -0500</pubDate>
</item>
<item>
<title><![CDATA[What Would Happen If There Were No Number 6?]]></title>
<category>Unread</category>
<link>https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6</link>
<guid>https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6</guid>
<pubDate>Tue, 24 Oct 2017 22:21:59 -0500</pubDate>
</item>
<item>
<title><![CDATA[Ten Basic Rules for Adventure]]></title>
<category>Unread</category>
<link>https://outsideonline.com/2252916/10-basic-rules-adventure</link>
<guid>https://outsideonline.com/2252916/10-basic-rules-adventure</guid>
<pubDate>Tue, 24 Oct 2017 20:56:25 -0500</pubDate>
</item>
<item>
<title><![CDATA[Insects Are In Serious Trouble]]></title>
<category>Unread</category>
<link>https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true</link>
<guid>https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true</guid>
<pubDate>Mon, 23 Oct 2017 23:10:10 -0500</pubDate>
</item>
<item>
<title><![CDATA[Netflix/bless]]></title>
<category>Unread</category>
<link>https://github.com/Netflix/bless</link>
<guid>https://github.com/Netflix/bless</guid>
<pubDate>Mon, 23 Oct 2017 23:04:46 -0500</pubDate>
</item>
<item>
<title><![CDATA[Getting Your First 10 Customers]]></title>
<category>Unread</category>
<link>https://stripe.com/atlas/guides/starting-sales</link>
<guid>https://stripe.com/atlas/guides/starting-sales</guid>
<pubDate>Mon, 23 Oct 2017 22:27:36 -0500</pubDate>
</item>
<item>
<title><![CDATA[GPS Hardware]]></title>
<category>Unread</category>
<link>https://novasummits.com/gps-hardware</link>
<guid>https://novasummits.com/gps-hardware</guid>
<pubDate>Mon, 23 Oct 2017 04:44:40 -0500</pubDate>
</item>
<item>
<title><![CDATA[Bicycle Tires and Tubes]]></title>
<category>Unread</category>
<link>http://sheldonbrown.com/tires.html#pressure</link>
<guid>http://sheldonbrown.com/tires.html#pressure</guid>
<pubDate>Mon, 23 Oct 2017 01:28:32 -0500</pubDate>
</item>
<item>
<title><![CDATA[Tire light is on]]></title>
<category>Unread</category>
<link>https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe</link>
<guid>https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe</guid>
<pubDate>Mon, 23 Oct 2017 01:21:42 -0500</pubDate>
</item>
<item>
<title><![CDATA[Bad_Salish_Boo ?? on Twitter]]></title>
<category>Unread</category>
<link>https://t.co/PDLlNjACv9</link>
<guid>https://t.co/PDLlNjACv9</guid>
<pubDate>Sat, 21 Oct 2017 06:48:07 -0500</pubDate>
</item>
<item>
<title><![CDATA[Is an Open Marriage a Happier Marriage?]]></title>
<category>Unread</category>
<link>https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html</link>
<guid>https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html</guid>
<pubDate>Fri, 20 Oct 2017 13:08:52 -0500</pubDate>
</item>
<item>
<title><![CDATA[The Invention of Monogamy]]></title>
<category>Unread</category>
<link>https://thenib.com/the-invention-of-monogamy</link>
<guid>https://thenib.com/the-invention-of-monogamy</guid>
<pubDate>Fri, 20 Oct 2017 12:19:00 -0500</pubDate>
</item>
<item>
<title><![CDATA[Google Chrome May Add a Permission to Stop In-Browser Cryptocurrency Miners]]></title>
<category>Unread</category>
<link>https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners</link>
<guid>https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners</guid>
<pubDate>Fri, 20 Oct 2017 03:57:41 -0500</pubDate>
</item>
</channel>
</rss>

View file

@ -1,92 +0,0 @@
#!/usr/bin/env python3
import json
import os
from os.path import dirname, pardir, join
from subprocess import check_output, check_call
from tempfile import TemporaryDirectory
from typing import List
import pytest
ARCHIVER_BIN = join(dirname(__file__), pardir, 'archive.py')
class Helper:
def __init__(self, output_dir: str):
self.output_dir = output_dir
def run(self, links, env=None, env_defaults=None):
if env_defaults is None:
env_defaults = {
# we don't wanna spam archive.org witin our tests..
'SUBMIT_ARCHIVE_DOT_ORG': 'False',
}
if env is None:
env = {}
env = dict(**env_defaults, **env)
jj = []
for url in links:
jj.append({
'href': url,
'description': url,
})
input_json = join(self.output_dir, 'input.json')
with open(input_json, 'w') as fo:
json.dump(jj, fo)
if env is None:
env = {}
env['OUTPUT_DIR'] = self.output_dir
check_call(
[ARCHIVER_BIN, input_json],
env={**os.environ.copy(), **env},
)
class TestArchiver:
def setup(self):
# self.tdir = TemporaryDirectory(dir='hello')
class AAA:
name = 'hello'
self.tdir = AAA()
def teardown(self):
pass
# self.tdir.cleanup()
@property
def output_dir(self):
return self.tdir.name
def test_fetch_favicon_false(self):
h = Helper(self.output_dir)
h.run(links=[
'https://google.com',
], env={
'FETCH_FAVICON': 'False',
})
# for now no asserts, good enough if it isn't failing
def test_3000_links(self):
"""
The pages are deliberatly unreachable. The tool should gracefully process all of them even though individual links are failing.
"""
h = Helper(self.output_dir)
h.run(links=[
f'https://localhost:123/whatever_{i}.html' for i in range(3000)
], env={
'FETCH_FAVICON': 'False',
'FETCH_SCREENSHOT': 'False',
'FETCH_PDF': 'False',
'FETCH_DOM': 'False',
'CHECK_SSL_VALIDITY': 'False',
})
if __name__ == '__main__':
pytest.main([__file__])

View file

@ -0,0 +1 @@
actions_as_select

View file

@ -0,0 +1,18 @@
{% extends "admin/index.html" %}
{% load i18n %}
{% block bodyclass %}{{ block.super }} app-{{ app_label }}{% endblock %}
{% if not is_popup %}
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
&rsaquo;
{% for app in app_list %}
{{ app.name }}
{% endfor %}
</div>
{% endblock %}
{% endif %}
{% block sidebar %}{% endblock %}

View file

@ -0,0 +1,188 @@
{% load i18n static %}<!DOCTYPE html>
{% get_current_language as LANGUAGE_CODE %}{% get_current_language_bidi as LANGUAGE_BIDI %}
<html lang="{{ LANGUAGE_CODE|default:"en-us" }}" {% if LANGUAGE_BIDI %}dir="rtl"{% endif %}>
<head>
<title>{% block title %}{% endblock %} | ArchiveBox</title>
<link rel="stylesheet" type="text/css" href="{% block stylesheet %}{% static "admin/css/base.css" %}{% endblock %}">
{% block extrastyle %}{% endblock %}
{% if LANGUAGE_BIDI %}<link rel="stylesheet" type="text/css" href="{% block stylesheet_rtl %}{% static "admin/css/rtl.css" %}{% endblock %}">{% endif %}
{% block extrahead %}{% endblock %}
{% block responsive %}
<meta name="viewport" content="user-scalable=no, width=device-width, initial-scale=1.0, maximum-scale=1.0">
<link rel="stylesheet" type="text/css" href="{% static "admin/css/responsive.css" %}">
{% if LANGUAGE_BIDI %}<link rel="stylesheet" type="text/css" href="{% static "admin/css/responsive_rtl.css" %}">{% endif %}
{% endblock %}
{% block blockbots %}<meta name="robots" content="NONE,NOARCHIVE">{% endblock %}
<link rel="stylesheet" type="text/css" href="{% static "admin.css" %}">
</head>
{% load i18n %}
<body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
data-admin-utc-offset="{% now "Z" %}">
<style nonce="{{nonce}}">
/* Loading Progress Bar */
#progress {
position: absolute;
z-index: 1000;
top: 0px;
left: -6px;
width: 2%;
opacity: 1;
height: 2px;
background: #1a1a1a;
border-radius: 1px;
transition: width 4s ease-out, opacity 400ms linear;
}
@-moz-keyframes bugfix { from { padding-right: 1px ; } to { padding-right: 0; } }
</style>
<script>
// Page Loading Bar
window.loadStart = function(distance) {
var distance = distance || 0;
// only add progrstess bar if not already present
if (django.jQuery("#loading-bar").length == 0) {
django.jQuery("body").add("<div id=\"loading-bar\"></div>");
}
if (django.jQuery("#progress").length === 0) {
django.jQuery("body").append(django.jQuery("<div></div>").attr("id", "progress"));
let last_distance = (distance || (30 + (Math.random() * 30)))
django.jQuery("#progress").width(last_distance + "%");
setInterval(function() {
last_distance += Math.random()
django.jQuery("#progress").width(last_distance + "%");
}, 1000)
}
};
window.loadFinish = function() {
django.jQuery("#progress").width("101%").delay(200).fadeOut(400, function() {
django.jQuery(this).remove();
});
};
window.loadStart();
window.addEventListener('beforeunload', function() {window.loadStart(27)});
document.addEventListener('DOMContentLoaded', function() {window.loadFinish()});
</script>
<!-- Container -->
<div id="container">
{% if not is_popup %}
<!-- Header -->
<div id="header">
<div id="branding">
<h1 id="site-name">
<a href="{% url 'Home' %}">
<img src="{% static 'archive.png' %}" id="logo">
ArchiveBox
</a>
</h1>
</div>
{% block usertools %}
{% if has_permission %}
<div id="user-tools">
<a href="{% url 'admin:Add' %}">Add </a> /
<a href="{% url 'Home' %}">Snapshots</a> /
<a href="/admin/auth/user/">Users</a> /
<a href="{% url 'OldHome' %}">Old UI</a> /
<a href="{% url 'Docs' %}">Docs</a>
&nbsp; &nbsp;
{% block welcome-msg %}
{% trans 'User' %}
<strong>{% firstof user.get_short_name user.get_username %}</strong> &nbsp; &nbsp;
{% endblock %}
{% block userlinks %}
{% if user.is_active and user.is_staff %}
{% url 'django-admindocs-docroot' as docsroot %}
{% if docsroot %}
<a href="{{ docsroot }}">{% trans 'Documentation' %}</a> /
{% endif %}
{% endif %}
{% if user.has_usable_password %}
<a href="{% url 'admin:password_change' %}">{% trans 'Change password' %}</a> /
{% endif %}
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
{% endblock %}
</div>
{% endif %}
{% endblock %}
{% block nav-global %}{% endblock %}
</div>
<!-- END Header -->
{% block breadcrumbs %}
<div class="breadcrumbs">
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
{% if title %} &rsaquo; {{ title }}{% endif %}
</div>
{% endblock %}
{% endif %}
{% block messages %}
{% if messages %}
<ul class="messagelist">{% for message in messages %}
<li{% if message.tags %} class="{{ message.tags }}"{% endif %}>{{ message|capfirst }}</li>
{% endfor %}</ul>
{% endif %}
{% endblock messages %}
<!-- Content -->
<div id="content" class="{% block coltype %}colM{% endblock %}">
{% block pretitle %}{% endblock %}
{% block content_title %}{# {% if title %}<h1>{{ title }}</h1>{% endif %} #}{% endblock %}
{% block content %}
{% block object-tools %}{% endblock %}
{{ content }}
{% endblock %}
{% block sidebar %}{% endblock %}
<br class="clear">
</div>
<!-- END Content -->
{% block footer %}<div id="footer"></div>{% endblock %}
</div>
<!-- END Container -->
<script>
(function ($) {
$.fn.reverse = [].reverse;
function fix_actions() {
var container = $('div.actions');
if (container.find('option').length < 10) {
container.find('label, button').hide();
var buttons = $('<div></div>')
.prependTo(container)
.css('display', 'inline')
.addClass('class', 'action-buttons');
container.find('option:gt(0)').reverse().each(function () {
const name = this.value
$('<button>')
.appendTo(buttons)
.attr('name', this.value)
.addClass('button')
.text(this.text)
.click(function () {
container.find('select')
.find(':selected').attr('selected', '').end()
.find('[value=' + this.name + ']').attr('selected', 'selected');
$('#changelist-form button[name="index"]').click();
document.querySelector('#logo').outerHTML = '<div class="loader"></div>'
});
});
}
};
$(function () {
fix_actions();
});
})(django.jQuery);
</script>
</body>
</html>

View file

@ -0,0 +1,100 @@
{% extends "admin/base_site.html" %}
{% load i18n static %}
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/login.css" %}">
{{ form.media }}
{% endblock %}
{% block bodyclass %}{{ block.super }} login{% endblock %}
{% block branding %}<h1>ArchiveBox Admin</h1>{% endblock %}
{% block usertools %}
<br/>
<a href="{% url 'Home' %}">Back to Main Index</a>
{% endblock %}
{% block nav-global %}{% endblock %}
{% block content_title %}
<center>
Log in to add, edit, and remove links from your archive.
</center><br/><br/>
<img src="{% static 'archive.png' %}" style="width: 80px; display: block; margin: auto"/><br/>
{% endblock %}
{% block breadcrumbs %}{% endblock %}
{% block content %}
{% if form.errors and not form.non_field_errors %}
<p class="errornote">
{% if form.errors.items|length == 1 %}{% trans "Please correct the error below." %}{% else %}{% trans "Please correct the errors below." %}{% endif %}
</p>
{% endif %}
{% if form.non_field_errors %}
{% for error in form.non_field_errors %}
<p class="errornote">
{{ error }}
</p>
{% endfor %}
{% endif %}
<div id="content-main">
{% if user.is_authenticated %}
<p class="errornote">
{% blocktrans trimmed %}
You are authenticated as {{ username }}, but are not authorized to
access this page. Would you like to login to a different account?
{% endblocktrans %}
</p>
{% endif %}
<br/>
<form action="{{ app_path }}" method="post" id="login-form">{% csrf_token %}
<div class="form-row">
{{ form.username.errors }}
{{ form.username.label_tag }} {{ form.username }}
</div>
<div class="form-row">
{{ form.password.errors }}
{{ form.password.label_tag }} {{ form.password }}
<input type="hidden" name="next" value="{{ next }}">
</div>
{% url 'admin_password_reset' as password_reset_url %}
{% if password_reset_url %}
<div class="password-reset-link">
<a href="{{ password_reset_url }}">{% trans 'Forgotten your password or username?' %}</a>
</div>
{% endif %}
<div class="submit-row">
<label>&nbsp;</label><input type="submit" value="{% trans 'Log in' %}">
</div>
</form>
<center>
<br/><br/>
<hr/>
<br/>
If you forgot your password, <a href="/accounts/password_reset/">reset it here</a> or run:<br/>
<pre>
archivebox manage changepassword USERNAME
</pre>
<br/><br/>
<hr/>
<br/>
To create a new admin user, run the following:
<pre>
archivebox manage createsuperuser
</pre>
<br/>
<hr/>
<small><i>(cd into your archive folder before running commands)</i></small>
</center>
</div>
{% endblock %}

Some files were not shown because too many files have changed in this diff Show more