mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-23 12:43:10 +00:00
Merge pull request #207 from pirate/django
This commit is contained in:
commit
56c697948d
159 changed files with 12366 additions and 3724 deletions
|
@ -1,20 +1,16 @@
|
|||
.DS_Store
|
||||
._*
|
||||
|
||||
*.pyc
|
||||
__pycache__/
|
||||
.mypy_cache/
|
||||
archivebox.egg-info/
|
||||
|
||||
venv/
|
||||
.venv/
|
||||
.docker-venv/
|
||||
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
|
||||
# Dependency code
|
||||
.venv # main pipenv venv path
|
||||
venv # old venv path, (no longer used)
|
||||
archivebox/.venv # old venv path, (no longer used)
|
||||
archivebox/venv # old venv path, (no longer used)
|
||||
|
||||
|
||||
# Stateful data folders
|
||||
data # main archivebox data folder
|
||||
archivebox/output # old output folder path (no longer used)
|
||||
output # old output folder path (no longer used)
|
||||
data/
|
||||
output/
|
||||
|
|
6
.flake8
Normal file
6
.flake8
Normal file
|
@ -0,0 +1,6 @@
|
|||
[flake8]
|
||||
ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391
|
||||
select = F,E9,W
|
||||
max-line-length = 130
|
||||
max-complexity = 10
|
||||
exclude = migrations,tests,node_modules,vendor,venv,.venv,.venv2,.docker-venv
|
145
.github/workflows/test.yml
vendored
Normal file
145
.github/workflows/test.yml
vendored
Normal file
|
@ -0,0 +1,145 @@
|
|||
name: Test workflow
|
||||
on: [push]
|
||||
|
||||
env:
|
||||
MAX_LINE_LENGTH: 110
|
||||
|
||||
jobs:
|
||||
lint:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: 3.8
|
||||
architecture: x64
|
||||
|
||||
- name: Install flake8
|
||||
run: |
|
||||
pip install flake8
|
||||
|
||||
- name: Lint with flake8
|
||||
run: |
|
||||
# one pass for show-stopper syntax errors or undefined names
|
||||
flake8 archivebox --count --show-source --statistics
|
||||
# one pass for small stylistic things
|
||||
flake8 archivebox --count --max-line-length="$MAX_LINE_LENGTH" --statistics
|
||||
|
||||
# - name: Lint with mypy
|
||||
# run: |
|
||||
# pip install mypy
|
||||
# mypy archivebox || true
|
||||
|
||||
test:
|
||||
runs-on: ${{ matrix.os }}
|
||||
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, macos-latest]
|
||||
python: [3.7, 3.8]
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- name: Set up Python ${{ matrix.python }}
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: ${{ matrix.python }}
|
||||
architecture: x64
|
||||
|
||||
- name: Cache virtualenv
|
||||
uses: actions/cache@v2
|
||||
id: cache-venv
|
||||
with:
|
||||
path: .venv
|
||||
key: ${{ runner.os }}-${{ matrix.python }}-venv-${{ hashFiles('setup.py') }}
|
||||
restore-keys: |
|
||||
${{ runner.os }}-${{ matrix.python }}-venv-
|
||||
|
||||
- name: Create virtualenv
|
||||
if: steps.cache-venv.outputs.cache-hit != 'true'
|
||||
run: |
|
||||
python3 -m venv .venv
|
||||
source .venv/bin/activate
|
||||
python3 -m pip install --upgrade pip setuptools
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pip install .
|
||||
python -m pip install pytest bottle
|
||||
|
||||
- name: Test built package with pytest
|
||||
run: |
|
||||
source .venv/bin/activate
|
||||
python -m pytest -s
|
||||
|
||||
docker-test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
with:
|
||||
fetch-depth: 1
|
||||
|
||||
- uses: satackey/action-docker-layer-caching@v0.0.4
|
||||
|
||||
- name: Build image
|
||||
run: |
|
||||
docker build . -t archivebox
|
||||
|
||||
- name: Init data dir
|
||||
run: |
|
||||
mkdir data
|
||||
docker run -v "$PWD"/data:/data archivebox init
|
||||
|
||||
- name: Run test server
|
||||
run: |
|
||||
sudo bash -c 'echo "127.0.0.1 www.test-nginx-1.local www.test-nginx-2.local" >> /etc/hosts'
|
||||
docker run --name www-nginx -p 80:80 -d nginx
|
||||
|
||||
- name: Add link
|
||||
run: |
|
||||
docker run -v "$PWD"/data:/data --network host archivebox add http://www.test-nginx-1.local
|
||||
|
||||
- name: Add stdin link
|
||||
run: |
|
||||
echo "http://www.test-nginx-2.local" | docker run -i -v "$PWD"/data:/data archivebox add
|
||||
|
||||
- name: List links
|
||||
run: |
|
||||
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-1.local" || { echo "The site 1 isn't in the list"; exit 1; }
|
||||
docker run -v "$PWD"/data:/data archivebox list | grep -q "www.test-nginx-2.local" || { echo "The site 2 isn't in the list"; exit 1; }
|
||||
|
||||
|
||||
- name: Start docker-compose stack
|
||||
run: |
|
||||
docker-compose run archivebox init
|
||||
docker-compose up -d
|
||||
sleep 4
|
||||
curl --silent --location 'http://127.0.0.1:8000/static/admin/js/jquery.init.js' | grep 'django.jQuery'
|
||||
|
||||
- name: Check added urls show up in index
|
||||
run: |
|
||||
docker-compose run archivebox add 'http://example.com/#test_docker' --index-only
|
||||
curl --silent --location 'http://127.0.0.1:8000' | grep 'http://example.com/#test_docker'
|
||||
|
||||
- name: Curl index with PUBLIC_INDEX=False
|
||||
run: |
|
||||
docker-compose run archivebox config --set PUBLIC_INDEX=False
|
||||
docker-compose up -d || true
|
||||
sleep 8
|
||||
curl --silent --location 'http://127.0.0.1:8000' | grep 'Log in'
|
||||
docker-compose down
|
||||
|
||||
- name: Curl index with PUBLIC_INDEX=True
|
||||
run: |
|
||||
docker-compose run archivebox config --set PUBLIC_INDEX=True
|
||||
docker-compose up -d || true
|
||||
sleep 8
|
||||
curl --silent --location 'http://127.0.0.1:8000' | grep 'Add Links'
|
||||
docker-compose down
|
23
.gitignore
vendored
23
.gitignore
vendored
|
@ -1,21 +1,16 @@
|
|||
.DS_Store
|
||||
._*
|
||||
|
||||
*.pyc
|
||||
__pycache__/
|
||||
.mypy_cache/
|
||||
archivebox.egg-info/
|
||||
|
||||
venv/
|
||||
.venv/
|
||||
.docker-venv/
|
||||
|
||||
*.egg-info/
|
||||
build/
|
||||
dist/
|
||||
|
||||
|
||||
# Dependency code
|
||||
.venv # main pipenv venv path
|
||||
venv # old venv path, (no longer used)
|
||||
archivebox/.venv # old venv path, (no longer used)
|
||||
archivebox/venv # old venv path, (no longer used)
|
||||
|
||||
|
||||
# Stateful data folders
|
||||
data/ # main archivebox data folder
|
||||
archivebox/output/ # old output folder path (no longer used)
|
||||
output/ # old output folder path (no longer used)
|
||||
data/
|
||||
output/
|
||||
|
|
118
Dockerfile
118
Dockerfile
|
@ -1,74 +1,68 @@
|
|||
# This Dockerfile for ArchiveBox installs the following in a container:
|
||||
# - curl, wget, python3, youtube-dl, google-chrome-unstable
|
||||
# - ArchiveBox
|
||||
# This is the Dockerfile for ArchiveBox, it includes the following major pieces:
|
||||
# git, curl, wget, python3, youtube-dl, google-chrome-stable, ArchiveBox
|
||||
# Usage:
|
||||
# docker build github.com/pirate/ArchiveBox -t archivebox
|
||||
# echo 'https://example.com' | docker run -i -v ./data:/data archivebox /bin/archive
|
||||
# docker run -v ./data:/data archivebox /bin/archive 'https://example.com/some/rss/feed.xml'
|
||||
# docker build . -t archivebox
|
||||
# docker run -v "$PWD/data":/data archivebox init
|
||||
# docker run -v "$PWD/data":/data archivebox add 'https://example.com'
|
||||
# Documentation:
|
||||
# https://github.com/pirate/ArchiveBox/wiki/Docker#docker
|
||||
|
||||
# TODO: bump to latest chrome and node version, confirm chrome doesn't hang on simple pages
|
||||
FROM python:3.8-slim-buster
|
||||
|
||||
FROM node:11-slim
|
||||
LABEL maintainer="Nick Sweeting <archivebox-git@sweeting.me>"
|
||||
LABEL name="archivebox" \
|
||||
maintainer="Nick Sweeting <archivebox-git@sweeting.me>" \
|
||||
description="All-in-one personal internet archiving container"
|
||||
|
||||
RUN apt-get update \
|
||||
&& apt-get install -yq --no-install-recommends \
|
||||
git wget curl youtube-dl gnupg2 libgconf-2-4 python3 python3-pip \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install latest chrome package and fonts to support major charsets (Chinese, Japanese, Arabic, Hebrew, Thai and a few others)
|
||||
RUN apt-get update && apt-get install -y wget --no-install-recommends \
|
||||
&& wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
|
||||
&& sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
|
||||
&& apt-get update \
|
||||
&& apt-get install -y google-chrome-unstable fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst ttf-freefont \
|
||||
--no-install-recommends \
|
||||
&& rm -rf /var/lib/apt/lists/* \
|
||||
&& rm -rf /src/*.deb
|
||||
|
||||
# It's a good idea to use dumb-init to help prevent zombie chrome processes.
|
||||
ADD https://github.com/Yelp/dumb-init/releases/download/v1.2.0/dumb-init_1.2.0_amd64 /usr/local/bin/dumb-init
|
||||
RUN chmod +x /usr/local/bin/dumb-init
|
||||
|
||||
# Uncomment to skip the chromium download when installing puppeteer. If you do,
|
||||
# you'll need to launch puppeteer with:
|
||||
# browser.launch({executablePath: 'google-chrome-unstable'})
|
||||
ENV PUPPETEER_SKIP_CHROMIUM_DOWNLOAD true
|
||||
|
||||
# Install puppeteer so it's available in the container.
|
||||
RUN npm i puppeteer
|
||||
|
||||
# Add user so we don't need --no-sandbox.
|
||||
RUN groupadd -r pptruser && useradd -r -g pptruser -G audio,video pptruser \
|
||||
&& mkdir -p /home/pptruser/Downloads \
|
||||
&& chown -R pptruser:pptruser /home/pptruser \
|
||||
&& chown -R pptruser:pptruser /node_modules
|
||||
|
||||
# Install the ArchiveBox repository and pip requirements
|
||||
COPY . /home/pptruser/app
|
||||
RUN mkdir -p /data \
|
||||
&& chown -R pptruser:pptruser /data \
|
||||
&& ln -s /data /home/pptruser/app/archivebox/output \
|
||||
&& ln -s /home/pptruser/app/bin/* /bin/ \
|
||||
&& ln -s /home/pptruser/app/bin/archivebox /bin/archive \
|
||||
&& chown -R pptruser:pptruser /home/pptruser/app/archivebox
|
||||
# && pip3 install -r /home/pptruser/app/archivebox/requirements.txt
|
||||
|
||||
VOLUME /data
|
||||
|
||||
ENV LANG=C.UTF-8 \
|
||||
ENV TZ=UTC \
|
||||
LANGUAGE=en_US:en \
|
||||
LC_ALL=C.UTF-8 \
|
||||
LANG=C.UTF-8 \
|
||||
PYTHONIOENCODING=UTF-8 \
|
||||
CHROME_SANDBOX=False \
|
||||
CHROME_BINARY=google-chrome-unstable \
|
||||
OUTPUT_DIR=/data
|
||||
PYTHONUNBUFFERED=1 \
|
||||
APT_KEY_DONT_WARN_ON_DANGEROUS_USAGE=1 \
|
||||
CODE_PATH=/app \
|
||||
VENV_PATH=/venv \
|
||||
DATA_PATH=/data
|
||||
|
||||
# First install CLI utils and base deps, then Chrome + Fons
|
||||
RUN echo 'debconf debconf/frontend select Noninteractive' | debconf-set-selections \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
apt-transport-https ca-certificates apt-utils gnupg gosu gnupg2 libgconf-2-4 zlib1g-dev \
|
||||
dumb-init jq git wget curl youtube-dl ffmpeg \
|
||||
&& curl -sSL "https://dl.google.com/linux/linux_signing_key.pub" | apt-key add - \
|
||||
&& echo "deb https://dl.google.com/linux/chrome/deb/ stable main" > /etc/apt/sources.list.d/google-chrome.list \
|
||||
&& apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
google-chrome-stable \
|
||||
fontconfig \
|
||||
fonts-ipafont-gothic \
|
||||
fonts-wqy-zenhei \
|
||||
fonts-thai-tlwg \
|
||||
fonts-kacst \
|
||||
fonts-symbola \
|
||||
fonts-noto \
|
||||
fonts-freefont-ttf \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Run everything from here on out as non-privileged user
|
||||
USER pptruser
|
||||
WORKDIR /home/pptruser/app
|
||||
RUN groupadd --system archivebox \
|
||||
&& useradd --system --create-home --gid archivebox --groups audio,video archivebox
|
||||
|
||||
ENTRYPOINT ["dumb-init", "--"]
|
||||
CMD ["/bin/archive"]
|
||||
ADD . "$CODE_PATH"
|
||||
WORKDIR "$CODE_PATH"
|
||||
ENV PATH="${PATH}:$VENV_PATH/bin"
|
||||
RUN python -m venv --clear --symlinks "$VENV_PATH" \
|
||||
&& pip install --upgrade pip setuptools \
|
||||
&& pip install -e .
|
||||
|
||||
VOLUME "$DATA_PATH"
|
||||
WORKDIR "$DATA_PATH"
|
||||
EXPOSE 8000
|
||||
ENV CHROME_BINARY=google-chrome \
|
||||
CHROME_SANDBOX=False
|
||||
|
||||
RUN env ALLOW_ROOT=True archivebox version
|
||||
|
||||
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh", "archivebox"]
|
||||
CMD ["server", "0.0.0.0:8000"]
|
||||
|
|
4
MANIFEST.in
Normal file
4
MANIFEST.in
Normal file
|
@ -0,0 +1,4 @@
|
|||
include LICENSE
|
||||
include README.md
|
||||
include archivebox/VERSION
|
||||
recursive-include archivebox/themes *
|
12
Pipfile
Normal file
12
Pipfile
Normal file
|
@ -0,0 +1,12 @@
|
|||
[[source]]
|
||||
name = "pypi"
|
||||
url = "https://pypi.org/simple"
|
||||
verify_ssl = true
|
||||
|
||||
[packages]
|
||||
# see setup.py for package dependency list
|
||||
"e1839a8" = {path = ".", editable = true}
|
||||
|
||||
[dev-packages]
|
||||
# see setup.py for dev package dependency list
|
||||
"e1839a8" = {path = ".", extras = ["dev"], editable = true}
|
259
README.md
259
README.md
|
@ -15,6 +15,7 @@
|
|||
</pre>
|
||||
|
||||
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
||||
|
||||
<a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
||||
<a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
|
||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
||||
|
@ -24,34 +25,38 @@
|
|||
|
||||
<hr/>
|
||||
<br/>
|
||||
<i>💥 Attention: Big API changes are coming soon (including a proper config file format and <code>pip install archivebox</code>)! Check out <a href="https://github.com/pirate/ArchiveBox/pull/207">v0.4</a> and help us test it! 💥</i>
|
||||
<i>💥 Attention: Big API changes are coming with the current release (including <code>pip install archivebox</code>)!
|
||||
<br/><br/>
|
||||
<b>Note: There are some important security design issues that need to be fixed before v0.4 can be pushed, all help is appreciated!<br/>
|
||||
(This project is not abandoned, it's my primary side-project for the forseeable future, my day job is very busy right now.)<br/>
|
||||
See the <a href="https://github.com/pirate/ArchiveBox/pull/207#issuecomment-494107553">v0.4 release PR</a> for more information.</b>
|
||||
<br/>
|
||||
<hr/>
|
||||
|
||||
</div>
|
||||
|
||||
**ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).**
|
||||
**ArchiveBox takes a list of website URLs you want to archive, and creates a local, static, browsable HTML clone of the content from those websites (it saves HTML, JS, media files, PDFs, images and more).**
|
||||
|
||||
You can use it to preserve access to websites you care about by storing them locally offline. ArchiveBox imports lists of URLs, renders the pages in a headless, authenticated, user-scriptable browser, and then archives the content in multiple redundant common formats (HTML, PDF, PNG, WARC) that will last long after the originals disappear off the internet. It automatically extracts assets and media from pages and saves them in easily-accessible folders, with out-of-the-box support for extracting git repositories, audio, video, subtitles, images, PDFs, and more.
|
||||
You can use it to preserve access to websites you care about by storing them locally offline. ArchiveBox imports lists of URLs, renders the pages in a headless, authenticated, user-scriptable browser, and then archives the content in multiple redundant common formats (HTML, PDF, PNG, WARC) that will last long after the originals disappear off the internet. It automatically extracts assets and media from pages and saves them in easily-accessible folders, with out-of-the-box support for extracting git repositories, audio, video, subtitles, images, PDFs, and more.
|
||||
|
||||
#### How does it work?
|
||||
|
||||
```bash
|
||||
echo 'http://example.com' | ./archive
|
||||
mkdir data && cd data
|
||||
archivebox init
|
||||
archivebox add 'https://example.com'
|
||||
archivebox add 'https://getpocket.com/users/USERNAME/feed/all' --depth=1
|
||||
archivebox server
|
||||
```
|
||||
After installing the dependencies, just pipe some new links into the `./archive` command to start your archive.
|
||||
|
||||
ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl, pywb, and other common UNIX tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer).
|
||||
After installing archivebox, just pass some new links to the `archivebox add` command to start your collection.
|
||||
|
||||
ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl, pywb, and other common UNIX tools to save each page you add in multiple redundant formats. It doesn't require a constantly running server or backend, just open the generated `output/index.html` in a browser to view the archive. It can import and export links as JSON (among other formats), so it's easy to script or hook up to other APIs. If you run it on a schedule and import from browser history or bookmarks regularly, you can sleep soundly knowing that the slice of the internet you care about will be automatically preserved in multiple, durable long-term formats that will be accessible for decades (or longer).
|
||||
|
||||
<div align="center">
|
||||
|
||||
<img src="https://i.imgur.com/3tBL7PU.png" width="30%" alt="CLI Screenshot" align="top">
|
||||
<img src="https://i.imgur.com/viklZNG.png" width="30%" alt="Desktop index screenshot" align="top">
|
||||
<img src="https://i.imgur.com/RefWsXB.jpg" width="30%" alt="Desktop details page Screenshot"/><br/>
|
||||
<img src="https://i.imgur.com/3tBL7PU.png" width="22%" alt="CLI Screenshot" align="top">
|
||||
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
|
||||
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
|
||||
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
|
||||
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
|
||||
<br/>
|
||||
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
|
||||
|
@ -60,25 +65,52 @@ ArchiveBox is written in Python 3.7 and uses wget, Chrome headless, youtube-dl,
|
|||
## Quickstart
|
||||
|
||||
ArchiveBox is written in `python3.7` and has [3 main binary dependencies](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies): `wget`, `chromium`, and `youtube-dl`.
|
||||
To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
|
||||
To get started, you can [install them manually](https://github.com/pirate/ArchiveBox/wiki/Install) using your system's package manager, use the [automated helper script](https://github.com/pirate/ArchiveBox/wiki/Quickstart), or use the official [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker) container. All three dependencies are optional if [disabled](https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles) in settings.
|
||||
|
||||
```bash
|
||||
# 1. Install dependencies (use apt on ubuntu, brew on mac, or pkg on BSD)
|
||||
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
|
||||
|
||||
# 2. Download ArchiveBox
|
||||
git clone https://github.com/pirate/ArchiveBox.git && cd ArchiveBox
|
||||
|
||||
# 3. Add your first links to your archive
|
||||
echo 'https://example.com' | ./archive # pass URLs to archive via stdin
|
||||
|
||||
./archive https://getpocket.com/users/example/feed/all # or import an RSS/JSON/XML/TXT feed
|
||||
# Docker
|
||||
mkdir data && cd data
|
||||
docker run -v $PWD:/data nikisweeting:archivebox init
|
||||
docker run -v $PWD:/data nikisweeting:archivebox add 'https://example.com'
|
||||
docker run -v $PWD:/data -p 8000 nikisweeting:archivebox server
|
||||
open https://127.0.0.1:8000
|
||||
```
|
||||
|
||||
Once you've added your first links, open `output/index.html` in a browser to view the archive. [DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
||||
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
|
||||
```bash
|
||||
# Docker Compose
|
||||
# Download https://github.com/pirate/ArchiveBox/tree/master/docker-compose.yml
|
||||
docker-compose run archivebox init
|
||||
docker-compose run archivebox add 'https://example.com'
|
||||
docker-compose up
|
||||
```
|
||||
|
||||
*(`pip install archivebox` will be available in the near future, follow our [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for progress)*
|
||||
```bash
|
||||
# Bare Metal
|
||||
# Use apt on Ubuntu/Debian, brew on mac, or pkg on BSD
|
||||
apt install python3 python3-pip git curl wget youtube-dl chromium-browser
|
||||
|
||||
pip install archivebox # install archivebox
|
||||
|
||||
mkdir data && cd data # (doesn't have to be called data)
|
||||
archivebox init
|
||||
archivebox add 'https://example.com' # add URLs via args or stdin
|
||||
|
||||
# or import an RSS/JSON/XML/TXT feed/list of links
|
||||
archivebox add https://getpocket.com/users/USERNAME/feed/all --depth=1
|
||||
```
|
||||
|
||||
Once you've added your first links, open `data/index.html` in a browser to view the static archive.
|
||||
|
||||
You can also start it as a server with a full web UI to manage your links:
|
||||
```bash
|
||||
archivebox manage createsuperuser
|
||||
archivebox server
|
||||
```
|
||||
|
||||
You can visit `https://127.0.0.1:8000` in your browser to access it.
|
||||
|
||||
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
||||
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
|
||||
|
||||
---
|
||||
|
||||
|
@ -88,28 +120,29 @@ For more information, see the [full Quickstart guide](https://github.com/pirate/
|
|||
|
||||
# Overview
|
||||
|
||||
Because modern websites are complicated and often rely on dynamic content,
|
||||
ArchiveBox archives the sites in **several different formats** beyond what public
|
||||
archiving services like Archive.org and Archive.is are capable of saving. Using multiple
|
||||
methods and the market-dominant browser to execute JS ensures we can save even the most
|
||||
Because modern websites are complicated and often rely on dynamic content,
|
||||
ArchiveBox archives the sites in **several different formats** beyond what public
|
||||
archiving services like Archive.org and Archive.is are capable of saving. Using multiple
|
||||
methods and the market-dominant browser to execute JS ensures we can save even the most
|
||||
complex, finicky websites in at least a few high-quality, long-term data formats.
|
||||
|
||||
ArchiveBox imports a list of URLs from stdin, remote URL, or file, then adds the pages to a local archive folder using wget to create a browsable HTML clone, youtube-dl to extract media, and a full instance of Chrome headless for PDF, Screenshot, and DOM dumps, and more...
|
||||
|
||||
Running `./archive` adds only new, unique links into `output/` on each run. Because it will ignore duplicates and only archive each link the first time you add it, you can schedule it to [run on a timer](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) and re-import all your feeds multiple times a day. It will run quickly even if the feeds are large, because it's only archiving the newest links since the last run. For each link, it runs through all the archive methods. Methods that fail will save `None` and be automatically retried on the next run, methods that succeed save their output into the data folder and are never retried/overwritten by subsequent runs. Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs).
|
||||
Running `./archive` adds only new, unique links into `output/` on each run. Because it will ignore duplicates and only archive each link the first time you add it, you can schedule it to [run on a timer](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) and re-import all your feeds multiple times a day. It will run quickly even if the feeds are large, because it's only archiving the newest links since the last run. For each link, it runs through all the archive methods. Methods that fail will save `None` and be automatically retried on the next run, methods that succeed save their output into the data folder and are never retried/overwritten by subsequent runs. Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs).
|
||||
|
||||
All the archived links are stored by date bookmarked in `output/archive/<timestamp>`, and everything is indexed nicely with JSON & HTML files. The intent is for all the content to be viewable with common software in 50 - 100 years without needing to run ArchiveBox in a VM.
|
||||
|
||||
#### Can import links from many formats:
|
||||
|
||||
```bash
|
||||
echo 'http://example.com' | ./archive
|
||||
./archive ~/Downloads/firefox_bookmarks_export.html
|
||||
./archive https://example.com/some/rss/feed.xml
|
||||
echo 'http://example.com' | archivebox add
|
||||
archivebox add ~/Downloads/firefox_bookmarks_export.html --depth=1
|
||||
archivebox add https://example.com/some/rss/feed.xml --depth=1
|
||||
```
|
||||
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more)
|
||||
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
||||
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
||||
|
||||
- <img src="https://nicksweeting.com/images/bookmarks.png" height="22px"/> Browser history or bookmarks exports (Chrome, Firefox, Safari, IE, Opera, and more)
|
||||
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
||||
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
||||
|
||||
See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
|
||||
|
||||
|
@ -119,41 +152,41 @@ See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
|
|||
ls output/archive/<timestamp>/
|
||||
```
|
||||
|
||||
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
|
||||
- **Title:** `title` title of the site
|
||||
- **Favicon:** `favicon.ico` favicon of the site
|
||||
- **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present
|
||||
- **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving
|
||||
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
|
||||
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
|
||||
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
|
||||
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
||||
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
||||
- *More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)...*
|
||||
- **Index:** `index.html` & `index.json` HTML and JSON index files containing metadata and details
|
||||
- **Title:** `title` title of the site
|
||||
- **Favicon:** `favicon.ico` favicon of the site
|
||||
- **WGET Clone:** `example.com/page-name.html` wget clone of the site, with .html appended if not present
|
||||
- **WARC:** `warc/<timestamp>.gz` gzipped WARC of all the resources fetched while archiving
|
||||
- **PDF:** `output.pdf` Printed PDF of site using headless chrome
|
||||
- **Screenshot:** `screenshot.png` 1440x900 screenshot of site using headless chrome
|
||||
- **DOM Dump:** `output.html` DOM Dump of the HTML after rendering using headless chrome
|
||||
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
||||
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
||||
- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
|
||||
|
||||
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
|
||||
|
||||
If you're importing URLs with secret tokens in them (e.g Google Docs, CodiMD notepads, etc), you may want to disable some of these methods to avoid leaking private URLs to 3rd party APIs during the archiving process. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
||||
If you're importing URLs with secret tokens in them (e.g Google Docs, CodiMD notepads, etc), you may want to disable some of these methods to avoid leaking private URLs to 3rd party APIs during the archiving process. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
||||
|
||||
## Key Features
|
||||
|
||||
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
||||
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
|
||||
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
- **Doesn't require a constantly-running server**, proxy, or native app
|
||||
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
||||
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
||||
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.4 is released with some security fixes)
|
||||
- Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
|
||||
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
||||
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
||||
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
|
||||
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
- **Doesn't require a constantly-running server**, proxy, or native app
|
||||
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
||||
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
||||
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
|
||||
- Can [**run scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51) to [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
|
||||
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
||||
|
||||
## Background & Motivation
|
||||
|
||||
Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity.
|
||||
Vast treasure troves of knowledge are lost every day on the internet to link rot. As a society, we have an imperative to preserve some important parts of that treasure, just like we preserve our books, paintings, and music in physical libraries long after the originals go out of print or fade into obscurity.
|
||||
|
||||
Whether it's to resist censorship by saving articles before they get taken down or edited, or
|
||||
just to save a collection of early 2010's flash games you love to play, having the tools to
|
||||
just to save a collection of early 2010's flash games you love to play, having the tools to
|
||||
archive internet content enables to you save the stuff you care most about before it disappears.
|
||||
|
||||
<div align="center">
|
||||
|
@ -161,10 +194,9 @@ archive internet content enables to you save the stuff you care most about befor
|
|||
<sup><i>Image from <a href="https://digiday.com/media/wtf-link-rot/">WTF is Link Rot?</a>...</i><br/></sup>
|
||||
</div>
|
||||
|
||||
The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful.
|
||||
The balance between the permanence and ephemeral nature of content on the internet is part of what makes it beautiful.
|
||||
I don't think everything should be preserved in an automated fashion, making all content permanent and never removable, but I do think people should be able to decide for themselves and effectively archive specific content that they care about.
|
||||
|
||||
|
||||
## Comparison to Other Projects
|
||||
|
||||
▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
|
||||
|
@ -173,86 +205,83 @@ I don't think everything should be preserved in an automated fashion, making all
|
|||
|
||||
#### User Interface & Intended Purpose
|
||||
|
||||
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI.
|
||||
|
||||
An alternative tool [pywb](https://github.com/webrecorder/pywb) allows you to run a browser through an always-running archiving proxy which records the traffic to WARC files. ArchiveBox intends to support this style of live proxy-archiving using `pywb` in the future, but for now, it only ingests lists of links at a time via browser history, bookmarks, RSS, etc.
|
||||
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
|
||||
|
||||
#### Private Local Archives vs Centralized Public Archives
|
||||
|
||||
Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.4 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle.
|
||||
Unlike crawler software that starts from a seed URL and works outwards, or public tools like Archive.org designed for users to manually submit links from the public internet, ArchiveBox tries to be a set-and-forget archiver suitable for archiving your entire browsing history, RSS feeds, or bookmarks, ~~including private/authenticated content that you wouldn't otherwise share with a centralized service~~ (do not do this until v0.5 is released with some security fixes). Also by having each user store their own content locally, we can save much larger portions of everyone's browsing history than a shared centralized service would be able to handle.
|
||||
|
||||
#### Storage Requirements
|
||||
|
||||
Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your mileage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than using a single method, but more content is accurately replayable over extended periods. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `FETCH_MEDIA=False` to skip audio & video files.
|
||||
Because ArchiveBox is designed to ingest a firehose of browser history and bookmark feeds to a local disk, it can be much more disk-space intensive than a centralized service like the Internet Archive or Archive.today. However, as storage space gets cheaper and compression improves, you should be able to use it continuously over the years without having to delete anything. In my experience, ArchiveBox uses about 5gb per 1000 articles, but your milage may vary depending on which options you have enabled and what types of sites you're archiving. By default, it archives everything in as many formats as possible, meaning it takes more space than a using a single method, but more content is accurately replayable over extended periods of time. Storage requirements can be reduced by using a compressed/deduplicated filesystem like ZFS/BTRFS, or by setting `SAVE_MEDIA=False` to skip audio & video files.
|
||||
|
||||
## Learn more
|
||||
|
||||
<!--▶ **Join out our [community chat](http://webchat.freenode.net?channels=ArchiveBox&uio=d4) hosted on IRC freenode.net:`#ArchiveBox`!**-->
|
||||
|
||||
Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web!
|
||||
Whether you want to learn which organizations are the big players in the web archiving space, want to find a specific open-source tool for your web archiving need, or just want to see where archivists hang out online, our Community Wiki page serves as an index of the broader web archiving community. Check it out to learn about some of the coolest web archiving projects and communities on the web!
|
||||
|
||||
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
|
||||
|
||||
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
+ [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
||||
*Community-maintained indexes of archiving tools and institutions.*
|
||||
+ [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
||||
*Open source tools and projects in the internet archiving space.*
|
||||
+ [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
||||
*Articles, posts, and blogs relevant to ArchiveBox and web archiving in general.*
|
||||
+ [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
||||
*A collection of the most active internet archiving communities and initiatives.*
|
||||
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
||||
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
||||
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
||||
|
||||
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
- [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
||||
_Community-maintained indexes of archiving tools and institutions._
|
||||
- [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
||||
_Open source tools and projects in the internet archiving space._
|
||||
- [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
||||
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
|
||||
- [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
||||
_A collection of the most active internet archiving communities and initiatives._
|
||||
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
||||
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
||||
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
||||
|
||||
---
|
||||
|
||||
|
||||
# Documentation
|
||||
|
||||
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
|
||||
|
||||
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) for documentation.
|
||||
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) for documentation.
|
||||
|
||||
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
|
||||
|
||||
You can build the docs by running:
|
||||
|
||||
```python
|
||||
cd ArchiveBox
|
||||
pipenv install --dev
|
||||
sphinx-apidoc -o docs archivebox
|
||||
cd docs/
|
||||
make html
|
||||
# then open docs/_build/html/index.html
|
||||
```
|
||||
|
||||
## Getting Started
|
||||
|
||||
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
|
||||
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
|
||||
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
|
||||
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
|
||||
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
|
||||
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
|
||||
|
||||
## Reference
|
||||
|
||||
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
|
||||
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
|
||||
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
||||
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
|
||||
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
|
||||
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
|
||||
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
|
||||
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
|
||||
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
|
||||
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
|
||||
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
||||
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
|
||||
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
|
||||
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
|
||||
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
|
||||
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
|
||||
|
||||
## More Info
|
||||
|
||||
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
|
||||
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
||||
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
|
||||
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
|
||||
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
|
||||
---
|
||||
|
||||
|
||||
# Screenshots
|
||||
|
||||
<div align="center">
|
||||
<img src="https://i.imgur.com/biVfFYr.png" width="18%" alt="CLI Screenshot" align="top">
|
||||
<img src="https://i.imgur.com/viklZNG.png" width="40%" alt="Desktop index screenshot" align="top">
|
||||
<img src="https://i.imgur.com/wnpdAVM.jpg" width="30%" alt="Desktop details page Screenshot" align="top">
|
||||
<img src="https://i.imgur.com/mW2dITg.png" width="8%" alt="Mobile details page screenshot" align="top">
|
||||
</div>
|
||||
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
|
||||
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
||||
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
|
||||
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
|
||||
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
||||
|
||||
---
|
||||
|
||||
|
@ -272,12 +301,14 @@ Contributor Spotlight:<br/><br/>
|
|||
<a href="https://sourcerer.io/fame/pirate/pirate/ArchiveBox/links/5"><img src="https://sourcerer.io/fame/pirate/pirate/ArchiveBox/images/5"></a>
|
||||
|
||||
<br/>
|
||||
<a href="https://github.com/sponsors/pirate">Sponsor us on Github</a>
|
||||
<br>
|
||||
<br>
|
||||
<a href="https://www.patreon.com/theSquashSH"><img src="https://img.shields.io/badge/Donate_to_support_development-via_Patreon-%23DD5D76.svg?style=flat"/></a>
|
||||
<br/>
|
||||
<br/>
|
||||
|
||||
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
|
||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
|
||||
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
||||
|
||||
<br/><br/>
|
||||
|
||||
|
|
1
archive
1
archive
|
@ -1 +0,0 @@
|
|||
bin/archivebox
|
6
archivebox/.flake8
Normal file
6
archivebox/.flake8
Normal file
|
@ -0,0 +1,6 @@
|
|||
[flake8]
|
||||
ignore = D100,D101,D102,D103,D104,D105,D202,D203,D205,D400,E131,E241,E252,E266,E272,E701,E731,W293,W503,W291,W391
|
||||
select = F,E9,W
|
||||
max-line-length = 130
|
||||
max-complexity = 10
|
||||
exclude = migrations,tests,node_modules,vendor,static,venv,.venv,.venv2,.docker-venv
|
1
archivebox/VERSION
Normal file
1
archivebox/VERSION
Normal file
|
@ -0,0 +1 @@
|
|||
0.4.8
|
|
@ -1 +1 @@
|
|||
# if you're looking for the source of the main `archivebox` shell command, it's in `archivebox/archivebox.py`
|
||||
__package__ = 'archivebox'
|
||||
|
|
11
archivebox/__main__.py
Executable file
11
archivebox/__main__.py
Executable file
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import sys
|
||||
|
||||
from .cli import main
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
|
@ -1,137 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
ArchiveBox command line application.
|
||||
|
||||
./archive and ./bin/archivebox both point to this file,
|
||||
but you can also run it directly using `python3 archive.py`
|
||||
|
||||
Usage & Documentation:
|
||||
https://github.com/pirate/ArchiveBox/Wiki
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
from links import links_after_timestamp
|
||||
from index import write_links_index, load_links_index
|
||||
from archive_methods import archive_link
|
||||
from config import (
|
||||
ARCHIVE_DIR,
|
||||
ONLY_NEW,
|
||||
OUTPUT_DIR,
|
||||
GIT_SHA,
|
||||
)
|
||||
from util import (
|
||||
save_remote_source,
|
||||
save_stdin_source,
|
||||
)
|
||||
from logs import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
)
|
||||
|
||||
__AUTHOR__ = 'Nick Sweeting <git@nicksweeting.com>'
|
||||
__VERSION__ = GIT_SHA[:9]
|
||||
__DESCRIPTION__ = 'ArchiveBox: The self-hosted internet archive.'
|
||||
__DOCUMENTATION__ = 'https://github.com/pirate/ArchiveBox/wiki'
|
||||
|
||||
|
||||
def print_help():
|
||||
print('ArchiveBox: The self-hosted internet archive.\n')
|
||||
print("Documentation:")
|
||||
print(" https://github.com/pirate/ArchiveBox/wiki\n")
|
||||
print("UI Usage:")
|
||||
print(" Open output/index.html to view your archive.\n")
|
||||
print("CLI Usage:")
|
||||
print(" echo 'https://example.com' | ./archive\n")
|
||||
print(" ./archive ~/Downloads/bookmarks_export.html\n")
|
||||
print(" ./archive https://example.com/feed.rss\n")
|
||||
print(" ./archive 15109948213.123\n")
|
||||
|
||||
|
||||
def main(*args):
|
||||
if set(args).intersection(('-h', '--help', 'help')) or len(args) > 2:
|
||||
print_help()
|
||||
raise SystemExit(0)
|
||||
|
||||
if set(args).intersection(('--version', 'version')):
|
||||
print('ArchiveBox version {}'.format(__VERSION__))
|
||||
raise SystemExit(0)
|
||||
|
||||
### Handle CLI arguments
|
||||
# ./archive bookmarks.html
|
||||
# ./archive 1523422111.234
|
||||
import_path, resume = None, None
|
||||
if len(args) == 2:
|
||||
# if the argument is a string, it's a import_path file to import
|
||||
# if it's a number, it's a timestamp to resume archiving from
|
||||
if args[1].replace('.', '').isdigit():
|
||||
import_path, resume = None, args[1]
|
||||
else:
|
||||
import_path, resume = args[1], None
|
||||
|
||||
### Set up output folder
|
||||
if not os.path.exists(OUTPUT_DIR):
|
||||
os.makedirs(OUTPUT_DIR)
|
||||
|
||||
### Handle ingesting urls piped in through stdin
|
||||
# (.e.g if user does cat example_urls.txt | ./archive)
|
||||
if not sys.stdin.isatty():
|
||||
stdin_raw_text = sys.stdin.read()
|
||||
if stdin_raw_text and import_path:
|
||||
print(
|
||||
'[X] You should pass either a path as an argument, '
|
||||
'or pass a list of links via stdin, but not both.\n'
|
||||
)
|
||||
print_help()
|
||||
raise SystemExit(1)
|
||||
if stdin_raw_text:
|
||||
import_path = save_stdin_source(stdin_raw_text)
|
||||
|
||||
### Handle ingesting urls from a remote file/feed
|
||||
# (e.g. if an RSS feed URL is used as the import path)
|
||||
if import_path and any(import_path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
import_path = save_remote_source(import_path)
|
||||
|
||||
### Run the main archive update process
|
||||
update_archive_data(import_path=import_path, resume=resume)
|
||||
|
||||
|
||||
def update_archive_data(import_path=None, resume=None):
|
||||
"""The main ArchiveBox entrancepoint. Everything starts here."""
|
||||
|
||||
# Step 1: Load list of links from the existing index
|
||||
# merge in and dedupe new links from import_path
|
||||
all_links, new_links = load_links_index(out_dir=OUTPUT_DIR, import_path=import_path)
|
||||
|
||||
# Step 2: Write updated index with deduped old and new links back to disk
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links)
|
||||
|
||||
# Step 3: Run the archive methods for each link
|
||||
links = new_links if ONLY_NEW else all_links
|
||||
log_archiving_started(len(links), resume)
|
||||
idx, link = 0, 0
|
||||
try:
|
||||
for idx, link in enumerate(links_after_timestamp(links, resume)):
|
||||
link_dir = os.path.join(ARCHIVE_DIR, link['timestamp'])
|
||||
archive_link(link_dir, link)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link and link['timestamp'])
|
||||
raise SystemExit(0)
|
||||
|
||||
except:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
|
||||
# Step 4: Re-write links index with updated titles, icons, and resources
|
||||
all_links, _ = load_links_index(out_dir=OUTPUT_DIR)
|
||||
write_links_index(out_dir=OUTPUT_DIR, links=all_links, finished=True)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(*sys.argv)
|
|
@ -1,623 +0,0 @@
|
|||
import os
|
||||
|
||||
from collections import defaultdict
|
||||
from datetime import datetime
|
||||
|
||||
from index import (
|
||||
write_link_index,
|
||||
patch_links_index,
|
||||
load_json_link_index,
|
||||
)
|
||||
from config import (
|
||||
CURL_BINARY,
|
||||
GIT_BINARY,
|
||||
WGET_BINARY,
|
||||
YOUTUBEDL_BINARY,
|
||||
FETCH_FAVICON,
|
||||
FETCH_TITLE,
|
||||
FETCH_WGET,
|
||||
FETCH_WGET_REQUISITES,
|
||||
FETCH_PDF,
|
||||
FETCH_SCREENSHOT,
|
||||
FETCH_DOM,
|
||||
FETCH_WARC,
|
||||
FETCH_GIT,
|
||||
FETCH_MEDIA,
|
||||
SUBMIT_ARCHIVE_DOT_ORG,
|
||||
TIMEOUT,
|
||||
MEDIA_TIMEOUT,
|
||||
ANSI,
|
||||
OUTPUT_DIR,
|
||||
GIT_DOMAINS,
|
||||
GIT_SHA,
|
||||
RESTRICT_FILE_NAMES,
|
||||
CURL_USER_AGENT,
|
||||
WGET_USER_AGENT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
COOKIES_FILE,
|
||||
WGET_AUTO_COMPRESSION
|
||||
)
|
||||
from util import (
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
without_fragment,
|
||||
fetch_page_title,
|
||||
is_static_file,
|
||||
TimedProgress,
|
||||
chmod_file,
|
||||
wget_output_path,
|
||||
chrome_args,
|
||||
check_link_structure,
|
||||
run, PIPE, DEVNULL
|
||||
)
|
||||
from logs import (
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_started,
|
||||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
|
||||
|
||||
class ArchiveError(Exception):
|
||||
def __init__(self, message, hints=None):
|
||||
super().__init__(message)
|
||||
self.hints = hints
|
||||
|
||||
|
||||
def archive_link(link_dir, link):
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
ARCHIVE_METHODS = (
|
||||
('title', should_fetch_title, fetch_title),
|
||||
('favicon', should_fetch_favicon, fetch_favicon),
|
||||
('wget', should_fetch_wget, fetch_wget),
|
||||
('pdf', should_fetch_pdf, fetch_pdf),
|
||||
('screenshot', should_fetch_screenshot, fetch_screenshot),
|
||||
('dom', should_fetch_dom, fetch_dom),
|
||||
('git', should_fetch_git, fetch_git),
|
||||
('media', should_fetch_media, fetch_media),
|
||||
('archive_org', should_fetch_archive_dot_org, archive_dot_org),
|
||||
)
|
||||
|
||||
try:
|
||||
is_new = not os.path.exists(link_dir)
|
||||
if is_new:
|
||||
os.makedirs(link_dir)
|
||||
|
||||
link = load_json_link_index(link_dir, link)
|
||||
log_link_archiving_started(link_dir, link, is_new)
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
if method_name not in link['history']:
|
||||
link['history'][method_name] = []
|
||||
|
||||
if should_run(link_dir, link):
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link_dir, link)
|
||||
link['history'][method_name].append(result)
|
||||
|
||||
stats[result['status']] += 1
|
||||
log_archive_method_finished(result)
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
write_link_index(link_dir, link)
|
||||
patch_links_index(link)
|
||||
log_link_archiving_finished(link_dir, link, is_new, stats)
|
||||
|
||||
except Exception as err:
|
||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
raise
|
||||
|
||||
return link
|
||||
|
||||
|
||||
### Archive Method Functions
|
||||
|
||||
def should_fetch_title(link_dir, link):
|
||||
# if link already has valid title, skip it
|
||||
if link['title'] and not link['title'].lower().startswith('http'):
|
||||
return False
|
||||
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
return FETCH_TITLE
|
||||
|
||||
def fetch_title(link_dir, link, timeout=TIMEOUT):
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
output = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
link['url'],
|
||||
'|',
|
||||
'grep',
|
||||
'<title>',
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output = fetch_page_title(link['url'], timeout=timeout, progress=False)
|
||||
if not output:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
|
||||
def should_fetch_favicon(link_dir, link):
|
||||
if os.path.exists(os.path.join(link_dir, 'favicon.ico')):
|
||||
return False
|
||||
|
||||
return FETCH_FAVICON
|
||||
|
||||
def fetch_favicon(link_dir, link, timeout=TIMEOUT):
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
output = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--output', output,
|
||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link['url'])),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
chmod_file(output, cwd=link_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def should_fetch_wget(link_dir, link):
|
||||
output_path = wget_output_path(link)
|
||||
if output_path and os.path.exists(os.path.join(link_dir, output_path)):
|
||||
return False
|
||||
|
||||
return FETCH_WGET
|
||||
|
||||
|
||||
def fetch_wget(link_dir, link, timeout=TIMEOUT):
|
||||
"""download full site using wget"""
|
||||
|
||||
if FETCH_WARC:
|
||||
warc_dir = os.path.join(link_dir, 'warc')
|
||||
os.makedirs(warc_dir, exist_ok=True)
|
||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
*(('--restrict-file-names={}'.format(RESTRICT_FILE_NAMES),) if RESTRICT_FILE_NAMES else ()),
|
||||
'--timeout={}'.format(timeout),
|
||||
*(('--compression=auto',) if WGET_AUTO_COMPRESSION else ()),
|
||||
*(() if FETCH_WARC else ('--timestamping',)),
|
||||
*(('--warc-file={}'.format(warc_path),) if FETCH_WARC else ()),
|
||||
*(('--page-requisites',) if FETCH_WGET_REQUISITES else ()),
|
||||
*(('--user-agent={}'.format(WGET_USER_AGENT),) if WGET_USER_AGENT else ()),
|
||||
*(('--load-cookies', COOKIES_FILE) if COOKIES_FILE else ()),
|
||||
*((() if CHECK_SSL_VALIDITY else ('--no-check-certificate', '--no-hsts'))),
|
||||
link['url'],
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
if line.strip()
|
||||
]
|
||||
files_downloaded = (
|
||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||
if 'Downloaded:' in output_tail[-1]
|
||||
else 0
|
||||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if result.returncode > 0 and files_downloaded < 1:
|
||||
hints = (
|
||||
'Got wget response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
if b'403: Forbidden' in result.stderr:
|
||||
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
|
||||
if b'404: Not Found' in result.stderr:
|
||||
raise ArchiveError('404 Not Found', hints)
|
||||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||
raise ArchiveError('500 Internal Server Error', hints)
|
||||
raise ArchiveError('Got an error from the server', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def should_fetch_pdf(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'output.pdf')):
|
||||
return False
|
||||
|
||||
return FETCH_PDF
|
||||
|
||||
|
||||
def fetch_pdf(link_dir, link, timeout=TIMEOUT):
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
output = 'output.pdf'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--print-to-pdf',
|
||||
link['url'],
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to print PDF', hints)
|
||||
|
||||
chmod_file('output.pdf', cwd=link_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def should_fetch_screenshot(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
|
||||
return False
|
||||
|
||||
return FETCH_SCREENSHOT
|
||||
|
||||
def fetch_screenshot(link_dir, link, timeout=TIMEOUT):
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
output = 'screenshot.png'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--screenshot',
|
||||
link['url'],
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to take screenshot', hints)
|
||||
|
||||
chmod_file(output, cwd=link_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def should_fetch_dom(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'output.html')):
|
||||
return False
|
||||
|
||||
return FETCH_DOM
|
||||
|
||||
def fetch_dom(link_dir, link, timeout=TIMEOUT):
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
output = 'output.html'
|
||||
output_path = os.path.join(link_dir, output)
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--dump-dom',
|
||||
link['url']
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
with open(output_path, 'w+') as f:
|
||||
result = run(cmd, stdout=f, stderr=PIPE, cwd=link_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
raise ArchiveError('Failed to fetch DOM', hints)
|
||||
|
||||
chmod_file(output, cwd=link_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def should_fetch_git(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'git')):
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
(domain(link['url']) in GIT_DOMAINS)
|
||||
or (extension(link['url']) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
||||
return FETCH_GIT
|
||||
|
||||
|
||||
def fetch_git(link_dir, link, timeout=TIMEOUT):
|
||||
"""download full site using git"""
|
||||
|
||||
output = 'git'
|
||||
output_path = os.path.join(link_dir, 'git')
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--mirror',
|
||||
'--recursive',
|
||||
*(() if CHECK_SSL_VALIDITY else ('-c', 'http.sslVerify=false')),
|
||||
without_query(without_fragment(link['url'])),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'Got git response code: {}.'.format(result.returncode)
|
||||
raise ArchiveError('Failed git download', hints)
|
||||
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
|
||||
def should_fetch_media(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'media')):
|
||||
return False
|
||||
|
||||
return FETCH_MEDIA
|
||||
|
||||
def fetch_media(link_dir, link, timeout=MEDIA_TIMEOUT):
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
output = 'media'
|
||||
output_path = os.path.join(link_dir, 'media')
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--yes-playlist',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*(() if CHECK_SSL_VALIDITY else ('--no-check-certificate',)),
|
||||
link['url'],
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=PIPE, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=link_dir)
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to download media', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
|
||||
def should_fetch_archive_dot_org(link_dir, link):
|
||||
if is_static_file(link['url']):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SUBMIT_ARCHIVE_DOT_ORG
|
||||
|
||||
def archive_dot_org(link_dir, link, timeout=TIMEOUT):
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
output = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link['url'])
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--location',
|
||||
'--head',
|
||||
*(('--user-agent', '{}'.format(CURL_USER_AGENT),) if CURL_USER_AGENT else ()), # be nice to the Archive.org people and show them where all this ArchiveBox traffic is coming from
|
||||
'--max-time', str(timeout),
|
||||
*(() if CHECK_SSL_VALIDITY else ('--insecure',)),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, stdout=PIPE, stderr=DEVNULL, cwd=link_dir, timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||
archive_org_url = None
|
||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link['url'])))
|
||||
elif errors:
|
||||
raise ArchiveError(', '.join(errors))
|
||||
else:
|
||||
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if not isinstance(output, Exception):
|
||||
# instead of writing None when archive.org rejects the url write the
|
||||
# url to resubmit it to archive.org. This is so when the user visits
|
||||
# the URL in person, it will attempt to re-archive it, and it'll show the
|
||||
# nicer error message explaining why the url was rejected if it fails.
|
||||
archive_org_url = archive_org_url or submit_url
|
||||
with open(os.path.join(link_dir, output), 'w', encoding='utf-8') as f:
|
||||
f.write(archive_org_url)
|
||||
chmod_file('archive.org.txt', cwd=link_dir)
|
||||
output = archive_org_url
|
||||
|
||||
return {
|
||||
'cmd': cmd,
|
||||
'pwd': link_dir,
|
||||
'output': output,
|
||||
'status': status,
|
||||
**timer.stats,
|
||||
}
|
||||
|
||||
def parse_archive_dot_org_response(response):
|
||||
# Parse archive.org response headers
|
||||
headers = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers['content-location']
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
135
archivebox/cli/__init__.py
Normal file
135
archivebox/cli/__init__.py
Normal file
|
@ -0,0 +1,135 @@
|
|||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, Dict, List, IO
|
||||
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
from importlib import import_module
|
||||
|
||||
CLI_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
# these common commands will appear sorted before any others for ease-of-use
|
||||
meta_cmds = ('help', 'version')
|
||||
main_cmds = ('init', 'info', 'config')
|
||||
archive_cmds = ('add', 'remove', 'update', 'list')
|
||||
|
||||
display_first = (*meta_cmds, *main_cmds, *archive_cmds)
|
||||
|
||||
# every imported command module must have these properties in order to be valid
|
||||
required_attrs = ('__package__', '__command__', 'main')
|
||||
|
||||
# basic checks to make sure imported files are valid subcommands
|
||||
is_cli_module = lambda fname: fname.startswith('archivebox_') and fname.endswith('.py')
|
||||
is_valid_cli_module = lambda module, subcommand: (
|
||||
all(hasattr(module, attr) for attr in required_attrs)
|
||||
and module.__command__.split(' ')[-1] == subcommand
|
||||
)
|
||||
|
||||
|
||||
def list_subcommands() -> Dict[str, str]:
|
||||
"""find and import all valid archivebox_<subcommand>.py files in CLI_DIR"""
|
||||
|
||||
COMMANDS = []
|
||||
for filename in os.listdir(CLI_DIR):
|
||||
if is_cli_module(filename):
|
||||
subcommand = filename.replace('archivebox_', '').replace('.py', '')
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
assert is_valid_cli_module(module, subcommand)
|
||||
COMMANDS.append((subcommand, module.main.__doc__))
|
||||
globals()[subcommand] = module.main
|
||||
|
||||
display_order = lambda cmd: (
|
||||
display_first.index(cmd[0])
|
||||
if cmd[0] in display_first else
|
||||
100 + len(cmd[0])
|
||||
)
|
||||
|
||||
return dict(sorted(COMMANDS, key=display_order))
|
||||
|
||||
|
||||
def run_subcommand(subcommand: str,
|
||||
subcommand_args: List[str]=None,
|
||||
stdin: Optional[IO]=None,
|
||||
pwd: Optional[str]=None) -> None:
|
||||
"""Run a given ArchiveBox subcommand with the given list of args"""
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
||||
|
||||
SUBCOMMANDS = list_subcommands()
|
||||
|
||||
class NotProvided:
|
||||
pass
|
||||
|
||||
|
||||
def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided, pwd: Optional[str]=None) -> None:
|
||||
args = sys.argv[1:] if args is NotProvided else args
|
||||
stdin = sys.stdin if stdin is NotProvided else stdin
|
||||
|
||||
subcommands = list_subcommands()
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description='ArchiveBox: The self-hosted internet archive',
|
||||
add_help=False,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--help', '-h',
|
||||
action='store_true',
|
||||
help=subcommands['help'],
|
||||
)
|
||||
group.add_argument(
|
||||
'--version',
|
||||
action='store_true',
|
||||
help=subcommands['version'],
|
||||
)
|
||||
group.add_argument(
|
||||
"subcommand",
|
||||
type=str,
|
||||
help= "The name of the subcommand to run",
|
||||
nargs='?',
|
||||
choices=subcommands.keys(),
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
"subcommand_args",
|
||||
help="Arguments for the subcommand",
|
||||
nargs=argparse.REMAINDER,
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
|
||||
if command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
elif command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
if command.subcommand not in ('help', 'version', 'status'):
|
||||
from ..logging_util import log_cli_command
|
||||
|
||||
log_cli_command(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin,
|
||||
pwd=pwd or OUTPUT_DIR
|
||||
)
|
||||
|
||||
run_subcommand(
|
||||
subcommand=command.subcommand,
|
||||
subcommand_args=command.subcommand_args,
|
||||
stdin=stdin,
|
||||
pwd=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
__all__ = (
|
||||
'SUBCOMMANDS',
|
||||
'list_subcommands',
|
||||
'run_subcommand',
|
||||
*SUBCOMMANDS.keys(),
|
||||
)
|
92
archivebox/cli/archivebox_add.py
Normal file
92
archivebox/cli/archivebox_add.py
Normal file
|
@ -0,0 +1,92 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox add'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import List, Optional, IO
|
||||
|
||||
from ..main import add
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR, ONLY_NEW
|
||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||
|
||||
|
||||
@docstring(add.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=add.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
|
||||
help="Also retry previously skipped/failed links when adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
help="Add the links to the main index without archiving them",
|
||||
)
|
||||
parser.add_argument(
|
||||
'urls',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help=(
|
||||
'URLs or paths to archive e.g.:\n'
|
||||
' https://getpocket.com/users/USERNAME/feed/all\n'
|
||||
' https://example.com/some/rss/feed.xml\n'
|
||||
' https://example.com\n'
|
||||
' ~/Downloads/firefox_bookmarks_export.html\n'
|
||||
' ~/Desktop/sites_list.csv\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
"--depth",
|
||||
action="store",
|
||||
default=0,
|
||||
choices=[0, 1],
|
||||
type=int,
|
||||
help="Recursively archive all linked pages up to this many hops away"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
if (stdin_urls and urls) or (not stdin and not urls):
|
||||
stderr(
|
||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
add(
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
||||
|
||||
|
||||
# TODO: Implement these
|
||||
#
|
||||
# parser.add_argument(
|
||||
# '--mirror', #'-m',
|
||||
# action='store_true',
|
||||
# help='Archive an entire site (finding all linked pages below it on the same domain)',
|
||||
# )
|
||||
# parser.add_argument(
|
||||
# '--crawler', #'-r',
|
||||
# choices=('depth_first', 'breadth_first'),
|
||||
# help='Controls which crawler to use in order to find outlinks in a given page',
|
||||
# default=None,
|
||||
# )
|
61
archivebox/cli/archivebox_config.py
Normal file
61
archivebox/cli/archivebox_config.py
Normal file
|
@ -0,0 +1,61 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox config'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import config
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
||||
@docstring(config.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=config.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--get', #'-g',
|
||||
action='store_true',
|
||||
help="Get the value for the given config KEYs",
|
||||
)
|
||||
group.add_argument(
|
||||
'--set', #'-s',
|
||||
action='store_true',
|
||||
help="Set the given KEY=VALUE config values",
|
||||
)
|
||||
group.add_argument(
|
||||
'--reset', #'-s',
|
||||
action='store_true',
|
||||
help="Reset the given KEY config values to their defaults",
|
||||
)
|
||||
parser.add_argument(
|
||||
'config_options',
|
||||
nargs='*',
|
||||
type=str,
|
||||
help='KEY or KEY=VALUE formatted config values to get or set',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
config_options_str = accept_stdin(stdin)
|
||||
|
||||
config(
|
||||
config_options_str=config_options_str,
|
||||
config_options=command.config_options,
|
||||
get=command.get,
|
||||
set=command.set,
|
||||
reset=command.reset,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
32
archivebox/cli/archivebox_help.py
Executable file
32
archivebox/cli/archivebox_help.py
Executable file
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox help'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import help
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(help.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=help.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
help(out_dir=pwd or OUTPUT_DIR)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
40
archivebox/cli/archivebox_init.py
Executable file
40
archivebox/cli/archivebox_init.py
Executable file
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox init'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import init
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(init.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=init.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--force', # '-f',
|
||||
action='store_true',
|
||||
help='Ignore unrecognized files in current directory and initialize anyway',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
init(
|
||||
force=command.force,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
120
archivebox/cli/archivebox_list.py
Normal file
120
archivebox/cli/archivebox_list.py
Normal file
|
@ -0,0 +1,120 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox list'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import list_all
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
||||
@docstring(list_all.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=list_all.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--csv', #'-c',
|
||||
type=str,
|
||||
help="Print the output in CSV format with the given columns, e.g.: timestamp,url,extension",
|
||||
default=None,
|
||||
)
|
||||
group.add_argument(
|
||||
'--json', #'-j',
|
||||
action='store_true',
|
||||
help="Print the output in JSON format with all columns included.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--sort', #'-s',
|
||||
type=str,
|
||||
help="List the links sorted using the given key, e.g. timestamp or updated.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="List only links bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="List only links bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--status',
|
||||
type=str,
|
||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
||||
default='indexed',
|
||||
help=(
|
||||
'List only links or data directories that have the given status\n'
|
||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
||||
f' archived {get_archived_folders.__doc__}\n'
|
||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' present {get_present_folders.__doc__}\n'
|
||||
f' valid {get_valid_folders.__doc__}\n'
|
||||
f' invalid {get_invalid_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='List only URLs matching these filter patterns.'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
matching_folders = list_all(
|
||||
filter_patterns_str=filter_patterns_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
status=command.status,
|
||||
after=command.after,
|
||||
before=command.before,
|
||||
sort=command.sort,
|
||||
csv=command.csv,
|
||||
json=command.json,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
raise SystemExit(not matching_folders)
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
24
archivebox/cli/archivebox_manage.py
Normal file
24
archivebox/cli/archivebox_manage.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox manage'
|
||||
|
||||
import sys
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import manage
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
@docstring(manage.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
manage(
|
||||
args=args,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
79
archivebox/cli/archivebox_remove.py
Normal file
79
archivebox/cli/archivebox_remove.py
Normal file
|
@ -0,0 +1,79 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox remove'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import remove
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
||||
@docstring(remove.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=remove.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--yes', # '-y',
|
||||
action='store_true',
|
||||
help='Remove links instantly without prompting to confirm.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--delete', # '-r',
|
||||
action='store_true',
|
||||
help=(
|
||||
"In addition to removing the link from the index, "
|
||||
"also delete its archived content and metadata folder."
|
||||
),
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="List only URLs bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="List only URLs bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
help='URLs matching this filter pattern will be removed from the index.'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_str = accept_stdin(stdin)
|
||||
|
||||
remove(
|
||||
filter_str=filter_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
before=command.before,
|
||||
after=command.after,
|
||||
yes=command.yes,
|
||||
delete=command.delete,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
89
archivebox/cli/archivebox_schedule.py
Normal file
89
archivebox/cli/archivebox_schedule.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox schedule'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import schedule
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(schedule.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=schedule.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quiet', '-q',
|
||||
action='store_true',
|
||||
help=("Don't warn about storage space."),
|
||||
)
|
||||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument(
|
||||
'--add', # '-a',
|
||||
action='store_true',
|
||||
help='Add a new scheduled ArchiveBox update job to cron',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--every', # '-e',
|
||||
type=str,
|
||||
default='day',
|
||||
help='Run ArchiveBox once every [timeperiod] (hour/day/week/month/year or cron format e.g. "0 0 * * *")',
|
||||
)
|
||||
group.add_argument(
|
||||
'--clear', # '-c'
|
||||
action='store_true',
|
||||
help=("Stop all ArchiveBox scheduled runs (remove cron jobs)"),
|
||||
)
|
||||
group.add_argument(
|
||||
'--show', # '-s'
|
||||
action='store_true',
|
||||
help=("Print a list of currently active ArchiveBox cron jobs"),
|
||||
)
|
||||
group.add_argument(
|
||||
'--foreground', '-f',
|
||||
action='store_true',
|
||||
help=("Launch ArchiveBox scheduler as a long-running foreground task "
|
||||
"instead of using cron."),
|
||||
)
|
||||
group.add_argument(
|
||||
'--run-all', # '-a',
|
||||
action='store_true',
|
||||
help=("Run all the scheduled jobs once immediately, independent of "
|
||||
"their configured schedules, can be used together with --foreground"),
|
||||
)
|
||||
parser.add_argument(
|
||||
'import_path',
|
||||
nargs='?',
|
||||
type=str,
|
||||
default=None,
|
||||
help=("Check this path and import any new links on every run "
|
||||
"(can be either local file or remote URL)"),
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
schedule(
|
||||
add=command.add,
|
||||
show=command.show,
|
||||
clear=command.clear,
|
||||
foreground=command.foreground,
|
||||
run_all=command.run_all,
|
||||
quiet=command.quiet,
|
||||
every=command.every,
|
||||
import_path=command.import_path,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
60
archivebox/cli/archivebox_server.py
Normal file
60
archivebox/cli/archivebox_server.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox server'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import server
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(server.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=server.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'runserver_args',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Arguments to pass to Django runserver'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--reload',
|
||||
action='store_true',
|
||||
help='Enable auto-reloading when code or templates change',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug',
|
||||
action='store_true',
|
||||
help='Enable DEBUG=True mode with more verbose errors',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--init',
|
||||
action='store_true',
|
||||
help='Run archivebox init before starting the server',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
server(
|
||||
runserver_args=command.runserver_args,
|
||||
reload=command.reload,
|
||||
debug=command.debug,
|
||||
init=command.init,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
34
archivebox/cli/archivebox_shell.py
Normal file
34
archivebox/cli/archivebox_shell.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox shell'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import shell
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(shell.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=shell.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
shell(
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
32
archivebox/cli/archivebox_status.py
Normal file
32
archivebox/cli/archivebox_status.py
Normal file
|
@ -0,0 +1,32 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox status'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import status
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(status.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=status.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
status(out_dir=pwd or OUTPUT_DIR)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
124
archivebox/cli/archivebox_update.py
Normal file
124
archivebox/cli/archivebox_update.py
Normal file
|
@ -0,0 +1,124 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox update'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import List, Optional, IO
|
||||
|
||||
from ..main import update
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_invalid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
||||
@docstring(update.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=update.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--only-new', #'-n',
|
||||
action='store_true',
|
||||
help="Don't attempt to retry previously skipped/failed links when updating",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
help="Update the main index without archiving any content",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--resume', #'-r',
|
||||
type=float,
|
||||
help='Resume the update process from a given timestamp',
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--overwrite', #'-x',
|
||||
action='store_true',
|
||||
help='Ignore existing archived content and overwrite with new versions (DANGEROUS)',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="Update only links bookmarked before the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="Update only links bookmarked after the given timestamp.",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--status',
|
||||
type=str,
|
||||
choices=('indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid', 'duplicate', 'orphaned', 'corrupted', 'unrecognized'),
|
||||
default='indexed',
|
||||
help=(
|
||||
'Update only links or data directories that have the given status\n'
|
||||
f' indexed {get_indexed_folders.__doc__} (the default)\n'
|
||||
f' archived {get_archived_folders.__doc__}\n'
|
||||
f' unarchived {get_unarchived_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' present {get_present_folders.__doc__}\n'
|
||||
f' valid {get_valid_folders.__doc__}\n'
|
||||
f' invalid {get_invalid_folders.__doc__}\n'
|
||||
'\n'
|
||||
f' duplicate {get_duplicate_folders.__doc__}\n'
|
||||
f' orphaned {get_orphaned_folders.__doc__}\n'
|
||||
f' corrupted {get_corrupted_folders.__doc__}\n'
|
||||
f' unrecognized {get_unrecognized_folders.__doc__}\n'
|
||||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
parser.add_argument(
|
||||
'filter_patterns',
|
||||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='Update only URLs matching these filter patterns.'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
update(
|
||||
resume=command.resume,
|
||||
only_new=command.only_new,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
filter_patterns_str=filter_patterns_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
status=command.status,
|
||||
after=command.after,
|
||||
before=command.before,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
40
archivebox/cli/archivebox_version.py
Executable file
40
archivebox/cli/archivebox_version.py
Executable file
|
@ -0,0 +1,40 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
__command__ = 'archivebox version'
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
|
||||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import version
|
||||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
||||
@docstring(version.__doc__)
|
||||
def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional[str]=None) -> None:
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=__command__,
|
||||
description=version.__doc__,
|
||||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quiet', '-q',
|
||||
action='store_true',
|
||||
help='Only print ArchiveBox version number and nothing else.',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
version(
|
||||
quiet=command.quiet,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(args=sys.argv[1:], stdin=sys.stdin)
|
226
archivebox/cli/tests.py
Executable file
226
archivebox/cli/tests.py
Executable file
|
@ -0,0 +1,226 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import unittest
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
|
||||
'OUTPUT_DIR': 'data.tests',
|
||||
|
||||
'SAVE_ARCHIVE_DOT_ORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
|
||||
'USE_CURL': 'False',
|
||||
'USE_WGET': 'False',
|
||||
'USE_GIT': 'False',
|
||||
'USE_CHROME': 'False',
|
||||
'USE_YOUTUBEDL': 'False',
|
||||
}
|
||||
|
||||
OUTPUT_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from ..main import init
|
||||
from ..index import load_main_index
|
||||
from ..config import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
)
|
||||
|
||||
from . import (
|
||||
archivebox_init,
|
||||
archivebox_add,
|
||||
archivebox_remove,
|
||||
)
|
||||
|
||||
HIDE_CLI_OUTPUT = True
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
@contextmanager
|
||||
def output_hidden(show_failing=True):
|
||||
if not HIDE_CLI_OUTPUT:
|
||||
yield
|
||||
return
|
||||
|
||||
sys.stdout = open('stdout.txt', 'w+')
|
||||
sys.stderr = open('stderr.txt', 'w+')
|
||||
try:
|
||||
yield
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
except:
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
if show_failing:
|
||||
with open('stdout.txt', 'r') as f:
|
||||
print(f.read())
|
||||
with open('stderr.txt', 'r') as f:
|
||||
print(f.read())
|
||||
raise
|
||||
finally:
|
||||
os.remove('stdout.txt')
|
||||
os.remove('stderr.txt')
|
||||
|
||||
|
||||
class TestInit(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
def test_basic_init(self):
|
||||
with output_hidden():
|
||||
archivebox_init.main([])
|
||||
|
||||
assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
|
||||
assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
|
||||
assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
|
||||
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
|
||||
|
||||
def test_conflicting_init(self):
|
||||
with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
|
||||
f.write('test')
|
||||
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_init.main([])
|
||||
assert False, 'Init should have exited with an exception'
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
|
||||
assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
|
||||
assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
|
||||
try:
|
||||
load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert False, 'load_main_index should raise an exception when no index is present'
|
||||
except:
|
||||
pass
|
||||
|
||||
def test_no_dirty_state(self):
|
||||
with output_hidden():
|
||||
init()
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
|
||||
class TestAdd(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
def test_add_arg_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 30
|
||||
|
||||
def test_add_arg_file(self):
|
||||
test_file = os.path.join(OUTPUT_DIR, 'test.txt')
|
||||
with open(test_file, 'w+') as f:
|
||||
f.write(test_urls)
|
||||
|
||||
with output_hidden():
|
||||
archivebox_add.main([test_file])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 12
|
||||
os.remove(test_file)
|
||||
|
||||
def test_add_stdin_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 12
|
||||
|
||||
|
||||
class TestRemove(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
# def tearDown(self):
|
||||
# shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
|
||||
def test_remove_exact(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 11
|
||||
|
||||
def test_remove_regex(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 4
|
||||
|
||||
def test_remove_domain(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 10
|
||||
|
||||
def test_remove_none(self):
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
|
||||
assert False, 'Should raise if no URLs match'
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if '--verbose' in sys.argv or '-v' in sys.argv:
|
||||
HIDE_CLI_OUTPUT = False
|
||||
|
||||
unittest.main()
|
|
@ -1,278 +0,0 @@
|
|||
import os
|
||||
import re
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
|
||||
# ******************************************************************************
|
||||
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
|
||||
# env USE_COLOR=True CHROME_BINARY=google-chrome ./archive export.html
|
||||
# ******************************************************************************
|
||||
|
||||
IS_TTY = sys.stdout.isatty()
|
||||
USE_COLOR = os.getenv('USE_COLOR', str(IS_TTY) ).lower() == 'true'
|
||||
SHOW_PROGRESS = os.getenv('SHOW_PROGRESS', str(IS_TTY) ).lower() == 'true'
|
||||
ONLY_NEW = os.getenv('ONLY_NEW', 'False' ).lower() == 'true'
|
||||
MEDIA_TIMEOUT = int(os.getenv('MEDIA_TIMEOUT', '3600'))
|
||||
TIMEOUT = int(os.getenv('TIMEOUT', '60'))
|
||||
OUTPUT_PERMISSIONS = os.getenv('OUTPUT_PERMISSIONS', '755' )
|
||||
FOOTER_INFO = os.getenv('FOOTER_INFO', 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.',)
|
||||
|
||||
FETCH_WGET = os.getenv('FETCH_WGET', 'True' ).lower() == 'true'
|
||||
FETCH_WGET_REQUISITES = os.getenv('FETCH_WGET_REQUISITES', 'True' ).lower() == 'true'
|
||||
FETCH_PDF = os.getenv('FETCH_PDF', 'True' ).lower() == 'true'
|
||||
FETCH_SCREENSHOT = os.getenv('FETCH_SCREENSHOT', 'True' ).lower() == 'true'
|
||||
FETCH_DOM = os.getenv('FETCH_DOM', 'True' ).lower() == 'true'
|
||||
FETCH_WARC = os.getenv('FETCH_WARC', 'True' ).lower() == 'true'
|
||||
FETCH_GIT = os.getenv('FETCH_GIT', 'True' ).lower() == 'true'
|
||||
FETCH_MEDIA = os.getenv('FETCH_MEDIA', 'True' ).lower() == 'true'
|
||||
FETCH_FAVICON = os.getenv('FETCH_FAVICON', 'True' ).lower() == 'true'
|
||||
FETCH_TITLE = os.getenv('FETCH_TITLE', 'True' ).lower() == 'true'
|
||||
SUBMIT_ARCHIVE_DOT_ORG = os.getenv('SUBMIT_ARCHIVE_DOT_ORG', 'True' ).lower() == 'true'
|
||||
|
||||
CHECK_SSL_VALIDITY = os.getenv('CHECK_SSL_VALIDITY', 'True' ).lower() == 'true'
|
||||
RESOLUTION = os.getenv('RESOLUTION', '1440,2000' )
|
||||
RESTRICT_FILE_NAMES = os.getenv('RESTRICT_FILE_NAMES', 'windows' )
|
||||
GIT_DOMAINS = os.getenv('GIT_DOMAINS', 'github.com,bitbucket.org,gitlab.com').split(',')
|
||||
CURL_USER_AGENT = os.getenv('CURL_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/)')
|
||||
WGET_USER_AGENT = os.getenv('WGET_USER_AGENT', 'ArchiveBox/{GIT_SHA} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}')
|
||||
COOKIES_FILE = os.getenv('COOKIES_FILE', None)
|
||||
CHROME_USER_DATA_DIR = os.getenv('CHROME_USER_DATA_DIR', None)
|
||||
CHROME_HEADLESS = os.getenv('CHROME_HEADLESS', 'True' ).lower() == 'true'
|
||||
CHROME_USER_AGENT = os.getenv('CHROME_USER_AGENT', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36')
|
||||
|
||||
CURL_BINARY = os.getenv('CURL_BINARY', 'curl')
|
||||
GIT_BINARY = os.getenv('GIT_BINARY', 'git')
|
||||
WGET_BINARY = os.getenv('WGET_BINARY', 'wget')
|
||||
YOUTUBEDL_BINARY = os.getenv('YOUTUBEDL_BINARY', 'youtube-dl')
|
||||
CHROME_BINARY = os.getenv('CHROME_BINARY', None)
|
||||
|
||||
URL_BLACKLIST = os.getenv('URL_BLACKLIST', None)
|
||||
|
||||
try:
|
||||
OUTPUT_DIR = os.path.abspath(os.getenv('OUTPUT_DIR'))
|
||||
except Exception:
|
||||
OUTPUT_DIR = None
|
||||
|
||||
|
||||
# ******************************************************************************
|
||||
# **************************** Derived Settings ********************************
|
||||
# ******************************************************************************
|
||||
|
||||
REPO_DIR = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..'))
|
||||
if not OUTPUT_DIR:
|
||||
OUTPUT_DIR = os.path.join(REPO_DIR, 'output')
|
||||
|
||||
ARCHIVE_DIR_NAME = 'archive'
|
||||
SOURCES_DIR_NAME = 'sources'
|
||||
ARCHIVE_DIR = os.path.join(OUTPUT_DIR, ARCHIVE_DIR_NAME)
|
||||
SOURCES_DIR = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME)
|
||||
|
||||
PYTHON_PATH = os.path.join(REPO_DIR, 'archivebox')
|
||||
TEMPLATES_DIR = os.path.join(PYTHON_PATH, 'templates')
|
||||
|
||||
CHROME_SANDBOX = os.getenv('CHROME_SANDBOX', 'True').lower() == 'true'
|
||||
USE_CHROME = FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM
|
||||
USE_WGET = FETCH_WGET or FETCH_WGET_REQUISITES or FETCH_WARC
|
||||
WGET_AUTO_COMPRESSION = USE_WGET and WGET_BINARY and (not run([WGET_BINARY, "--compression=auto", "--help"], stdout=DEVNULL, stderr=DEVNULL).returncode)
|
||||
|
||||
URL_BLACKLIST = URL_BLACKLIST and re.compile(URL_BLACKLIST, re.IGNORECASE)
|
||||
|
||||
########################### Environment & Dependencies #########################
|
||||
|
||||
try:
|
||||
### Terminal Configuration
|
||||
TERM_WIDTH = shutil.get_terminal_size((100, 10)).columns
|
||||
ANSI = {
|
||||
'reset': '\033[00;00m',
|
||||
'lightblue': '\033[01;30m',
|
||||
'lightyellow': '\033[01;33m',
|
||||
'lightred': '\033[01;35m',
|
||||
'red': '\033[01;31m',
|
||||
'green': '\033[01;32m',
|
||||
'blue': '\033[01;34m',
|
||||
'white': '\033[01;37m',
|
||||
'black': '\033[01;30m',
|
||||
}
|
||||
if not USE_COLOR:
|
||||
# dont show colors if USE_COLOR is False
|
||||
ANSI = {k: '' for k in ANSI.keys()}
|
||||
|
||||
|
||||
if not CHROME_BINARY:
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
default_executable_paths = (
|
||||
'chromium-browser',
|
||||
'chromium',
|
||||
'chrome',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'google-chrome',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
)
|
||||
for name in default_executable_paths:
|
||||
full_path_exists = shutil.which(name)
|
||||
if full_path_exists:
|
||||
CHROME_BINARY = name
|
||||
break
|
||||
else:
|
||||
CHROME_BINARY = 'chromium-browser'
|
||||
# print('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
|
||||
|
||||
if CHROME_USER_DATA_DIR is None:
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/.config/chrome',
|
||||
'~/.config/google-chrome',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/.config/google-chrome-stable',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/.config/google-chrome-dev',
|
||||
)
|
||||
for path in default_profile_paths:
|
||||
full_path = os.path.expanduser(path)
|
||||
if os.path.exists(full_path):
|
||||
CHROME_USER_DATA_DIR = full_path
|
||||
break
|
||||
# print('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
||||
|
||||
CHROME_OPTIONS = {
|
||||
'TIMEOUT': TIMEOUT,
|
||||
'RESOLUTION': RESOLUTION,
|
||||
'CHECK_SSL_VALIDITY': CHECK_SSL_VALIDITY,
|
||||
'CHROME_BINARY': CHROME_BINARY,
|
||||
'CHROME_HEADLESS': CHROME_HEADLESS,
|
||||
'CHROME_SANDBOX': CHROME_SANDBOX,
|
||||
'CHROME_USER_AGENT': CHROME_USER_AGENT,
|
||||
'CHROME_USER_DATA_DIR': CHROME_USER_DATA_DIR,
|
||||
}
|
||||
|
||||
|
||||
### Check Python environment
|
||||
python_vers = float('{}.{}'.format(sys.version_info.major, sys.version_info.minor))
|
||||
if python_vers < 3.5:
|
||||
print('{}[X] Python version is not new enough: {} (>3.5 is required){}'.format(ANSI['red'], python_vers, ANSI['reset']))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||
raise SystemExit(1)
|
||||
|
||||
if sys.stdout.encoding.upper() not in ('UTF-8', 'UTF8'):
|
||||
print('[X] Your system is running python3 scripts with a bad locale setting: {} (it should be UTF-8).'.format(sys.stdout.encoding))
|
||||
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
|
||||
print('')
|
||||
print(' Confirm that it\'s fixed by opening a new shell and running:')
|
||||
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
|
||||
print('')
|
||||
print(' Alternatively, run this script with:')
|
||||
print(' env PYTHONIOENCODING=UTF-8 ./archive.py export.html')
|
||||
|
||||
### Get code version by parsing git log
|
||||
GIT_SHA = 'unknown'
|
||||
try:
|
||||
GIT_SHA = run([GIT_BINARY, 'rev-list', '-1', 'HEAD', './'], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||
except Exception:
|
||||
print('[!] Warning: unable to determine git version, is git installed and in your $PATH?')
|
||||
|
||||
### Get absolute path for cookies file
|
||||
try:
|
||||
COOKIES_FILE = os.path.abspath(COOKIES_FILE) if COOKIES_FILE else None
|
||||
except Exception:
|
||||
print('[!] Warning: unable to get full path to COOKIES_FILE, are you sure you specified it correctly?')
|
||||
raise
|
||||
|
||||
### Make sure curl is installed
|
||||
if FETCH_FAVICON or FETCH_TITLE or SUBMIT_ARCHIVE_DOT_ORG:
|
||||
if run(['which', CURL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([CURL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: curl{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CURL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
CURL_USER_AGENT = CURL_USER_AGENT.format(GIT_SHA=GIT_SHA[:9])
|
||||
|
||||
### Make sure wget is installed and calculate version
|
||||
if FETCH_WGET or FETCH_WARC:
|
||||
if run(['which', WGET_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([WGET_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: wget{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(WGET_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
WGET_VERSION = 'unknown'
|
||||
try:
|
||||
wget_vers_str = run([WGET_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||
WGET_VERSION = wget_vers_str.split('\n')[0].split(' ')[2]
|
||||
except Exception:
|
||||
if USE_WGET:
|
||||
print('[!] Warning: unable to determine wget version, is wget installed and in your $PATH?')
|
||||
|
||||
WGET_USER_AGENT = WGET_USER_AGENT.format(GIT_SHA=GIT_SHA[:9], WGET_VERSION=WGET_VERSION)
|
||||
|
||||
### Make sure chrome is installed and calculate version
|
||||
if FETCH_PDF or FETCH_SCREENSHOT or FETCH_DOM:
|
||||
if run(['which', CHROME_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{}[X] Missing dependency: {}{}'.format(ANSI['red'], CHROME_BINARY, ANSI['reset']))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
# parse chrome --version e.g. Google Chrome 61.0.3114.0 canary / Chromium 59.0.3029.110 built on Ubuntu, running on Ubuntu 16.04
|
||||
try:
|
||||
result = run([CHROME_BINARY, '--version'], stdout=PIPE)
|
||||
version_str = result.stdout.decode('utf-8')
|
||||
version_lines = re.sub("(Google Chrome|Chromium) (\\d+?)\\.(\\d+?)\\.(\\d+?).*?$", "\\2", version_str).split('\n')
|
||||
version = [l for l in version_lines if l.isdigit()][-1]
|
||||
if int(version) < 59:
|
||||
print(version_lines)
|
||||
print('{red}[X] Chrome version must be 59 or greater for headless PDF, screenshot, and DOM saving{reset}'.format(**ANSI))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
except (IndexError, TypeError, OSError):
|
||||
print('{red}[X] Failed to parse Chrome version, is it installed properly?{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(CHROME_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
CHROME_VERSION = 'unknown'
|
||||
try:
|
||||
chrome_vers_str = run([CHROME_BINARY, "--version"], stdout=PIPE, cwd=REPO_DIR).stdout.strip().decode()
|
||||
CHROME_VERSION = [v for v in chrome_vers_str.strip().split(' ') if v.replace('.', '').isdigit()][0]
|
||||
except Exception:
|
||||
if USE_CHROME:
|
||||
print('[!] Warning: unable to determine chrome version, is chrome installed and in your $PATH?')
|
||||
|
||||
### Make sure git is installed
|
||||
if FETCH_GIT:
|
||||
if run(['which', GIT_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([GIT_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: git{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it works with: {} --version'.format(GIT_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
### Make sure youtube-dl is installed
|
||||
if FETCH_MEDIA:
|
||||
if run(['which', YOUTUBEDL_BINARY], stdout=DEVNULL, stderr=DEVNULL).returncode or run([YOUTUBEDL_BINARY, '--version'], stdout=DEVNULL, stderr=DEVNULL).returncode:
|
||||
print('{red}[X] Missing dependency: youtube-dl{reset}'.format(**ANSI))
|
||||
print(' Install it, then confirm it was installed with: {} --version'.format(YOUTUBEDL_BINARY))
|
||||
print(' See https://github.com/pirate/ArchiveBox/wiki/Install for help.')
|
||||
raise SystemExit(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(1)
|
||||
|
||||
except:
|
||||
print('[X] There was an error during the startup procedure, your archive data is unaffected.')
|
||||
raise
|
874
archivebox/config/__init__.py
Normal file
874
archivebox/config/__init__.py
Normal file
|
@ -0,0 +1,874 @@
|
|||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import io
|
||||
import re
|
||||
import sys
|
||||
import django
|
||||
import getpass
|
||||
import shutil
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
from typing import Optional, Type, Tuple, Dict
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
from configparser import ConfigParser
|
||||
from collections import defaultdict
|
||||
|
||||
from .stubs import (
|
||||
SimpleConfigValueDict,
|
||||
ConfigValue,
|
||||
ConfigDict,
|
||||
ConfigDefaultValue,
|
||||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
# precedence order for config:
|
||||
# 1. cli args
|
||||
# 2. shell environment vars
|
||||
# 3. config file
|
||||
# 4. defaults
|
||||
|
||||
# env USE_COLO=false archivebox add '...'
|
||||
# env SHOW_PROGRESS=1 archivebox add '...'
|
||||
|
||||
# ******************************************************************************
|
||||
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
|
||||
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
|
||||
# ******************************************************************************
|
||||
|
||||
################################# User Config ##################################
|
||||
|
||||
CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||
'SHELL_CONFIG': {
|
||||
'IS_TTY': {'type': bool, 'default': lambda _: sys.stdout.isatty()},
|
||||
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
||||
},
|
||||
|
||||
'GENERAL_CONFIG': {
|
||||
'OUTPUT_DIR': {'type': str, 'default': None},
|
||||
'CONFIG_FILE': {'type': str, 'default': None},
|
||||
'ONLY_NEW': {'type': bool, 'default': True},
|
||||
'TIMEOUT': {'type': int, 'default': 60},
|
||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
|
||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
||||
'URL_BLACKLIST': {'type': str, 'default': None},
|
||||
},
|
||||
|
||||
'SERVER_CONFIG': {
|
||||
'SECRET_KEY': {'type': str, 'default': None},
|
||||
'ALLOWED_HOSTS': {'type': str, 'default': '*'},
|
||||
'DEBUG': {'type': bool, 'default': False},
|
||||
'PUBLIC_INDEX': {'type': bool, 'default': True},
|
||||
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
||||
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
||||
'ACTIVE_THEME': {'type': str, 'default': 'default'},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
'SAVE_TITLE': {'type': bool, 'default': True, 'aliases': ('FETCH_TITLE',)},
|
||||
'SAVE_FAVICON': {'type': bool, 'default': True, 'aliases': ('FETCH_FAVICON',)},
|
||||
'SAVE_WGET': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET',)},
|
||||
'SAVE_WGET_REQUISITES': {'type': bool, 'default': True, 'aliases': ('FETCH_WGET_REQUISITES',)},
|
||||
'SAVE_PDF': {'type': bool, 'default': True, 'aliases': ('FETCH_PDF',)},
|
||||
'SAVE_SCREENSHOT': {'type': bool, 'default': True, 'aliases': ('FETCH_SCREENSHOT',)},
|
||||
'SAVE_DOM': {'type': bool, 'default': True, 'aliases': ('FETCH_DOM',)},
|
||||
'SAVE_WARC': {'type': bool, 'default': True, 'aliases': ('FETCH_WARC',)},
|
||||
'SAVE_GIT': {'type': bool, 'default': True, 'aliases': ('FETCH_GIT',)},
|
||||
'SAVE_MEDIA': {'type': bool, 'default': True, 'aliases': ('FETCH_MEDIA',)},
|
||||
'SAVE_PLAYLISTS': {'type': bool, 'default': True, 'aliases': ('FETCH_PLAYLISTS',)},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'type': bool, 'default': True, 'aliases': ('SUBMIT_ARCHIVE_DOT_ORG',)},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_OPTIONS': {
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
|
||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||
'CHROME_SANDBOX': {'type': bool, 'default': True},
|
||||
|
||||
},
|
||||
|
||||
'DEPENDENCY_CONFIG': {
|
||||
'USE_CURL': {'type': bool, 'default': True},
|
||||
'USE_WGET': {'type': bool, 'default': True},
|
||||
'USE_GIT': {'type': bool, 'default': True},
|
||||
'USE_CHROME': {'type': bool, 'default': True},
|
||||
'USE_YOUTUBEDL': {'type': bool, 'default': True},
|
||||
|
||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
'WGET_BINARY': {'type': str, 'default': 'wget'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
},
|
||||
}
|
||||
|
||||
CONFIG_ALIASES = {
|
||||
alias: key
|
||||
for section in CONFIG_DEFAULTS.values()
|
||||
for key, default in section.items()
|
||||
for alias in default.get('aliases', ())
|
||||
}
|
||||
USER_CONFIG = {key for section in CONFIG_DEFAULTS.values() for key in section.keys()}
|
||||
def get_real_name(key: str) -> str:
|
||||
return CONFIG_ALIASES.get(key.upper().strip(), key.upper().strip())
|
||||
|
||||
############################## Derived Config ##############################
|
||||
|
||||
# Constants
|
||||
|
||||
DEFAULT_CLI_COLORS = {
|
||||
'reset': '\033[00;00m',
|
||||
'lightblue': '\033[01;30m',
|
||||
'lightyellow': '\033[01;33m',
|
||||
'lightred': '\033[01;35m',
|
||||
'red': '\033[01;31m',
|
||||
'green': '\033[01;32m',
|
||||
'blue': '\033[01;34m',
|
||||
'white': '\033[01;37m',
|
||||
'black': '\033[01;30m',
|
||||
}
|
||||
ANSI = {k: '' for k in DEFAULT_CLI_COLORS.keys()}
|
||||
|
||||
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
|
||||
'00': [(0, 0, 0), (0, 0, 0)],
|
||||
'30': [(0, 0, 0), (0, 0, 0)],
|
||||
'31': [(255, 0, 0), (128, 0, 0)],
|
||||
'32': [(0, 200, 0), (0, 128, 0)],
|
||||
'33': [(255, 255, 0), (128, 128, 0)],
|
||||
'34': [(0, 0, 255), (0, 0, 128)],
|
||||
'35': [(255, 0, 255), (128, 0, 128)],
|
||||
'36': [(0, 255, 255), (0, 128, 128)],
|
||||
'37': [(255, 255, 255), (255, 255, 255)],
|
||||
})
|
||||
|
||||
STATICFILE_EXTENSIONS = {
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||
'atom', 'rss', 'css', 'js', 'json',
|
||||
'dmg', 'iso', 'img',
|
||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
}
|
||||
|
||||
VERSION_FILENAME = 'VERSION'
|
||||
PYTHON_DIR_NAME = 'archivebox'
|
||||
TEMPLATES_DIR_NAME = 'themes'
|
||||
|
||||
ARCHIVE_DIR_NAME = 'archive'
|
||||
SOURCES_DIR_NAME = 'sources'
|
||||
LOGS_DIR_NAME = 'logs'
|
||||
STATIC_DIR_NAME = 'static'
|
||||
SQL_INDEX_FILENAME = 'index.sqlite3'
|
||||
JSON_INDEX_FILENAME = 'index.json'
|
||||
HTML_INDEX_FILENAME = 'index.html'
|
||||
ROBOTS_TXT_FILENAME = 'robots.txt'
|
||||
FAVICON_FILENAME = 'favicon.ico'
|
||||
CONFIG_FILENAME = 'ArchiveBox.conf'
|
||||
|
||||
CONFIG_HEADER = (
|
||||
"""# This is the config file for your ArchiveBox collection.
|
||||
#
|
||||
# You can add options here manually in INI format, or automatically by running:
|
||||
# archivebox config --set KEY=VALUE
|
||||
#
|
||||
# If you modify this file manually, make sure to update your archive after by running:
|
||||
# archivebox init
|
||||
#
|
||||
# A list of all possible config with documentation and examples can be found here:
|
||||
# https://github.com/pirate/ArchiveBox/wiki/Configuration
|
||||
|
||||
""")
|
||||
|
||||
|
||||
DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
|
||||
'USER': {'default': lambda c: getpass.getuser() or os.getlogin()},
|
||||
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else {k: '' for k in DEFAULT_CLI_COLORS.keys()}},
|
||||
|
||||
'REPO_DIR': {'default': lambda c: os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), '..', '..'))},
|
||||
'PYTHON_DIR': {'default': lambda c: os.path.join(c['REPO_DIR'], PYTHON_DIR_NAME)},
|
||||
'TEMPLATES_DIR': {'default': lambda c: os.path.join(c['PYTHON_DIR'], TEMPLATES_DIR_NAME, 'legacy')},
|
||||
|
||||
'OUTPUT_DIR': {'default': lambda c: os.path.abspath(os.path.expanduser(c['OUTPUT_DIR'])) if c['OUTPUT_DIR'] else os.path.abspath(os.curdir)},
|
||||
'ARCHIVE_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], ARCHIVE_DIR_NAME)},
|
||||
'SOURCES_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], SOURCES_DIR_NAME)},
|
||||
'LOGS_DIR': {'default': lambda c: os.path.join(c['OUTPUT_DIR'], LOGS_DIR_NAME)},
|
||||
'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else os.path.join(c['OUTPUT_DIR'], CONFIG_FILENAME)},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)},
|
||||
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE)},
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
|
||||
'VERSION': {'default': lambda c: open(os.path.join(c['PYTHON_DIR'], VERSION_FILENAME), 'r').read().strip()},
|
||||
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
|
||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['FETCH_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
'SAVE_ARCHIVE_DOT_ORG': {'default': lambda c: c['USE_CURL'] and c['SAVE_ARCHIVE_DOT_ORG']},
|
||||
|
||||
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
|
||||
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
|
||||
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
|
||||
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
||||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
|
||||
'USE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
|
||||
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'SAVE_PLAYLISTS': {'default': lambda c: c['SAVE_PLAYLISTS'] and c['SAVE_MEDIA']},
|
||||
|
||||
'USE_CHROME': {'default': lambda c: c['USE_CHROME'] and (c['SAVE_PDF'] or c['SAVE_SCREENSHOT'] or c['SAVE_DOM'])},
|
||||
'CHROME_BINARY': {'default': lambda c: c['CHROME_BINARY'] if c['CHROME_BINARY'] else find_chrome_binary()},
|
||||
'CHROME_VERSION': {'default': lambda c: bin_version(c['CHROME_BINARY']) if c['USE_CHROME'] else None},
|
||||
'SAVE_PDF': {'default': lambda c: c['USE_CHROME'] and c['SAVE_PDF']},
|
||||
'SAVE_SCREENSHOT': {'default': lambda c: c['USE_CHROME'] and c['SAVE_SCREENSHOT']},
|
||||
'SAVE_DOM': {'default': lambda c: c['USE_CHROME'] and c['SAVE_DOM']},
|
||||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'EXTERNAL_LOCATIONS': {'default': lambda c: get_external_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
'CHROME_OPTIONS': {'default': lambda c: get_chrome_info(c)},
|
||||
}
|
||||
|
||||
|
||||
|
||||
################################### Helpers ####################################
|
||||
|
||||
def load_config_val(key: str,
|
||||
default: ConfigDefaultValue=None,
|
||||
type: Optional[Type]=None,
|
||||
aliases: Optional[Tuple[str, ...]]=None,
|
||||
config: Optional[ConfigDict]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigValue:
|
||||
"""parse bool, int, and str key=value pairs from env"""
|
||||
|
||||
|
||||
config_keys_to_check = (key, *(aliases or ()))
|
||||
for key in config_keys_to_check:
|
||||
if env_vars:
|
||||
val = env_vars.get(key)
|
||||
if val:
|
||||
break
|
||||
if config_file_vars:
|
||||
val = config_file_vars.get(key)
|
||||
if val:
|
||||
break
|
||||
|
||||
if type is None or val is None:
|
||||
if callable(default):
|
||||
assert isinstance(config, dict)
|
||||
return default(config)
|
||||
|
||||
return default
|
||||
|
||||
elif type is bool:
|
||||
if val.lower() in ('true', 'yes', '1'):
|
||||
return True
|
||||
elif val.lower() in ('false', 'no', '0'):
|
||||
return False
|
||||
else:
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a boolean: True/False)')
|
||||
|
||||
elif type is str:
|
||||
if val.lower() in ('true', 'false', 'yes', 'no', '1', '0'):
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected a string)')
|
||||
return val.strip()
|
||||
|
||||
elif type is int:
|
||||
if not val.isdigit():
|
||||
raise ValueError(f'Invalid configuration option {key}={val} (expected an integer)')
|
||||
return int(val)
|
||||
|
||||
raise Exception('Config values can only be str, bool, or int')
|
||||
|
||||
|
||||
def load_config_file(out_dir: str=None) -> Optional[Dict[str, str]]:
|
||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||
|
||||
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
|
||||
config_path = os.path.join(out_dir, CONFIG_FILENAME)
|
||||
if os.path.exists(config_path):
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
config_file.read(config_path)
|
||||
# flatten into one namespace
|
||||
config_file_vars = {
|
||||
key.upper(): val
|
||||
for section, options in config_file.items()
|
||||
for key, val in options.items()
|
||||
}
|
||||
# print('[i] Loaded config file', os.path.abspath(config_path))
|
||||
# print(config_file_vars)
|
||||
return config_file_vars
|
||||
return None
|
||||
|
||||
|
||||
def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||
|
||||
from ..system import atomic_write
|
||||
|
||||
out_dir = out_dir or os.path.abspath(os.getenv('OUTPUT_DIR', '.'))
|
||||
config_path = os.path.join(out_dir, CONFIG_FILENAME)
|
||||
|
||||
if not os.path.exists(config_path):
|
||||
atomic_write(config_path, CONFIG_HEADER)
|
||||
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
config_file.read(config_path)
|
||||
|
||||
with open(config_path, 'r') as old:
|
||||
atomic_write(f'{config_path}.bak', old.read())
|
||||
|
||||
find_section = lambda key: [name for name, opts in CONFIG_DEFAULTS.items() if key in opts][0]
|
||||
|
||||
# Set up sections in empty config file
|
||||
for key, val in config.items():
|
||||
section = find_section(key)
|
||||
if section in config_file:
|
||||
existing_config = dict(config_file[section])
|
||||
else:
|
||||
existing_config = {}
|
||||
config_file[section] = {**existing_config, key: val}
|
||||
|
||||
# always make sure there's a SECRET_KEY defined for Django
|
||||
existing_secret_key = None
|
||||
if 'SERVER_CONFIG' in config_file and 'SECRET_KEY' in config_file['SERVER_CONFIG']:
|
||||
existing_secret_key = config_file['SERVER_CONFIG']['SECRET_KEY']
|
||||
|
||||
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
|
||||
from django.utils.crypto import get_random_string
|
||||
chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
|
||||
random_secret_key = get_random_string(50, chars)
|
||||
if 'SERVER_CONFIG' in config_file:
|
||||
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
|
||||
else:
|
||||
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
|
||||
|
||||
with open(config_path, 'w+') as new:
|
||||
config_file.write(new)
|
||||
|
||||
try:
|
||||
# validate the config by attempting to re-parse it
|
||||
CONFIG = load_all_config()
|
||||
return {
|
||||
key.upper(): CONFIG.get(key.upper())
|
||||
for key in config.keys()
|
||||
}
|
||||
except:
|
||||
# something went horribly wrong, rever to the previous version
|
||||
with open(f'{config_path}.bak', 'r') as old:
|
||||
atomic_write(config_path, old.read())
|
||||
|
||||
if os.path.exists(f'{config_path}.bak'):
|
||||
os.remove(f'{config_path}.bak')
|
||||
|
||||
return {}
|
||||
|
||||
|
||||
|
||||
def load_config(defaults: ConfigDefaultDict,
|
||||
config: Optional[ConfigDict]=None,
|
||||
out_dir: Optional[str]=None,
|
||||
env_vars: Optional[os._Environ]=None,
|
||||
config_file_vars: Optional[Dict[str, str]]=None) -> ConfigDict:
|
||||
|
||||
env_vars = env_vars or os.environ
|
||||
config_file_vars = config_file_vars or load_config_file(out_dir=out_dir)
|
||||
|
||||
extended_config: ConfigDict = config.copy() if config else {}
|
||||
for key, default in defaults.items():
|
||||
try:
|
||||
extended_config[key] = load_config_val(
|
||||
key,
|
||||
default=default['default'],
|
||||
type=default.get('type'),
|
||||
aliases=default.get('aliases'),
|
||||
config=extended_config,
|
||||
env_vars=env_vars,
|
||||
config_file_vars=config_file_vars,
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(0)
|
||||
except Exception as e:
|
||||
stderr()
|
||||
stderr(f'[X] Error while loading configuration value: {key}', color='red', config=extended_config)
|
||||
stderr(' {}: {}'.format(e.__class__.__name__, e))
|
||||
stderr()
|
||||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||
stderr()
|
||||
stderr(' For config documentation and examples see:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
|
||||
stderr()
|
||||
raise
|
||||
raise SystemExit(2)
|
||||
|
||||
return extended_config
|
||||
|
||||
# def write_config(config: ConfigDict):
|
||||
|
||||
# with open(os.path.join(config['OUTPUT_DIR'], CONFIG_FILENAME), 'w+') as f:
|
||||
|
||||
|
||||
|
||||
def stderr(*args, color: Optional[str]=None, config: Optional[ConfigDict]=None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||
|
||||
if color:
|
||||
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
|
||||
else:
|
||||
strs = [' '.join(str(a) for a in args), '\n']
|
||||
|
||||
sys.stderr.write(''.join(strs))
|
||||
|
||||
def bin_version(binary: Optional[str]) -> Optional[str]:
|
||||
"""check the presence and return valid version line of a specified binary"""
|
||||
|
||||
abspath = bin_path(binary)
|
||||
if not abspath:
|
||||
return None
|
||||
|
||||
try:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except Exception:
|
||||
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
|
||||
# stderr(' Make sure it\'s installed, then confirm it\'s working by running:')
|
||||
# stderr(f' {binary} --version')
|
||||
# stderr()
|
||||
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
|
||||
# stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
|
||||
# stderr()
|
||||
return None
|
||||
|
||||
def bin_path(binary: Optional[str]) -> Optional[str]:
|
||||
if binary is None:
|
||||
return None
|
||||
|
||||
return shutil.which(os.path.expanduser(binary)) or binary
|
||||
|
||||
def bin_hash(binary: Optional[str]) -> Optional[str]:
|
||||
if binary is None:
|
||||
return None
|
||||
abs_path = bin_path(binary)
|
||||
if abs_path is None or not Path(abs_path).exists():
|
||||
return None
|
||||
|
||||
file_hash = md5()
|
||||
with io.open(abs_path, mode='rb') as f:
|
||||
for chunk in iter(lambda: f.read(io.DEFAULT_BUFFER_SIZE), b''):
|
||||
file_hash.update(chunk)
|
||||
|
||||
return f'md5:{file_hash.hexdigest()}'
|
||||
|
||||
def find_chrome_binary() -> Optional[str]:
|
||||
"""find any installed chrome binaries in the default locations"""
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# make sure data dir finding precedence order always matches binary finding order
|
||||
default_executable_paths = (
|
||||
'chromium-browser',
|
||||
'chromium',
|
||||
'/Applications/Chromium.app/Contents/MacOS/Chromium',
|
||||
'chrome',
|
||||
'google-chrome',
|
||||
'/Applications/Google Chrome.app/Contents/MacOS/Google Chrome',
|
||||
'google-chrome-stable',
|
||||
'google-chrome-beta',
|
||||
'google-chrome-canary',
|
||||
'/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary',
|
||||
'google-chrome-unstable',
|
||||
'google-chrome-dev',
|
||||
)
|
||||
for name in default_executable_paths:
|
||||
full_path_exists = shutil.which(name)
|
||||
if full_path_exists:
|
||||
return name
|
||||
|
||||
stderr('[X] Unable to find a working version of Chrome/Chromium, is it installed and in your $PATH?', color='red')
|
||||
stderr()
|
||||
return None
|
||||
|
||||
def find_chrome_data_dir() -> Optional[str]:
|
||||
"""find any installed chrome user data directories in the default locations"""
|
||||
# Precedence: Chromium, Chrome, Beta, Canary, Unstable, Dev
|
||||
# make sure data dir finding precedence order always matches binary finding order
|
||||
default_profile_paths = (
|
||||
'~/.config/chromium',
|
||||
'~/Library/Application Support/Chromium',
|
||||
'~/AppData/Local/Chromium/User Data',
|
||||
'~/.config/chrome',
|
||||
'~/.config/google-chrome',
|
||||
'~/Library/Application Support/Google/Chrome',
|
||||
'~/AppData/Local/Google/Chrome/User Data',
|
||||
'~/.config/google-chrome-stable',
|
||||
'~/.config/google-chrome-beta',
|
||||
'~/Library/Application Support/Google/Chrome Canary',
|
||||
'~/AppData/Local/Google/Chrome SxS/User Data',
|
||||
'~/.config/google-chrome-unstable',
|
||||
'~/.config/google-chrome-dev',
|
||||
)
|
||||
for path in default_profile_paths:
|
||||
full_path = os.path.expanduser(path)
|
||||
if os.path.exists(full_path):
|
||||
return full_path
|
||||
return None
|
||||
|
||||
def wget_supports_compression(config):
|
||||
cmd = [
|
||||
config['WGET_BINARY'],
|
||||
"--compression=auto",
|
||||
"--help",
|
||||
]
|
||||
return not run(cmd, stdout=DEVNULL, stderr=DEVNULL).returncode
|
||||
|
||||
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||
return {
|
||||
'REPO_DIR': {
|
||||
'path': os.path.abspath(config['REPO_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['REPO_DIR'], 'archivebox')),
|
||||
},
|
||||
'PYTHON_DIR': {
|
||||
'path': os.path.abspath(config['PYTHON_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['PYTHON_DIR'], '__main__.py')),
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': os.path.abspath(config['TEMPLATES_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['TEMPLATES_DIR'], 'static')),
|
||||
},
|
||||
}
|
||||
|
||||
def get_external_locations(config: ConfigDict) -> ConfigValue:
|
||||
abspath = lambda path: None if path is None else os.path.abspath(path)
|
||||
return {
|
||||
'CHROME_USER_DATA_DIR': {
|
||||
'path': abspath(config['CHROME_USER_DATA_DIR']),
|
||||
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
|
||||
},
|
||||
'COOKIES_FILE': {
|
||||
'path': abspath(config['COOKIES_FILE']),
|
||||
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']),
|
||||
},
|
||||
}
|
||||
|
||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'OUTPUT_DIR': {
|
||||
'path': os.path.abspath(config['OUTPUT_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': os.path.abspath(config['SOURCES_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['SOURCES_DIR']),
|
||||
},
|
||||
'LOGS_DIR': {
|
||||
'path': os.path.abspath(config['LOGS_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['LOGS_DIR']),
|
||||
},
|
||||
'ARCHIVE_DIR': {
|
||||
'path': os.path.abspath(config['ARCHIVE_DIR']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['ARCHIVE_DIR']),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': os.path.abspath(config['CONFIG_FILE']),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(config['CONFIG_FILE']),
|
||||
},
|
||||
'SQL_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], SQL_INDEX_FILENAME)),
|
||||
},
|
||||
'JSON_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], JSON_INDEX_FILENAME)),
|
||||
},
|
||||
'HTML_INDEX': {
|
||||
'path': os.path.abspath(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
|
||||
'enabled': True,
|
||||
'is_valid': os.path.exists(os.path.join(config['OUTPUT_DIR'], HTML_INDEX_FILENAME)),
|
||||
},
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'PYTHON_BINARY': {
|
||||
'path': bin_path(config['PYTHON_BINARY']),
|
||||
'version': config['PYTHON_VERSION'],
|
||||
'hash': bin_hash(config['PYTHON_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['DJANGO_VERSION']),
|
||||
},
|
||||
'DJANGO_BINARY': {
|
||||
'path': bin_path(config['DJANGO_BINARY']),
|
||||
'version': config['DJANGO_VERSION'],
|
||||
'hash': bin_hash(config['DJANGO_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['DJANGO_VERSION']),
|
||||
},
|
||||
'CURL_BINARY': {
|
||||
'path': bin_path(config['CURL_BINARY']),
|
||||
'version': config['CURL_VERSION'],
|
||||
'hash': bin_hash(config['PYTHON_BINARY']),
|
||||
'enabled': config['USE_CURL'],
|
||||
'is_valid': bool(config['CURL_VERSION']),
|
||||
},
|
||||
'WGET_BINARY': {
|
||||
'path': bin_path(config['WGET_BINARY']),
|
||||
'version': config['WGET_VERSION'],
|
||||
'hash': bin_hash(config['WGET_BINARY']),
|
||||
'enabled': config['USE_WGET'],
|
||||
'is_valid': bool(config['WGET_VERSION']),
|
||||
},
|
||||
'GIT_BINARY': {
|
||||
'path': bin_path(config['GIT_BINARY']),
|
||||
'version': config['GIT_VERSION'],
|
||||
'hash': bin_hash(config['GIT_BINARY']),
|
||||
'enabled': config['USE_GIT'],
|
||||
'is_valid': bool(config['GIT_VERSION']),
|
||||
},
|
||||
'YOUTUBEDL_BINARY': {
|
||||
'path': bin_path(config['YOUTUBEDL_BINARY']),
|
||||
'version': config['YOUTUBEDL_VERSION'],
|
||||
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
|
||||
'enabled': config['USE_YOUTUBEDL'],
|
||||
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||
},
|
||||
'CHROME_BINARY': {
|
||||
'path': bin_path(config['CHROME_BINARY']),
|
||||
'version': config['CHROME_VERSION'],
|
||||
'hash': bin_hash(config['CHROME_BINARY']),
|
||||
'enabled': config['USE_CHROME'],
|
||||
'is_valid': bool(config['CHROME_VERSION']),
|
||||
},
|
||||
}
|
||||
|
||||
def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'TIMEOUT': config['TIMEOUT'],
|
||||
'RESOLUTION': config['RESOLUTION'],
|
||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
||||
'CHROME_BINARY': config['CHROME_BINARY'],
|
||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
||||
'CHROME_USER_DATA_DIR': config['CHROME_USER_DATA_DIR'],
|
||||
}
|
||||
|
||||
|
||||
################################## Load Config #################################
|
||||
|
||||
|
||||
def load_all_config():
|
||||
CONFIG: ConfigDict = {}
|
||||
for section_name, section_config in CONFIG_DEFAULTS.items():
|
||||
CONFIG = load_config(section_config, CONFIG)
|
||||
|
||||
return load_config(DERIVED_CONFIG_DEFAULTS, CONFIG)
|
||||
|
||||
CONFIG = load_all_config()
|
||||
globals().update(CONFIG)
|
||||
|
||||
|
||||
############################## Importable Checkers #############################
|
||||
|
||||
def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||
### Check system environment
|
||||
if config['USER'] == 'root':
|
||||
stderr('[!] ArchiveBox should never be run as root!', color='red')
|
||||
stderr(' For more information, see the security overview documentation:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
||||
raise SystemExit(2)
|
||||
|
||||
### Check Python environment
|
||||
if sys.version_info[:3] < (3, 6, 0):
|
||||
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
|
||||
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||
raise SystemExit(2)
|
||||
|
||||
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
|
||||
stderr(f'[X] Your system is running python3 scripts with a bad locale setting: {config["PYTHON_ENCODING"]} (it should be UTF-8).', color='red')
|
||||
stderr(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)')
|
||||
stderr(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"')
|
||||
stderr('')
|
||||
stderr(' Confirm that it\'s fixed by opening a new shell and running:')
|
||||
stderr(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8')
|
||||
raise SystemExit(2)
|
||||
|
||||
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
|
||||
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
|
||||
if config['CHROME_USER_DATA_DIR'] is not None:
|
||||
if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
|
||||
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
|
||||
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
||||
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||
stderr(' For more info see:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
if 'Default' in config['CHROME_USER_DATA_DIR']:
|
||||
stderr()
|
||||
stderr(' Try removing /Default from the end e.g.:')
|
||||
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
|
||||
raise SystemExit(2)
|
||||
|
||||
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||
invalid = [
|
||||
'{}: {} ({})'.format(name, info['path'] or 'unable to find binary', info['version'] or 'unable to detect version')
|
||||
for name, info in config['DEPENDENCIES'].items()
|
||||
if info['enabled'] and not info['is_valid']
|
||||
]
|
||||
|
||||
if invalid:
|
||||
stderr('[X] Missing some required dependencies.', color='red')
|
||||
stderr()
|
||||
stderr(' {}'.format('\n '.join(invalid)))
|
||||
if show_help:
|
||||
stderr()
|
||||
stderr(' To get more info on dependency status run:')
|
||||
stderr(' archivebox --version')
|
||||
raise SystemExit(2)
|
||||
|
||||
if config['TIMEOUT'] < 5:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.')
|
||||
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
|
||||
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={config["TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
|
||||
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
|
||||
stderr()
|
||||
stderr(f'[!] Warning: MEDIA_TIMEOUT is set too low! (currently set to MEDIA_TIMEOUT={config["MEDIA_TIMEOUT"]} seconds)', color='red')
|
||||
stderr(' Youtube-dl will fail to archive all media if set to less than ~20 seconds.')
|
||||
stderr(' (Setting it somewhere over 60 seconds is recommended)')
|
||||
stderr()
|
||||
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
|
||||
|
||||
|
||||
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
assert isinstance(output_dir, str)
|
||||
|
||||
json_index_exists = os.path.exists(os.path.join(output_dir, JSON_INDEX_FILENAME))
|
||||
if not json_index_exists:
|
||||
stderr('[X] No archivebox index found in the current directory.', color='red')
|
||||
stderr(f' {output_dir}', color='lightyellow')
|
||||
stderr()
|
||||
stderr(' {lightred}Hint{reset}: Are you running archivebox in the right folder?'.format(**config['ANSI']))
|
||||
stderr(' cd path/to/your/archive/folder')
|
||||
stderr(' archivebox [command]')
|
||||
stderr()
|
||||
stderr(' {lightred}Hint{reset}: To create a new archive collection or import existing data in this folder, run:'.format(**config['ANSI']))
|
||||
stderr(' archivebox init')
|
||||
raise SystemExit(2)
|
||||
|
||||
sql_index_exists = os.path.exists(os.path.join(output_dir, SQL_INDEX_FILENAME))
|
||||
from ..index.sql import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
|
||||
if (not sql_index_exists) or pending_migrations:
|
||||
if sql_index_exists:
|
||||
pending_operation = f'apply the {len(pending_migrations)} pending migrations'
|
||||
else:
|
||||
pending_operation = 'generate the new SQL main index'
|
||||
|
||||
stderr('[X] This collection was created with an older version of ArchiveBox and must be upgraded first.', color='lightyellow')
|
||||
stderr(f' {output_dir}')
|
||||
stderr()
|
||||
stderr(f' To upgrade it to the latest version and {pending_operation} run:')
|
||||
stderr(' archivebox init')
|
||||
raise SystemExit(3)
|
||||
|
||||
sources_dir = os.path.join(output_dir, SOURCES_DIR_NAME)
|
||||
if not os.path.exists(sources_dir):
|
||||
os.makedirs(sources_dir)
|
||||
|
||||
|
||||
|
||||
def setup_django(out_dir: str=None, check_db=False, config: ConfigDict=CONFIG) -> None:
|
||||
check_system_config()
|
||||
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
|
||||
assert isinstance(output_dir, str) and isinstance(config['PYTHON_DIR'], str)
|
||||
|
||||
try:
|
||||
import django
|
||||
sys.path.append(config['PYTHON_DIR'])
|
||||
os.environ.setdefault('OUTPUT_DIR', output_dir)
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
django.setup()
|
||||
|
||||
if check_db:
|
||||
sql_index_path = os.path.join(output_dir, SQL_INDEX_FILENAME)
|
||||
assert os.path.exists(sql_index_path), (
|
||||
f'No database file {SQL_INDEX_FILENAME} found in OUTPUT_DIR: {config["OUTPUT_DIR"]}')
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
||||
os.umask(0o777 - int(OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
131
archivebox/config/stubs.py
Normal file
131
archivebox/config/stubs.py
Normal file
|
@ -0,0 +1,131 @@
|
|||
from typing import Optional, Dict, Union, Tuple, Callable, Pattern, Type, Any
|
||||
from mypy_extensions import TypedDict
|
||||
|
||||
|
||||
SimpleConfigValue = Union[str, bool, int, None, Pattern, Dict[str, Any]]
|
||||
SimpleConfigValueDict = Dict[str, SimpleConfigValue]
|
||||
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
||||
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
||||
|
||||
|
||||
class BaseConfig(TypedDict):
|
||||
pass
|
||||
|
||||
class ConfigDict(BaseConfig, total=False):
|
||||
"""
|
||||
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
||||
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
||||
print('class ConfigDict(BaseConfig, total=False):')
|
||||
print(' ' + '"'*3 + ConfigDict.__doc__ + '"'*3)
|
||||
for section, configs in CONFIG_DEFAULTS.items():
|
||||
for key, attrs in configs.items():
|
||||
Type, default = attrs['type'], attrs['default']
|
||||
if default is None:
|
||||
print(f' {key}: Optional[{Type.__name__}]')
|
||||
else:
|
||||
print(f' {key}: {Type.__name__}')
|
||||
print()
|
||||
"""
|
||||
IS_TTY: bool
|
||||
USE_COLOR: bool
|
||||
SHOW_PROGRESS: bool
|
||||
|
||||
OUTPUT_DIR: str
|
||||
CONFIG_FILE: str
|
||||
ONLY_NEW: bool
|
||||
TIMEOUT: int
|
||||
MEDIA_TIMEOUT: int
|
||||
OUTPUT_PERMISSIONS: str
|
||||
URL_BLACKLIST: Optional[str]
|
||||
|
||||
SECRET_KEY: str
|
||||
ALLOWED_HOSTS: str
|
||||
DEBUG: bool
|
||||
PUBLIC_INDEX: bool
|
||||
PUBLIC_SNAPSHOTS: bool
|
||||
FOOTER_INFO: str
|
||||
ACTIVE_THEME: str
|
||||
|
||||
SAVE_TITLE: bool
|
||||
SAVE_FAVICON: bool
|
||||
SAVE_WGET: bool
|
||||
SAVE_WGET_REQUISITES: bool
|
||||
SAVE_PDF: bool
|
||||
SAVE_SCREENSHOT: bool
|
||||
SAVE_DOM: bool
|
||||
SAVE_WARC: bool
|
||||
SAVE_GIT: bool
|
||||
SAVE_MEDIA: bool
|
||||
SAVE_PLAYLISTS: bool
|
||||
SAVE_ARCHIVE_DOT_ORG: bool
|
||||
|
||||
RESOLUTION: str
|
||||
GIT_DOMAINS: str
|
||||
CHECK_SSL_VALIDITY: bool
|
||||
CURL_USER_AGENT: str
|
||||
WGET_USER_AGENT: str
|
||||
CHROME_USER_AGENT: str
|
||||
COOKIES_FILE: Optional[str]
|
||||
CHROME_USER_DATA_DIR: Optional[str]
|
||||
CHROME_HEADLESS: bool
|
||||
CHROME_SANDBOX: bool
|
||||
|
||||
USE_CURL: bool
|
||||
USE_WGET: bool
|
||||
USE_GIT: bool
|
||||
USE_CHROME: bool
|
||||
USE_YOUTUBEDL: bool
|
||||
|
||||
CURL_BINARY: Optional[str]
|
||||
GIT_BINARY: Optional[str]
|
||||
WGET_BINARY: Optional[str]
|
||||
YOUTUBEDL_BINARY: Optional[str]
|
||||
CHROME_BINARY: Optional[str]
|
||||
|
||||
TERM_WIDTH: Callable[[], int]
|
||||
USER: str
|
||||
ANSI: Dict[str, str]
|
||||
REPO_DIR: str
|
||||
PYTHON_DIR: str
|
||||
TEMPLATES_DIR: str
|
||||
ARCHIVE_DIR: str
|
||||
SOURCES_DIR: str
|
||||
LOGS_DIR: str
|
||||
|
||||
URL_BLACKLIST_PTN: Optional[Pattern]
|
||||
WGET_AUTO_COMPRESSION: bool
|
||||
|
||||
ARCHIVEBOX_BINARY: str
|
||||
VERSION: str
|
||||
GIT_SHA: str
|
||||
|
||||
PYTHON_BINARY: str
|
||||
PYTHON_ENCODING: str
|
||||
PYTHON_VERSION: str
|
||||
|
||||
DJANGO_BINARY: str
|
||||
DJANGO_VERSION: str
|
||||
|
||||
CURL_VERSION: str
|
||||
WGET_VERSION: str
|
||||
YOUTUBEDL_VERSION: str
|
||||
GIT_VERSION: str
|
||||
CHROME_VERSION: str
|
||||
|
||||
DEPENDENCIES: Dict[str, SimpleConfigValueDict]
|
||||
CODE_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
CONFIG_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
DATA_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
CHROME_OPTIONS: Dict[str, SimpleConfigValue]
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
ConfigDefaultValue = Union[ConfigValue, ConfigDefaultValueGetter]
|
||||
|
||||
ConfigDefault = TypedDict('ConfigDefault', {
|
||||
'default': ConfigDefaultValue,
|
||||
'type': Optional[Type],
|
||||
'aliases': Optional[Tuple[str, ...]],
|
||||
}, total=False)
|
||||
|
||||
ConfigDefaultDict = Dict[str, ConfigDefault]
|
1
archivebox/core/__init__.py
Normal file
1
archivebox/core/__init__.py
Normal file
|
@ -0,0 +1 @@
|
|||
__package__ = 'archivebox.core'
|
200
archivebox/core/admin.py
Normal file
200
archivebox/core/admin.py
Normal file
|
@ -0,0 +1,200 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from io import StringIO
|
||||
from contextlib import redirect_stdout
|
||||
from pathlib import Path
|
||||
|
||||
from django.contrib import admin
|
||||
from django.urls import path
|
||||
from django.utils.html import format_html
|
||||
from django.utils.safestring import mark_safe
|
||||
from django.shortcuts import render, redirect
|
||||
from django.contrib.auth import get_user_model
|
||||
|
||||
from core.models import Snapshot
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
from util import htmldecode, urldecode, ansi_to_html
|
||||
from logging_util import printable_filesize
|
||||
from main import add, remove
|
||||
from config import OUTPUT_DIR
|
||||
from extractors import archive_links
|
||||
|
||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
def update_snapshots(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], out_dir=OUTPUT_DIR)
|
||||
update_snapshots.short_description = "Archive"
|
||||
|
||||
def update_titles(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, methods=('title',), out_dir=OUTPUT_DIR)
|
||||
update_titles.short_description = "Pull title"
|
||||
|
||||
def overwrite_snapshots(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, out_dir=OUTPUT_DIR)
|
||||
overwrite_snapshots.short_description = "Re-archive (overwrite)"
|
||||
|
||||
def verify_snapshots(modeladmin, request, queryset):
|
||||
for snapshot in queryset:
|
||||
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
|
||||
|
||||
verify_snapshots.short_description = "Check"
|
||||
|
||||
def delete_snapshots(modeladmin, request, queryset):
|
||||
remove(links=[snapshot.as_link() for snapshot in queryset], yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||
|
||||
delete_snapshots.short_description = "Delete"
|
||||
|
||||
|
||||
class SnapshotAdmin(admin.ModelAdmin):
|
||||
list_display = ('added', 'title_str', 'url_str', 'files', 'size', 'updated')
|
||||
sort_fields = ('title_str', 'url_str', 'added', 'updated')
|
||||
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
||||
search_fields = ('url', 'timestamp', 'title', 'tags')
|
||||
fields = ('title', 'tags', *readonly_fields)
|
||||
list_filter = ('added', 'updated', 'tags')
|
||||
ordering = ['-added']
|
||||
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
|
||||
actions_template = 'admin/actions_as_select.html'
|
||||
|
||||
def id_str(self, obj):
|
||||
return format_html(
|
||||
'<code style="font-size: 10px">{}</code>',
|
||||
obj.url_hash[:8],
|
||||
)
|
||||
|
||||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
format_html('<span>{}</span>', tag.strip())
|
||||
for tag in obj.tags.split(',')
|
||||
) if obj.tags else ''
|
||||
return format_html(
|
||||
'<a href="/{}">'
|
||||
'<img src="/{}/{}" class="favicon" onerror="this.remove()">'
|
||||
'</a>'
|
||||
'<a href="/{}/{}">'
|
||||
'<b class="status-{}">{}</b>'
|
||||
'</a>',
|
||||
obj.archive_path,
|
||||
obj.archive_path, canon['favicon_path'],
|
||||
obj.archive_path, canon['wget_path'] or '',
|
||||
'fetched' if obj.latest_title or obj.title else 'pending',
|
||||
urldecode(htmldecode(obj.latest_title or obj.title or ''))[:128] or 'Pending...'
|
||||
) + mark_safe(f'<span class="tags">{tags}</span>')
|
||||
|
||||
def files(self, obj):
|
||||
link = obj.as_link()
|
||||
canon = link.canonical_outputs()
|
||||
out_dir = Path(link.link_dir)
|
||||
|
||||
link_tuple = lambda link, method: (link.archive_path, canon[method], canon[method] and (out_dir / canon[method]).exists())
|
||||
|
||||
return format_html(
|
||||
'<span class="files-icons" style="font-size: 1.2em; opacity: 0.8">'
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Wget clone">🌐 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="PDF">📄</a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="Screenshot">🖥 </a> '
|
||||
'<a href="/{}/{}" class="exists-{}" title="HTML dump">🅷 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="WARC">🆆 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Media files">📼 </a> '
|
||||
'<a href="/{}/{}/" class="exists-{}" title="Git repos">📦 </a> '
|
||||
'<a href="{}" class="exists-{}" title="Archive.org snapshot">🏛 </a> '
|
||||
'</span>',
|
||||
*link_tuple(link, 'wget_path'),
|
||||
*link_tuple(link, 'pdf_path'),
|
||||
*link_tuple(link, 'screenshot_path'),
|
||||
*link_tuple(link, 'dom_path'),
|
||||
*link_tuple(link, 'warc_path')[:2], any((out_dir / canon['warc_path']).glob('*.warc.gz')),
|
||||
*link_tuple(link, 'media_path')[:2], any((out_dir / canon['media_path']).glob('*')),
|
||||
*link_tuple(link, 'git_path')[:2], any((out_dir / canon['git_path']).glob('*')),
|
||||
canon['archive_org_path'], (out_dir / 'archive.org.txt').exists(),
|
||||
)
|
||||
|
||||
def size(self, obj):
|
||||
return format_html(
|
||||
'<a href="/{}" title="View all files">{}</a>',
|
||||
obj.archive_path,
|
||||
printable_filesize(obj.archive_size) if obj.archive_size else 'pending',
|
||||
)
|
||||
|
||||
def url_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="{}">{}</a>',
|
||||
obj.url,
|
||||
obj.url.split('://www.', 1)[-1].split('://', 1)[-1][:64],
|
||||
)
|
||||
|
||||
id_str.short_description = 'ID'
|
||||
title_str.short_description = 'Title'
|
||||
url_str.short_description = 'Original URL'
|
||||
|
||||
id_str.admin_order_field = 'id'
|
||||
title_str.admin_order_field = 'title'
|
||||
url_str.admin_order_field = 'url'
|
||||
|
||||
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
index_title = 'Links'
|
||||
site_title = 'Index'
|
||||
|
||||
def get_urls(self):
|
||||
return [
|
||||
path('core/snapshot/add/', self.add_view, name='Add'),
|
||||
] + super().get_urls()
|
||||
|
||||
def add_view(self, request):
|
||||
if not request.user.is_authenticated:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
request.current_app = self.name
|
||||
context = {
|
||||
**self.each_context(request),
|
||||
'title': 'Add URLs',
|
||||
}
|
||||
|
||||
if request.method == 'GET':
|
||||
context['form'] = AddLinkForm()
|
||||
|
||||
elif request.method == 'POST':
|
||||
form = AddLinkForm(request.POST)
|
||||
if form.is_valid():
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
}
|
||||
add_stdout = StringIO()
|
||||
with redirect_stdout(add_stdout):
|
||||
add(**input_kwargs)
|
||||
print(add_stdout.getvalue())
|
||||
|
||||
context.update({
|
||||
"stdout": ansi_to_html(add_stdout.getvalue().strip()),
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
else:
|
||||
context["form"] = form
|
||||
|
||||
return render(template_name='add_links.html', request=request, context=context)
|
||||
|
||||
|
||||
admin.site = ArchiveBoxAdmin()
|
||||
admin.site.register(get_user_model())
|
||||
admin.site.register(Snapshot, SnapshotAdmin)
|
||||
admin.site.disable_action('delete_selected')
|
5
archivebox/core/apps.py
Normal file
5
archivebox/core/apps.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class CoreConfig(AppConfig):
|
||||
name = 'core'
|
14
archivebox/core/forms.py
Normal file
14
archivebox/core/forms.py
Normal file
|
@ -0,0 +1,14 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from django import forms
|
||||
|
||||
from ..util import URL_REGEX
|
||||
|
||||
CHOICES = (
|
||||
('0', 'depth = 0 (archive just these URLs)'),
|
||||
('1', 'depth = 1 (archive these URLs and all URLs one hop away)'),
|
||||
)
|
||||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
|
18
archivebox/core/management/commands/archivebox.py
Normal file
18
archivebox/core/management/commands/archivebox.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
|
||||
from .cli import run_subcommand
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = 'Run an ArchiveBox CLI subcommand (e.g. add, remove, list, etc)'
|
||||
|
||||
def add_arguments(self, parser):
|
||||
parser.add_argument('subcommand', type=str, help='The subcommand you want to run')
|
||||
parser.add_argument('command_args', nargs='*', help='Arguments to pass to the subcommand')
|
||||
|
||||
|
||||
def handle(self, *args, **kwargs):
|
||||
run_subcommand(kwargs['subcommand'], args=kwargs['command_args'])
|
27
archivebox/core/migrations/0001_initial.py
Normal file
27
archivebox/core/migrations/0001_initial.py
Normal file
|
@ -0,0 +1,27 @@
|
|||
# Generated by Django 2.2 on 2019-05-01 03:27
|
||||
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
initial = True
|
||||
|
||||
dependencies = [
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='Snapshot',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=uuid.uuid4, editable=False, primary_key=True, serialize=False)),
|
||||
('url', models.URLField(unique=True)),
|
||||
('timestamp', models.CharField(default=None, max_length=32, null=True, unique=True)),
|
||||
('title', models.CharField(default=None, max_length=128, null=True)),
|
||||
('tags', models.CharField(default=None, max_length=256, null=True)),
|
||||
('added', models.DateTimeField(auto_now_add=True)),
|
||||
('updated', models.DateTimeField(default=None, null=True)),
|
||||
],
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0002_auto_20200625_1521.py
Normal file
18
archivebox/core/migrations/0002_auto_20200625_1521.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.0.7 on 2020-06-25 15:21
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(default=None, max_length=32, null=True),
|
||||
),
|
||||
]
|
38
archivebox/core/migrations/0003_auto_20200630_1034.py
Normal file
38
archivebox/core/migrations/0003_auto_20200630_1034.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
# Generated by Django 3.0.7 on 2020-06-30 10:34
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0002_auto_20200625_1521'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='added',
|
||||
field=models.DateTimeField(auto_now_add=True, db_index=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.CharField(db_index=True, default=None, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(db_index=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
field=models.DateTimeField(db_index=True, default=None, null=True),
|
||||
),
|
||||
]
|
19
archivebox/core/migrations/0004_auto_20200713_1552.py
Normal file
19
archivebox/core/migrations/0004_auto_20200713_1552.py
Normal file
|
@ -0,0 +1,19 @@
|
|||
# Generated by Django 3.0.7 on 2020-07-13 15:52
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0003_auto_20200630_1034'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='timestamp',
|
||||
field=models.CharField(db_index=True, default=None, max_length=32, unique=True),
|
||||
preserve_default=False,
|
||||
),
|
||||
]
|
28
archivebox/core/migrations/0005_auto_20200728_0326.py
Normal file
28
archivebox/core/migrations/0005_auto_20200728_0326.py
Normal file
|
@ -0,0 +1,28 @@
|
|||
# Generated by Django 3.0.7 on 2020-07-28 03:26
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0004_auto_20200713_1552'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
field=models.DateTimeField(blank=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
94
archivebox/core/models.py
Normal file
94
archivebox/core/models.py
Normal file
|
@ -0,0 +1,94 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
import uuid
|
||||
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from ..util import parse_date
|
||||
from ..index.schema import Link
|
||||
|
||||
|
||||
class Snapshot(models.Model):
|
||||
id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False)
|
||||
|
||||
url = models.URLField(unique=True)
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
||||
|
||||
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
|
||||
tags = models.CharField(max_length=256, null=True, blank=True, db_index=True)
|
||||
|
||||
added = models.DateTimeField(auto_now_add=True, db_index=True)
|
||||
updated = models.DateTimeField(null=True, blank=True, db_index=True)
|
||||
# bookmarked = models.DateTimeField()
|
||||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
def __repr__(self) -> str:
|
||||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
|
||||
def __str__(self) -> str:
|
||||
title = self.title or '-'
|
||||
return f'[{self.timestamp}] {self.url[:64]} ({title[:64]})'
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, info: dict):
|
||||
info = {k: v for k, v in info.items() if k in cls.keys}
|
||||
return cls(**info)
|
||||
|
||||
def as_json(self, *args) -> dict:
|
||||
args = args or self.keys
|
||||
return {
|
||||
key: getattr(self, key)
|
||||
for key in args
|
||||
}
|
||||
|
||||
def as_link(self) -> Link:
|
||||
return Link.from_json(self.as_json())
|
||||
|
||||
@cached_property
|
||||
def bookmarked(self):
|
||||
return parse_date(self.timestamp)
|
||||
|
||||
@cached_property
|
||||
def is_archived(self):
|
||||
return self.as_link().is_archived
|
||||
|
||||
@cached_property
|
||||
def num_outputs(self):
|
||||
return self.as_link().num_outputs
|
||||
|
||||
@cached_property
|
||||
def url_hash(self):
|
||||
return self.as_link().url_hash
|
||||
|
||||
@cached_property
|
||||
def base_url(self):
|
||||
return self.as_link().base_url
|
||||
|
||||
@cached_property
|
||||
def link_dir(self):
|
||||
return self.as_link().link_dir
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
return self.as_link().archive_path
|
||||
|
||||
@cached_property
|
||||
def archive_size(self):
|
||||
return self.as_link().archive_size
|
||||
|
||||
@cached_property
|
||||
def history(self):
|
||||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link()).history
|
||||
|
||||
@cached_property
|
||||
def latest_title(self):
|
||||
if ('title' in self.history
|
||||
and self.history['title']
|
||||
and (self.history['title'][-1].status == 'succeeded')
|
||||
and self.history['title'][-1].output.strip()):
|
||||
return self.history['title'][-1].output.strip()
|
||||
return None
|
127
archivebox/core/settings.py
Normal file
127
archivebox/core/settings.py
Normal file
|
@ -0,0 +1,127 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
|
||||
from ..config import ( # noqa: F401
|
||||
DEBUG,
|
||||
SECRET_KEY,
|
||||
ALLOWED_HOSTS,
|
||||
PYTHON_DIR,
|
||||
ACTIVE_THEME,
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
)
|
||||
|
||||
ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
|
||||
IS_SHELL = 'shell' in sys.argv[:3] or 'shell_plus' in sys.argv[:3]
|
||||
|
||||
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
|
||||
|
||||
INSTALLED_APPS = [
|
||||
'django.contrib.auth',
|
||||
'django.contrib.contenttypes',
|
||||
'django.contrib.sessions',
|
||||
'django.contrib.messages',
|
||||
'django.contrib.staticfiles',
|
||||
'django.contrib.admin',
|
||||
|
||||
'core',
|
||||
|
||||
'django_extensions',
|
||||
]
|
||||
|
||||
|
||||
MIDDLEWARE = [
|
||||
'django.middleware.security.SecurityMiddleware',
|
||||
'django.contrib.sessions.middleware.SessionMiddleware',
|
||||
'django.middleware.common.CommonMiddleware',
|
||||
'django.middleware.csrf.CsrfViewMiddleware',
|
||||
'django.contrib.auth.middleware.AuthenticationMiddleware',
|
||||
'django.contrib.messages.middleware.MessageMiddleware',
|
||||
]
|
||||
|
||||
ROOT_URLCONF = 'core.urls'
|
||||
APPEND_SLASH = True
|
||||
TEMPLATES = [
|
||||
{
|
||||
'BACKEND': 'django.template.backends.django.DjangoTemplates',
|
||||
'DIRS': [
|
||||
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME),
|
||||
os.path.join(PYTHON_DIR, 'themes', 'default'),
|
||||
os.path.join(PYTHON_DIR, 'themes'),
|
||||
],
|
||||
'APP_DIRS': True,
|
||||
'OPTIONS': {
|
||||
'context_processors': [
|
||||
'django.template.context_processors.debug',
|
||||
'django.template.context_processors.request',
|
||||
'django.contrib.auth.context_processors.auth',
|
||||
'django.contrib.messages.context_processors.messages',
|
||||
],
|
||||
},
|
||||
},
|
||||
]
|
||||
|
||||
WSGI_APPLICATION = 'core.wsgi.application'
|
||||
|
||||
DATABASES = {
|
||||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME),
|
||||
}
|
||||
}
|
||||
|
||||
AUTHENTICATION_BACKENDS = [
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
]
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.CommonPasswordValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.NumericPasswordValidator'},
|
||||
]
|
||||
|
||||
################################################################################
|
||||
### Security Settings
|
||||
################################################################################
|
||||
SECURE_BROWSER_XSS_FILTER = True
|
||||
SECURE_CONTENT_TYPE_NOSNIFF = True
|
||||
SESSION_COOKIE_SECURE = False
|
||||
CSRF_COOKIE_SECURE = False
|
||||
SESSION_COOKIE_DOMAIN = None
|
||||
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
|
||||
SESSION_SAVE_EVERY_REQUEST = True
|
||||
SESSION_COOKIE_AGE = 1209600 # 2 weeks
|
||||
LOGIN_URL = '/accounts/login/'
|
||||
LOGOUT_REDIRECT_URL = '/'
|
||||
PASSWORD_RESET_URL = '/accounts/password_reset/'
|
||||
|
||||
|
||||
SHELL_PLUS = 'ipython'
|
||||
SHELL_PLUS_PRINT_SQL = False
|
||||
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
|
||||
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
|
||||
if IS_SHELL:
|
||||
os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py')
|
||||
|
||||
|
||||
LANGUAGE_CODE = 'en-us'
|
||||
TIME_ZONE = 'UTC'
|
||||
USE_I18N = False
|
||||
USE_L10N = False
|
||||
USE_TZ = False
|
||||
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
|
||||
|
||||
EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
||||
|
||||
STATIC_URL = '/static/'
|
||||
STATICFILES_DIRS = [
|
||||
os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'),
|
||||
os.path.join(PYTHON_DIR, 'themes', 'default', 'static'),
|
||||
]
|
3
archivebox/core/tests.py
Normal file
3
archivebox/core/tests.py
Normal file
|
@ -0,0 +1,3 @@
|
|||
#from django.test import TestCase
|
||||
|
||||
# Create your tests here.
|
34
archivebox/core/urls.py
Normal file
34
archivebox/core/urls.py
Normal file
|
@ -0,0 +1,34 @@
|
|||
from django.contrib import admin
|
||||
|
||||
from django.urls import path, include
|
||||
from django.views import static
|
||||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from core.views import MainIndex, OldIndex, LinkDetails
|
||||
|
||||
|
||||
# print('DEBUG', settings.DEBUG)
|
||||
|
||||
urlpatterns = [
|
||||
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
|
||||
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
|
||||
|
||||
path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'),
|
||||
|
||||
path('archive/', RedirectView.as_view(url='/')),
|
||||
path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
|
||||
path('add/', RedirectView.as_view(url='/admin/core/snapshot/add/')),
|
||||
|
||||
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
|
||||
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
|
||||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', admin.site.urls),
|
||||
|
||||
path('old.html', OldIndex.as_view(), name='OldHome'),
|
||||
path('index.html', RedirectView.as_view(url='/')),
|
||||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||
path('', MainIndex.as_view(), name='Home'),
|
||||
]
|
104
archivebox/core/views.py
Normal file
104
archivebox/core/views.py
Normal file
|
@ -0,0 +1,104 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
|
||||
from django.http import HttpResponse
|
||||
from django.views import View, static
|
||||
|
||||
from core.models import Snapshot
|
||||
|
||||
from ..index import load_main_index, load_main_index_meta
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
VERSION,
|
||||
FOOTER_INFO,
|
||||
PUBLIC_INDEX,
|
||||
PUBLIC_SNAPSHOTS,
|
||||
)
|
||||
from ..util import base_url
|
||||
|
||||
|
||||
class MainIndex(View):
|
||||
template = 'main_index.html'
|
||||
|
||||
def get(self, request):
|
||||
if request.user.is_authenticated:
|
||||
return redirect('/admin/core/snapshot/')
|
||||
|
||||
if PUBLIC_INDEX:
|
||||
return redirect('OldHome')
|
||||
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
|
||||
|
||||
class OldIndex(View):
|
||||
template = 'main_index.html'
|
||||
|
||||
def get(self, request):
|
||||
if PUBLIC_INDEX or request.user.is_authenticated:
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
meta_info = load_main_index_meta(out_dir=OUTPUT_DIR)
|
||||
|
||||
context = {
|
||||
'updated': meta_info['updated'],
|
||||
'num_links': meta_info['num_links'],
|
||||
'links': all_links,
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
return render(template_name=self.template, request=request, context=context)
|
||||
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
|
||||
class LinkDetails(View):
|
||||
def get(self, request, path):
|
||||
# missing trailing slash -> redirect to index
|
||||
if '/' not in path:
|
||||
return redirect(f'{path}/index.html')
|
||||
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
try:
|
||||
slug, archivefile = path.split('/', 1)
|
||||
except (IndexError, ValueError):
|
||||
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
||||
|
||||
all_pages = list(Snapshot.objects.all())
|
||||
|
||||
# slug is a timestamp
|
||||
by_ts = {page.timestamp: page for page in all_pages}
|
||||
try:
|
||||
# print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
|
||||
response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
|
||||
response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
|
||||
return response
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# slug is a hash
|
||||
by_hash = {page.url_hash: page for page in all_pages}
|
||||
try:
|
||||
timestamp = by_hash[slug].timestamp
|
||||
return redirect(f'/archive/{timestamp}/{archivefile}')
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
# slug is a URL
|
||||
by_url = {page.base_url: page for page in all_pages}
|
||||
try:
|
||||
# TODO: add multiple snapshot support by showing index of all snapshots
|
||||
# for given url instead of redirecting to timestamp index
|
||||
timestamp = by_url[base_url(path)].timestamp
|
||||
return redirect(f'/archive/{timestamp}/index.html')
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return HttpResponse(
|
||||
'No archived link matches the given timestamp or hash.',
|
||||
content_type="text/plain",
|
||||
status=404,
|
||||
)
|
5
archivebox/core/welcome_message.py
Normal file
5
archivebox/core/welcome_message.py
Normal file
|
@ -0,0 +1,5 @@
|
|||
from archivebox.logging_util import log_shell_welcome_msg
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
log_shell_welcome_msg()
|
16
archivebox/core/wsgi.py
Normal file
16
archivebox/core/wsgi.py
Normal file
|
@ -0,0 +1,16 @@
|
|||
"""
|
||||
WSGI config for archivebox project.
|
||||
|
||||
It exposes the WSGI callable as a module-level variable named ``application``.
|
||||
|
||||
For more information on this file, see
|
||||
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
|
||||
application = get_wsgi_application()
|
143
archivebox/extractors/__init__.py
Normal file
143
archivebox/extractors/__init__.py
Normal file
|
@ -0,0 +1,143 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional, List, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..index import (
|
||||
load_link_details,
|
||||
write_link_details,
|
||||
patch_main_index,
|
||||
)
|
||||
from ..util import enforce_types
|
||||
from ..logging_util import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
log_archiving_finished,
|
||||
log_link_archiving_started,
|
||||
log_link_archiving_finished,
|
||||
log_archive_method_started,
|
||||
log_archive_method_finished,
|
||||
)
|
||||
|
||||
from .title import should_save_title, save_title
|
||||
from .favicon import should_save_favicon, save_favicon
|
||||
from .wget import should_save_wget, save_wget
|
||||
from .pdf import should_save_pdf, save_pdf
|
||||
from .screenshot import should_save_screenshot, save_screenshot
|
||||
from .dom import should_save_dom, save_dom
|
||||
from .git import should_save_git, save_git
|
||||
from .media import should_save_media, save_media
|
||||
from .archive_org import should_save_archive_dot_org, save_archive_dot_org
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> Link:
|
||||
"""download the DOM, PDF, and a screenshot into a folder named after the link's timestamp"""
|
||||
|
||||
ARCHIVE_METHODS = [
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
if methods is not None:
|
||||
ARCHIVE_METHODS = [
|
||||
method for method in ARCHIVE_METHODS
|
||||
if method[1] in methods
|
||||
]
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
try:
|
||||
is_new = not os.path.exists(out_dir)
|
||||
if is_new:
|
||||
os.makedirs(out_dir)
|
||||
|
||||
link = load_link_details(link, out_dir=out_dir)
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
log_link_archiving_started(link, out_dir, is_new)
|
||||
link = link.overwrite(updated=datetime.now())
|
||||
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
|
||||
|
||||
for method_name, should_run, method_function in ARCHIVE_METHODS:
|
||||
try:
|
||||
if method_name not in link.history:
|
||||
link.history[method_name] = []
|
||||
|
||||
if should_run(link, out_dir) or overwrite:
|
||||
log_archive_method_started(method_name)
|
||||
|
||||
result = method_function(link=link, out_dir=out_dir)
|
||||
|
||||
link.history[method_name].append(result)
|
||||
|
||||
stats[result.status] += 1
|
||||
log_archive_method_finished(result)
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
try:
|
||||
latest_title = link.history['title'][-1].output.strip()
|
||||
if latest_title and len(latest_title) >= len(link.title or ''):
|
||||
link = link.overwrite(title=latest_title)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
patch_main_index(link)
|
||||
|
||||
# # If any changes were made, update the main links index json and html
|
||||
# was_changed = stats['succeeded'] or stats['failed']
|
||||
# if was_changed:
|
||||
# patch_main_index(link)
|
||||
|
||||
log_link_archiving_finished(link, link.link_dir, is_new, stats)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
try:
|
||||
write_link_details(link, out_dir=link.link_dir)
|
||||
except:
|
||||
pass
|
||||
raise
|
||||
|
||||
except Exception as err:
|
||||
print(' ! Failed to archive link: {}: {}'.format(err.__class__.__name__, err))
|
||||
raise
|
||||
|
||||
return link
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archive_links(links: List[Link], overwrite: bool=False, methods: Optional[Iterable[str]]=None, out_dir: Optional[str]=None) -> List[Link]:
|
||||
if not links:
|
||||
return []
|
||||
|
||||
log_archiving_started(len(links))
|
||||
idx: int = 0
|
||||
link: Link = links[0]
|
||||
try:
|
||||
for idx, link in enumerate(links):
|
||||
archive_link(link, overwrite=overwrite, methods=methods, out_dir=link.link_dir)
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(len(links), idx, link.timestamp)
|
||||
raise SystemExit(0)
|
||||
except BaseException:
|
||||
print()
|
||||
raise
|
||||
|
||||
log_archiving_finished(len(links))
|
||||
return links
|
113
archivebox/extractors/archive_org.py
Normal file
113
archivebox/extractors/archive_org.py
Normal file
|
@ -0,0 +1,113 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional, List, Dict, Tuple
|
||||
from collections import defaultdict
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_archive_dot_org(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'archive.org.txt')):
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SAVE_ARCHIVE_DOT_ORG
|
||||
|
||||
@enforce_types
|
||||
def save_archive_dot_org(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""submit site to archive.org for archiving via their service, save returned archive url"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'archive.org.txt'
|
||||
archive_org_url = None
|
||||
submit_url = 'https://web.archive.org/save/{}'.format(link.url)
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--silent',
|
||||
'--location',
|
||||
'--head',
|
||||
'--compressed',
|
||||
'--max-time', str(timeout),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
submit_url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
content_location, errors = parse_archive_dot_org_response(result.stdout)
|
||||
if content_location:
|
||||
archive_org_url = 'https://web.archive.org{}'.format(content_location[0])
|
||||
elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
|
||||
archive_org_url = None
|
||||
# raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
|
||||
elif errors:
|
||||
raise ArchiveError(', '.join(errors))
|
||||
else:
|
||||
raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
if output and not isinstance(output, Exception):
|
||||
# instead of writing None when archive.org rejects the url write the
|
||||
# url to resubmit it to archive.org. This is so when the user visits
|
||||
# the URL in person, it will attempt to re-archive it, and it'll show the
|
||||
# nicer error message explaining why the url was rejected if it fails.
|
||||
archive_org_url = archive_org_url or submit_url
|
||||
with open(os.path.join(out_dir, str(output)), 'w', encoding='utf-8') as f:
|
||||
f.write(archive_org_url)
|
||||
chmod_file('archive.org.txt', cwd=out_dir)
|
||||
output = archive_org_url
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_archive_dot_org_response(response: bytes) -> Tuple[List[str], List[str]]:
|
||||
# Parse archive.org response headers
|
||||
headers: Dict[str, List[str]] = defaultdict(list)
|
||||
|
||||
# lowercase all the header names and store in dict
|
||||
for header in response.splitlines():
|
||||
if b':' not in header or not header.strip():
|
||||
continue
|
||||
name, val = header.decode().split(':', 1)
|
||||
headers[name.lower().strip()].append(val.strip())
|
||||
|
||||
# Get successful archive url in "content-location" header or any errors
|
||||
content_location = headers.get('content-location', headers['location'])
|
||||
errors = headers['x-archive-wayback-runtime-error']
|
||||
return content_location, errors
|
||||
|
70
archivebox/extractors/dom.py
Normal file
70
archivebox/extractors/dom.py
Normal file
|
@ -0,0 +1,70 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file, atomic_write
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_DOM,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_dom(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.html')):
|
||||
return False
|
||||
|
||||
return SAVE_DOM
|
||||
|
||||
@enforce_types
|
||||
def save_dom(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print HTML of site to file using chrome --dump-html"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.html'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--dump-dom',
|
||||
link.url
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
atomic_write(output_path, result.stdout)
|
||||
|
||||
if result.returncode:
|
||||
hints = result.stderr.decode()
|
||||
raise ArchiveError('Failed to save DOM', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
65
archivebox/extractors/favicon.py
Normal file
65
archivebox/extractors/favicon.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from ..system import chmod_file, run
|
||||
from ..util import enforce_types, domain
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_FAVICON,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CHECK_SSL_VALIDITY,
|
||||
CURL_USER_AGENT,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
|
||||
return False
|
||||
|
||||
return SAVE_FAVICON
|
||||
|
||||
@enforce_types
|
||||
def save_favicon(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download site favicon from google's favicon api"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'favicon.ico'
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--silent',
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--compressed',
|
||||
'--output', str(output),
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
'https://www.google.com/s2/favicons?domain={}'.format(domain(link.url)),
|
||||
]
|
||||
status = 'pending'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
run(cmd, cwd=out_dir, timeout=timeout)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
status = 'succeeded'
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
89
archivebox/extractors/git.py
Normal file
89
archivebox/extractors/git.py
Normal file
|
@ -0,0 +1,89 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
domain,
|
||||
extension,
|
||||
without_query,
|
||||
without_fragment,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_GIT,
|
||||
GIT_BINARY,
|
||||
GIT_VERSION,
|
||||
GIT_DOMAINS,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_git(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'git')):
|
||||
return False
|
||||
|
||||
is_clonable_url = (
|
||||
(domain(link.url) in GIT_DOMAINS)
|
||||
or (extension(link.url) == 'git')
|
||||
)
|
||||
if not is_clonable_url:
|
||||
return False
|
||||
|
||||
return SAVE_GIT
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_git(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using git"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'git'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
GIT_BINARY,
|
||||
'clone',
|
||||
'--recursive',
|
||||
*([] if CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
|
||||
without_query(without_fragment(link.url)),
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||
if result.returncode == 128:
|
||||
# ignore failed re-download when the folder already exists
|
||||
pass
|
||||
elif result.returncode > 0:
|
||||
hints = 'Got git response code: {}.'.format(result.returncode)
|
||||
raise ArchiveError('Failed to save git clone', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=GIT_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
98
archivebox/extractors/media.py
Normal file
98
archivebox/extractors/media.py
Normal file
|
@ -0,0 +1,98 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
from ..config import (
|
||||
MEDIA_TIMEOUT,
|
||||
SAVE_MEDIA,
|
||||
SAVE_PLAYLISTS,
|
||||
YOUTUBEDL_BINARY,
|
||||
YOUTUBEDL_VERSION,
|
||||
CHECK_SSL_VALIDITY
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_media(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'media')):
|
||||
return False
|
||||
|
||||
return SAVE_MEDIA
|
||||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[str]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'media'
|
||||
output_path = os.path.join(out_dir, str(output))
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
cmd = [
|
||||
YOUTUBEDL_BINARY,
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--no-check-certificate',
|
||||
'--user-agent',
|
||||
'--all-subs',
|
||||
'--extract-audio',
|
||||
'--keep-video',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--audio-format', 'mp3',
|
||||
'--audio-quality', '320K',
|
||||
'--embed-thumbnail',
|
||||
'--add-metadata',
|
||||
*(['--yes-playlist'] if SAVE_PLAYLISTS else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=output_path, timeout=timeout + 1)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
if result.returncode:
|
||||
if (b'ERROR: Unsupported URL' in result.stderr
|
||||
or b'HTTP Error 404' in result.stderr
|
||||
or b'HTTP Error 403' in result.stderr
|
||||
or b'URL could be a direct video link' in result.stderr
|
||||
or b'Unable to extract container ID' in result.stderr):
|
||||
# These happen too frequently on non-media pages to warrant printing to console
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=YOUTUBEDL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
69
archivebox/extractors/pdf.py
Normal file
69
archivebox/extractors/pdf.py
Normal file
|
@ -0,0 +1,69 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_PDF,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_pdf(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'output.pdf')):
|
||||
return False
|
||||
|
||||
return SAVE_PDF
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_pdf(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""print PDF of site to file using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'output.pdf'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--print-to-pdf',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save PDF', hints)
|
||||
|
||||
chmod_file('output.pdf', cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
68
archivebox/extractors/screenshot.py
Normal file
68
archivebox/extractors/screenshot.py
Normal file
|
@ -0,0 +1,68 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
chrome_args,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SCREENSHOT,
|
||||
CHROME_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_screenshot(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
out_dir = out_dir or link.link_dir
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
if os.path.exists(os.path.join(out_dir, 'screenshot.png')):
|
||||
return False
|
||||
|
||||
return SAVE_SCREENSHOT
|
||||
|
||||
@enforce_types
|
||||
def save_screenshot(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""take screenshot of site using chrome --headless"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
output: ArchiveOutput = 'screenshot.png'
|
||||
cmd = [
|
||||
*chrome_args(TIMEOUT=timeout),
|
||||
'--screenshot',
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
|
||||
if result.returncode:
|
||||
hints = (result.stderr or result.stdout).decode()
|
||||
raise ArchiveError('Failed to save screenshot', hints)
|
||||
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CHROME_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
85
archivebox/extractors/title.py
Normal file
85
archivebox/extractors/title.py
Normal file
|
@ -0,0 +1,85 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
download_url,
|
||||
htmldecode,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_TITLE,
|
||||
CURL_BINARY,
|
||||
CURL_VERSION,
|
||||
CURL_USER_AGENT,
|
||||
setup_django,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
HTML_TITLE_REGEX = re.compile(
|
||||
r'<title.*?>' # start matching text after <title> tag
|
||||
r'(.[^<>]+)', # get everything up to these symbols
|
||||
re.IGNORECASE | re.MULTILINE | re.DOTALL | re.UNICODE,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
# if link already has valid title, skip it
|
||||
if link.title and not link.title.lower().startswith('http'):
|
||||
return False
|
||||
|
||||
if is_static_file(link.url):
|
||||
return False
|
||||
|
||||
return SAVE_TITLE
|
||||
|
||||
@enforce_types
|
||||
def save_title(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""try to guess the page's title from its content"""
|
||||
|
||||
setup_django(out_dir=out_dir)
|
||||
from core.models import Snapshot
|
||||
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
CURL_BINARY,
|
||||
'--silent',
|
||||
'--max-time', str(timeout),
|
||||
'--location',
|
||||
'--compressed',
|
||||
*(['--user-agent', '{}'.format(CURL_USER_AGENT)] if CURL_USER_AGENT else []),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--insecure']),
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
match = re.search(HTML_TITLE_REGEX, html)
|
||||
output = htmldecode(match.group(1).strip()) if match else None
|
||||
if output:
|
||||
if not link.title or len(output) >= len(link.title):
|
||||
Snapshot.objects.filter(url=link.url, timestamp=link.timestamp).update(title=output)
|
||||
else:
|
||||
raise ArchiveError('Unable to detect page title')
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=CURL_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
195
archivebox/extractors/wget.py
Normal file
195
archivebox/extractors/wget.py
Normal file
|
@ -0,0 +1,195 @@
|
|||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
import re
|
||||
|
||||
from typing import Optional
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..system import run, chmod_file
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
without_scheme,
|
||||
without_fragment,
|
||||
without_query,
|
||||
path,
|
||||
domain,
|
||||
urldecode,
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_WGET,
|
||||
SAVE_WARC,
|
||||
WGET_BINARY,
|
||||
WGET_VERSION,
|
||||
RESTRICT_FILE_NAMES,
|
||||
CHECK_SSL_VALIDITY,
|
||||
SAVE_WGET_REQUISITES,
|
||||
WGET_AUTO_COMPRESSION,
|
||||
WGET_USER_AGENT,
|
||||
COOKIES_FILE,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
@enforce_types
|
||||
def should_save_wget(link: Link, out_dir: Optional[str]=None) -> bool:
|
||||
output_path = wget_output_path(link)
|
||||
out_dir = out_dir or link.link_dir
|
||||
if output_path and os.path.exists(os.path.join(out_dir, output_path)):
|
||||
return False
|
||||
|
||||
return SAVE_WGET
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_wget(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOUT) -> ArchiveResult:
|
||||
"""download full site using wget"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
if SAVE_WARC:
|
||||
warc_dir = os.path.join(out_dir, 'warc')
|
||||
os.makedirs(warc_dir, exist_ok=True)
|
||||
warc_path = os.path.join('warc', str(int(datetime.now().timestamp())))
|
||||
|
||||
# WGET CLI Docs: https://www.gnu.org/software/wget/manual/wget.html
|
||||
output: ArchiveOutput = None
|
||||
cmd = [
|
||||
WGET_BINARY,
|
||||
# '--server-response', # print headers for better error parsing
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
'--timeout={}'.format(timeout),
|
||||
*(['--restrict-file-names={}'.format(RESTRICT_FILE_NAMES)] if RESTRICT_FILE_NAMES else []),
|
||||
*(['--warc-file={}'.format(warc_path)] if SAVE_WARC else []),
|
||||
*(['--page-requisites'] if SAVE_WGET_REQUISITES else []),
|
||||
*(['--user-agent={}'.format(WGET_USER_AGENT)] if WGET_USER_AGENT else []),
|
||||
*(['--load-cookies', COOKIES_FILE] if COOKIES_FILE else []),
|
||||
*(['--compression=auto'] if WGET_AUTO_COMPRESSION else []),
|
||||
*([] if SAVE_WARC else ['--timestamping']),
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate', '--no-hsts']),
|
||||
link.url,
|
||||
]
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
output = wget_output_path(link)
|
||||
|
||||
# parse out number of files downloaded from last line of stderr:
|
||||
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
|
||||
output_tail = [
|
||||
line.strip()
|
||||
for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
|
||||
if line.strip()
|
||||
]
|
||||
files_downloaded = (
|
||||
int(output_tail[-1].strip().split(' ', 2)[1] or 0)
|
||||
if 'Downloaded:' in output_tail[-1]
|
||||
else 0
|
||||
)
|
||||
hints = (
|
||||
'Got wget response code: {}.'.format(result.returncode),
|
||||
*output_tail,
|
||||
)
|
||||
|
||||
# Check for common failure cases
|
||||
if (result.returncode > 0 and files_downloaded < 1) or output is None:
|
||||
if b'403: Forbidden' in result.stderr:
|
||||
raise ArchiveError('403 Forbidden (try changing WGET_USER_AGENT)', hints)
|
||||
if b'404: Not Found' in result.stderr:
|
||||
raise ArchiveError('404 Not Found', hints)
|
||||
if b'ERROR 500: Internal Server Error' in result.stderr:
|
||||
raise ArchiveError('500 Internal Server Error', hints)
|
||||
raise ArchiveError('Wget failed or got an error from the server', hints)
|
||||
chmod_file(output, cwd=out_dir)
|
||||
except Exception as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return ArchiveResult(
|
||||
cmd=cmd,
|
||||
pwd=out_dir,
|
||||
cmd_version=WGET_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
**timer.stats,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def wget_output_path(link: Link) -> Optional[str]:
|
||||
"""calculate the path to the wgetted .html file, since wget may
|
||||
adjust some paths to be different than the base_url path.
|
||||
|
||||
See docs on wget --adjust-extension (-E)
|
||||
"""
|
||||
if is_static_file(link.url):
|
||||
return without_scheme(without_fragment(link.url))
|
||||
|
||||
# Wget downloads can save in a number of different ways depending on the url:
|
||||
# https://example.com
|
||||
# > example.com/index.html
|
||||
# https://example.com?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
# https://www.example.com/?v=zzVa_tX1OiI
|
||||
# > example.com/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc
|
||||
# > example.com/abc.html
|
||||
# https://example.com/abc/
|
||||
# > example.com/abc/index.html
|
||||
# https://example.com/abc?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/?v=zzVa_tX1OiI.html
|
||||
# > example.com/abc/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# https://example.com/abc/test.html
|
||||
# > example.com/abc/test.html
|
||||
# https://example.com/abc/test?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test?v=zzVa_tX1OiI.html
|
||||
# https://example.com/abc/test/?v=zzVa_tX1OiI
|
||||
# > example.com/abc/test/index.html?v=zzVa_tX1OiI.html
|
||||
|
||||
# There's also lots of complexity around how the urlencoding and renaming
|
||||
# is done for pages with query and hash fragments or extensions like shtml / htm / php / etc
|
||||
|
||||
# Since the wget algorithm for -E (appending .html) is incredibly complex
|
||||
# and there's no way to get the computed output path from wget
|
||||
# in order to avoid having to reverse-engineer how they calculate it,
|
||||
# we just look in the output folder read the filename wget used from the filesystem
|
||||
full_path = without_fragment(without_query(path(link.url))).strip('/')
|
||||
search_dir = os.path.join(
|
||||
link.link_dir,
|
||||
domain(link.url).replace(":", "+"),
|
||||
urldecode(full_path),
|
||||
)
|
||||
for _ in range(4):
|
||||
if os.path.exists(search_dir):
|
||||
if os.path.isdir(search_dir):
|
||||
html_files = [
|
||||
f for f in os.listdir(search_dir)
|
||||
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", f, re.I | re.M)
|
||||
]
|
||||
if html_files:
|
||||
path_from_link_dir = search_dir.split(link.link_dir)[-1].strip('/')
|
||||
return os.path.join(path_from_link_dir, html_files[0])
|
||||
|
||||
# Move up one directory level
|
||||
search_dir = search_dir.rsplit('/', 1)[0]
|
||||
|
||||
if search_dir == link.link_dir:
|
||||
break
|
||||
|
||||
return None
|
|
@ -1,271 +0,0 @@
|
|||
import os
|
||||
import json
|
||||
|
||||
from datetime import datetime
|
||||
from string import Template
|
||||
try:
|
||||
from distutils.dir_util import copy_tree
|
||||
except ImportError:
|
||||
print('[X] Missing "distutils" python package. To install it, run:')
|
||||
print(' pip install distutils')
|
||||
|
||||
from config import (
|
||||
OUTPUT_DIR,
|
||||
TEMPLATES_DIR,
|
||||
GIT_SHA,
|
||||
FOOTER_INFO,
|
||||
)
|
||||
from util import (
|
||||
chmod_file,
|
||||
urlencode,
|
||||
derived_link_info,
|
||||
check_link_structure,
|
||||
check_links_structure,
|
||||
wget_output_path,
|
||||
latest_output,
|
||||
)
|
||||
from parse import parse_links
|
||||
from links import validate_links
|
||||
from logs import (
|
||||
log_indexing_process_started,
|
||||
log_indexing_started,
|
||||
log_indexing_finished,
|
||||
log_parsing_started,
|
||||
log_parsing_finished,
|
||||
)
|
||||
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
|
||||
### Homepage index for all the links
|
||||
|
||||
def write_links_index(out_dir, links, finished=False):
|
||||
"""create index.html file for a given list of links"""
|
||||
|
||||
log_indexing_process_started()
|
||||
check_links_structure(links)
|
||||
|
||||
log_indexing_started(out_dir, 'index.json')
|
||||
write_json_links_index(out_dir, links)
|
||||
log_indexing_finished(out_dir, 'index.json')
|
||||
|
||||
log_indexing_started(out_dir, 'index.html')
|
||||
write_html_links_index(out_dir, links, finished=finished)
|
||||
log_indexing_finished(out_dir, 'index.html')
|
||||
|
||||
def load_links_index(out_dir=OUTPUT_DIR, import_path=None):
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
|
||||
existing_links = []
|
||||
if out_dir:
|
||||
existing_links = parse_json_links_index(out_dir)
|
||||
check_links_structure(existing_links)
|
||||
|
||||
new_links = []
|
||||
if import_path:
|
||||
# parse and validate the import file
|
||||
log_parsing_started(import_path)
|
||||
raw_links, parser_name = parse_links(import_path)
|
||||
new_links = validate_links(raw_links)
|
||||
check_links_structure(new_links)
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
check_links_structure(all_links)
|
||||
num_new_links = len(all_links) - len(existing_links)
|
||||
|
||||
if import_path and parser_name:
|
||||
log_parsing_finished(num_new_links, parser_name)
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
def write_json_links_index(out_dir, links):
|
||||
"""write the json link index to a given path"""
|
||||
|
||||
check_links_structure(links)
|
||||
|
||||
path = os.path.join(out_dir, 'index.json')
|
||||
|
||||
index_json = {
|
||||
'info': 'ArchiveBox Index',
|
||||
'help': 'https://github.com/pirate/ArchiveBox',
|
||||
'version': GIT_SHA,
|
||||
'num_links': len(links),
|
||||
'updated': str(datetime.now().timestamp()),
|
||||
'links': links,
|
||||
}
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(index_json, f, indent=4, default=str)
|
||||
|
||||
chmod_file(path)
|
||||
|
||||
def parse_json_links_index(out_dir=OUTPUT_DIR):
|
||||
"""parse a archive index json file and return the list of links"""
|
||||
index_path = os.path.join(out_dir, 'index.json')
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
links = json.load(f)['links']
|
||||
check_links_structure(links)
|
||||
return links
|
||||
|
||||
return []
|
||||
|
||||
def write_html_links_index(out_dir, links, finished=False):
|
||||
"""write the html link index to a given path"""
|
||||
|
||||
check_links_structure(links)
|
||||
|
||||
path = os.path.join(out_dir, 'index.html')
|
||||
|
||||
copy_tree(os.path.join(TEMPLATES_DIR, 'static'), os.path.join(out_dir, 'static'))
|
||||
|
||||
with open(os.path.join(out_dir, 'robots.txt'), 'w+') as f:
|
||||
f.write('User-agent: *\nDisallow: /')
|
||||
|
||||
with open(os.path.join(TEMPLATES_DIR, 'index.html'), 'r', encoding='utf-8') as f:
|
||||
index_html = f.read()
|
||||
|
||||
with open(os.path.join(TEMPLATES_DIR, 'index_row.html'), 'r', encoding='utf-8') as f:
|
||||
link_row_html = f.read()
|
||||
|
||||
full_links_info = (derived_link_info(link) for link in links)
|
||||
|
||||
link_rows = '\n'.join(
|
||||
Template(link_row_html).substitute(**{
|
||||
**link,
|
||||
'title': (
|
||||
link['title']
|
||||
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
||||
),
|
||||
'favicon_url': (
|
||||
os.path.join('archive', link['timestamp'], 'favicon.ico')
|
||||
# if link['is_archived'] else ''
|
||||
),
|
||||
'archive_url': urlencode(
|
||||
wget_output_path(link) or 'index.html'
|
||||
),
|
||||
})
|
||||
for link in full_links_info
|
||||
)
|
||||
|
||||
template_vars = {
|
||||
'num_links': len(links),
|
||||
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
||||
'footer_info': FOOTER_INFO,
|
||||
'git_sha': GIT_SHA,
|
||||
'short_git_sha': GIT_SHA[:8],
|
||||
'rows': link_rows,
|
||||
'status': 'finished' if finished else 'running',
|
||||
}
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
f.write(Template(index_html).substitute(**template_vars))
|
||||
|
||||
chmod_file(path)
|
||||
|
||||
|
||||
def patch_links_index(link, out_dir=OUTPUT_DIR):
|
||||
"""hack to in-place update one row's info in the generated index html"""
|
||||
|
||||
title = link['title'] or latest_output(link)['title']
|
||||
successful = len(tuple(filter(None, latest_output(link).values())))
|
||||
|
||||
# Patch JSON index
|
||||
changed = False
|
||||
json_file_links = parse_json_links_index(out_dir)
|
||||
for saved_link in json_file_links:
|
||||
if saved_link['url'] == link['url']:
|
||||
saved_link['title'] = title
|
||||
saved_link['history'] = link['history']
|
||||
changed = True
|
||||
break
|
||||
if changed:
|
||||
write_json_links_index(out_dir, json_file_links)
|
||||
|
||||
# Patch HTML index
|
||||
html_path = os.path.join(out_dir, 'index.html')
|
||||
with open(html_path, 'r') as html_file:
|
||||
html = html_file.read().splitlines()
|
||||
for idx, line in enumerate(html):
|
||||
if title and ('<span data-title-for="{}"'.format(link['url']) in line):
|
||||
html[idx] = '<span>{}</span>'.format(title)
|
||||
elif successful and ('<span data-number-for="{}"'.format(link['url']) in line):
|
||||
html[idx] = '<span>{}</span>'.format(successful)
|
||||
break
|
||||
|
||||
with open(html_path, 'w') as f:
|
||||
f.write('\n'.join(html))
|
||||
|
||||
|
||||
### Individual link index
|
||||
|
||||
def write_link_index(out_dir, link):
|
||||
link['updated'] = str(datetime.now().timestamp())
|
||||
write_json_link_index(out_dir, link)
|
||||
write_html_link_index(out_dir, link)
|
||||
|
||||
def write_json_link_index(out_dir, link):
|
||||
"""write a json file with some info about the link"""
|
||||
|
||||
check_link_structure(link)
|
||||
path = os.path.join(out_dir, 'index.json')
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
json.dump(link, f, indent=4, default=str)
|
||||
|
||||
chmod_file(path)
|
||||
|
||||
def parse_json_link_index(out_dir):
|
||||
"""load the json link index from a given directory"""
|
||||
existing_index = os.path.join(out_dir, 'index.json')
|
||||
if os.path.exists(existing_index):
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
link_json = json.load(f)
|
||||
check_link_structure(link_json)
|
||||
return link_json
|
||||
return {}
|
||||
|
||||
def load_json_link_index(out_dir, link):
|
||||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
link = {
|
||||
**parse_json_link_index(out_dir),
|
||||
**link,
|
||||
}
|
||||
link.update({
|
||||
'history': link.get('history') or {},
|
||||
})
|
||||
|
||||
check_link_structure(link)
|
||||
return link
|
||||
|
||||
def write_html_link_index(out_dir, link):
|
||||
check_link_structure(link)
|
||||
with open(os.path.join(TEMPLATES_DIR, 'link_index.html'), 'r', encoding='utf-8') as f:
|
||||
link_html = f.read()
|
||||
|
||||
path = os.path.join(out_dir, 'index.html')
|
||||
|
||||
link = derived_link_info(link)
|
||||
|
||||
with open(path, 'w', encoding='utf-8') as f:
|
||||
f.write(Template(link_html).substitute({
|
||||
**link,
|
||||
'title': (
|
||||
link['title']
|
||||
or (link['base_url'] if link['is_archived'] else TITLE_LOADING_MSG)
|
||||
),
|
||||
'archive_url': urlencode(
|
||||
wget_output_path(link)
|
||||
or (link['domain'] if link['is_archived'] else 'about:blank')
|
||||
),
|
||||
'extension': link['extension'] or 'html',
|
||||
'tags': link['tags'].strip() or 'untagged',
|
||||
'status': 'Archived' if link['is_archived'] else 'Not yet archived',
|
||||
'status_color': 'success' if link['is_archived'] else 'danger',
|
||||
}))
|
||||
|
||||
chmod_file(path)
|
615
archivebox/index/__init__.py
Normal file
615
archivebox/index/__init__.py
Normal file
|
@ -0,0 +1,615 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import re
|
||||
import os
|
||||
import shutil
|
||||
import json as pyjson
|
||||
|
||||
from itertools import chain
|
||||
from typing import List, Tuple, Dict, Optional, Iterable
|
||||
from collections import OrderedDict
|
||||
from contextlib import contextmanager
|
||||
|
||||
from ..system import atomic_write
|
||||
from ..util import (
|
||||
scheme,
|
||||
enforce_types,
|
||||
ExtendedEncoder,
|
||||
)
|
||||
from ..config import (
|
||||
ARCHIVE_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
TIMEOUT,
|
||||
URL_BLACKLIST_PTN,
|
||||
ANSI,
|
||||
stderr,
|
||||
OUTPUT_PERMISSIONS
|
||||
)
|
||||
from ..logging_util import (
|
||||
TimedProgress,
|
||||
log_indexing_process_started,
|
||||
log_indexing_process_finished,
|
||||
log_indexing_started,
|
||||
log_indexing_finished,
|
||||
log_parsing_finished,
|
||||
log_deduping_finished,
|
||||
)
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from .html import (
|
||||
write_html_main_index,
|
||||
write_html_link_details,
|
||||
)
|
||||
from .json import (
|
||||
parse_json_main_index,
|
||||
write_json_main_index,
|
||||
parse_json_link_details,
|
||||
write_json_link_details,
|
||||
)
|
||||
from .sql import (
|
||||
write_sql_main_index,
|
||||
parse_sql_main_index,
|
||||
write_sql_link_details,
|
||||
)
|
||||
|
||||
### Link filtering and checking
|
||||
|
||||
@enforce_types
|
||||
def merge_links(a: Link, b: Link) -> Link:
|
||||
"""deterministially merge two links, favoring longer field values over shorter,
|
||||
and "cleaner" values over worse ones.
|
||||
"""
|
||||
assert a.base_url == b.base_url, 'Cannot merge two links with different URLs'
|
||||
|
||||
# longest url wins (because a fuzzy url will always be shorter)
|
||||
url = a.url if len(a.url) > len(b.url) else b.url
|
||||
|
||||
# best title based on length and quality
|
||||
possible_titles = [
|
||||
title
|
||||
for title in (a.title, b.title)
|
||||
if title and title.strip() and '://' not in title
|
||||
]
|
||||
title = None
|
||||
if len(possible_titles) == 2:
|
||||
title = max(possible_titles, key=lambda t: len(t))
|
||||
elif len(possible_titles) == 1:
|
||||
title = possible_titles[0]
|
||||
|
||||
# earliest valid timestamp
|
||||
timestamp = (
|
||||
a.timestamp
|
||||
if float(a.timestamp or 0) < float(b.timestamp or 0) else
|
||||
b.timestamp
|
||||
)
|
||||
|
||||
# all unique, truthy tags
|
||||
tags_set = (
|
||||
set(tag.strip() for tag in (a.tags or '').split(','))
|
||||
| set(tag.strip() for tag in (b.tags or '').split(','))
|
||||
)
|
||||
tags = ','.join(tags_set) or None
|
||||
|
||||
# all unique source entries
|
||||
sources = list(set(a.sources + b.sources))
|
||||
|
||||
# all unique history entries for the combined archive methods
|
||||
all_methods = set(list(a.history.keys()) + list(a.history.keys()))
|
||||
history = {
|
||||
method: (a.history.get(method) or []) + (b.history.get(method) or [])
|
||||
for method in all_methods
|
||||
}
|
||||
for method in all_methods:
|
||||
deduped_jsons = {
|
||||
pyjson.dumps(result, sort_keys=True, cls=ExtendedEncoder)
|
||||
for result in history[method]
|
||||
}
|
||||
history[method] = list(reversed(sorted(
|
||||
(ArchiveResult.from_json(pyjson.loads(result)) for result in deduped_jsons),
|
||||
key=lambda result: result.start_ts,
|
||||
)))
|
||||
|
||||
return Link(
|
||||
url=url,
|
||||
timestamp=timestamp,
|
||||
title=title,
|
||||
tags=tags,
|
||||
sources=sources,
|
||||
history=history,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def validate_links(links: Iterable[Link]) -> List[Link]:
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
try:
|
||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
||||
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
return list(links)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def archivable_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||
for link in links:
|
||||
scheme_is_valid = scheme(link.url) in ('http', 'https', 'ftp')
|
||||
not_blacklisted = (not URL_BLACKLIST_PTN.match(link.url)) if URL_BLACKLIST_PTN else True
|
||||
if scheme_is_valid and not_blacklisted:
|
||||
yield link
|
||||
|
||||
|
||||
@enforce_types
|
||||
def uniquefied_links(sorted_links: Iterable[Link]) -> Iterable[Link]:
|
||||
"""
|
||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||
"""
|
||||
|
||||
unique_urls: OrderedDict[str, Link] = OrderedDict()
|
||||
|
||||
for link in sorted_links:
|
||||
if link.base_url in unique_urls:
|
||||
# merge with any other links that share the same url
|
||||
link = merge_links(unique_urls[link.base_url], link)
|
||||
unique_urls[link.base_url] = link
|
||||
|
||||
unique_timestamps: OrderedDict[str, Link] = OrderedDict()
|
||||
for link in unique_urls.values():
|
||||
new_link = link.overwrite(
|
||||
timestamp=lowest_uniq_timestamp(unique_timestamps, link.timestamp),
|
||||
)
|
||||
unique_timestamps[new_link.timestamp] = new_link
|
||||
|
||||
return unique_timestamps.values()
|
||||
|
||||
|
||||
@enforce_types
|
||||
def sorted_links(links: Iterable[Link]) -> Iterable[Link]:
|
||||
sort_func = lambda link: (link.timestamp.split('.', 1)[0], link.url)
|
||||
return sorted(links, key=sort_func, reverse=True)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def links_after_timestamp(links: Iterable[Link], resume: Optional[float]=None) -> Iterable[Link]:
|
||||
if not resume:
|
||||
yield from links
|
||||
return
|
||||
|
||||
for link in links:
|
||||
try:
|
||||
if float(link.timestamp) <= resume:
|
||||
yield link
|
||||
except (ValueError, TypeError):
|
||||
print('Resume value and all timestamp values must be valid numbers.')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def lowest_uniq_timestamp(used_timestamps: OrderedDict, timestamp: str) -> str:
|
||||
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
|
||||
|
||||
timestamp = timestamp.split('.')[0]
|
||||
nonce = 0
|
||||
|
||||
# first try 152323423 before 152323423.0
|
||||
if timestamp not in used_timestamps:
|
||||
return timestamp
|
||||
|
||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
||||
while new_timestamp in used_timestamps:
|
||||
nonce += 1
|
||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
||||
|
||||
return new_timestamp
|
||||
|
||||
|
||||
|
||||
### Main Links Index
|
||||
|
||||
@contextmanager
|
||||
@enforce_types
|
||||
def timed_index_update(out_path: str):
|
||||
log_indexing_started(out_path)
|
||||
timer = TimedProgress(TIMEOUT * 2, prefix=' ')
|
||||
try:
|
||||
yield
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
assert os.path.exists(out_path), f'Failed to write index file: {out_path}'
|
||||
log_indexing_finished(out_path)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
"""create index.html file for a given list of links"""
|
||||
|
||||
log_indexing_process_started(len(links))
|
||||
|
||||
with timed_index_update(os.path.join(out_dir, SQL_INDEX_FILENAME)):
|
||||
write_sql_main_index(links, out_dir=out_dir)
|
||||
os.chmod(os.path.join(out_dir, SQL_INDEX_FILENAME), int(OUTPUT_PERMISSIONS, base=8)) # set here because we don't write it with atomic writes
|
||||
|
||||
|
||||
with timed_index_update(os.path.join(out_dir, JSON_INDEX_FILENAME)):
|
||||
write_json_main_index(links, out_dir=out_dir)
|
||||
|
||||
with timed_index_update(os.path.join(out_dir, HTML_INDEX_FILENAME)):
|
||||
write_html_main_index(links, out_dir=out_dir, finished=finished)
|
||||
|
||||
log_indexing_process_finished()
|
||||
|
||||
|
||||
@enforce_types
|
||||
def load_main_index(out_dir: str=OUTPUT_DIR, warn: bool=True) -> List[Link]:
|
||||
"""parse and load existing index with any new links from import_path merged in"""
|
||||
|
||||
all_links: List[Link] = []
|
||||
all_links = list(parse_json_main_index(out_dir))
|
||||
links_from_sql = list(parse_sql_main_index(out_dir))
|
||||
|
||||
if warn and not set(l.url for l in all_links) == set(l.url for l in links_from_sql):
|
||||
stderr('{red}[!] Warning: SQL index does not match JSON index!{reset}'.format(**ANSI))
|
||||
stderr(' To repair the index and re-import any orphaned links run:')
|
||||
stderr(' archivebox init')
|
||||
|
||||
return all_links
|
||||
|
||||
@enforce_types
|
||||
def load_main_index_meta(out_dir: str=OUTPUT_DIR) -> Optional[dict]:
|
||||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
meta_dict = pyjson.load(f)
|
||||
meta_dict.pop('links')
|
||||
return meta_dict
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_links_from_source(source_path: str) -> Tuple[List[Link], List[Link]]:
|
||||
|
||||
from ..parsers import parse_links
|
||||
|
||||
new_links: List[Link] = []
|
||||
|
||||
# parse and validate the import file
|
||||
raw_links, parser_name = parse_links(source_path)
|
||||
new_links = validate_links(raw_links)
|
||||
|
||||
if parser_name:
|
||||
num_parsed = len(raw_links)
|
||||
log_parsing_finished(num_parsed, parser_name)
|
||||
|
||||
return new_links
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe_links(existing_links: List[Link],
|
||||
new_links: List[Link]) -> Tuple[List[Link], List[Link]]:
|
||||
|
||||
# merge existing links in out_dir and new links
|
||||
all_links = validate_links(existing_links + new_links)
|
||||
all_link_urls = {link.url for link in existing_links}
|
||||
|
||||
new_links = [
|
||||
link for link in new_links
|
||||
if link.url not in all_link_urls
|
||||
]
|
||||
|
||||
all_links_deduped = {link.url: link for link in all_links}
|
||||
for i in range(len(new_links)):
|
||||
if new_links[i].url in all_links_deduped.keys():
|
||||
new_links[i] = all_links_deduped[new_links[i].url]
|
||||
log_deduping_finished(len(new_links))
|
||||
|
||||
return all_links, new_links
|
||||
|
||||
|
||||
@enforce_types
|
||||
def patch_main_index(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||
"""hack to in-place update one row's info in the generated index files"""
|
||||
|
||||
# TODO: remove this ASAP, it's ugly, error-prone, and potentially dangerous
|
||||
|
||||
title = link.title or link.latest_outputs(status='succeeded')['title']
|
||||
successful = link.num_outputs
|
||||
|
||||
# Patch JSON main index
|
||||
json_file_links = parse_json_main_index(out_dir)
|
||||
patched_links = []
|
||||
for saved_link in json_file_links:
|
||||
if saved_link.url == link.url:
|
||||
patched_links.append(saved_link.overwrite(
|
||||
title=title,
|
||||
history=link.history,
|
||||
updated=link.updated,
|
||||
))
|
||||
else:
|
||||
patched_links.append(saved_link)
|
||||
|
||||
write_json_main_index(patched_links, out_dir=out_dir)
|
||||
|
||||
# Patch HTML main index
|
||||
html_path = os.path.join(out_dir, 'index.html')
|
||||
with open(html_path, 'r') as f:
|
||||
html = f.read().splitlines()
|
||||
|
||||
for idx, line in enumerate(html):
|
||||
if title and ('<span data-title-for="{}"'.format(link.url) in line):
|
||||
html[idx] = '<span>{}</span>'.format(title)
|
||||
elif successful and ('<span data-number-for="{}"'.format(link.url) in line):
|
||||
html[idx] = '<span>{}</span>'.format(successful)
|
||||
break
|
||||
|
||||
atomic_write(html_path, '\n'.join(html))
|
||||
|
||||
|
||||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
write_json_link_details(link, out_dir=out_dir)
|
||||
write_html_link_details(link, out_dir=out_dir)
|
||||
write_sql_link_details(link)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def load_link_details(link: Link, out_dir: Optional[str]=None) -> Link:
|
||||
"""check for an existing link archive in the given directory,
|
||||
and load+merge it into the given link dict
|
||||
"""
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
existing_link = parse_json_link_details(out_dir)
|
||||
if existing_link:
|
||||
return merge_links(existing_link, link)
|
||||
|
||||
return link
|
||||
|
||||
|
||||
|
||||
LINK_FILTERS = {
|
||||
'exact': lambda link, pattern: (link.url == pattern) or (link.base_url == pattern),
|
||||
'substring': lambda link, pattern: pattern in link.url,
|
||||
'regex': lambda link, pattern: bool(re.match(pattern, link.url)),
|
||||
'domain': lambda link, pattern: link.domain == pattern,
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
def link_matches_filter(link: Link, filter_patterns: List[str], filter_type: str='exact') -> bool:
|
||||
for pattern in filter_patterns:
|
||||
try:
|
||||
if LINK_FILTERS[filter_type](link, pattern):
|
||||
return True
|
||||
except Exception:
|
||||
stderr()
|
||||
stderr(
|
||||
f'[X] Got invalid pattern for --filter-type={filter_type}:',
|
||||
color='red',
|
||||
)
|
||||
stderr(f' {pattern}')
|
||||
raise SystemExit(2)
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def get_indexed_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links without checking archive status or data directory validity"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in links
|
||||
}
|
||||
|
||||
def get_archived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are archived with a valid data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_archived, links)
|
||||
}
|
||||
|
||||
def get_unarchived_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""indexed links that are unarchived with no data directory or an empty data directory"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_unarchived, links)
|
||||
}
|
||||
|
||||
def get_present_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that actually exist in the archive/ folder"""
|
||||
all_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
all_folders[entry.path] = link
|
||||
|
||||
return all_folders
|
||||
|
||||
def get_valid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs with a valid index matched to the main index and archived content"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_valid, links)
|
||||
}
|
||||
|
||||
def get_invalid_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized"""
|
||||
duplicate = get_duplicate_folders(links, out_dir=OUTPUT_DIR)
|
||||
orphaned = get_orphaned_folders(links, out_dir=OUTPUT_DIR)
|
||||
corrupted = get_corrupted_folders(links, out_dir=OUTPUT_DIR)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=OUTPUT_DIR)
|
||||
return {**duplicate, **orphaned, **corrupted, **unrecognized}
|
||||
|
||||
|
||||
def get_duplicate_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that conflict with other directories that have the same link URL or timestamp"""
|
||||
links = list(links)
|
||||
by_url = {link.url: 0 for link in links}
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
|
||||
duplicate_folders = {}
|
||||
|
||||
indexed_folders = {link.link_dir for link in links}
|
||||
data_folders = (
|
||||
entry.path
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME))
|
||||
if entry.is_dir(follow_symlinks=True) and entry.path not in indexed_folders
|
||||
)
|
||||
|
||||
for path in chain(sorted(indexed_folders), sorted(data_folders)):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link:
|
||||
# link folder has same timestamp as different link folder
|
||||
by_timestamp[link.timestamp] = by_timestamp.get(link.timestamp, 0) + 1
|
||||
if by_timestamp[link.timestamp] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
# link folder has same url as different link folder
|
||||
by_url[link.url] = by_url.get(link.url, 0) + 1
|
||||
if by_url[link.url] > 1:
|
||||
duplicate_folders[path] = link
|
||||
|
||||
return duplicate_folders
|
||||
|
||||
def get_orphaned_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that contain a valid index but aren't listed in the main index"""
|
||||
links = list(links)
|
||||
indexed_folders = {link.link_dir: link for link in links}
|
||||
orphaned_folders = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if link and entry.path not in indexed_folders:
|
||||
# folder is a valid link data dir with index details, but it's not in the main index
|
||||
orphaned_folders[entry.path] = link
|
||||
|
||||
return orphaned_folders
|
||||
|
||||
def get_corrupted_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain a valid index and aren't listed in the main index"""
|
||||
return {
|
||||
link.link_dir: link
|
||||
for link in filter(is_corrupt, links)
|
||||
}
|
||||
|
||||
def get_unrecognized_folders(links, out_dir: str=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
|
||||
"""dirs that don't contain recognizable archive data and aren't listed in the main index"""
|
||||
by_timestamp = {link.timestamp: 0 for link in links}
|
||||
unrecognized_folders: Dict[str, Optional[Link]] = {}
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_exists = os.path.exists(os.path.join(entry.path, 'index.json'))
|
||||
link = None
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except KeyError:
|
||||
# Try to fix index
|
||||
if index_exists:
|
||||
try:
|
||||
# Last attempt to repair the detail index
|
||||
link_guessed = parse_json_link_details(entry.path, guess=True)
|
||||
write_json_link_details(link_guessed, out_dir=entry.path)
|
||||
link = parse_json_link_details(entry.path)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if index_exists and link is None:
|
||||
# index exists but it's corrupted or unparseable
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
elif not index_exists:
|
||||
# link details index doesn't exist and the folder isn't in the main index
|
||||
timestamp = entry.path.rsplit('/', 1)[-1]
|
||||
if timestamp not in by_timestamp:
|
||||
unrecognized_folders[entry.path] = link
|
||||
|
||||
return unrecognized_folders
|
||||
|
||||
|
||||
def is_valid(link: Link) -> bool:
|
||||
dir_exists = os.path.exists(link.link_dir)
|
||||
index_exists = os.path.exists(os.path.join(link.link_dir, 'index.json'))
|
||||
if not dir_exists:
|
||||
# unarchived links are not included in the valid list
|
||||
return False
|
||||
if dir_exists and not index_exists:
|
||||
return False
|
||||
if dir_exists and index_exists:
|
||||
try:
|
||||
parsed_link = parse_json_link_details(link.link_dir, guess=True)
|
||||
return link.url == parsed_link.url
|
||||
except Exception:
|
||||
pass
|
||||
return False
|
||||
|
||||
def is_corrupt(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
# unarchived links are not considered corrupt
|
||||
return False
|
||||
|
||||
if is_valid(link):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
def is_archived(link: Link) -> bool:
|
||||
return is_valid(link) and link.is_archived
|
||||
|
||||
def is_unarchived(link: Link) -> bool:
|
||||
if not os.path.exists(link.link_dir):
|
||||
return True
|
||||
return not link.is_archived
|
||||
|
||||
|
||||
def fix_invalid_folder_locations(out_dir: str=OUTPUT_DIR) -> Tuple[List[str], List[str]]:
|
||||
fixed = []
|
||||
cant_fix = []
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except KeyError:
|
||||
link = None
|
||||
if not link:
|
||||
continue
|
||||
|
||||
if not entry.path.endswith(f'/{link.timestamp}'):
|
||||
dest = os.path.join(out_dir, ARCHIVE_DIR_NAME, link.timestamp)
|
||||
if os.path.exists(dest):
|
||||
cant_fix.append(entry.path)
|
||||
else:
|
||||
shutil.move(entry.path, dest)
|
||||
fixed.append(dest)
|
||||
timestamp = entry.path.rsplit('/', 1)[-1]
|
||||
assert link.link_dir == entry.path
|
||||
assert link.timestamp == timestamp
|
||||
write_json_link_details(link, out_dir=entry.path)
|
||||
|
||||
return fixed, cant_fix
|
37
archivebox/index/csv.py
Normal file
37
archivebox/index/csv.py
Normal file
|
@ -0,0 +1,37 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
from typing import List, Optional, Any
|
||||
|
||||
from ..util import enforce_types
|
||||
from .schema import Link
|
||||
|
||||
|
||||
@enforce_types
|
||||
def links_to_csv(links: List[Link],
|
||||
cols: Optional[List[str]]=None,
|
||||
header: bool=True,
|
||||
separator: str=',',
|
||||
ljust: int=0) -> str:
|
||||
|
||||
cols = cols or ['timestamp', 'is_archived', 'url']
|
||||
|
||||
header_str = ''
|
||||
if header:
|
||||
header_str = separator.join(col.ljust(ljust) for col in cols)
|
||||
|
||||
row_strs = (
|
||||
link.to_csv(cols=cols, ljust=ljust, separator=separator)
|
||||
for link in links
|
||||
)
|
||||
|
||||
return '\n'.join((header_str, *row_strs))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def to_csv(obj: Any, cols: List[str], separator: str=',', ljust: int=0) -> str:
|
||||
from .json import to_json
|
||||
|
||||
return separator.join(
|
||||
to_json(getattr(obj, col), indent=None).ljust(ljust)
|
||||
for col in cols
|
||||
)
|
156
archivebox/index/html.py
Normal file
156
archivebox/index/html.py
Normal file
|
@ -0,0 +1,156 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
|
||||
from string import Template
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Mapping
|
||||
|
||||
from .schema import Link
|
||||
from ..system import atomic_write, copy_and_overwrite
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
ts_to_date,
|
||||
urlencode,
|
||||
htmlencode,
|
||||
urldecode,
|
||||
)
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
TEMPLATES_DIR,
|
||||
VERSION,
|
||||
GIT_SHA,
|
||||
FOOTER_INFO,
|
||||
ARCHIVE_DIR_NAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
STATIC_DIR_NAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
)
|
||||
|
||||
join = lambda *paths: os.path.join(*paths)
|
||||
MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
|
||||
MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
|
||||
LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
|
||||
TITLE_LOADING_MSG = 'Not yet archived...'
|
||||
|
||||
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_html_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[str]:
|
||||
"""parse an archive index html file and return the list of urls"""
|
||||
|
||||
index_path = join(out_dir, HTML_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
for line in f:
|
||||
if 'class="link-url"' in line:
|
||||
yield line.split('"')[1]
|
||||
return ()
|
||||
|
||||
@enforce_types
|
||||
def write_html_main_index(links: List[Link], out_dir: str=OUTPUT_DIR, finished: bool=False) -> None:
|
||||
"""write the html link index to a given path"""
|
||||
|
||||
copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
|
||||
copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))
|
||||
copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
|
||||
|
||||
rendered_html = main_index_template(links, finished=finished)
|
||||
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def main_index_template(links: List[Link], finished: bool=True) -> str:
|
||||
"""render the template for the entire main index"""
|
||||
|
||||
return render_legacy_template(MAIN_INDEX_TEMPLATE, {
|
||||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'num_links': str(len(links)),
|
||||
'status': 'finished' if finished else 'running',
|
||||
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
||||
'rows': '\n'.join(
|
||||
main_index_row_template(link)
|
||||
for link in links
|
||||
),
|
||||
'footer_info': FOOTER_INFO,
|
||||
})
|
||||
|
||||
|
||||
@enforce_types
|
||||
def main_index_row_template(link: Link) -> str:
|
||||
"""render the template for an individual link row of the main index"""
|
||||
|
||||
from ..extractors.wget import wget_output_path
|
||||
|
||||
return render_legacy_template(MAIN_INDEX_ROW_TEMPLATE, {
|
||||
**link._asdict(extended=True),
|
||||
|
||||
# before pages are finished archiving, show loading msg instead of title
|
||||
'title': htmlencode(
|
||||
link.title
|
||||
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||
),
|
||||
|
||||
# before pages are finished archiving, show fallback loading favicon
|
||||
'favicon_url': (
|
||||
join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
|
||||
# if link['is_archived'] else ''
|
||||
),
|
||||
|
||||
# before pages are finished archiving, show the details page instead
|
||||
'wget_url': urlencode(wget_output_path(link) or 'index.html'),
|
||||
|
||||
# replace commas in tags with spaces, or file extension if it's static
|
||||
'tags': (link.tags or '') + (' {}'.format(link.extension) if link.is_static else ''),
|
||||
})
|
||||
|
||||
|
||||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
out_dir = out_dir or link.link_dir
|
||||
|
||||
rendered_html = link_details_template(link)
|
||||
atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def link_details_template(link: Link) -> str:
|
||||
|
||||
from ..extractors.wget import wget_output_path
|
||||
|
||||
link_info = link._asdict(extended=True)
|
||||
|
||||
return render_legacy_template(LINK_DETAILS_TEMPLATE, {
|
||||
**link_info,
|
||||
**link_info['canonical'],
|
||||
'title': htmlencode(
|
||||
link.title
|
||||
or (link.base_url if link.is_archived else TITLE_LOADING_MSG)
|
||||
),
|
||||
'url_str': htmlencode(urldecode(link.base_url)),
|
||||
'archive_url': urlencode(
|
||||
wget_output_path(link)
|
||||
or (link.domain if link.is_archived else '')
|
||||
) or 'about:blank',
|
||||
'extension': link.extension or 'html',
|
||||
'tags': link.tags or 'untagged',
|
||||
'status': 'archived' if link.is_archived else 'not yet archived',
|
||||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date(link.oldest_archive_date),
|
||||
})
|
||||
|
||||
|
||||
@enforce_types
|
||||
def render_legacy_template(template_path: str, context: Mapping[str, str]) -> str:
|
||||
"""render a given html template string with the given template content"""
|
||||
|
||||
# will be replaced by django templates in the future
|
||||
with open(template_path, 'r', encoding='utf-8') as template:
|
||||
template_str = template.read()
|
||||
return Template(template_str).substitute(**context)
|
165
archivebox/index/json.py
Normal file
165
archivebox/index/json.py
Normal file
|
@ -0,0 +1,165 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
import sys
|
||||
import json as pyjson
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Any
|
||||
|
||||
from .schema import Link, ArchiveResult
|
||||
from ..system import atomic_write
|
||||
from ..util import enforce_types
|
||||
from ..config import (
|
||||
VERSION,
|
||||
OUTPUT_DIR,
|
||||
FOOTER_INFO,
|
||||
GIT_SHA,
|
||||
DEPENDENCIES,
|
||||
JSON_INDEX_FILENAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
ANSI
|
||||
)
|
||||
|
||||
|
||||
MAIN_INDEX_HEADER = {
|
||||
'info': 'This is an index of site data archived by ArchiveBox: The self-hosted web archive.',
|
||||
'schema': 'archivebox.index.json',
|
||||
'copyright_info': FOOTER_INFO,
|
||||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/pirate/ArchiveBox',
|
||||
'issues': 'https://github.com/pirate/ArchiveBox/issues',
|
||||
'dependencies': DEPENDENCIES,
|
||||
},
|
||||
}
|
||||
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_json_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
"""parse an archive index json file and return the list of links"""
|
||||
|
||||
index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(index_path):
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
links = pyjson.load(f)['links']
|
||||
for link_json in links:
|
||||
try:
|
||||
yield Link.from_json(link_json)
|
||||
except KeyError:
|
||||
try:
|
||||
detail_index_path = Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / link_json['timestamp']
|
||||
yield parse_json_link_details(str(detail_index_path))
|
||||
except KeyError:
|
||||
# as a last effort, try to guess the missing values out of existing ones
|
||||
try:
|
||||
yield Link.from_json(link_json, guess=True)
|
||||
except KeyError:
|
||||
print(" {lightyellow}! Failed to load the index.json from {}".format(detail_index_path, **ANSI))
|
||||
continue
|
||||
return ()
|
||||
|
||||
@enforce_types
|
||||
def write_json_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
"""write the json link index to a given path"""
|
||||
|
||||
assert isinstance(links, List), 'Links must be a list, not a generator.'
|
||||
assert not links or isinstance(links[0].history, dict)
|
||||
assert not links or isinstance(links[0].sources, list)
|
||||
|
||||
if links and links[0].history.get('title'):
|
||||
assert isinstance(links[0].history['title'][0], ArchiveResult)
|
||||
|
||||
if links and links[0].sources:
|
||||
assert isinstance(links[0].sources[0], str)
|
||||
|
||||
main_index_json = {
|
||||
**MAIN_INDEX_HEADER,
|
||||
'num_links': len(links),
|
||||
'updated': datetime.now(),
|
||||
'last_run_cmd': sys.argv,
|
||||
'links': links,
|
||||
}
|
||||
atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
|
||||
|
||||
|
||||
### Link Details Index
|
||||
|
||||
@enforce_types
|
||||
def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
|
||||
"""write a json file with some info about the link"""
|
||||
|
||||
out_dir = out_dir or link.link_dir
|
||||
path = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
atomic_write(path, link._asdict(extended=True))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_link_details(out_dir: str, guess: Optional[bool]=False) -> Optional[Link]:
|
||||
"""load the json link index from a given directory"""
|
||||
existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
|
||||
if os.path.exists(existing_index):
|
||||
with open(existing_index, 'r', encoding='utf-8') as f:
|
||||
try:
|
||||
link_json = pyjson.load(f)
|
||||
return Link.from_json(link_json, guess)
|
||||
except pyjson.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_json_links_details(out_dir: str) -> Iterator[Link]:
|
||||
"""read through all the archive data folders and return the parsed links"""
|
||||
|
||||
for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
if os.path.exists(os.path.join(entry.path, 'index.json')):
|
||||
try:
|
||||
link = parse_json_link_details(entry.path)
|
||||
except KeyError:
|
||||
link = None
|
||||
if link:
|
||||
yield link
|
||||
|
||||
|
||||
|
||||
### Helpers
|
||||
|
||||
class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
"""
|
||||
Extended json serializer that supports serializing several model
|
||||
fields and objects
|
||||
"""
|
||||
|
||||
def default(self, obj):
|
||||
cls_name = obj.__class__.__name__
|
||||
|
||||
if hasattr(obj, '_asdict'):
|
||||
return obj._asdict()
|
||||
|
||||
elif isinstance(obj, bytes):
|
||||
return obj.decode()
|
||||
|
||||
elif isinstance(obj, datetime):
|
||||
return obj.isoformat()
|
||||
|
||||
elif isinstance(obj, Exception):
|
||||
return '{}: {}'.format(obj.__class__.__name__, obj)
|
||||
|
||||
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
||||
return tuple(obj)
|
||||
|
||||
return pyjson.JSONEncoder.default(self, obj)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True, cls=ExtendedEncoder) -> str:
|
||||
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
||||
|
431
archivebox/index/schema.py
Normal file
431
archivebox/index/schema.py
Normal file
|
@ -0,0 +1,431 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
|
||||
from datetime import datetime, timedelta
|
||||
|
||||
from typing import List, Dict, Any, Optional, Union
|
||||
|
||||
from dataclasses import dataclass, asdict, field, fields
|
||||
|
||||
|
||||
from ..system import get_dir_size
|
||||
|
||||
from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME
|
||||
|
||||
class ArchiveError(Exception):
|
||||
def __init__(self, message, hints=None):
|
||||
super().__init__(message)
|
||||
self.hints = hints
|
||||
|
||||
LinkDict = Dict[str, Any]
|
||||
|
||||
ArchiveOutput = Union[str, Exception, None]
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class ArchiveResult:
|
||||
cmd: List[str]
|
||||
pwd: Optional[str]
|
||||
cmd_version: Optional[str]
|
||||
output: ArchiveOutput
|
||||
status: str
|
||||
start_ts: datetime
|
||||
end_ts: datetime
|
||||
schema: str = 'ArchiveResult'
|
||||
|
||||
def __post_init__(self):
|
||||
self.typecheck()
|
||||
|
||||
def _asdict(self):
|
||||
return asdict(self)
|
||||
|
||||
def typecheck(self) -> None:
|
||||
assert self.schema == self.__class__.__name__
|
||||
assert isinstance(self.status, str) and self.status
|
||||
assert isinstance(self.start_ts, datetime)
|
||||
assert isinstance(self.end_ts, datetime)
|
||||
assert isinstance(self.cmd, list)
|
||||
assert all(isinstance(arg, str) and arg for arg in self.cmd)
|
||||
assert self.pwd is None or isinstance(self.pwd, str) and self.pwd
|
||||
assert self.cmd_version is None or isinstance(self.cmd_version, str) and self.cmd_version
|
||||
assert self.output is None or isinstance(self.output, (str, Exception))
|
||||
if isinstance(self.output, str):
|
||||
assert self.output
|
||||
|
||||
@classmethod
|
||||
def guess_ts(_cls, dict_info):
|
||||
from ..util import parse_date
|
||||
parsed_timestamp = parse_date(dict_info["timestamp"])
|
||||
start_ts = parsed_timestamp
|
||||
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
||||
return start_ts, end_ts
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
for key, val in json_info.items()
|
||||
if key in cls.field_names()
|
||||
}
|
||||
if guess:
|
||||
keys = info.keys()
|
||||
if "start_ts" not in keys:
|
||||
info["start_ts"], info["end_ts"] = cls.guess_ts(json_info)
|
||||
else:
|
||||
info['start_ts'] = parse_date(info['start_ts'])
|
||||
info['end_ts'] = parse_date(info['end_ts'])
|
||||
if "pwd" not in keys:
|
||||
info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"])
|
||||
if "cmd_version" not in keys:
|
||||
info["cmd_version"] = "Undefined"
|
||||
if "cmd" not in keys:
|
||||
info["cmd"] = []
|
||||
else:
|
||||
info['start_ts'] = parse_date(info['start_ts'])
|
||||
info['end_ts'] = parse_date(info['end_ts'])
|
||||
info['cmd_version'] = info.get('cmd_version')
|
||||
if type(info["cmd"]) is str:
|
||||
info["cmd"] = [info["cmd"]]
|
||||
return cls(**info)
|
||||
|
||||
def to_dict(self, *keys) -> dict:
|
||||
if keys:
|
||||
return {k: v for k, v in asdict(self).items() if k in keys}
|
||||
return asdict(self)
|
||||
|
||||
def to_json(self, indent=4, sort_keys=True) -> str:
|
||||
from .json import to_json
|
||||
|
||||
return to_json(self, indent=indent, sort_keys=sort_keys)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
||||
from .csv import to_csv
|
||||
|
||||
return to_csv(self, csv_col=cols or self.field_names(), separator=separator, ljust=ljust)
|
||||
|
||||
@classmethod
|
||||
def field_names(cls):
|
||||
return [f.name for f in fields(cls)]
|
||||
|
||||
@property
|
||||
def duration(self) -> int:
|
||||
return (self.end_ts - self.start_ts).seconds
|
||||
|
||||
@dataclass(frozen=True)
|
||||
class Link:
|
||||
timestamp: str
|
||||
url: str
|
||||
title: Optional[str]
|
||||
tags: Optional[str]
|
||||
sources: List[str]
|
||||
history: Dict[str, List[ArchiveResult]] = field(default_factory=lambda: {})
|
||||
updated: Optional[datetime] = None
|
||||
schema: str = 'Link'
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.timestamp}] {self.base_url} "{self.title}"'
|
||||
|
||||
def __post_init__(self):
|
||||
self.typecheck()
|
||||
|
||||
def overwrite(self, **kwargs):
|
||||
"""pure functional version of dict.update that returns a new instance"""
|
||||
return Link(**{**self._asdict(), **kwargs})
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Link):
|
||||
return NotImplemented
|
||||
return self.url == other.url
|
||||
|
||||
def __gt__(self, other):
|
||||
if not isinstance(other, Link):
|
||||
return NotImplemented
|
||||
if not self.timestamp or not other.timestamp:
|
||||
return
|
||||
return float(self.timestamp) > float(other.timestamp)
|
||||
|
||||
def typecheck(self) -> None:
|
||||
from ..config import stderr, ANSI
|
||||
try:
|
||||
assert self.schema == self.__class__.__name__
|
||||
assert isinstance(self.timestamp, str) and self.timestamp
|
||||
assert self.timestamp.replace('.', '').isdigit()
|
||||
assert isinstance(self.url, str) and '://' in self.url
|
||||
assert self.updated is None or isinstance(self.updated, datetime)
|
||||
assert self.title is None or (isinstance(self.title, str) and self.title)
|
||||
assert self.tags is None or isinstance(self.tags, str)
|
||||
assert isinstance(self.sources, list)
|
||||
assert all(isinstance(source, str) and source for source in self.sources)
|
||||
assert isinstance(self.history, dict)
|
||||
for method, results in self.history.items():
|
||||
assert isinstance(method, str) and method
|
||||
assert isinstance(results, list)
|
||||
assert all(isinstance(result, ArchiveResult) for result in results)
|
||||
except Exception:
|
||||
stderr('{red}[X] Error while loading link! [{}] {} "{}"{reset}'.format(self.timestamp, self.url, self.title, **ANSI))
|
||||
raise
|
||||
|
||||
def _asdict(self, extended=False):
|
||||
info = {
|
||||
'schema': 'Link',
|
||||
'url': self.url,
|
||||
'title': self.title or None,
|
||||
'timestamp': self.timestamp,
|
||||
'updated': self.updated or None,
|
||||
'tags': self.tags or None,
|
||||
'sources': self.sources or [],
|
||||
'history': self.history or {},
|
||||
}
|
||||
if extended:
|
||||
info.update({
|
||||
'link_dir': self.link_dir,
|
||||
'archive_path': self.archive_path,
|
||||
|
||||
'hash': self.url_hash,
|
||||
'base_url': self.base_url,
|
||||
'scheme': self.scheme,
|
||||
'domain': self.domain,
|
||||
'path': self.path,
|
||||
'basename': self.basename,
|
||||
'extension': self.extension,
|
||||
'is_static': self.is_static,
|
||||
|
||||
'bookmarked_date': self.bookmarked_date,
|
||||
'updated_date': self.updated_date,
|
||||
'oldest_archive_date': self.oldest_archive_date,
|
||||
'newest_archive_date': self.newest_archive_date,
|
||||
|
||||
'is_archived': self.is_archived,
|
||||
'num_outputs': self.num_outputs,
|
||||
'num_failures': self.num_failures,
|
||||
|
||||
'latest': self.latest_outputs(),
|
||||
'canonical': self.canonical_outputs(),
|
||||
})
|
||||
return info
|
||||
|
||||
@classmethod
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
for key, val in json_info.items()
|
||||
if key in cls.field_names()
|
||||
}
|
||||
info['updated'] = parse_date(info.get('updated'))
|
||||
info['sources'] = info.get('sources') or []
|
||||
|
||||
json_history = info.get('history') or {}
|
||||
cast_history = {}
|
||||
|
||||
for method, method_history in json_history.items():
|
||||
cast_history[method] = []
|
||||
for json_result in method_history:
|
||||
assert isinstance(json_result, dict), 'Items in Link["history"][method] must be dicts'
|
||||
cast_result = ArchiveResult.from_json(json_result, guess)
|
||||
cast_history[method].append(cast_result)
|
||||
|
||||
info['history'] = cast_history
|
||||
return cls(**info)
|
||||
|
||||
def to_json(self, indent=4, sort_keys=True) -> str:
|
||||
from .json import to_json
|
||||
|
||||
return to_json(self, indent=indent, sort_keys=sort_keys)
|
||||
|
||||
def to_csv(self, cols: Optional[List[str]]=None, separator: str=',', ljust: int=0) -> str:
|
||||
from .csv import to_csv
|
||||
|
||||
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
|
||||
|
||||
@classmethod
|
||||
def field_names(cls):
|
||||
return [f.name for f in fields(cls)]
|
||||
|
||||
@property
|
||||
def link_dir(self) -> str:
|
||||
from ..config import CONFIG
|
||||
return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
|
||||
|
||||
@property
|
||||
def archive_path(self) -> str:
|
||||
from ..config import ARCHIVE_DIR_NAME
|
||||
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
||||
|
||||
@property
|
||||
def archive_size(self) -> float:
|
||||
try:
|
||||
return get_dir_size(self.archive_path)[0]
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
### URL Helpers
|
||||
@property
|
||||
def url_hash(self):
|
||||
from ..util import hashurl
|
||||
|
||||
return hashurl(self.url)
|
||||
|
||||
@property
|
||||
def scheme(self) -> str:
|
||||
from ..util import scheme
|
||||
return scheme(self.url)
|
||||
|
||||
@property
|
||||
def extension(self) -> str:
|
||||
from ..util import extension
|
||||
return extension(self.url)
|
||||
|
||||
@property
|
||||
def domain(self) -> str:
|
||||
from ..util import domain
|
||||
return domain(self.url)
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
from ..util import path
|
||||
return path(self.url)
|
||||
|
||||
@property
|
||||
def basename(self) -> str:
|
||||
from ..util import basename
|
||||
return basename(self.url)
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
from ..util import base_url
|
||||
return base_url(self.url)
|
||||
|
||||
### Pretty Printing Helpers
|
||||
@property
|
||||
def bookmarked_date(self) -> Optional[str]:
|
||||
from ..util import ts_to_date
|
||||
|
||||
max_ts = (datetime.now() + timedelta(days=30)).timestamp()
|
||||
|
||||
if self.timestamp and self.timestamp.replace('.', '').isdigit():
|
||||
if 0 < float(self.timestamp) < max_ts:
|
||||
return ts_to_date(datetime.fromtimestamp(float(self.timestamp)))
|
||||
else:
|
||||
return str(self.timestamp)
|
||||
return None
|
||||
|
||||
|
||||
@property
|
||||
def updated_date(self) -> Optional[str]:
|
||||
from ..util import ts_to_date
|
||||
return ts_to_date(self.updated) if self.updated else None
|
||||
|
||||
@property
|
||||
def archive_dates(self) -> List[datetime]:
|
||||
return [
|
||||
result.start_ts
|
||||
for method in self.history.keys()
|
||||
for result in self.history[method]
|
||||
]
|
||||
|
||||
@property
|
||||
def oldest_archive_date(self) -> Optional[datetime]:
|
||||
return min(self.archive_dates, default=None)
|
||||
|
||||
@property
|
||||
def newest_archive_date(self) -> Optional[datetime]:
|
||||
return max(self.archive_dates, default=None)
|
||||
|
||||
### Archive Status Helpers
|
||||
@property
|
||||
def num_outputs(self) -> int:
|
||||
return len(tuple(filter(None, self.latest_outputs().values())))
|
||||
|
||||
@property
|
||||
def num_failures(self) -> int:
|
||||
return sum(1
|
||||
for method in self.history.keys()
|
||||
for result in self.history[method]
|
||||
if result.status == 'failed')
|
||||
|
||||
@property
|
||||
def is_static(self) -> bool:
|
||||
from ..util import is_static_file
|
||||
return is_static_file(self.url)
|
||||
|
||||
@property
|
||||
def is_archived(self) -> bool:
|
||||
from ..config import ARCHIVE_DIR
|
||||
from ..util import domain
|
||||
|
||||
output_paths = (
|
||||
domain(self.url),
|
||||
'output.pdf',
|
||||
'screenshot.png',
|
||||
'output.html',
|
||||
'media',
|
||||
)
|
||||
|
||||
return any(
|
||||
os.path.exists(os.path.join(ARCHIVE_DIR, self.timestamp, path))
|
||||
for path in output_paths
|
||||
)
|
||||
|
||||
def latest_outputs(self, status: str=None) -> Dict[str, ArchiveOutput]:
|
||||
"""get the latest output that each archive method produced for link"""
|
||||
|
||||
ARCHIVE_METHODS = (
|
||||
'title', 'favicon', 'wget', 'warc', 'pdf',
|
||||
'screenshot', 'dom', 'git', 'media', 'archive_org',
|
||||
)
|
||||
latest: Dict[str, ArchiveOutput] = {}
|
||||
for archive_method in ARCHIVE_METHODS:
|
||||
# get most recent succesful result in history for each archive method
|
||||
history = self.history.get(archive_method) or []
|
||||
history = list(filter(lambda result: result.output, reversed(history)))
|
||||
if status is not None:
|
||||
history = list(filter(lambda result: result.status == status, history))
|
||||
|
||||
history = list(history)
|
||||
if history:
|
||||
latest[archive_method] = history[0].output
|
||||
else:
|
||||
latest[archive_method] = None
|
||||
|
||||
return latest
|
||||
|
||||
|
||||
def canonical_outputs(self) -> Dict[str, Optional[str]]:
|
||||
"""predict the expected output paths that should be present after archiving"""
|
||||
|
||||
from ..extractors.wget import wget_output_path
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'favicon_path': 'favicon.ico',
|
||||
'google_favicon_path': 'https://www.google.com/s2/favicons?domain={}'.format(self.domain),
|
||||
'wget_path': wget_output_path(self),
|
||||
'warc_path': 'warc',
|
||||
'pdf_path': 'output.pdf',
|
||||
'screenshot_path': 'screenshot.png',
|
||||
'dom_path': 'output.html',
|
||||
'archive_org_path': 'https://web.archive.org/web/{}'.format(self.base_url),
|
||||
'git_path': 'git',
|
||||
'media_path': 'media',
|
||||
}
|
||||
if self.is_static:
|
||||
# static binary files like PDF and images are handled slightly differently.
|
||||
# they're just downloaded once and aren't archived separately multiple times,
|
||||
# so the wget, screenshot, & pdf urls should all point to the same file
|
||||
|
||||
static_path = wget_output_path(self)
|
||||
canonical.update({
|
||||
'title': self.basename,
|
||||
'wget_path': static_path,
|
||||
'pdf_path': static_path,
|
||||
'screenshot_path': static_path,
|
||||
'dom_path': static_path,
|
||||
})
|
||||
return canonical
|
||||
|
||||
|
90
archivebox/index/sql.py
Normal file
90
archivebox/index/sql.py
Normal file
|
@ -0,0 +1,90 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
from io import StringIO
|
||||
from typing import List, Tuple, Iterator
|
||||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types
|
||||
from ..config import setup_django, OUTPUT_DIR
|
||||
|
||||
|
||||
### Main Links Index
|
||||
|
||||
@enforce_types
|
||||
def parse_sql_main_index(out_dir: str=OUTPUT_DIR) -> Iterator[Link]:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
|
||||
return (
|
||||
Link.from_json(page.as_json(*Snapshot.keys))
|
||||
for page in Snapshot.objects.all()
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def remove_from_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
|
||||
with transaction.atomic():
|
||||
for link in links:
|
||||
Snapshot.objects.filter(url=link.url).delete()
|
||||
|
||||
@enforce_types
|
||||
def write_sql_main_index(links: List[Link], out_dir: str=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
|
||||
with transaction.atomic():
|
||||
for link in links:
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
|
||||
@enforce_types
|
||||
def write_sql_link_details(link: Link, out_dir: str=OUTPUT_DIR) -> None:
|
||||
setup_django(out_dir, check_db=True)
|
||||
from core.models import Snapshot
|
||||
from django.db import transaction
|
||||
|
||||
with transaction.atomic():
|
||||
snap = Snapshot.objects.get(url=link.url, timestamp=link.timestamp)
|
||||
snap.title = link.title
|
||||
snap.tags = link.tags
|
||||
snap.save()
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def list_migrations(out_dir: str=OUTPUT_DIR) -> List[Tuple[bool, str]]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.core.management import call_command
|
||||
out = StringIO()
|
||||
call_command("showmigrations", list=True, stdout=out)
|
||||
out.seek(0)
|
||||
migrations = []
|
||||
for line in out.readlines():
|
||||
if line.strip() and ']' in line:
|
||||
status_str, name_str = line.strip().split(']', 1)
|
||||
is_applied = 'X' in status_str
|
||||
migration_name = name_str.strip()
|
||||
migrations.append((is_applied, migration_name))
|
||||
|
||||
return migrations
|
||||
|
||||
@enforce_types
|
||||
def apply_migrations(out_dir: str=OUTPUT_DIR) -> List[str]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.core.management import call_command
|
||||
null, out = StringIO(), StringIO()
|
||||
call_command("makemigrations", interactive=False, stdout=null)
|
||||
call_command("migrate", interactive=False, stdout=out)
|
||||
out.seek(0)
|
||||
|
||||
return [line.strip() for line in out.readlines() if line.strip()]
|
||||
|
||||
@enforce_types
|
||||
def get_admins(out_dir: str=OUTPUT_DIR) -> List[str]:
|
||||
setup_django(out_dir, check_db=False)
|
||||
from django.contrib.auth.models import User
|
||||
return User.objects.filter(is_superuser=True)
|
|
@ -1,123 +0,0 @@
|
|||
"""
|
||||
In ArchiveBox, a Link represents a single entry that we track in the
|
||||
json index. All links pass through all archiver functions and the latest,
|
||||
most up-to-date canonical output for each is stored in "latest".
|
||||
|
||||
Link {
|
||||
timestamp: str, (how we uniquely id links)
|
||||
url: str,
|
||||
title: str,
|
||||
tags: str,
|
||||
sources: [str],
|
||||
history: {
|
||||
pdf: [
|
||||
{start_ts, end_ts, duration, cmd, pwd, status, output},
|
||||
...
|
||||
],
|
||||
...
|
||||
},
|
||||
}
|
||||
"""
|
||||
|
||||
from html import unescape
|
||||
from collections import OrderedDict
|
||||
|
||||
from util import (
|
||||
scheme,
|
||||
merge_links,
|
||||
check_link_structure,
|
||||
check_links_structure,
|
||||
)
|
||||
|
||||
from config import (
|
||||
URL_BLACKLIST,
|
||||
)
|
||||
|
||||
def validate_links(links):
|
||||
check_links_structure(links)
|
||||
links = archivable_links(links) # remove chrome://, about:, mailto: etc.
|
||||
links = uniquefied_links(links) # merge/dedupe duplicate timestamps & urls
|
||||
links = sorted_links(links) # deterministically sort the links based on timstamp, url
|
||||
|
||||
if not links:
|
||||
print('[X] No links found :(')
|
||||
raise SystemExit(1)
|
||||
|
||||
for link in links:
|
||||
link['title'] = unescape(link['title'].strip()) if link['title'] else None
|
||||
check_link_structure(link)
|
||||
|
||||
return list(links)
|
||||
|
||||
|
||||
def archivable_links(links):
|
||||
"""remove chrome://, about:// or other schemed links that cant be archived"""
|
||||
for link in links:
|
||||
scheme_is_valid = scheme(link['url']) in ('http', 'https', 'ftp')
|
||||
not_blacklisted = (not URL_BLACKLIST.match(link['url'])) if URL_BLACKLIST else True
|
||||
if scheme_is_valid and not_blacklisted:
|
||||
yield link
|
||||
|
||||
|
||||
def uniquefied_links(sorted_links):
|
||||
"""
|
||||
ensures that all non-duplicate links have monotonically increasing timestamps
|
||||
"""
|
||||
|
||||
unique_urls = OrderedDict()
|
||||
|
||||
lower = lambda url: url.lower().strip()
|
||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
||||
|
||||
for link in sorted_links:
|
||||
fuzzy_url = without_www(without_trailing_slash(lower(link['url'])))
|
||||
if fuzzy_url in unique_urls:
|
||||
# merge with any other links that share the same url
|
||||
link = merge_links(unique_urls[fuzzy_url], link)
|
||||
unique_urls[fuzzy_url] = link
|
||||
|
||||
unique_timestamps = OrderedDict()
|
||||
for link in unique_urls.values():
|
||||
link['timestamp'] = lowest_uniq_timestamp(unique_timestamps, link['timestamp'])
|
||||
unique_timestamps[link['timestamp']] = link
|
||||
|
||||
return unique_timestamps.values()
|
||||
|
||||
|
||||
def sorted_links(links):
|
||||
sort_func = lambda link: (link['timestamp'].split('.', 1)[0], link['url'])
|
||||
return sorted(links, key=sort_func, reverse=True)
|
||||
|
||||
|
||||
def links_after_timestamp(links, timestamp=None):
|
||||
if not timestamp:
|
||||
yield from links
|
||||
return
|
||||
|
||||
for link in links:
|
||||
try:
|
||||
if float(link['timestamp']) <= float(timestamp):
|
||||
yield link
|
||||
except (ValueError, TypeError):
|
||||
print('Resume value and all timestamp values must be valid numbers.')
|
||||
|
||||
|
||||
def lowest_uniq_timestamp(used_timestamps, timestamp):
|
||||
"""resolve duplicate timestamps by appending a decimal 1234, 1234 -> 1234.1, 1234.2"""
|
||||
|
||||
timestamp = timestamp.split('.')[0]
|
||||
nonce = 0
|
||||
|
||||
# first try 152323423 before 152323423.0
|
||||
if timestamp not in used_timestamps:
|
||||
return timestamp
|
||||
|
||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
||||
while new_timestamp in used_timestamps:
|
||||
nonce += 1
|
||||
new_timestamp = '{}.{}'.format(timestamp, nonce)
|
||||
|
||||
return new_timestamp
|
||||
|
||||
|
547
archivebox/logging_util.py
Normal file
547
archivebox/logging_util.py
Normal file
|
@ -0,0 +1,547 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
import re
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from multiprocessing import Process
|
||||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .index.schema import Link, ArchiveResult
|
||||
|
||||
from .util import enforce_types
|
||||
from .config import (
|
||||
ConfigDict,
|
||||
PYTHON_ENCODING,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
TERM_WIDTH,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR_NAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
stderr,
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
"""mutable stats counter for logging archiving timing info to CLI output"""
|
||||
|
||||
skipped: int = 0
|
||||
succeeded: int = 0
|
||||
failed: int = 0
|
||||
|
||||
parse_start_ts: Optional[datetime] = None
|
||||
parse_end_ts: Optional[datetime] = None
|
||||
|
||||
index_start_ts: Optional[datetime] = None
|
||||
index_end_ts: Optional[datetime] = None
|
||||
|
||||
archiving_start_ts: Optional[datetime] = None
|
||||
archiving_end_ts: Optional[datetime] = None
|
||||
|
||||
# globals are bad, mmkay
|
||||
_LAST_RUN_STATS = RuntimeStats()
|
||||
|
||||
|
||||
|
||||
class SmartFormatter(argparse.HelpFormatter):
|
||||
"""Patched formatter that prints newlines in argparse help strings"""
|
||||
def _split_lines(self, text, width):
|
||||
if '\n' in text:
|
||||
return text.splitlines()
|
||||
return argparse.HelpFormatter._split_lines(self, text, width)
|
||||
|
||||
|
||||
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
|
||||
"""Tell the user they passed stdin to a command that doesn't accept it"""
|
||||
|
||||
if stdin and not stdin.isatty():
|
||||
stdin_raw_text = stdin.read().strip()
|
||||
if stdin_raw_text:
|
||||
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
|
||||
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
|
||||
stderr()
|
||||
raise SystemExit(1)
|
||||
|
||||
|
||||
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
|
||||
"""accept any standard input and return it as a string or None"""
|
||||
if not stdin:
|
||||
return None
|
||||
elif stdin and not stdin.isatty():
|
||||
stdin_str = stdin.read().strip()
|
||||
return stdin_str or None
|
||||
return None
|
||||
|
||||
|
||||
class TimedProgress:
|
||||
"""Show a progress bar and measure elapsed time until .end() is called"""
|
||||
|
||||
def __init__(self, seconds, prefix=''):
|
||||
from .config import SHOW_PROGRESS
|
||||
self.SHOW_PROGRESS = SHOW_PROGRESS
|
||||
if self.SHOW_PROGRESS:
|
||||
self.p = Process(target=progress_bar, args=(seconds, prefix))
|
||||
self.p.start()
|
||||
|
||||
self.stats = {'start_ts': datetime.now(), 'end_ts': None}
|
||||
|
||||
def end(self):
|
||||
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
||||
|
||||
end_ts = datetime.now()
|
||||
self.stats['end_ts'] = end_ts
|
||||
|
||||
if self.SHOW_PROGRESS:
|
||||
# terminate if we havent already terminated
|
||||
self.p.terminate()
|
||||
self.p.join()
|
||||
self.p.close()
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
|
||||
except (IOError, BrokenPipeError):
|
||||
# ignore when the parent proc has stopped listening to our stdout
|
||||
pass
|
||||
|
||||
|
||||
@enforce_types
|
||||
def progress_bar(seconds: int, prefix: str='') -> None:
|
||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||
chunk = '█' if PYTHON_ENCODING == 'UTF-8' else '#'
|
||||
last_width = TERM_WIDTH()
|
||||
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||
try:
|
||||
for s in range(seconds * chunks):
|
||||
max_width = TERM_WIDTH()
|
||||
if max_width < last_width:
|
||||
# when the terminal size is shrunk, we have to write a newline
|
||||
# otherwise the progress bar will keep wrapping incorrectly
|
||||
sys.stdout.write('\r\n')
|
||||
sys.stdout.flush()
|
||||
chunks = max_width - len(prefix) - 20
|
||||
progress = s / chunks / seconds * 100
|
||||
bar_width = round(progress/(100/chunks))
|
||||
last_width = max_width
|
||||
|
||||
# ████████████████████ 0.9% (1/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
||||
prefix,
|
||||
ANSI['green'],
|
||||
(chunk * bar_width).ljust(chunks),
|
||||
ANSI['reset'],
|
||||
round(progress, 1),
|
||||
round(s/chunks),
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
time.sleep(1 / chunks)
|
||||
|
||||
# ██████████████████████████████████ 100.0% (60/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)\n'.format(
|
||||
prefix,
|
||||
ANSI['red'],
|
||||
chunk * chunks,
|
||||
ANSI['reset'],
|
||||
100.0,
|
||||
seconds,
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.flush()
|
||||
except (KeyboardInterrupt, BrokenPipeError):
|
||||
print()
|
||||
pass
|
||||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
|
||||
from .config import VERSION, ANSI
|
||||
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
|
||||
stdin_hint = ' < /dev/stdin' if not stdin.isatty() else ''
|
||||
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{stdin_hint}{reset}'.format(
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
VERSION=VERSION,
|
||||
cmd=cmd,
|
||||
stdin_hint=stdin_hint,
|
||||
**ANSI,
|
||||
))
|
||||
stderr('{black} > {pwd}{reset}'.format(pwd=pwd, **ANSI))
|
||||
stderr()
|
||||
|
||||
### Parsing Stage
|
||||
|
||||
|
||||
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
|
||||
_LAST_RUN_STATS.parse_start_ts = datetime.now()
|
||||
print('{green}[+] [{}] Adding {} links to index (crawl depth={}){}...{reset}'.format(
|
||||
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
|
||||
depth,
|
||||
' (index only)' if index_only else '',
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_source_saved(source_file: str):
|
||||
print(' > Saved verbatim input to {}/{}'.format(SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
||||
|
||||
def log_parsing_finished(num_parsed: int, parser_name: str):
|
||||
_LAST_RUN_STATS.parse_end_ts = datetime.now()
|
||||
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
|
||||
|
||||
def log_deduping_finished(num_new_links: int):
|
||||
print(' > Found {} new URLs not already in index'.format(num_new_links))
|
||||
|
||||
|
||||
def log_crawl_started(new_links):
|
||||
print('{lightred}[*] Starting crawl of {} sites 1 hop out from starting point{reset}'.format(len(new_links), **ANSI))
|
||||
|
||||
### Indexing Stage
|
||||
|
||||
def log_indexing_process_started(num_links: int):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.index_start_ts = start_ts
|
||||
print()
|
||||
print('{black}[*] [{}] Writing {} links to main index...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
|
||||
def log_indexing_process_finished():
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS.index_end_ts = end_ts
|
||||
|
||||
|
||||
def log_indexing_started(out_path: str):
|
||||
if IS_TTY:
|
||||
sys.stdout.write(f' > {out_path}')
|
||||
|
||||
|
||||
def log_indexing_finished(out_path: str):
|
||||
print(f'\r √ {out_path}')
|
||||
|
||||
|
||||
### Archiving Stage
|
||||
|
||||
def log_archiving_started(num_links: int, resume: Optional[float]=None):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
||||
print()
|
||||
if resume:
|
||||
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
resume,
|
||||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Collecting content for {} Snapshots in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||
print()
|
||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||
**ANSI,
|
||||
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=timestamp,
|
||||
total=num_links,
|
||||
))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To view your archive index, open:'.format(**ANSI))
|
||||
print(' {}/{}'.format(OUTPUT_DIR, HTML_INDEX_FILENAME))
|
||||
print(' Continue archiving where you left off by running:')
|
||||
print(' archivebox update --resume={}'.format(timestamp))
|
||||
|
||||
def log_archiving_finished(num_links: int):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||
assert _LAST_RUN_STATS.archiving_start_ts is not None
|
||||
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds)
|
||||
|
||||
print()
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
duration,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
|
||||
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded))
|
||||
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To view your archive index, open:'.format(**ANSI))
|
||||
print(' {}/{}'.format(OUTPUT_DIR, HTML_INDEX_FILENAME))
|
||||
print(' Or run the built-in webserver:')
|
||||
print(' archivebox server')
|
||||
|
||||
|
||||
def log_link_archiving_started(link: "Link", link_dir: str, is_new: bool):
|
||||
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
||||
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
||||
# > output/archive/1478739709
|
||||
|
||||
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
|
||||
symbol_color=ANSI['green' if is_new else 'black'],
|
||||
symbol='+' if is_new else '√',
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
title=link.title or link.base_url,
|
||||
**ANSI,
|
||||
))
|
||||
print(' {blue}{url}{reset}'.format(url=link.url, **ANSI))
|
||||
print(' {} {}'.format(
|
||||
'>' if is_new else '√',
|
||||
pretty_path(link_dir),
|
||||
))
|
||||
|
||||
def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats: dict):
|
||||
total = sum(stats.values())
|
||||
|
||||
if stats['failed'] > 0 :
|
||||
_LAST_RUN_STATS.failed += 1
|
||||
elif stats['skipped'] == total:
|
||||
_LAST_RUN_STATS.skipped += 1
|
||||
else:
|
||||
_LAST_RUN_STATS.succeeded += 1
|
||||
|
||||
|
||||
def log_archive_method_started(method: str):
|
||||
print(' > {}'.format(method))
|
||||
|
||||
|
||||
def log_archive_method_finished(result: "ArchiveResult"):
|
||||
"""quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if ' ' in arg else arg
|
||||
for arg in result.cmd
|
||||
)
|
||||
|
||||
if result.status == 'failed':
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
)
|
||||
|
||||
# Collect and prefix output lines with indentation
|
||||
output_lines = [
|
||||
'{lightred}Failed:{reset}'.format(**ANSI),
|
||||
' {reset}{} {red}{}{reset}'.format(
|
||||
result.output.__class__.__name__.replace('ArchiveError', ''),
|
||||
result.output,
|
||||
**ANSI,
|
||||
),
|
||||
*hints,
|
||||
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
|
||||
*([' cd {};'.format(result.pwd)] if result.pwd else []),
|
||||
' {}'.format(quoted_cmd),
|
||||
]
|
||||
print('\n'.join(
|
||||
' {}'.format(line)
|
||||
for line in output_lines
|
||||
if line
|
||||
))
|
||||
print()
|
||||
|
||||
|
||||
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||
print('{green}[*] Finding links in the archive index matching these {} patterns:{reset}'.format(
|
||||
filter_type,
|
||||
**ANSI,
|
||||
))
|
||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
|
||||
def log_list_finished(links):
|
||||
from .index.csv import links_to_csv
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(links_to_csv(links, cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print()
|
||||
|
||||
|
||||
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
|
||||
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
|
||||
if delete:
|
||||
file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)]
|
||||
print(
|
||||
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||
)
|
||||
else:
|
||||
print(
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||
)
|
||||
|
||||
if not yes:
|
||||
print()
|
||||
print('{lightyellow}[?] Do you want to proceed with removing these {} links?{reset}'.format(len(links), **ANSI))
|
||||
try:
|
||||
assert input(' y/[n]: ').lower() == 'y'
|
||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||
raise SystemExit(0)
|
||||
|
||||
def log_removal_finished(all_links: int, to_keep: int):
|
||||
if all_links == 0:
|
||||
print()
|
||||
print('{red}[X] No matching links found.{reset}'.format(**ANSI))
|
||||
else:
|
||||
num_removed = all_links - to_keep
|
||||
print()
|
||||
print('{red}[√] Removed {} out of {} links from the archive index.{reset}'.format(
|
||||
num_removed,
|
||||
all_links,
|
||||
**ANSI,
|
||||
))
|
||||
print(' Index now contains {} links.'.format(to_keep))
|
||||
|
||||
|
||||
def log_shell_welcome_msg():
|
||||
from .cli import list_subcommands
|
||||
|
||||
print('{green}# ArchiveBox Imports{reset}'.format(**ANSI))
|
||||
print('{green}from archivebox.core.models import Snapshot, User{reset}'.format(**ANSI))
|
||||
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
|
||||
print()
|
||||
print('[i] Welcome to the ArchiveBox Shell!')
|
||||
print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
|
||||
print(' print(Snapshot.objects.filter(is_archived=True).count())')
|
||||
print(' Snapshot.objects.get(url="https://example.com").as_json()')
|
||||
print(' add("https://example.com/some/new/url")')
|
||||
|
||||
|
||||
|
||||
### Helpers
|
||||
|
||||
@enforce_types
|
||||
def pretty_path(path: str) -> str:
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
pwd = os.path.abspath('.')
|
||||
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
|
||||
return path.replace(pwd + '/', './')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_filesize(num_bytes: Union[int, float]) -> str:
|
||||
for count in ['Bytes','KB','MB','GB']:
|
||||
if num_bytes > -1024.0 and num_bytes < 1024.0:
|
||||
return '%3.1f %s' % (num_bytes, count)
|
||||
num_bytes /= 1024.0
|
||||
return '%3.1f %s' % (num_bytes, 'TB')
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folders(folders: Dict[str, Optional["Link"]],
|
||||
json: bool=False,
|
||||
csv: Optional[str]=None) -> str:
|
||||
if json:
|
||||
from .index.json import to_json
|
||||
return to_json(folders.values(), indent=4, sort_keys=True)
|
||||
|
||||
elif csv:
|
||||
from .index.csv import links_to_csv
|
||||
return links_to_csv(folders.values(), cols=csv.split(','), header=True)
|
||||
|
||||
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_config(config: ConfigDict, prefix: str='') -> str:
|
||||
return f'\n{prefix}'.join(
|
||||
f'{key}={val}'
|
||||
for key, val in config.items()
|
||||
if not (isinstance(val, dict) or callable(val))
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
if folder['enabled']:
|
||||
if folder['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
else:
|
||||
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
if folder['path']:
|
||||
if os.path.exists(folder['path']):
|
||||
num_files = (
|
||||
f'{len(os.listdir(folder["path"]))} files'
|
||||
if os.path.isdir(folder['path']) else
|
||||
printable_filesize(os.path.getsize(folder['path']))
|
||||
)
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
if ' ' in folder['path']:
|
||||
folder['path'] = f'"{folder["path"]}"'
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(22),
|
||||
(folder["path"] or '').ljust(76),
|
||||
num_files.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
ANSI['reset'],
|
||||
))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note, version = 'green', '√', 'valid', ''
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
|
||||
if not version:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
|
||||
if ' ' in dependency["path"]:
|
||||
dependency["path"] = f'"{dependency["path"]}"'
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(22),
|
||||
(dependency["path"] or '').ljust(76),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note,
|
||||
ANSI['reset'],
|
||||
))
|
|
@ -1,201 +0,0 @@
|
|||
import sys
|
||||
from datetime import datetime
|
||||
from config import ANSI, REPO_DIR, OUTPUT_DIR
|
||||
|
||||
|
||||
# globals are bad, mmkay
|
||||
_LAST_RUN_STATS = {
|
||||
'skipped': 0,
|
||||
'succeeded': 0,
|
||||
'failed': 0,
|
||||
|
||||
'parsing_start_ts': 0,
|
||||
'parsing_end_ts': 0,
|
||||
|
||||
'indexing_start_ts': 0,
|
||||
'indexing_end_ts': 0,
|
||||
|
||||
'archiving_start_ts': 0,
|
||||
'archiving_end_ts': 0,
|
||||
|
||||
'links': {},
|
||||
}
|
||||
|
||||
def pretty_path(path):
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
return path.replace(REPO_DIR + '/', '')
|
||||
|
||||
|
||||
### Parsing Stage
|
||||
|
||||
def log_parsing_started(source_file):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['parse_start_ts'] = start_ts
|
||||
print('{green}[*] [{}] Parsing new links from output/sources/{}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
source_file.rsplit('/', 1)[-1],
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_parsing_finished(num_new_links, parser_name):
|
||||
print(' > Adding {} new links to index (parsed import as {})'.format(
|
||||
num_new_links,
|
||||
parser_name,
|
||||
))
|
||||
|
||||
|
||||
### Indexing Stage
|
||||
|
||||
def log_indexing_process_started():
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_start_ts'] = start_ts
|
||||
print('{green}[*] [{}] Saving main index files...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_indexing_started(out_dir, out_file):
|
||||
sys.stdout.write(' > {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
|
||||
def log_indexing_finished(out_dir, out_file):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['index_end_ts'] = end_ts
|
||||
print('\r √ {}/{}'.format(pretty_path(out_dir), out_file))
|
||||
|
||||
|
||||
### Archiving Stage
|
||||
|
||||
def log_archiving_started(num_links, resume):
|
||||
start_ts = datetime.now()
|
||||
_LAST_RUN_STATS['start_ts'] = start_ts
|
||||
if resume:
|
||||
print('{green}[▶] [{}] Resuming archive updating for {} pages starting from {}...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
resume,
|
||||
**ANSI,
|
||||
))
|
||||
else:
|
||||
print('{green}[▶] [{}] Updating content for {} pages in archive...{reset}'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
**ANSI,
|
||||
))
|
||||
|
||||
def log_archiving_paused(num_links, idx, timestamp):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||
print()
|
||||
print('\n{lightyellow}[X] [{now}] Downloading paused on link {timestamp} ({idx}/{total}){reset}'.format(
|
||||
**ANSI,
|
||||
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=timestamp,
|
||||
total=num_links,
|
||||
))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
print(' Continue where you left off by running:')
|
||||
print(' {} {}'.format(
|
||||
pretty_path(sys.argv[0]),
|
||||
timestamp,
|
||||
))
|
||||
|
||||
def log_archiving_finished(num_links):
|
||||
end_ts = datetime.now()
|
||||
_LAST_RUN_STATS['end_ts'] = end_ts
|
||||
seconds = end_ts.timestamp() - _LAST_RUN_STATS['start_ts'].timestamp()
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60, 2)
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds, 2)
|
||||
|
||||
print('{}[√] [{}] Update of {} pages complete ({}){}'.format(
|
||||
ANSI['green'],
|
||||
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
duration,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' - {} links skipped'.format(_LAST_RUN_STATS['skipped']))
|
||||
print(' - {} links updated'.format(_LAST_RUN_STATS['succeeded']))
|
||||
print(' - {} links had errors'.format(_LAST_RUN_STATS['failed']))
|
||||
print(' To view your archive, open: {}/index.html'.format(OUTPUT_DIR.replace(REPO_DIR + '/', '')))
|
||||
|
||||
|
||||
def log_link_archiving_started(link_dir, link, is_new):
|
||||
# [*] [2019-03-22 13:46:45] "Log Structured Merge Trees - ben stopford"
|
||||
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
||||
# > output/archive/1478739709
|
||||
|
||||
print('\n[{symbol_color}{symbol}{reset}] [{symbol_color}{now}{reset}] "{title}"'.format(
|
||||
symbol_color=ANSI['green' if is_new else 'black'],
|
||||
symbol='+' if is_new else '*',
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
title=link['title'] or link['url'],
|
||||
**ANSI,
|
||||
))
|
||||
print(' {blue}{url}{reset}'.format(url=link['url'], **ANSI))
|
||||
print(' {} {}'.format(
|
||||
'>' if is_new else '√',
|
||||
pretty_path(link_dir),
|
||||
))
|
||||
|
||||
def log_link_archiving_finished(link_dir, link, is_new, stats):
|
||||
total = sum(stats.values())
|
||||
|
||||
if stats['failed'] > 0 :
|
||||
_LAST_RUN_STATS['failed'] += 1
|
||||
elif stats['skipped'] == total:
|
||||
_LAST_RUN_STATS['skipped'] += 1
|
||||
else:
|
||||
_LAST_RUN_STATS['succeeded'] += 1
|
||||
|
||||
|
||||
def log_archive_method_started(method):
|
||||
print(' > {}'.format(method))
|
||||
|
||||
def log_archive_method_finished(result):
|
||||
"""quote the argument with whitespace in a command so the user can
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
required_keys = ('cmd', 'pwd', 'output', 'status', 'start_ts', 'end_ts')
|
||||
assert (
|
||||
isinstance(result, dict)
|
||||
and all(key in result for key in required_keys)
|
||||
and ('output' in result)
|
||||
), 'Archive method did not return a valid result.'
|
||||
|
||||
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if ' ' in arg else arg
|
||||
for arg in result['cmd']
|
||||
)
|
||||
|
||||
if result['status'] == 'failed':
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result['output'], 'hints', None) or ()
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
)
|
||||
|
||||
# Collect and prefix output lines with indentation
|
||||
output_lines = [
|
||||
'{}Failed:{} {}{}'.format(
|
||||
ANSI['red'],
|
||||
result['output'].__class__.__name__.replace('ArchiveError', ''),
|
||||
result['output'],
|
||||
ANSI['reset']
|
||||
),
|
||||
*hints,
|
||||
'{}Run to see full output:{}'.format(ANSI['lightred'], ANSI['reset']),
|
||||
' cd {};'.format(result['pwd']),
|
||||
' {}'.format(quoted_cmd),
|
||||
]
|
||||
print('\n'.join(
|
||||
' {}'.format(line)
|
||||
for line in output_lines
|
||||
if line
|
||||
))
|
1057
archivebox/main.py
Normal file
1057
archivebox/main.py
Normal file
File diff suppressed because it is too large
Load diff
30
archivebox/manage.py
Executable file
30
archivebox/manage.py
Executable file
|
@ -0,0 +1,30 @@
|
|||
#!/usr/bin/env python
|
||||
import os
|
||||
import sys
|
||||
|
||||
if __name__ == '__main__':
|
||||
# if you're a developer working on archivebox, still prefer the archivebox
|
||||
# versions of ./manage.py commands whenever possible. When that's not possible
|
||||
# (e.g. makemigrations), you can comment out this check temporarily
|
||||
|
||||
if not ('makemigrations' in sys.argv or 'migrate' in sys.argv):
|
||||
print("[X] Don't run ./manage.py directly, use the archivebox CLI instead e.g.:")
|
||||
print(' archivebox manage createsuperuser')
|
||||
print()
|
||||
print(' Hint: Use these archivebox commands instead of the ./manage.py equivalents:')
|
||||
print(' archivebox init (migrates the databse to latest version)')
|
||||
print(' archivebox server (runs the Django web server)')
|
||||
print(' archivebox shell (opens an iPython Django shell with all models imported)')
|
||||
print(' archivebox manage [cmd] (any other management commands)')
|
||||
raise SystemExit(2)
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
try:
|
||||
from django.core.management import execute_from_command_line
|
||||
except ImportError as exc:
|
||||
raise ImportError(
|
||||
"Couldn't import Django. Are you sure it's installed and "
|
||||
"available on your PYTHONPATH environment variable? Did you "
|
||||
"forget to activate a virtual environment?"
|
||||
) from exc
|
||||
execute_from_command_line(sys.argv)
|
3
archivebox/mypy.ini
Normal file
3
archivebox/mypy.ini
Normal file
|
@ -0,0 +1,3 @@
|
|||
[mypy]
|
||||
plugins =
|
||||
mypy_django_plugin.main
|
|
@ -1,315 +0,0 @@
|
|||
"""
|
||||
Everything related to parsing links from input sources.
|
||||
|
||||
For a list of supported services, see the README.md.
|
||||
For examples of supported import formats see tests/.
|
||||
|
||||
Link: {
|
||||
'url': 'https://example.com/example/?abc=123&xyc=345#lmnop',
|
||||
'timestamp': '1544212312.4234',
|
||||
'title': 'Example.com Page Title',
|
||||
'tags': 'abc,def',
|
||||
'sources': [
|
||||
'output/sources/ril_export.html',
|
||||
'output/sources/getpocket.com-1523422111.txt',
|
||||
'output/sources/stdin-234234112312.txt'
|
||||
]
|
||||
}
|
||||
"""
|
||||
|
||||
import re
|
||||
import json
|
||||
|
||||
from datetime import datetime
|
||||
import xml.etree.ElementTree as etree
|
||||
|
||||
from config import TIMEOUT
|
||||
from util import (
|
||||
str_between,
|
||||
URL_REGEX,
|
||||
check_url_parsing_invariants,
|
||||
TimedProgress,
|
||||
)
|
||||
|
||||
|
||||
def parse_links(source_file):
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
check_url_parsing_invariants()
|
||||
PARSERS = (
|
||||
# Specialized parsers
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('Generic RSS', parse_rss_export),
|
||||
('Generic JSON', parse_json_export),
|
||||
|
||||
# Fallback parser
|
||||
('Plain Text', parse_plain_text_export),
|
||||
)
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
links = list(parser_func(file))
|
||||
if links:
|
||||
timer.end()
|
||||
return links, parser_name
|
||||
except Exception as err:
|
||||
# Parsers are tried one by one down the list, and the first one
|
||||
# that succeeds is used. To see why a certain parser was not used
|
||||
# due to error or format incompatibility, uncomment this line:
|
||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
pass
|
||||
|
||||
timer.end()
|
||||
return [], 'Failed to parse'
|
||||
|
||||
|
||||
### Import Parser Functions
|
||||
|
||||
def parse_pocket_html_export(html_file):
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
tags = match.group(3)
|
||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title or None,
|
||||
'tags': tags or '',
|
||||
'sources': [html_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_json_export(json_file):
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
links = json.load(json_file)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%SZ')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now().timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip() or None
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip() or None
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip() or None
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': ts_str,
|
||||
'title': title,
|
||||
'tags': link.get('tags') or '',
|
||||
'sources': [json_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_rss_export(rss_file):
|
||||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip() or None
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_shaarli_rss_export(rss_file):
|
||||
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
entries = rss_file.read().split('<entry>')[1:]
|
||||
for entry in entries:
|
||||
# example entry:
|
||||
# <entry>
|
||||
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
|
||||
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
|
||||
# <id>https://demo.shaarli.org/?cEV4vw</id>
|
||||
# <published>2019-01-30T06:06:01+00:00</published>
|
||||
# <updated>2019-01-30T06:06:01+00:00</updated>
|
||||
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
|
||||
# </entry>
|
||||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title>', '</title>').strip()
|
||||
url = str_between(get_row('link'), '<link href="', '" />')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title or None,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_netscape_html_export(html_file):
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
|
||||
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': match.group(3).strip() or None,
|
||||
'tags': '',
|
||||
'sources': [html_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_pinboard_rss_export(rss_file):
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = etree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
url = item.find("{http://purl.org/rss/1.0/}link").text
|
||||
tags = item.find("{http://purl.org/dc/elements/1.1/}subject").text if item.find("{http://purl.org/dc/elements/1.1/}subject") else None
|
||||
title = item.find("{http://purl.org/rss/1.0/}title").text.strip() if item.find("{http://purl.org/rss/1.0/}title").text.strip() else None
|
||||
ts_str = item.find("{http://purl.org/dc/elements/1.1/}date").text if item.find("{http://purl.org/dc/elements/1.1/}date").text else None
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title or None,
|
||||
'tags': tags or '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_medium_rss_export(rss_file):
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = etree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item")
|
||||
for item in items:
|
||||
url = item.find("link").text
|
||||
title = item.find("title").text.strip()
|
||||
ts_str = item.find("pubDate").text
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(time.timestamp()),
|
||||
'title': title or None,
|
||||
'tags': '',
|
||||
'sources': [rss_file.name],
|
||||
}
|
||||
|
||||
|
||||
def parse_plain_text_export(text_file):
|
||||
"""Parse raw links from each line in a text file"""
|
||||
|
||||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
urls = re.findall(URL_REGEX, line) if line.strip() else ()
|
||||
for url in urls:
|
||||
yield {
|
||||
'url': url,
|
||||
'timestamp': str(datetime.now().timestamp()),
|
||||
'title': None,
|
||||
'tags': '',
|
||||
'sources': [text_file.name],
|
||||
}
|
159
archivebox/parsers/__init__.py
Normal file
159
archivebox/parsers/__init__.py
Normal file
|
@ -0,0 +1,159 @@
|
|||
"""
|
||||
Everything related to parsing links from input sources.
|
||||
|
||||
For a list of supported services, see the README.md.
|
||||
For examples of supported import formats see tests/.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import re
|
||||
import os
|
||||
|
||||
from typing import Tuple, List
|
||||
from datetime import datetime
|
||||
|
||||
from ..system import atomic_write
|
||||
from ..config import (
|
||||
ANSI,
|
||||
OUTPUT_DIR,
|
||||
SOURCES_DIR_NAME,
|
||||
TIMEOUT,
|
||||
)
|
||||
from ..util import (
|
||||
basename,
|
||||
download_url,
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
)
|
||||
from ..index.schema import Link
|
||||
from ..logging_util import TimedProgress, log_source_saved
|
||||
from .pocket_html import parse_pocket_html_export
|
||||
from .pinboard_rss import parse_pinboard_rss_export
|
||||
from .shaarli_rss import parse_shaarli_rss_export
|
||||
from .medium_rss import parse_medium_rss_export
|
||||
from .netscape_html import parse_netscape_html_export
|
||||
from .generic_rss import parse_generic_rss_export
|
||||
from .generic_json import parse_generic_json_export
|
||||
from .generic_txt import parse_generic_txt_export
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_links(source_file: str) -> Tuple[List[Link], str]:
|
||||
"""parse a list of URLs with their metadata from an
|
||||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
check_url_parsing_invariants()
|
||||
PARSERS = (
|
||||
# Specialized parsers
|
||||
('Pocket HTML', parse_pocket_html_export),
|
||||
('Pinboard RSS', parse_pinboard_rss_export),
|
||||
('Shaarli RSS', parse_shaarli_rss_export),
|
||||
('Medium RSS', parse_medium_rss_export),
|
||||
|
||||
# General parsers
|
||||
('Netscape HTML', parse_netscape_html_export),
|
||||
('Generic RSS', parse_generic_rss_export),
|
||||
('Generic JSON', parse_generic_json_export),
|
||||
|
||||
# Fallback parser
|
||||
('Plain Text', parse_generic_txt_export),
|
||||
)
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
for parser_name, parser_func in PARSERS:
|
||||
try:
|
||||
links = list(parser_func(file))
|
||||
if links:
|
||||
timer.end()
|
||||
return links, parser_name
|
||||
except Exception as err: # noqa
|
||||
pass
|
||||
# Parsers are tried one by one down the list, and the first one
|
||||
# that succeeds is used. To see why a certain parser was not used
|
||||
# due to error or format incompatibility, uncomment this line:
|
||||
# print('[!] Parser {} failed: {} {}'.format(parser_name, err.__class__.__name__, err))
|
||||
# raise
|
||||
|
||||
timer.end()
|
||||
return [], 'Failed to parse'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: str=OUTPUT_DIR) -> str:
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(ts=ts))
|
||||
atomic_write(source_path, raw_text)
|
||||
log_source_saved(source_file=source_path)
|
||||
return source_path
|
||||
|
||||
|
||||
@enforce_types
|
||||
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: str=OUTPUT_DIR) -> str:
|
||||
"""download a given url's content into output/sources/domain-<timestamp>.txt"""
|
||||
ts = str(datetime.now().timestamp()).split('.', 1)[0]
|
||||
source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
|
||||
|
||||
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
|
||||
# Source is a URL that needs to be downloaded
|
||||
print('{}[*] [{}] Downloading {}{}'.format(
|
||||
ANSI['green'],
|
||||
datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
path,
|
||||
ANSI['reset'],
|
||||
))
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
raw_source_text = download_url(path, timeout=timeout)
|
||||
timer.end()
|
||||
except Exception as e:
|
||||
timer.end()
|
||||
print('{}[!] Failed to download {}{}\n'.format(
|
||||
ANSI['red'],
|
||||
path,
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' ', e)
|
||||
raise SystemExit(1)
|
||||
|
||||
else:
|
||||
# Source is a path to a local file on the filesystem
|
||||
with open(path, 'r') as f:
|
||||
raw_source_text = f.read()
|
||||
|
||||
atomic_write(source_path, raw_source_text)
|
||||
|
||||
log_source_saved(source_file=source_path)
|
||||
|
||||
return source_path
|
||||
|
||||
|
||||
def check_url_parsing_invariants() -> None:
|
||||
"""Check that plain text regex URL parsing works as expected"""
|
||||
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving, as the consequences could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
|
||||
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
||||
|
65
archivebox/parsers/generic_json.py
Normal file
65
archivebox/parsers/generic_json.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
import json
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_json_export(json_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse JSON-format bookmarks export files (produced by pinboard.in/export/, or wallabag)"""
|
||||
|
||||
json_file.seek(0)
|
||||
links = json.load(json_file)
|
||||
json_date = lambda s: datetime.strptime(s, '%Y-%m-%dT%H:%M:%S%z')
|
||||
|
||||
for link in links:
|
||||
# example line
|
||||
# {"href":"http:\/\/www.reddit.com\/r\/example","description":"title here","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android"}]
|
||||
if link:
|
||||
# Parse URL
|
||||
url = link.get('href') or link.get('url') or link.get('URL')
|
||||
if not url:
|
||||
raise Exception('JSON must contain URL in each entry [{"url": "http://...", ...}, ...]')
|
||||
|
||||
# Parse the timestamp
|
||||
ts_str = str(datetime.now().timestamp())
|
||||
if link.get('timestamp'):
|
||||
# chrome/ff histories use a very precise timestamp
|
||||
ts_str = str(link['timestamp'] / 10000000)
|
||||
elif link.get('time'):
|
||||
ts_str = str(json_date(link['time'].split(',', 1)[0]).timestamp())
|
||||
elif link.get('created_at'):
|
||||
ts_str = str(json_date(link['created_at']).timestamp())
|
||||
elif link.get('created'):
|
||||
ts_str = str(json_date(link['created']).timestamp())
|
||||
elif link.get('date'):
|
||||
ts_str = str(json_date(link['date']).timestamp())
|
||||
elif link.get('bookmarked'):
|
||||
ts_str = str(json_date(link['bookmarked']).timestamp())
|
||||
elif link.get('saved'):
|
||||
ts_str = str(json_date(link['saved']).timestamp())
|
||||
|
||||
# Parse the title
|
||||
title = None
|
||||
if link.get('title'):
|
||||
title = link['title'].strip()
|
||||
elif link.get('description'):
|
||||
title = link['description'].replace(' — Readability', '').strip()
|
||||
elif link.get('name'):
|
||||
title = link['name'].strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=ts_str,
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(link.get('tags')) or '',
|
||||
sources=[json_file.name],
|
||||
)
|
49
archivebox/parsers/generic_rss.py
Normal file
49
archivebox/parsers/generic_rss.py
Normal file
|
@ -0,0 +1,49 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
)
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
items = rss_file.read().split('<item>')
|
||||
items = items[1:] if items else []
|
||||
for item in items:
|
||||
# example item:
|
||||
# <item>
|
||||
# <title><![CDATA[How JavaScript works: inside the V8 engine]]></title>
|
||||
# <category>Unread</category>
|
||||
# <link>https://blog.sessionstack.com/how-javascript-works-inside</link>
|
||||
# <guid>https://blog.sessionstack.com/how-javascript-works-inside</guid>
|
||||
# <pubDate>Mon, 21 Aug 2017 14:21:58 -0500</pubDate>
|
||||
# </item>
|
||||
|
||||
trailing_removed = item.split('</item>', 1)[0]
|
||||
leading_removed = trailing_removed.split('<item>', 1)[-1].strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r for r in rows if r.strip().startswith('<{}>'.format(key))][0]
|
||||
|
||||
url = str_between(get_row('link'), '<link>', '</link>')
|
||||
ts_str = str_between(get_row('pubDate'), '<pubDate>', '</pubDate>')
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %z")
|
||||
title = str_between(get_row('title'), '<![CDATA[', ']]').strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
57
archivebox/parsers/generic_txt.py
Normal file
57
archivebox/parsers/generic_txt.py
Normal file
|
@ -0,0 +1,57 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
__description__ = 'Plain Text'
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
URL_REGEX
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_generic_txt_export(text_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse raw links from each line in a text file"""
|
||||
|
||||
text_file.seek(0)
|
||||
for line in text_file.readlines():
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
# if the line is a local file path that resolves, then we can archive it
|
||||
if Path(line).exists():
|
||||
yield Link(
|
||||
url=line,
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
# otherwise look for anything that looks like a URL in the line
|
||||
for url in re.findall(URL_REGEX, line):
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
||||
|
||||
# look inside the URL for any sub-urls, e.g. for archive.org links
|
||||
# https://web.archive.org/web/20200531203453/https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
# -> https://www.reddit.com/r/socialism/comments/gu24ke/nypd_officers_claim_they_are_protecting_the_rule/fsfq0sw/
|
||||
for url in re.findall(URL_REGEX, line[1:]):
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(datetime.now().timestamp()),
|
||||
title=None,
|
||||
tags=None,
|
||||
sources=[text_file.name],
|
||||
)
|
35
archivebox/parsers/medium_rss.py
Normal file
35
archivebox/parsers/medium_rss.py
Normal file
|
@ -0,0 +1,35 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_medium_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Medium RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.find("channel").findall("item") # type: ignore
|
||||
for item in items:
|
||||
url = item.find("link").text # type: ignore
|
||||
title = item.find("title").text.strip() # type: ignore
|
||||
ts_str = item.find("pubDate").text # type: ignore
|
||||
time = datetime.strptime(ts_str, "%a, %d %b %Y %H:%M:%S %Z") # type: ignore
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
39
archivebox/parsers/netscape_html.py
Normal file
39
archivebox/parsers/netscape_html.py
Normal file
|
@ -0,0 +1,39 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_netscape_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse netscape-format bookmarks export files (produced by all browsers)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("<a href=\"(.+?)\" add_date=\"(\\d+)\"[^>]*>(.+)</a>", re.UNICODE | re.IGNORECASE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <DT><A HREF="https://example.com/?q=1+2" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://example.com/favicon.ico" ICON="data:image/png;base64,...">example bookmark title</A>
|
||||
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1)
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
title = match.group(3).strip()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[html_file.name],
|
||||
)
|
||||
|
47
archivebox/parsers/pinboard_rss.py
Normal file
47
archivebox/parsers/pinboard_rss.py
Normal file
|
@ -0,0 +1,47 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pinboard_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pinboard RSS feed files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
root = ElementTree.parse(rss_file).getroot()
|
||||
items = root.findall("{http://purl.org/rss/1.0/}item")
|
||||
for item in items:
|
||||
find = lambda p: item.find(p).text.strip() if item.find(p) else None # type: ignore
|
||||
|
||||
url = find("{http://purl.org/rss/1.0/}link")
|
||||
tags = find("{http://purl.org/dc/elements/1.1/}subject")
|
||||
title = find("{http://purl.org/rss/1.0/}title")
|
||||
ts_str = find("{http://purl.org/dc/elements/1.1/}date")
|
||||
|
||||
# Pinboard includes a colon in its date stamp timezone offsets, which
|
||||
# Python can't parse. Remove it:
|
||||
if ts_str and ts_str[-3:-2] == ":":
|
||||
ts_str = ts_str[:-3]+ts_str[-2:]
|
||||
|
||||
if ts_str:
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
else:
|
||||
time = datetime.now()
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=htmldecode(tags) or None,
|
||||
sources=[rss_file.name],
|
||||
)
|
38
archivebox/parsers/pocket_html.py
Normal file
38
archivebox/parsers/pocket_html.py
Normal file
|
@ -0,0 +1,38 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
import re
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_pocket_html_export(html_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Pocket-format bookmarks export files (produced by getpocket.com/export/)"""
|
||||
|
||||
html_file.seek(0)
|
||||
pattern = re.compile("^\\s*<li><a href=\"(.+)\" time_added=\"(\\d+)\" tags=\"(.*)\">(.+)</a></li>", re.UNICODE)
|
||||
for line in html_file:
|
||||
# example line
|
||||
# <li><a href="http://example.com/ time_added="1478739709" tags="tag1,tag2">example title</a></li>
|
||||
match = pattern.search(line)
|
||||
if match:
|
||||
url = match.group(1).replace('http://www.readability.com/read?url=', '') # remove old readability prefixes to get original url
|
||||
time = datetime.fromtimestamp(float(match.group(2)))
|
||||
tags = match.group(3)
|
||||
title = match.group(4).replace(' — Readability', '').replace('http://www.readability.com/read?url=', '')
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=tags or '',
|
||||
sources=[html_file.name],
|
||||
)
|
50
archivebox/parsers/shaarli_rss.py
Normal file
50
archivebox/parsers/shaarli_rss.py
Normal file
|
@ -0,0 +1,50 @@
|
|||
__package__ = 'archivebox.parsers'
|
||||
|
||||
|
||||
from typing import IO, Iterable
|
||||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def parse_shaarli_rss_export(rss_file: IO[str]) -> Iterable[Link]:
|
||||
"""Parse Shaarli-specific RSS XML-format files into links"""
|
||||
|
||||
rss_file.seek(0)
|
||||
entries = rss_file.read().split('<entry>')[1:]
|
||||
for entry in entries:
|
||||
# example entry:
|
||||
# <entry>
|
||||
# <title>Aktuelle Trojaner-Welle: Emotet lauert in gefälschten Rechnungsmails | heise online</title>
|
||||
# <link href="https://www.heise.de/security/meldung/Aktuelle-Trojaner-Welle-Emotet-lauert-in-gefaelschten-Rechnungsmails-4291268.html" />
|
||||
# <id>https://demo.shaarli.org/?cEV4vw</id>
|
||||
# <published>2019-01-30T06:06:01+00:00</published>
|
||||
# <updated>2019-01-30T06:06:01+00:00</updated>
|
||||
# <content type="html" xml:lang="en"><![CDATA[<div class="markdown"><p>— <a href="https://demo.shaarli.org/?cEV4vw">Permalink</a></p></div>]]></content>
|
||||
# </entry>
|
||||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title>', '</title>').strip()
|
||||
url = str_between(get_row('link'), '<link href="', '" />')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=None,
|
||||
sources=[rss_file.name],
|
||||
)
|
|
@ -1,86 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
import re
|
||||
from argparse import ArgumentParser
|
||||
from os.path import exists, join
|
||||
from shutil import rmtree
|
||||
from typing import List
|
||||
|
||||
from config import ARCHIVE_DIR, OUTPUT_DIR
|
||||
from index import (parse_json_links_index, write_html_links_index,
|
||||
write_json_links_index)
|
||||
|
||||
|
||||
def cleanup_index(regexes: List[str], proceed: bool, delete: bool) -> None:
|
||||
if not exists(join(OUTPUT_DIR, 'index.json')):
|
||||
exit('index.json is missing; nothing to do')
|
||||
|
||||
compiled = [re.compile(r) for r in regexes]
|
||||
links = parse_json_links_index(OUTPUT_DIR)
|
||||
filtered = []
|
||||
remaining = []
|
||||
|
||||
for l in links:
|
||||
url = l['url']
|
||||
for r in compiled:
|
||||
if r.search(url):
|
||||
filtered.append((l, r))
|
||||
break
|
||||
else:
|
||||
remaining.append(l)
|
||||
|
||||
if not filtered:
|
||||
exit('Search did not match any entries.')
|
||||
|
||||
print('Filtered out {}/{} urls:'.format(len(filtered), len(links)))
|
||||
|
||||
for link, regex in filtered:
|
||||
url = link['url']
|
||||
print(' {url} via {regex}'.format(url=url, regex=regex.pattern))
|
||||
|
||||
if not proceed:
|
||||
answer = input('Remove {} entries from index? [y/n] '.format(
|
||||
len(filtered)))
|
||||
proceed = answer.strip().lower() in ('y', 'yes')
|
||||
|
||||
if not proceed:
|
||||
exit('Aborted')
|
||||
|
||||
write_json_links_index(OUTPUT_DIR, remaining)
|
||||
write_html_links_index(OUTPUT_DIR, remaining)
|
||||
|
||||
if delete:
|
||||
for link, _ in filtered:
|
||||
data_dir = join(ARCHIVE_DIR, link['timestamp'])
|
||||
if exists(data_dir):
|
||||
rmtree(data_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
p = ArgumentParser('Index purging tool')
|
||||
p.add_argument(
|
||||
'--regex',
|
||||
'-r',
|
||||
action='append',
|
||||
help='Regular expression matching URLs to purge',
|
||||
)
|
||||
p.add_argument(
|
||||
'--delete',
|
||||
'-d',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Delete webpage files from archive',
|
||||
)
|
||||
p.add_argument(
|
||||
'--yes',
|
||||
'-y',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help='Do not prompt for confirmation',
|
||||
)
|
||||
|
||||
args = p.parse_args()
|
||||
if args.regex:
|
||||
cleanup_index(args.regex, proceed=args.yes, delete=args.delete)
|
||||
else:
|
||||
p.print_help()
|
116
archivebox/system.py
Normal file
116
archivebox/system.py
Normal file
|
@ -0,0 +1,116 @@
|
|||
__package__ = 'archivebox'
|
||||
|
||||
|
||||
import os
|
||||
import shutil
|
||||
|
||||
from json import dump
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Set, Tuple
|
||||
from subprocess import run as subprocess_run
|
||||
|
||||
from crontab import CronTab
|
||||
from atomicwrites import atomic_write as lib_atomic_write
|
||||
|
||||
from .util import enforce_types, ExtendedEncoder
|
||||
from .config import OUTPUT_PERMISSIONS
|
||||
|
||||
|
||||
def run(*args, input=None, capture_output=True, text=False, **kwargs):
|
||||
"""Patched of subprocess.run to fix blocking io making timeout=innefective"""
|
||||
|
||||
if input is not None:
|
||||
if 'stdin' in kwargs:
|
||||
raise ValueError('stdin and input arguments may not both be used.')
|
||||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used '
|
||||
'with capture_output.')
|
||||
|
||||
return subprocess_run(*args, input=input, capture_output=capture_output, text=text, **kwargs)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
|
||||
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
||||
|
||||
mode = 'wb+' if isinstance(contents, bytes) else 'w'
|
||||
|
||||
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
|
||||
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
|
||||
if isinstance(contents, dict):
|
||||
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
||||
elif isinstance(contents, (bytes, str)):
|
||||
f.write(contents)
|
||||
os.chmod(path, int(OUTPUT_PERMISSIONS, base=8))
|
||||
|
||||
@enforce_types
|
||||
def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) -> None:
|
||||
"""chmod -R <permissions> <cwd>/<path>"""
|
||||
|
||||
root = Path(cwd) / path
|
||||
if not root.exists():
|
||||
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
||||
|
||||
if not root.is_dir():
|
||||
os.chmod(root, int(OUTPUT_PERMISSIONS, base=8))
|
||||
else:
|
||||
for subpath in Path(path).glob('**/*'):
|
||||
os.chmod(subpath, int(OUTPUT_PERMISSIONS, base=8))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def copy_and_overwrite(from_path: str, to_path: str):
|
||||
"""copy a given file or directory to a given path, overwriting the destination"""
|
||||
if os.path.isdir(from_path):
|
||||
shutil.rmtree(to_path, ignore_errors=True)
|
||||
shutil.copytree(from_path, to_path)
|
||||
else:
|
||||
with open(from_path, 'rb') as src:
|
||||
contents = src.read()
|
||||
atomic_write(to_path, contents)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_dir_size(path: str, recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
|
||||
"""get the total disk size of a given directory, optionally summing up
|
||||
recursively and limiting to a given filter list
|
||||
"""
|
||||
num_bytes, num_dirs, num_files = 0, 0, 0
|
||||
for entry in os.scandir(path):
|
||||
if (pattern is not None) and (pattern not in entry.path):
|
||||
continue
|
||||
if entry.is_dir(follow_symlinks=False):
|
||||
if not recursive:
|
||||
continue
|
||||
num_dirs += 1
|
||||
bytes_inside, dirs_inside, files_inside = get_dir_size(entry.path)
|
||||
num_bytes += bytes_inside
|
||||
num_dirs += dirs_inside
|
||||
num_files += files_inside
|
||||
else:
|
||||
num_bytes += entry.stat(follow_symlinks=False).st_size
|
||||
num_files += 1
|
||||
return num_bytes, num_dirs, num_files
|
||||
|
||||
|
||||
CRON_COMMENT = 'archivebox_schedule'
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe_cron_jobs(cron: CronTab) -> CronTab:
|
||||
deduped: Set[Tuple[str, str]] = set()
|
||||
|
||||
for job in list(cron):
|
||||
unique_tuple = (str(job.slices), job.command)
|
||||
if unique_tuple not in deduped:
|
||||
deduped.add(unique_tuple)
|
||||
cron.remove(job)
|
||||
|
||||
for schedule, command in deduped:
|
||||
job = cron.new(command=command, comment=CRON_COMMENT)
|
||||
job.setall(schedule)
|
||||
job.enable()
|
||||
|
||||
return cron
|
|
@ -1,348 +0,0 @@
|
|||
<html>
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<title>$title</title>
|
||||
<style>
|
||||
html, body {
|
||||
width: 100%;
|
||||
height: 100%;
|
||||
}
|
||||
body {
|
||||
background-color: #ddd;
|
||||
}
|
||||
header {
|
||||
width: 100%;
|
||||
height: 90px;
|
||||
background-color: #aa1e55;
|
||||
margin: 0px;
|
||||
text-align: center;
|
||||
color: white;
|
||||
}
|
||||
header h1 {
|
||||
padding-top: 5px;
|
||||
padding-bottom: 5px;
|
||||
margin: 0px;
|
||||
font-weight: 200;
|
||||
font-family: "Gill Sans", Helvetica, sans-serif;
|
||||
font-size: calc(16px + 1vw);
|
||||
}
|
||||
.collapse-icon {
|
||||
float: right;
|
||||
color: black;
|
||||
width: 126px;
|
||||
font-size: 0.8em;
|
||||
margin-top: 20px;
|
||||
margin-right: 0px;
|
||||
margin-left: -35px;
|
||||
}
|
||||
.nav-icon img {
|
||||
float: left;
|
||||
display: block;
|
||||
margin-right: 13px;
|
||||
color: black;
|
||||
height: 53px;
|
||||
margin-top: 12px;
|
||||
margin-left: 10px;
|
||||
}
|
||||
.nav-icon img:hover {
|
||||
opacity: 0.5;
|
||||
}
|
||||
.title-url {
|
||||
color: black;
|
||||
display: block;
|
||||
width: 75%;
|
||||
white-space: nowrap;
|
||||
overflow: hidden;
|
||||
margin: auto;
|
||||
}
|
||||
.archive-page-header {
|
||||
margin-top: 5px;
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
.archive-page-header .alert {
|
||||
margin-bottom: 0px;
|
||||
}
|
||||
h1 small {
|
||||
opacity: 0.4;
|
||||
font-size: 0.6em;
|
||||
}
|
||||
h1 small:hover {
|
||||
opacity: 0.8;
|
||||
}
|
||||
.card {
|
||||
overflow: hidden;
|
||||
box-shadow: 2px 3px 14px 0px rgba(0,0,0,0.02);
|
||||
}
|
||||
.card h4 {
|
||||
font-size: 1.4vw;
|
||||
}
|
||||
.card-body {
|
||||
font-size: 1vw;
|
||||
padding-top: 1.2vw;
|
||||
padding-left: 1vw;
|
||||
padding-right: 1vw;
|
||||
padding-bottom: 1vw;
|
||||
line-height: 1.1;
|
||||
word-wrap: break-word;
|
||||
max-height: 102px;
|
||||
overflow: hidden;
|
||||
}
|
||||
.card-img-top {
|
||||
border: 0px;
|
||||
padding: 0px;
|
||||
margin: 0px;
|
||||
overflow: hidden;
|
||||
opacity: 0.8;
|
||||
border-top: 1px solid gray;
|
||||
border-radius: 3px;
|
||||
border-bottom: 1px solid #ddd;
|
||||
height: 430px;
|
||||
width: 400%;
|
||||
margin-bottom: -330px;
|
||||
|
||||
transform: scale(0.25);
|
||||
transform-origin: 0 0;
|
||||
}
|
||||
.full-page-iframe {
|
||||
border-top: 1px solid #ddd;
|
||||
width: 100%;
|
||||
height: 69vh;
|
||||
margin: 0px;
|
||||
border: 0px;
|
||||
border-top: 3px solid #aa1e55;
|
||||
}
|
||||
.card.selected-card {
|
||||
border: 2px solid orange;
|
||||
box-shadow: 0px -6px 13px 1px rgba(0,0,0,0.05);
|
||||
}
|
||||
.iframe-large {
|
||||
height: 93%;
|
||||
margin-top: -10px;
|
||||
}
|
||||
.pdf-frame {
|
||||
transform: none;
|
||||
width: 100%;
|
||||
height: 160px;
|
||||
margin-top: -60px;
|
||||
margin-bottom: 0px;
|
||||
}
|
||||
img.external {
|
||||
height: 30px;
|
||||
margin-right: -10px;
|
||||
padding: 3px;
|
||||
border-radius: 4px;
|
||||
vertical-align: middle;
|
||||
border: 4px solid rgba(0,0,0,0);
|
||||
}
|
||||
img.external:hover {
|
||||
border: 4px solid green;
|
||||
}
|
||||
.screenshot {
|
||||
transform: none;
|
||||
width: 100%;
|
||||
height: auto;
|
||||
max-height: 100px;
|
||||
margin-bottom: 0px;
|
||||
object-fit: cover;
|
||||
object-position: top;
|
||||
}
|
||||
|
||||
@media(max-width: 1092px) {
|
||||
iframe {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@media(max-width: 728px) {
|
||||
.card h4 {
|
||||
font-size: 5vw;
|
||||
}
|
||||
.card-body {
|
||||
font-size: 4vw;
|
||||
}
|
||||
.card {
|
||||
margin-bottom: 5px;
|
||||
}
|
||||
header > h1 > a.collapse-icon, header > h1 > a.nav-icon {
|
||||
display: none;
|
||||
}
|
||||
}
|
||||
</style>
|
||||
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" integrity="sha384-rwoIResjU2yc3z8GV/NPeZWAv56rSmLldC3R/AZzGRnGxQQKnKkoFVhFQhNUwEyJ" crossorigin="anonymous">
|
||||
</head>
|
||||
<body>
|
||||
<header>
|
||||
<h1 class="page-title">
|
||||
<a href="../../index.html" class="nav-icon" title="Go to Main Index...">
|
||||
<img src="../../static/archive.png" alt="Archive Icon">
|
||||
</a>
|
||||
<a href="#" class="collapse-icon" style="text-decoration: none" title="Toggle info panel...">
|
||||
▾
|
||||
</a>
|
||||
<img src="$favicon_url" height="20px"> $title<br/>
|
||||
<a href="$url" class="title-url">
|
||||
<small>$base_url</small>
|
||||
</a>
|
||||
</h1>
|
||||
</header>
|
||||
<div class="site-header container-fluid">
|
||||
<div class="row archive-page-header">
|
||||
<div class="col-lg-4 alert well">
|
||||
Bookmarked: <small title="Timestamp: $timestamp">$bookmarked_date</small>
|
||||
|
|
||||
Last updated: <small title="Timestamp: $updated">$updated_date</small>
|
||||
|
|
||||
Total files: <small title="Archive methods">🗃 $num_outputs</small>
|
||||
</div>
|
||||
<div class="col-lg-4 alert well">
|
||||
Type:
|
||||
<span class="badge badge-default">$extension</span>
|
||||
|
|
||||
Tags:
|
||||
<span class="badge badge-warning">$tags</span>
|
||||
|
|
||||
Status:
|
||||
<span class="badge badge-$status_color">$status</span>
|
||||
</div>
|
||||
<div class="col-lg-4 alert well">
|
||||
Archive Methods:
|
||||
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
|
||||
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
|
||||
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
|
||||
<a href="git/" title="Any git repos at the url">Git Repos</a> |
|
||||
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
|
||||
<a href="." title="Webserver-provided index of files directory.">See all files...</a>
|
||||
</div>
|
||||
<hr/>
|
||||
<div class="col-lg-2">
|
||||
<div class="card selected-card">
|
||||
<iframe class="card-img-top" src="$archive_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$archive_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$archive_url" target="preview"><h4 class="card-title">Local Archive</h4></a>
|
||||
<p class="card-text">archive/$domain</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="$dom_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$dom_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$dom_url" target="preview"><h4 class="card-title">HTML</h4></a>
|
||||
<p class="card-text">archive/output.html</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top pdf-frame" src="$pdf_url" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$pdf_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$pdf_url" target="preview" id="pdf-btn"><h4 class="card-title">PDF</h4></a>
|
||||
<p class="card-text">archive/output.pdf</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<img class="card-img-top screenshot" src="$screenshot_url"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$screenshot_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$screenshot_url" target="preview"><h4 class="card-title">Screenshot</h4></a>
|
||||
<p class="card-text">archive/screenshot.png</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="$url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$url" target="preview"><h4 class="card-title">Original</h4></a>
|
||||
<p class="card-text">$domain</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="$archive_org_url" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="$archive_org_url" style="float:right" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<img src="../../static/external.png" class="external"/>
|
||||
</a>
|
||||
<a href="$archive_org_url" target="preview"><h4 class="card-title">Archive.Org</h4></a>
|
||||
<p class="card-text">web.archive.org/web/...</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="$archive_url" name="preview"></iframe>
|
||||
|
||||
<script
|
||||
src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
|
||||
integrity="sha256-k2WSCIexGzOj3Euiig+TlR8gA0EmPjuc79OEeY5L45g="
|
||||
crossorigin="anonymous"></script>
|
||||
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
// show selected file in iframe when preview card is clicked
|
||||
jQuery('.card').on('click', function(e) {
|
||||
jQuery('.selected-card').removeClass('selected-card')
|
||||
jQuery(e.target).closest('.card').addClass('selected-card')
|
||||
})
|
||||
jQuery('.card a[target=preview]').on('click', function(e) {
|
||||
if (e.currentTarget.href.endsWith('.pdf')) {
|
||||
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
|
||||
} else {
|
||||
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
// un-sandbox iframes showing pdfs (required to display pdf viewer)
|
||||
jQuery('iframe').map(function() {
|
||||
if (this.src.endsWith('.pdf')) {
|
||||
this.removeAttribute('sandbox')
|
||||
this.src = this.src
|
||||
}
|
||||
})
|
||||
|
||||
// hide header when collapse icon is clicked
|
||||
jQuery('.collapse-icon').on('click', function() {
|
||||
if (jQuery('.collapse-icon').text().includes('▾')) {
|
||||
jQuery('.collapse-icon').text('▸')
|
||||
jQuery('.site-header').hide()
|
||||
jQuery('.full-page-iframe').addClass('iframe-large')
|
||||
} else {
|
||||
jQuery('.collapse-icon').text('▾')
|
||||
jQuery('.site-header').show()
|
||||
jQuery('.full-page-iframe').removeClass('iframe-large')
|
||||
}
|
||||
return true
|
||||
})
|
||||
|
||||
// hide all preview iframes on small screens
|
||||
if (window.innerWidth < 1091) {
|
||||
jQuery('.card a[target=preview]').attr('target', '_self')
|
||||
}
|
||||
|
||||
var pdf_frame = document.querySelector('.pdf-frame');
|
||||
pdf_frame.onload = function () {
|
||||
pdf_frame.contentWindow.scrollTo(0, 400);
|
||||
}
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
|
@ -1,34 +0,0 @@
|
|||
<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||
<!-- This is an automatically generated file.
|
||||
It will be read and overwritten.
|
||||
DO NOT EDIT! -->
|
||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
||||
<TITLE>Bookmarks</TITLE>
|
||||
<H1>Bookmarks Menu</H1>
|
||||
|
||||
<DL><p>
|
||||
<DT><A HREF="place:folder=BOOKMARKS_MENU&folder=UNFILED_BOOKMARKS&folder=TOOLBAR&queryType=1&sort=12&maxResults=10&excludeQueries=1" ADD_DATE="1409779227" LAST_MODIFIED="1470506008">Recently Bookmarked</A>
|
||||
<DT><A HREF="place:type=6&sort=14&maxResults=10" ADD_DATE="1470506008" LAST_MODIFIED="1470506008">Recent Tags</A>
|
||||
<HR> <DT><H3 ADD_DATE="1409779227" LAST_MODIFIED="1409779227">Mozilla Firefox</H3>
|
||||
<DL><p>
|
||||
<DT><A HREF="https://www.mozilla.org/en-US/firefox/help/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/0-1409779227970" ICON="">Help and Tutorials</A>
|
||||
<DT><A HREF="https://www.mozilla.org/en-US/firefox/customize/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/1-1409779227971" ICON="">Customize Firefox</A>
|
||||
<DT><A HREF="https://www.mozilla.org/en-US/contribute/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/2-1409779227973" ICON="">Get Involved</A>
|
||||
<DT><A HREF="https://www.mozilla.org/en-US/about/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227" ICON_URI="http://www.mozilla.org/2005/made-up-favicon/3-1409779227974" ICON="">About Us</A>
|
||||
</DL><p>
|
||||
<DT><H3 ADD_DATE="1497562973" LAST_MODIFIED="1497562974">[Folder Name]</H3>
|
||||
<DL><p>
|
||||
<DT><A HREF="https://duckduckgo.com/?q=firefox+export+bookmarks&t=ffhp&ia=web" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://duckduckgo.com/favicon.ico" ICON="">firefox export bookmarks at DuckDuckGo</A>
|
||||
<DT><A HREF="https://duckduckgo.com/?q=archive+firefox+bookmarks&t=ffab&ia=web" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://duckduckgo.com/favicon.ico" ICON="">archive firefox bookmarks at DuckDuckGo</A>
|
||||
<DT><A HREF="https://github.com/nodiscc" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">nodiscc (nodiscc) · GitHub</A>
|
||||
<DT><A HREF="https://github.com/pirate/ArchiveBox#troubleshooting" ADD_DATE="1497562975" LAST_MODIFIED="1497562975" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">pirate/ArchiveBox · Github</A>
|
||||
<DT><A HREF="http://www.cs.unc.edu/~fabian/papers/foniks-oak11.pdf" ADD_DATE="1497562976" LAST_MODIFIED="1497562976" ICON_URI="https://assets-cdn.github.com/favicon.ico" ICON="">Phonotactic Reconstruction of Encrypted VoIP Conversations</A>
|
||||
<DT><A HREF="https://www.ghacks.net/2009/07/23/firefox-bookmarks-archiver/" ADD_DATE="1497562974" LAST_MODIFIED="1497562974" ICON_URI="https://www.ghacks.net/wp-content/uploads/2005/10/favicon.ico" ICON="">Firefox Bookmarks Archiver - gHacks Tech News</A>
|
||||
</DL><p>
|
||||
<DT><H3 ADD_DATE="1409779227" LAST_MODIFIED="1470506008" PERSONAL_TOOLBAR_FOLDER="true">Bookmarks Toolbar</H3>
|
||||
<DD>Add bookmarks to this folder to see them displayed on the Bookmarks Toolbar
|
||||
<DL><p>
|
||||
<DT><A HREF="place:sort=8&maxResults=10" ADD_DATE="1470506008" LAST_MODIFIED="1470506008">Most Visited</A>
|
||||
<DT><A HREF="https://www.mozilla.org/en-US/firefox/central/" ADD_DATE="1409779227" LAST_MODIFIED="1409779227">Getting Started</A>
|
||||
</DL><p>
|
||||
</DL>
|
|
@ -1,12 +0,0 @@
|
|||
<!DOCTYPE NETSCAPE-Bookmark-file-1>
|
||||
<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=UTF-8">
|
||||
<TITLE>Pinboard Bookmarks</TITLE>
|
||||
<H1>Bookmarks</H1>
|
||||
<DL>
|
||||
<p>
|
||||
|
||||
<DT><A HREF="https://github.com/trailofbits/algo" ADD_DATE="1542616733" PRIVATE="1" TOREAD="1" TAGS="vpn,scripts,toread">Algo VPN scripts</A>
|
||||
<DT><A HREF="http://www.ulisp.com/" ADD_DATE="1542374412" PRIVATE="1" TOREAD="1" TAGS="arduino,avr,embedded,lisp,toread">uLisp</A>
|
||||
|
||||
</DL>
|
||||
</p>
|
|
@ -1,8 +0,0 @@
|
|||
[{"href":"https:\/\/en.wikipedia.org\/wiki\/International_Typographic_Style","description":"International Typographic Style - Wikipedia, the free encyclopedia","extended":"","meta":"32f4cc916e6f5919cc19aceb10559cc1","hash":"3dd64e155e16731d20350bec6bef7cb5","time":"2016-06-07T11:27:08Z","shared":"no","toread":"yes","tags":""},
|
||||
{"href":"https:\/\/news.ycombinator.com\/item?id=11686984","description":"Announcing Certbot: EFF's Client for Let's Encrypt | Hacker News","extended":"","meta":"4a49602ba5d20ec3505c75d38ebc1d63","hash":"1c1acb53a5bd520e8529ce4f9600abee","time":"2016-05-13T05:46:16Z","shared":"no","toread":"yes","tags":""},
|
||||
{"href":"https:\/\/github.com\/google\/styleguide","description":"GitHub - google\/styleguide: Style guides for Google-originated open-source projects","extended":"","meta":"15a8d50f7295f18ccb6dd19cb689c68a","hash":"1028bf9872d8e4ea1b1858f4044abb58","time":"2016-02-24T08:49:25Z","shared":"no","toread":"no","tags":"code.style.guide programming reference web.dev"},
|
||||
{"href":"http:\/\/en.wikipedia.org\/wiki\/List_of_XML_and_HTML_character_entity_references","description":"List of XML and HTML character entity references - Wikipedia, the free encyclopedia","extended":"","meta":"6683a70f0f59c92c0bfd0bce653eab69","hash":"344d975c6251a8d460971fa2c43d9bbb","time":"2014-06-16T04:17:15Z","shared":"no","toread":"no","tags":"html reference web.dev typography"},
|
||||
{"href":"https:\/\/pushover.net\/","description":"Pushover: Simple Notifications for Android, iOS, and Desktop","extended":"","meta":"1e68511234d9390d10b7772c8ccc4b9e","hash":"bb93374ead8a937b18c7c46e13168a7d","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"app android"},
|
||||
{"href":"http:\/\/www.reddit.com\/r\/Android","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e3","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 1"},
|
||||
{"href":"http:\/\/www.reddit.com\/r\/Android2","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e2","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 2"},
|
||||
{"href":"http:\/\/www.reddit.com\/r\/Android3","description":"r\/android","extended":"","meta":"18a973f09c9cc0608c116967b64e0419","hash":"910293f019c2f4bb1a749fb937ba58e4","time":"2014-06-14T15:51:42Z","shared":"no","toread":"no","tags":"reddit android 3"}]
|
|
@ -1,46 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<rdf:RDF xmlns="http://purl.org/rss/1.0/" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:content="http://purl.org/rss/1.0/modules/content/" xmlns:taxo="http://purl.org/rss/1.0/modules/taxonomy/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:cc="http://web.resource.org/cc/" xmlns:syn="http://purl.org/rss/1.0/modules/syndication/" xmlns:admin="http://webns.net/mvcb/">
|
||||
<channel rdf:about="http://pinboard.in">
|
||||
<title>Pinboard (private aaronmueller)</title>
|
||||
<link>https://pinboard.in/u:aaronmueller/private/</link>
|
||||
<description></description>
|
||||
<items>
|
||||
<rdf:Seq>
|
||||
<rdf:li rdf:resource="https://mehkee.com/"/>
|
||||
<rdf:li rdf:resource="https://qmk.fm/"/>
|
||||
</rdf:Seq>
|
||||
</items>
|
||||
</channel>
|
||||
|
||||
<item rdf:about="https://mehkee.com/">
|
||||
<title>Mehkee - Mechanical Keyboard Parts & Accessories</title>
|
||||
<dc:date>2018-11-08T21:29:32+00:00</dc:date>
|
||||
<link>https://mehkee.com/</link>
|
||||
<dc:creator>aaronmueller</dc:creator>
|
||||
<dc:subject>keyboard gadget diy</dc:subject>
|
||||
<dc:source>http://pinboard.in/</dc:source>
|
||||
<dc:identifier>http://pinboard.in/u:aaronmueller/b:xxx/</dc:identifier>
|
||||
<taxo:topics>
|
||||
<rdf:Bag>
|
||||
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:keyboard"/>
|
||||
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:gadget"/>
|
||||
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:diy"/>
|
||||
</rdf:Bag>
|
||||
</taxo:topics>
|
||||
</item>
|
||||
<item rdf:about="https://qmk.fm/">
|
||||
<title>QMK Firmware - An open source firmware for AVR and ARM based keyboards</title>
|
||||
<dc:date>2018-11-06T22:36:21+00:00</dc:date>
|
||||
<link>https://qmk.fm/</link>
|
||||
<dc:creator>aaronmueller</dc:creator>
|
||||
<dc:subject>firmware keyboard</dc:subject>
|
||||
<dc:source>http://pinboard.in/</dc:source>
|
||||
<dc:identifier>http://pinboard.in/u:aaronmueller/b:xxx/</dc:identifier>
|
||||
<taxo:topics>
|
||||
<rdf:Bag>
|
||||
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:firmware"/>
|
||||
<rdf:li rdf:resource="http://pinboard.in/u:aaronmueller/t:keyboard"/>
|
||||
</rdf:Bag>
|
||||
</taxo:topics>
|
||||
</item>
|
||||
</rdf:RDF>
|
|
@ -1,5 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<posts user="aaronmueller">
|
||||
<post href="https://github.com/trailofbits/algo" time="2018-11-19T08:38:53Z" description="Algo VPN scripts" extended="" tag="vpn scripts" hash="18d708f67bb26d843b1cac4530bb52aa" shared="no" toread="yes" />
|
||||
<post href="http://www.ulisp.com/" time="2018-11-16T13:20:12Z" description="uLisp" extended="" tag="arduino avr embedded lisp" hash="2a17ae95925a03a5b9bb38cf7f6c6f9b" shared="no" toread="yes" />
|
||||
</posts>
|
|
@ -1,2 +0,0 @@
|
|||
[{"href":"https:\/\/github.com\/trailofbits\/algo","description":"Algo VPN scripts","extended":"","meta":"62325ba3b577683aee854d7f191034dc","hash":"18d708f67bb26d843b1cac4530bb52aa","time":"2018-11-19T08:38:53Z","shared":"no","toread":"yes","tags":"vpn scripts"},
|
||||
{"href":"http:\/\/www.ulisp.com\/","description":"uLisp","extended":"","meta":"7bd0c0ef31f69d1459e3d37366e742b3","hash":"2a17ae95925a03a5b9bb38cf7f6c6f9b","time":"2018-11-16T13:20:12Z","shared":"no","toread":"yes","tags":"arduino avr embedded lisp"}]
|
|
@ -1,38 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html>
|
||||
<!--So long and thanks for all the fish-->
|
||||
<head>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
|
||||
<title>Pocket Export</title>
|
||||
</head>
|
||||
<body>
|
||||
<h1>Unread</h1>
|
||||
<ul>
|
||||
<li><a href="https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3110382/" time_added="1493913054" tags="">The Radical Plasticity Thesis: How the Brain Learns to be Conscious</a></li>
|
||||
<li><a href="https://martinfowler.com/eaaDev/uiArchs.html" time_added="1493909628" tags="">GUI Architectures</a></li>
|
||||
<li><a href="https://issuu.com/crowdcraft/docs/shanghai-talk-july-2012" time_added="1493900327" tags="make512">Shanghai Talk July 2012 by Mike Hall - issuu</a></li>
|
||||
<li><a href="http://make512.weebly.com/about-us.html" time_added="1493900002" tags="">About Us - make512</a></li>
|
||||
<li><a href="https://openzfsonosx.org/wiki/ZFS_on_Boot" time_added="1493887140" tags="">ZFS on Boot - OpenZFS on OS X</a></li>
|
||||
<li><a href="http://www.softpanorama.org/DNS/history.shtml" time_added="1493869958" tags="">History of DNS</a></li>
|
||||
<li><a href="https://chromium.googlesource.com/chromium/src/+/master/docs/linux_sandboxing.md" time_added="1493869649" tags="">Linux Sandboxing</a></li>
|
||||
<li><a href="https://hackernoon.com/rems-and-ems-and-why-you-probably-dont-need-them-664b9ce1e09f" time_added="1493694979" tags="">rems and ems, and why you probably don’t need them – Hacker Noon</a></li>
|
||||
<li><a href="https://wiki.archlinux.org/index.php/full_system_backup_with_rsync" time_added="1493581911" tags="">Full system backup with rsync - ArchWiki</a></li>
|
||||
<li><a href="https://www.youtube.com/watch?v=iNnAQpAHfmA" time_added="1493581911" tags="">SingUnltd. - Nature Boy (Flying Lotus Massage Situation Sample?! )</a></li>
|
||||
</ul>
|
||||
|
||||
<h1>Read Archive</h1>
|
||||
<ul>
|
||||
<li><a href="https://github.com/Droogans/unmaintainable-code" time_added="1478739800" tags="">Droogans/unmaintainable-code: An easier to share version of the infamous ht</a></li>
|
||||
<li><a href="http://www.benstopford.com/2015/02/14/log-structured-merge-trees/" time_added="1478739709" tags="">Log Structured Merge Trees - ben stopford</a></li>
|
||||
<li><a href="http://jgthms.com/web-design-in-4-minutes/#share" time_added="1478739628" tags="">Web Design in 4 minutes</a></li>
|
||||
<li><a href="https://eev.ee/blog/2016/07/26/the-hardest-problem-in-computer-science/" time_added="1478739622" tags="">The hardest problem in computer science / fuzzy notepad</a></li>
|
||||
<li><a href="https://medium.com/@iamjordanlittle/9-underutilized-features-in-css-90ced6ddbfe7#.690ah7whf" time_added="1476686912" tags="">9 Underutilized Features in CSS – Medium</a></li>
|
||||
<li><a href="http://themacro.com/articles/2016/09/employee-1-coinbase/" time_added="1476686907" tags="">Employee #1: Coinbase · The Macro</a></li>
|
||||
<li><a href="https://juokaz.com/blog/becoming-a-cto" time_added="1476686904" tags="">Becoming a CTO // Juozas Kaziukėnas</a></li>
|
||||
<li><a href="https://backchannel.com/the-internet-really-has-changed-everything-here-s-the-proof-928eaead18a8#.ekfmwcjh2" time_added="1476686896" tags="">The Internet Really Has Changed Everything. Here’s the Proof.</a></li>
|
||||
<li><a href="http://www.hindawi.com/journals/ijbm/2011/172389/" time_added="1424321329" tags="">Experimental and Modeling Study of Collagen Scaffolds with the Effects of C</a></li>
|
||||
<li><a href="http://search.cpan.org/dist/Locale-Maketext/lib/Locale/Maketext/TPJ13.pod?#A_Localization_Horror_Story:_It_Could_Happen_To_You" time_added="1424306906" tags="">Locale::Maketext::TPJ13 - search.cpan.org</a></li>
|
||||
|
||||
</ul>
|
||||
</body>
|
||||
</html>
|
|
@ -1,228 +0,0 @@
|
|||
<?xml version="1.0" encoding="UTF-8"?><rss version="2.0"
|
||||
xmlns:content="http://purl.org/rss/1.0/modules/content/"
|
||||
xmlns:wfw="http://wellformedweb.org/CommentAPI/"
|
||||
xmlns:dc="http://purl.org/dc/elements/1.1/"
|
||||
xmlns:atom="http://www.w3.org/2005/Atom"
|
||||
>
|
||||
|
||||
<channel>
|
||||
|
||||
<title>My Reading List: Read and Unread</title>
|
||||
<description>Items I've saved to read</description>
|
||||
<link>http://readitlaterlist.com/users/nikisweeting/feed/all</link>
|
||||
<atom:link href="http://readitlaterlist.com/users/nikisweeting/feed/all" rel="self" type="application/rss+xml" />
|
||||
|
||||
|
||||
<item>
|
||||
<title><![CDATA[Cell signaling]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://en.wikipedia.org/wiki/Cell_signaling</link>
|
||||
<guid>https://en.wikipedia.org/wiki/Cell_signaling</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:12:10 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Hayflick limit]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://en.wikipedia.org/wiki/Hayflick_limit</link>
|
||||
<guid>https://en.wikipedia.org/wiki/Hayflick_limit</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:11:38 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Even moderate drinking by parents can upset children – study]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal</link>
|
||||
<guid>https://theguardian.com/society/2017/oct/18/even-moderate-drinking-by-parents-can-upset-children-study?CMP=Share_AndroidApp_Signal</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:11:30 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[How Merkle trees enable the decentralized Web]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web</link>
|
||||
<guid>https://taravancil.com/blog/how-merkle-trees-enable-decentralized-web</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:11:30 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Inertial navigation system]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://en.wikipedia.org/wiki/Inertial_navigation_system</link>
|
||||
<guid>https://en.wikipedia.org/wiki/Inertial_navigation_system</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:10:10 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Dead reckoning]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://en.wikipedia.org/wiki/Dead_reckoning</link>
|
||||
<guid>https://en.wikipedia.org/wiki/Dead_reckoning</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:10:08 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Calling Rust From Python]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://bheisler.github.io/post/calling-rust-in-python</link>
|
||||
<guid>https://bheisler.github.io/post/calling-rust-in-python</guid>
|
||||
<pubDate>Mon, 30 Oct 2017 01:04:33 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Why would anyone choose Docker over fat binaries?]]></title>
|
||||
<category>Unread</category>
|
||||
<link>http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries</link>
|
||||
<guid>http://smashcompany.com/technology/why-would-anyone-choose-docker-over-fat-binaries</guid>
|
||||
<pubDate>Sun, 29 Oct 2017 14:57:25 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://heml.io</link>
|
||||
<guid>https://heml.io</guid>
|
||||
<pubDate>Sun, 29 Oct 2017 14:55:26 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[A surprising amount of people want to be in North Korea]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad</link>
|
||||
<guid>https://blog.benjojo.co.uk/post/north-korea-dprk-bgp-geoip-fruad</guid>
|
||||
<pubDate>Sat, 28 Oct 2017 05:41:41 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Learning a Hierarchy]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://blog.openai.com/learning-a-hierarchy</link>
|
||||
<guid>https://blog.openai.com/learning-a-hierarchy</guid>
|
||||
<pubDate>Thu, 26 Oct 2017 16:43:48 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[High Performance Browser Networking]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://hpbn.co</link>
|
||||
<guid>https://hpbn.co</guid>
|
||||
<pubDate>Wed, 25 Oct 2017 19:05:24 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[What tender and juicy drama is going on at your school/workplace?]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v</link>
|
||||
<guid>https://reddit.com/r/AskReddit/comments/78nc2a/what_tender_and_juicy_drama_is_going_on_at_your/dovab2v</guid>
|
||||
<pubDate>Wed, 25 Oct 2017 18:05:58 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Using an SSH Bastion Host]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host</link>
|
||||
<guid>https://blog.scottlowe.org/2015/11/21/using-ssh-bastion-host</guid>
|
||||
<pubDate>Wed, 25 Oct 2017 11:38:47 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Let's Define "undefined" | NathanShane.me]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://nathanshane.me/blog/let's-define-undefined</link>
|
||||
<guid>https://nathanshane.me/blog/let's-define-undefined</guid>
|
||||
<pubDate>Wed, 25 Oct 2017 11:32:59 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Control theory]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function</link>
|
||||
<guid>https://en.wikipedia.org/wiki/Control_theory#Closed-loop_transfer_function</guid>
|
||||
<pubDate>Tue, 24 Oct 2017 22:57:43 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[J012-86-intractable.pdf]]></title>
|
||||
<category>Unread</category>
|
||||
<link>http://mit.edu/~jnt/Papers/J012-86-intractable.pdf</link>
|
||||
<guid>http://mit.edu/~jnt/Papers/J012-86-intractable.pdf</guid>
|
||||
<pubDate>Tue, 24 Oct 2017 22:56:32 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Dynamic Programming: First Principles]]></title>
|
||||
<category>Unread</category>
|
||||
<link>http://flawlessrhetoric.com/Dynamic-Programming-First-Principles</link>
|
||||
<guid>http://flawlessrhetoric.com/Dynamic-Programming-First-Principles</guid>
|
||||
<pubDate>Tue, 24 Oct 2017 22:56:30 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[What Would Happen If There Were No Number 6?]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6</link>
|
||||
<guid>https://fivethirtyeight.com/features/what-would-happen-if-there-were-no-number-6</guid>
|
||||
<pubDate>Tue, 24 Oct 2017 22:21:59 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Ten Basic Rules for Adventure]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://outsideonline.com/2252916/10-basic-rules-adventure</link>
|
||||
<guid>https://outsideonline.com/2252916/10-basic-rules-adventure</guid>
|
||||
<pubDate>Tue, 24 Oct 2017 20:56:25 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Insects Are In Serious Trouble]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true</link>
|
||||
<guid>https://theatlantic.com/science/archive/2017/10/oh-no/543390?single_page=true</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 23:10:10 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Netflix/bless]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://github.com/Netflix/bless</link>
|
||||
<guid>https://github.com/Netflix/bless</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 23:04:46 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Getting Your First 10 Customers]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://stripe.com/atlas/guides/starting-sales</link>
|
||||
<guid>https://stripe.com/atlas/guides/starting-sales</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 22:27:36 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[GPS Hardware]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://novasummits.com/gps-hardware</link>
|
||||
<guid>https://novasummits.com/gps-hardware</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 04:44:40 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Bicycle Tires and Tubes]]></title>
|
||||
<category>Unread</category>
|
||||
<link>http://sheldonbrown.com/tires.html#pressure</link>
|
||||
<guid>http://sheldonbrown.com/tires.html#pressure</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 01:28:32 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Tire light is on]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe</link>
|
||||
<guid>https://reddit.com/r/Justrolledintotheshop/comments/77zm9e/tire_light_is_on/doqbshe</guid>
|
||||
<pubDate>Mon, 23 Oct 2017 01:21:42 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Bad_Salish_Boo ?? on Twitter]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://t.co/PDLlNjACv9</link>
|
||||
<guid>https://t.co/PDLlNjACv9</guid>
|
||||
<pubDate>Sat, 21 Oct 2017 06:48:07 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Is an Open Marriage a Happier Marriage?]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html</link>
|
||||
<guid>https://nytimes.com/2017/05/11/magazine/is-an-open-marriage-a-happier-marriage.html</guid>
|
||||
<pubDate>Fri, 20 Oct 2017 13:08:52 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[The Invention of Monogamy]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://thenib.com/the-invention-of-monogamy</link>
|
||||
<guid>https://thenib.com/the-invention-of-monogamy</guid>
|
||||
<pubDate>Fri, 20 Oct 2017 12:19:00 -0500</pubDate>
|
||||
</item>
|
||||
<item>
|
||||
<title><![CDATA[Google Chrome May Add a Permission to Stop In-Browser Cryptocurrency Miners]]></title>
|
||||
<category>Unread</category>
|
||||
<link>https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners</link>
|
||||
<guid>https://bleepingcomputer.com/news/google/google-chrome-may-add-a-permission-to-stop-in-browser-cryptocurrency-miners</guid>
|
||||
<pubDate>Fri, 20 Oct 2017 03:57:41 -0500</pubDate>
|
||||
</item>
|
||||
</channel>
|
||||
|
||||
</rss>
|
|
@ -1,92 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
import json
|
||||
import os
|
||||
from os.path import dirname, pardir, join
|
||||
from subprocess import check_output, check_call
|
||||
from tempfile import TemporaryDirectory
|
||||
from typing import List
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
ARCHIVER_BIN = join(dirname(__file__), pardir, 'archive.py')
|
||||
|
||||
|
||||
class Helper:
|
||||
def __init__(self, output_dir: str):
|
||||
self.output_dir = output_dir
|
||||
|
||||
def run(self, links, env=None, env_defaults=None):
|
||||
if env_defaults is None:
|
||||
env_defaults = {
|
||||
# we don't wanna spam archive.org witin our tests..
|
||||
'SUBMIT_ARCHIVE_DOT_ORG': 'False',
|
||||
}
|
||||
if env is None:
|
||||
env = {}
|
||||
|
||||
env = dict(**env_defaults, **env)
|
||||
|
||||
jj = []
|
||||
for url in links:
|
||||
jj.append({
|
||||
'href': url,
|
||||
'description': url,
|
||||
})
|
||||
input_json = join(self.output_dir, 'input.json')
|
||||
with open(input_json, 'w') as fo:
|
||||
json.dump(jj, fo)
|
||||
|
||||
if env is None:
|
||||
env = {}
|
||||
env['OUTPUT_DIR'] = self.output_dir
|
||||
check_call(
|
||||
[ARCHIVER_BIN, input_json],
|
||||
env={**os.environ.copy(), **env},
|
||||
)
|
||||
|
||||
|
||||
class TestArchiver:
|
||||
def setup(self):
|
||||
# self.tdir = TemporaryDirectory(dir='hello')
|
||||
class AAA:
|
||||
name = 'hello'
|
||||
self.tdir = AAA()
|
||||
|
||||
def teardown(self):
|
||||
pass
|
||||
# self.tdir.cleanup()
|
||||
|
||||
@property
|
||||
def output_dir(self):
|
||||
return self.tdir.name
|
||||
|
||||
def test_fetch_favicon_false(self):
|
||||
h = Helper(self.output_dir)
|
||||
|
||||
h.run(links=[
|
||||
'https://google.com',
|
||||
], env={
|
||||
'FETCH_FAVICON': 'False',
|
||||
})
|
||||
# for now no asserts, good enough if it isn't failing
|
||||
|
||||
def test_3000_links(self):
|
||||
"""
|
||||
The pages are deliberatly unreachable. The tool should gracefully process all of them even though individual links are failing.
|
||||
"""
|
||||
h = Helper(self.output_dir)
|
||||
|
||||
h.run(links=[
|
||||
f'https://localhost:123/whatever_{i}.html' for i in range(3000)
|
||||
], env={
|
||||
'FETCH_FAVICON': 'False',
|
||||
'FETCH_SCREENSHOT': 'False',
|
||||
'FETCH_PDF': 'False',
|
||||
'FETCH_DOM': 'False',
|
||||
'CHECK_SSL_VALIDITY': 'False',
|
||||
})
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__])
|
1
archivebox/themes/admin/actions_as_select.html
Normal file
1
archivebox/themes/admin/actions_as_select.html
Normal file
|
@ -0,0 +1 @@
|
|||
actions_as_select
|
18
archivebox/themes/admin/app_index.html
Normal file
18
archivebox/themes/admin/app_index.html
Normal file
|
@ -0,0 +1,18 @@
|
|||
{% extends "admin/index.html" %}
|
||||
{% load i18n %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} app-{{ app_label }}{% endblock %}
|
||||
|
||||
{% if not is_popup %}
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
›
|
||||
{% for app in app_list %}
|
||||
{{ app.name }}
|
||||
{% endfor %}
|
||||
</div>
|
||||
{% endblock %}
|
||||
{% endif %}
|
||||
|
||||
{% block sidebar %}{% endblock %}
|
188
archivebox/themes/admin/base.html
Normal file
188
archivebox/themes/admin/base.html
Normal file
|
@ -0,0 +1,188 @@
|
|||
{% load i18n static %}<!DOCTYPE html>
|
||||
{% get_current_language as LANGUAGE_CODE %}{% get_current_language_bidi as LANGUAGE_BIDI %}
|
||||
<html lang="{{ LANGUAGE_CODE|default:"en-us" }}" {% if LANGUAGE_BIDI %}dir="rtl"{% endif %}>
|
||||
<head>
|
||||
<title>{% block title %}{% endblock %} | ArchiveBox</title>
|
||||
<link rel="stylesheet" type="text/css" href="{% block stylesheet %}{% static "admin/css/base.css" %}{% endblock %}">
|
||||
{% block extrastyle %}{% endblock %}
|
||||
{% if LANGUAGE_BIDI %}<link rel="stylesheet" type="text/css" href="{% block stylesheet_rtl %}{% static "admin/css/rtl.css" %}{% endblock %}">{% endif %}
|
||||
{% block extrahead %}{% endblock %}
|
||||
{% block responsive %}
|
||||
<meta name="viewport" content="user-scalable=no, width=device-width, initial-scale=1.0, maximum-scale=1.0">
|
||||
<link rel="stylesheet" type="text/css" href="{% static "admin/css/responsive.css" %}">
|
||||
{% if LANGUAGE_BIDI %}<link rel="stylesheet" type="text/css" href="{% static "admin/css/responsive_rtl.css" %}">{% endif %}
|
||||
{% endblock %}
|
||||
{% block blockbots %}<meta name="robots" content="NONE,NOARCHIVE">{% endblock %}
|
||||
<link rel="stylesheet" type="text/css" href="{% static "admin.css" %}">
|
||||
</head>
|
||||
{% load i18n %}
|
||||
|
||||
<body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
|
||||
data-admin-utc-offset="{% now "Z" %}">
|
||||
|
||||
<style nonce="{{nonce}}">
|
||||
/* Loading Progress Bar */
|
||||
#progress {
|
||||
position: absolute;
|
||||
z-index: 1000;
|
||||
top: 0px;
|
||||
left: -6px;
|
||||
width: 2%;
|
||||
opacity: 1;
|
||||
height: 2px;
|
||||
background: #1a1a1a;
|
||||
border-radius: 1px;
|
||||
transition: width 4s ease-out, opacity 400ms linear;
|
||||
}
|
||||
|
||||
@-moz-keyframes bugfix { from { padding-right: 1px ; } to { padding-right: 0; } }
|
||||
</style>
|
||||
|
||||
<script>
|
||||
// Page Loading Bar
|
||||
window.loadStart = function(distance) {
|
||||
var distance = distance || 0;
|
||||
// only add progrstess bar if not already present
|
||||
if (django.jQuery("#loading-bar").length == 0) {
|
||||
django.jQuery("body").add("<div id=\"loading-bar\"></div>");
|
||||
}
|
||||
if (django.jQuery("#progress").length === 0) {
|
||||
django.jQuery("body").append(django.jQuery("<div></div>").attr("id", "progress"));
|
||||
let last_distance = (distance || (30 + (Math.random() * 30)))
|
||||
django.jQuery("#progress").width(last_distance + "%");
|
||||
setInterval(function() {
|
||||
last_distance += Math.random()
|
||||
django.jQuery("#progress").width(last_distance + "%");
|
||||
}, 1000)
|
||||
}
|
||||
};
|
||||
|
||||
window.loadFinish = function() {
|
||||
django.jQuery("#progress").width("101%").delay(200).fadeOut(400, function() {
|
||||
django.jQuery(this).remove();
|
||||
});
|
||||
};
|
||||
window.loadStart();
|
||||
window.addEventListener('beforeunload', function() {window.loadStart(27)});
|
||||
document.addEventListener('DOMContentLoaded', function() {window.loadFinish()});
|
||||
</script>
|
||||
|
||||
|
||||
<!-- Container -->
|
||||
<div id="container">
|
||||
|
||||
{% if not is_popup %}
|
||||
<!-- Header -->
|
||||
<div id="header">
|
||||
<div id="branding">
|
||||
<h1 id="site-name">
|
||||
<a href="{% url 'Home' %}">
|
||||
<img src="{% static 'archive.png' %}" id="logo">
|
||||
ArchiveBox
|
||||
</a>
|
||||
</h1>
|
||||
|
||||
</div>
|
||||
{% block usertools %}
|
||||
{% if has_permission %}
|
||||
<div id="user-tools">
|
||||
<a href="{% url 'admin:Add' %}">Add ➕</a> /
|
||||
<a href="{% url 'Home' %}">Snapshots</a> /
|
||||
<a href="/admin/auth/user/">Users</a> /
|
||||
<a href="{% url 'OldHome' %}">Old UI</a> /
|
||||
<a href="{% url 'Docs' %}">Docs</a>
|
||||
|
||||
{% block welcome-msg %}
|
||||
{% trans 'User' %}
|
||||
<strong>{% firstof user.get_short_name user.get_username %}</strong>
|
||||
{% endblock %}
|
||||
{% block userlinks %}
|
||||
{% if user.is_active and user.is_staff %}
|
||||
{% url 'django-admindocs-docroot' as docsroot %}
|
||||
{% if docsroot %}
|
||||
<a href="{{ docsroot }}">{% trans 'Documentation' %}</a> /
|
||||
{% endif %}
|
||||
{% endif %}
|
||||
{% if user.has_usable_password %}
|
||||
<a href="{% url 'admin:password_change' %}">{% trans 'Change password' %}</a> /
|
||||
{% endif %}
|
||||
<a href="{% url 'admin:logout' %}">{% trans 'Log out' %}</a>
|
||||
{% endblock %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endblock %}
|
||||
{% block nav-global %}{% endblock %}
|
||||
</div>
|
||||
<!-- END Header -->
|
||||
{% block breadcrumbs %}
|
||||
<div class="breadcrumbs">
|
||||
<a href="{% url 'admin:index' %}">{% trans 'Home' %}</a>
|
||||
{% if title %} › {{ title }}{% endif %}
|
||||
</div>
|
||||
{% endblock %}
|
||||
{% endif %}
|
||||
|
||||
{% block messages %}
|
||||
{% if messages %}
|
||||
<ul class="messagelist">{% for message in messages %}
|
||||
<li{% if message.tags %} class="{{ message.tags }}"{% endif %}>{{ message|capfirst }}</li>
|
||||
{% endfor %}</ul>
|
||||
{% endif %}
|
||||
{% endblock messages %}
|
||||
|
||||
<!-- Content -->
|
||||
<div id="content" class="{% block coltype %}colM{% endblock %}">
|
||||
{% block pretitle %}{% endblock %}
|
||||
{% block content_title %}{# {% if title %}<h1>{{ title }}</h1>{% endif %} #}{% endblock %}
|
||||
{% block content %}
|
||||
{% block object-tools %}{% endblock %}
|
||||
{{ content }}
|
||||
{% endblock %}
|
||||
{% block sidebar %}{% endblock %}
|
||||
<br class="clear">
|
||||
</div>
|
||||
<!-- END Content -->
|
||||
|
||||
{% block footer %}<div id="footer"></div>{% endblock %}
|
||||
</div>
|
||||
<!-- END Container -->
|
||||
|
||||
<script>
|
||||
(function ($) {
|
||||
$.fn.reverse = [].reverse;
|
||||
|
||||
function fix_actions() {
|
||||
var container = $('div.actions');
|
||||
|
||||
if (container.find('option').length < 10) {
|
||||
container.find('label, button').hide();
|
||||
|
||||
var buttons = $('<div></div>')
|
||||
.prependTo(container)
|
||||
.css('display', 'inline')
|
||||
.addClass('class', 'action-buttons');
|
||||
|
||||
container.find('option:gt(0)').reverse().each(function () {
|
||||
const name = this.value
|
||||
$('<button>')
|
||||
.appendTo(buttons)
|
||||
.attr('name', this.value)
|
||||
.addClass('button')
|
||||
.text(this.text)
|
||||
.click(function () {
|
||||
container.find('select')
|
||||
.find(':selected').attr('selected', '').end()
|
||||
.find('[value=' + this.name + ']').attr('selected', 'selected');
|
||||
$('#changelist-form button[name="index"]').click();
|
||||
document.querySelector('#logo').outerHTML = '<div class="loader"></div>'
|
||||
});
|
||||
});
|
||||
}
|
||||
};
|
||||
$(function () {
|
||||
fix_actions();
|
||||
});
|
||||
})(django.jQuery);
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
100
archivebox/themes/admin/login.html
Normal file
100
archivebox/themes/admin/login.html
Normal file
|
@ -0,0 +1,100 @@
|
|||
{% extends "admin/base_site.html" %}
|
||||
{% load i18n static %}
|
||||
|
||||
{% block extrastyle %}{{ block.super }}<link rel="stylesheet" type="text/css" href="{% static "admin/css/login.css" %}">
|
||||
{{ form.media }}
|
||||
{% endblock %}
|
||||
|
||||
{% block bodyclass %}{{ block.super }} login{% endblock %}
|
||||
|
||||
{% block branding %}<h1>ArchiveBox Admin</h1>{% endblock %}
|
||||
|
||||
{% block usertools %}
|
||||
<br/>
|
||||
<a href="{% url 'Home' %}">Back to Main Index</a>
|
||||
{% endblock %}
|
||||
|
||||
{% block nav-global %}{% endblock %}
|
||||
|
||||
{% block content_title %}
|
||||
<center>
|
||||
Log in to add, edit, and remove links from your archive.
|
||||
</center><br/><br/>
|
||||
<img src="{% static 'archive.png' %}" style="width: 80px; display: block; margin: auto"/><br/>
|
||||
{% endblock %}
|
||||
|
||||
{% block breadcrumbs %}{% endblock %}
|
||||
|
||||
{% block content %}
|
||||
{% if form.errors and not form.non_field_errors %}
|
||||
<p class="errornote">
|
||||
{% if form.errors.items|length == 1 %}{% trans "Please correct the error below." %}{% else %}{% trans "Please correct the errors below." %}{% endif %}
|
||||
</p>
|
||||
{% endif %}
|
||||
|
||||
{% if form.non_field_errors %}
|
||||
{% for error in form.non_field_errors %}
|
||||
<p class="errornote">
|
||||
{{ error }}
|
||||
</p>
|
||||
{% endfor %}
|
||||
{% endif %}
|
||||
|
||||
<div id="content-main">
|
||||
|
||||
{% if user.is_authenticated %}
|
||||
<p class="errornote">
|
||||
{% blocktrans trimmed %}
|
||||
You are authenticated as {{ username }}, but are not authorized to
|
||||
access this page. Would you like to login to a different account?
|
||||
{% endblocktrans %}
|
||||
</p>
|
||||
{% endif %}
|
||||
|
||||
<br/>
|
||||
<form action="{{ app_path }}" method="post" id="login-form">{% csrf_token %}
|
||||
<div class="form-row">
|
||||
{{ form.username.errors }}
|
||||
{{ form.username.label_tag }} {{ form.username }}
|
||||
</div>
|
||||
<div class="form-row">
|
||||
{{ form.password.errors }}
|
||||
{{ form.password.label_tag }} {{ form.password }}
|
||||
<input type="hidden" name="next" value="{{ next }}">
|
||||
</div>
|
||||
{% url 'admin_password_reset' as password_reset_url %}
|
||||
{% if password_reset_url %}
|
||||
<div class="password-reset-link">
|
||||
<a href="{{ password_reset_url }}">{% trans 'Forgotten your password or username?' %}</a>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="submit-row">
|
||||
<label> </label><input type="submit" value="{% trans 'Log in' %}">
|
||||
</div>
|
||||
</form>
|
||||
|
||||
<center>
|
||||
<br/><br/>
|
||||
<hr/>
|
||||
<br/>
|
||||
If you forgot your password, <a href="/accounts/password_reset/">reset it here</a> or run:<br/>
|
||||
<pre>
|
||||
archivebox manage changepassword USERNAME
|
||||
</pre>
|
||||
|
||||
<br/><br/>
|
||||
<hr/>
|
||||
<br/>
|
||||
To create a new admin user, run the following:
|
||||
<pre>
|
||||
archivebox manage createsuperuser
|
||||
</pre>
|
||||
<br/>
|
||||
<hr/>
|
||||
|
||||
<small><i>(cd into your archive folder before running commands)</i></small>
|
||||
</center>
|
||||
|
||||
|
||||
</div>
|
||||
{% endblock %}
|
Some files were not shown because too many files have changed in this diff Show more
Loading…
Reference in a new issue