mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-24 21:23:22 +00:00
update urls to new repo path
This commit is contained in:
parent
d97fc6b16c
commit
0e2ccbc10d
16 changed files with 163 additions and 163 deletions
|
@ -12,8 +12,8 @@ FROM python:3.8-slim-buster
|
||||||
LABEL name="archivebox" \
|
LABEL name="archivebox" \
|
||||||
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
|
maintainer="Nick Sweeting <archivebox-docker@sweeting.me>" \
|
||||||
description="All-in-one personal internet archiving container" \
|
description="All-in-one personal internet archiving container" \
|
||||||
homepage="https://github.com/pirate/ArchiveBox" \
|
homepage="https://github.com/ArchiveBox/ArchiveBox" \
|
||||||
documentation="https://github.com/pirate/ArchiveBox/wiki/Docker#docker"
|
documentation="https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker"
|
||||||
|
|
||||||
# System-level base config
|
# System-level base config
|
||||||
ENV TZ=UTC \
|
ENV TZ=UTC \
|
||||||
|
|
116
README.md
116
README.md
|
@ -2,13 +2,13 @@
|
||||||
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
||||||
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
||||||
|
|
||||||
▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
||||||
<a href="https://archivebox.zervice.io/">Demo</a> |
|
<a href="https://archivebox.zervice.io/">Demo</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox">Github</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox">Github</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
|
||||||
<a href="#background--motivation">Info & Motivation</a> |
|
<a href="#background--motivation">Info & Motivation</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
"Your own personal internet archive" (网站存档 / 爬虫)
|
"Your own personal internet archive" (网站存档 / 爬虫)
|
||||||
|
@ -16,11 +16,11 @@
|
||||||
|
|
||||||
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
||||||
|
|
||||||
<a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
||||||
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
||||||
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
||||||
|
|
||||||
<hr/>
|
<hr/>
|
||||||
|
@ -48,7 +48,7 @@ open http://127.0.0.1:8000/admin/login/ # then click "Add" in the navbar
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
||||||
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
|
For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs.
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -76,7 +76,7 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the
|
||||||
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
|
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
|
||||||
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
|
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
|
||||||
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
|
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
|
||||||
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
|
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
|
||||||
<br/>
|
<br/>
|
||||||
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
|
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
|
||||||
</div><br/>
|
</div><br/>
|
||||||
|
@ -84,16 +84,16 @@ At the end of the day, the goal is to sleep soundly knowing that the part of the
|
||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
||||||
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
|
- [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage)
|
||||||
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
- Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
||||||
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
||||||
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
|
- ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
|
||||||
- **Doesn't require a constantly-running daemon**, proxy, or native app
|
- **Doesn't require a constantly-running daemon**, proxy, or native app
|
||||||
- Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
|
- Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
|
||||||
- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
|
- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc.
|
||||||
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
||||||
|
|
||||||
## Input formats
|
## Input formats
|
||||||
|
|
||||||
|
@ -112,7 +112,7 @@ archivebox add --depth=1 'https://news.ycombinator.com#2020-12-12'
|
||||||
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
||||||
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
||||||
|
|
||||||
See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
|
See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
|
||||||
|
|
||||||
It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
|
It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
|
||||||
|
|
||||||
|
@ -137,15 +137,15 @@ The on-disk layout is optimized to be easy to browse by hand and durable long-te
|
||||||
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
||||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
||||||
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
||||||
- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
|
- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
|
||||||
|
|
||||||
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
|
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled.
|
You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled.
|
||||||
|
|
||||||
If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install).
|
If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install).
|
||||||
|
|
||||||
ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
|
ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
|
||||||
|
|
||||||
|
@ -163,7 +163,7 @@ archivebox config --set SAVE_FAVICON=False # optional: only the domain is leake
|
||||||
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google
|
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google
|
||||||
```
|
```
|
||||||
|
|
||||||
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
||||||
```bash
|
```bash
|
||||||
# visiting an archived page with malicious JS:
|
# visiting an archived page with malicious JS:
|
||||||
https://127.0.0.1:8000/archive/1602401954/example.com/index.html
|
https://127.0.0.1:8000/archive/1602401954/example.com/index.html
|
||||||
|
@ -174,7 +174,7 @@ https://127.0.0.1:8000/archive/*
|
||||||
# then example.com/index.js can send it off to some evil server
|
# then example.com/index.js can send it off to some evil server
|
||||||
```
|
```
|
||||||
|
|
||||||
Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
|
Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
|
||||||
```bash
|
```bash
|
||||||
archivebox add 'https://example.com#2020-10-24'
|
archivebox add 'https://example.com#2020-10-24'
|
||||||
...
|
...
|
||||||
|
@ -196,7 +196,7 @@ a headless browser runtime, a full webserver, and CLI interface.
|
||||||
# docker-compose run archivebox <command> [args]
|
# docker-compose run archivebox <command> [args]
|
||||||
|
|
||||||
mkdir archivebox && cd archivebox
|
mkdir archivebox && cd archivebox
|
||||||
wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml'
|
wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
|
||||||
docker-compose run archivebox init
|
docker-compose run archivebox init
|
||||||
docker-compose run archivebox add 'https://example.com'
|
docker-compose run archivebox add 'https://example.com'
|
||||||
docker-compose run archivebox manage createsuperuser
|
docker-compose run archivebox manage createsuperuser
|
||||||
|
@ -250,7 +250,7 @@ python3 -m venv .venv && source .venv/bin/activate
|
||||||
pip install --upgrade archivebox
|
pip install --upgrade archivebox
|
||||||
|
|
||||||
# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
|
# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
|
||||||
npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git'
|
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
|
||||||
```
|
```
|
||||||
|
|
||||||
Initialize your archive and add some links:
|
Initialize your archive and add some links:
|
||||||
|
@ -314,13 +314,13 @@ All the archived links are stored by date bookmarked in `./archive/<timestamp>`,
|
||||||
|
|
||||||
## Comparison to Other Projects
|
## Comparison to Other Projects
|
||||||
|
|
||||||
▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
|
▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
|
||||||
|
|
||||||
<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
|
<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
|
||||||
|
|
||||||
#### User Interface & Intended Purpose
|
#### User Interface & Intended Purpose
|
||||||
|
|
||||||
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
|
ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
|
||||||
|
|
||||||
#### Private Local Archives vs Centralized Public Archives
|
#### Private Local Archives vs Centralized Public Archives
|
||||||
|
|
||||||
|
@ -336,16 +336,16 @@ Whether you want to learn which organizations are the big players in the web arc
|
||||||
|
|
||||||
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
|
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
|
||||||
|
|
||||||
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
- [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
||||||
_Community-maintained indexes of archiving tools and institutions._
|
_Community-maintained indexes of archiving tools and institutions._
|
||||||
- [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
- [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
||||||
_Open source tools and projects in the internet archiving space._
|
_Open source tools and projects in the internet archiving space._
|
||||||
- [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
- [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
||||||
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
|
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
|
||||||
- [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
- [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
||||||
_A collection of the most active internet archiving communities and initiatives._
|
_A collection of the most active internet archiving communities and initiatives._
|
||||||
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
||||||
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
||||||
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
||||||
|
|
||||||
|
@ -355,51 +355,51 @@ Whether you want to learn which organizations are the big players in the web arc
|
||||||
|
|
||||||
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
|
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
|
||||||
|
|
||||||
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
|
We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
|
||||||
|
|
||||||
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
|
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder.
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
|
- [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart)
|
||||||
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
|
- [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install)
|
||||||
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
|
- [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)
|
||||||
|
|
||||||
## Reference
|
## Reference
|
||||||
|
|
||||||
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
|
- [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage)
|
||||||
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
|
- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
|
||||||
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||||
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
||||||
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
|
- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
|
||||||
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
|
- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive)
|
||||||
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
|
- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium)
|
||||||
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
|
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
|
||||||
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
|
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
|
||||||
- [Python API](https://docs.archivebox.io/en/latest/modules.html)
|
- [Python API](https://docs.archivebox.io/en/latest/modules.html)
|
||||||
- REST API (coming soon...)
|
- REST API (coming soon...)
|
||||||
|
|
||||||
## More Info
|
## More Info
|
||||||
|
|
||||||
- [Tickets](https://github.com/pirate/ArchiveBox/issues)
|
- [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues)
|
||||||
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
|
- [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)
|
||||||
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
- [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
||||||
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
|
- [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations)
|
||||||
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
|
- [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation)
|
||||||
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# ArchiveBox Development
|
# ArchiveBox Development
|
||||||
|
|
||||||
All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
|
All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
|
||||||
|
|
||||||
### Setup the dev environment
|
### Setup the dev environment
|
||||||
|
|
||||||
First, install the system dependencies from the "Bare Metal" section above.
|
First, install the system dependencies from the "Bare Metal" section above.
|
||||||
Then you can clone the ArchiveBox repo and install
|
Then you can clone the ArchiveBox repo and install
|
||||||
```python3
|
```python3
|
||||||
git clone https://github.com/pirate/ArchiveBox
|
git clone https://github.com/ArchiveBox/ArchiveBox
|
||||||
cd ArchiveBox
|
cd ArchiveBox
|
||||||
git checkout master # or the branch you want to test
|
git checkout master # or the branch you want to test
|
||||||
git pull
|
git pull
|
||||||
|
@ -480,7 +480,7 @@ You can also run all these in Docker. For more examples see the Github Actions C
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
|
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
|
||||||
|
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
|
|
||||||
|
|
|
@ -2,28 +2,28 @@ Metadata-Version: 2.1
|
||||||
Name: archivebox
|
Name: archivebox
|
||||||
Version: 0.4.21
|
Version: 0.4.21
|
||||||
Summary: The self-hosted internet archive.
|
Summary: The self-hosted internet archive.
|
||||||
Home-page: https://github.com/pirate/ArchiveBox
|
Home-page: https://github.com/ArchiveBox/ArchiveBox
|
||||||
Author: Nick Sweeting
|
Author: Nick Sweeting
|
||||||
Author-email: git@nicksweeting.com
|
Author-email: git@nicksweeting.com
|
||||||
License: MIT
|
License: MIT
|
||||||
Project-URL: Source, https://github.com/pirate/ArchiveBox
|
Project-URL: Source, https://github.com/ArchiveBox/ArchiveBox
|
||||||
Project-URL: Documentation, https://github.com/pirate/ArchiveBox/wiki
|
Project-URL: Documentation, https://github.com/ArchiveBox/ArchiveBox/wiki
|
||||||
Project-URL: Bug Tracker, https://github.com/pirate/ArchiveBox/issues
|
Project-URL: Bug Tracker, https://github.com/ArchiveBox/ArchiveBox/issues
|
||||||
Project-URL: Changelog, https://github.com/pirate/ArchiveBox/wiki/Changelog
|
Project-URL: Changelog, https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog
|
||||||
Project-URL: Roadmap, https://github.com/pirate/ArchiveBox/wiki/Roadmap
|
Project-URL: Roadmap, https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap
|
||||||
Project-URL: Community, https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community
|
Project-URL: Community, https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community
|
||||||
Project-URL: Donate, https://github.com/pirate/ArchiveBox/wiki/Donations
|
Project-URL: Donate, https://github.com/ArchiveBox/ArchiveBox/wiki/Donations
|
||||||
Description: <div align="center">
|
Description: <div align="center">
|
||||||
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
<img src="https://i.imgur.com/4nkFjdv.png" height="80px">
|
||||||
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
<h1>ArchiveBox<br/><sub>The open-source self-hosted web archive.</sub></h1>
|
||||||
|
|
||||||
▶️ <a href="https://github.com/pirate/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
▶️ <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart">Quickstart</a> |
|
||||||
<a href="https://archivebox.zervice.io/">Demo</a> |
|
<a href="https://archivebox.zervice.io/">Demo</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox">Github</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox">Github</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
|
||||||
<a href="#background--motivation">Info & Motivation</a> |
|
<a href="#background--motivation">Info & Motivation</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community">Community</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap">Roadmap</a>
|
||||||
|
|
||||||
<pre>
|
<pre>
|
||||||
"Your own personal internet archive" (网站存档 / 爬虫)
|
"Your own personal internet archive" (网站存档 / 爬虫)
|
||||||
|
@ -31,11 +31,11 @@ Description: <div align="center">
|
||||||
|
|
||||||
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
<!--<a href="http://webchat.freenode.net?channels=ArchiveBox&uio=d4"><img src="https://img.shields.io/badge/Community_chat-IRC-%2328A745.svg"/></a>-->
|
||||||
|
|
||||||
<a href="https://github.com/pirate/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE"><img src="https://img.shields.io/badge/Open_source-MIT-green.svg?logo=git&logoColor=green"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/pirate/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/commits/dev"><img src="https://img.shields.io/github/last-commit/ArchiveBox/ArchiveBox.svg?logo=Sublime+Text&logoColor=green&label=Active"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?logo=github&label=Stars&logoColor=blue"/></a>
|
||||||
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
<a href="https://test.pypi.org/project/archivebox/"><img src="https://img.shields.io/badge/Python-%3E%3D3.7-yellow.svg?logo=python&logoColor=yellow"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies"><img src="https://img.shields.io/badge/Chromium-%3E%3D59-orange.svg?logo=Google+Chrome&logoColor=orange"/></a>
|
||||||
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
<a href="https://hub.docker.com/r/nikisweeting/archivebox"><img src="https://img.shields.io/badge/Docker-all%20platforms-lightblue.svg?logo=docker&logoColor=lightblue"/></a>
|
||||||
|
|
||||||
<hr/>
|
<hr/>
|
||||||
|
@ -63,7 +63,7 @@ Description: <div align="center">
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
[DEMO: archivebox.zervice.io/](https://archivebox.zervice.io)
|
||||||
For more information, see the [full Quickstart guide](https://github.com/pirate/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration) docs.
|
For more information, see the [full Quickstart guide](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart), [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage), and [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) docs.
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
---
|
---
|
||||||
|
@ -91,7 +91,7 @@ Description: <div align="center">
|
||||||
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
|
<img src="https://i.imgur.com/viklZNG.png" width="22%" alt="Desktop index screenshot" align="top">
|
||||||
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
|
<img src="https://i.imgur.com/RefWsXB.jpg" width="22%" alt="Desktop details page Screenshot"/>
|
||||||
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
|
<img src="https://i.imgur.com/M6HhzVx.png" width="22%" alt="Desktop details page Screenshot"/><br/>
|
||||||
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/pirate/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
|
<sup><a href="https://archive.sweeting.me/">Demo</a> | <a href="https://github.com/ArchiveBox/ArchiveBox/wiki/Usage">Usage</a> | <a href="#screenshots">Screenshots</a></sup>
|
||||||
<br/>
|
<br/>
|
||||||
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
|
<sub>. . . . . . . . . . . . . . . . . . . . . . . . . . . .</sub>
|
||||||
</div><br/>
|
</div><br/>
|
||||||
|
@ -99,16 +99,16 @@ Description: <div align="center">
|
||||||
|
|
||||||
## Key Features
|
## Key Features
|
||||||
|
|
||||||
- [**Free & open source**](https://github.com/pirate/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
- [**Free & open source**](https://github.com/ArchiveBox/ArchiveBox/blob/master/LICENSE), doesn't require signing up for anything, stores all data locally
|
||||||
- [**Few dependencies**](https://github.com/pirate/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage)
|
- [**Few dependencies**](https://github.com/ArchiveBox/ArchiveBox/wiki/Install#dependencies) and [simple command line interface](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage)
|
||||||
- [**Comprehensive documentation**](https://github.com/pirate/ArchiveBox/wiki), [active development](https://github.com/pirate/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [**Comprehensive documentation**](https://github.com/ArchiveBox/ArchiveBox/wiki), [active development](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap), and [rich community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- Easy to set up **[scheduled importing](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
- Easy to set up **[scheduled importing](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving) from multiple sources**
|
||||||
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
- Uses common, **durable, [long-term formats](#saves-lots-of-useful-stuff-for-each-imported-link)** like HTML, JSON, PDF, PNG, and WARC
|
||||||
- ~~**Suitable for paywalled / [authenticated content](https://github.com/pirate/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
|
- ~~**Suitable for paywalled / [authenticated content](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#chrome_user_data_dir)** (can use your cookies)~~ (do not do this until v0.5 is released with some security fixes)
|
||||||
- **Doesn't require a constantly-running daemon**, proxy, or native app
|
- **Doesn't require a constantly-running daemon**, proxy, or native app
|
||||||
- Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
|
- Provides a CLI, Python API, self-hosted web UI, and REST API (WIP)
|
||||||
- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/pirate/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/pirate/ArchiveBox/issues/80), [close modals](https://github.com/pirate/ArchiveBox/issues/175), expand comment threads, etc.
|
- Architected to be able to run [**many varieties of scripts during archiving**](https://github.com/ArchiveBox/ArchiveBox/issues/51), e.g. to extract media, summarize articles, [scroll pages](https://github.com/ArchiveBox/ArchiveBox/issues/80), [close modals](https://github.com/ArchiveBox/ArchiveBox/issues/175), expand comment threads, etc.
|
||||||
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/pirate/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
- Can also [**mirror content to 3rd-party archiving services**](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#submit_archive_dot_org) automatically for redundancy
|
||||||
|
|
||||||
## Input formats
|
## Input formats
|
||||||
|
|
||||||
|
@ -127,7 +127,7 @@ Description: <div align="center">
|
||||||
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
- <img src="https://nicksweeting.com/images/rss.svg" height="22px"/> RSS, XML, JSON, CSV, SQL, HTML, Markdown, TXT, or any other text-based format
|
||||||
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
- <img src="https://getpocket.com/favicon.ico" height="22px"/> Pocket, Pinboard, Instapaper, Shaarli, Delicious, Reddit Saved Posts, Wallabag, Unmark.it, OneTab, and more
|
||||||
|
|
||||||
See the [Usage: CLI](https://github.com/pirate/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
|
See the [Usage: CLI](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#CLI-Usage) page for documentation and examples.
|
||||||
|
|
||||||
It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
|
It also includes a built-in scheduled import feature and browser bookmarklet, so you can ingest URLs from RSS feeds, websites, or the filesystem regularly.
|
||||||
|
|
||||||
|
@ -152,15 +152,15 @@ Description: <div align="center">
|
||||||
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
- **URL to Archive.org:** `archive.org.txt` A link to the saved site on archive.org
|
||||||
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
- **Audio & Video:** `media/` all audio/video files + playlists, including subtitles & metadata with youtube-dl
|
||||||
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
- **Source Code:** `git/` clone of any repository found on github, bitbucket, or gitlab links
|
||||||
- _More coming soon! See the [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)..._
|
- _More coming soon! See the [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)..._
|
||||||
|
|
||||||
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/pirate/ArchiveBox/wiki/Configuration) via environment variables or config file.
|
It does everything out-of-the-box by default, but you can disable or tweak [individual archive methods](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration) via environment variables or config file.
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/pirate/ArchiveBox/wiki/Docker) with everything preinstalled.
|
You don't need to install all the dependencies, ArchiveBox will automatically enable the relevant modules based on whatever you have available, but it's recommended to use the official [Docker image](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker) with everything preinstalled.
|
||||||
|
|
||||||
If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/pirate/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/pirate/ArchiveBox/wiki/Install).
|
If you so choose, you can also install ArchiveBox and its dependencies directly on any Linux or macOS systems using the [automated setup script](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart) or the [system package manager](https://github.com/ArchiveBox/ArchiveBox/wiki/Install).
|
||||||
|
|
||||||
ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
|
ArchiveBox is written in Python 3 so it requires `python3` and `pip3` available on your system. It also uses a set of optional, but highly recommended external dependencies for archiving sites: `wget` (for plain HTML, static files, and WARC saving), `chromium` (for screenshots, PDFs, JS execution, and more), `youtube-dl` (for audio and video), `git` (for cloning git repos), and `nodejs` (for readability and singlefile), and more.
|
||||||
|
|
||||||
|
@ -178,7 +178,7 @@ Description: <div align="center">
|
||||||
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google
|
archivebox config --get CHROME_VERSION # optional: set this to chromium instead of chrome if you don't like Google
|
||||||
```
|
```
|
||||||
|
|
||||||
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
Be aware that malicious archived JS can also read the contents of other pages in your archive due to snapshot CSRF and XSS protections being imperfect. See the [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#stealth-mode) page for more details.
|
||||||
```bash
|
```bash
|
||||||
# visiting an archived page with malicious JS:
|
# visiting an archived page with malicious JS:
|
||||||
https://127.0.0.1:8000/archive/1602401954/example.com/index.html
|
https://127.0.0.1:8000/archive/1602401954/example.com/index.html
|
||||||
|
@ -189,7 +189,7 @@ Description: <div align="center">
|
||||||
# then example.com/index.js can send it off to some evil server
|
# then example.com/index.js can send it off to some evil server
|
||||||
```
|
```
|
||||||
|
|
||||||
Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/pirate/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
|
Support for saving multiple snapshots of each site over time will be [added soon](https://github.com/ArchiveBox/ArchiveBox/issues/179) (along with the ability to view diffs of the changes between runs). For now ArchiveBox is designed to only archive each URL with each extractor type once. A workaround to take multiple snapshots of the same URL is to make them slightly different by adding a hash:
|
||||||
```bash
|
```bash
|
||||||
archivebox add 'https://example.com#2020-10-24'
|
archivebox add 'https://example.com#2020-10-24'
|
||||||
...
|
...
|
||||||
|
@ -211,7 +211,7 @@ Description: <div align="center">
|
||||||
# docker-compose run archivebox <command> [args]
|
# docker-compose run archivebox <command> [args]
|
||||||
|
|
||||||
mkdir archivebox && cd archivebox
|
mkdir archivebox && cd archivebox
|
||||||
wget 'https://raw.githubusercontent.com/pirate/ArchiveBox/master/docker-compose.yml'
|
wget 'https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/docker-compose.yml'
|
||||||
docker-compose run archivebox init
|
docker-compose run archivebox init
|
||||||
docker-compose run archivebox add 'https://example.com'
|
docker-compose run archivebox add 'https://example.com'
|
||||||
docker-compose run archivebox manage createsuperuser
|
docker-compose run archivebox manage createsuperuser
|
||||||
|
@ -265,7 +265,7 @@ Description: <div align="center">
|
||||||
pip install --upgrade archivebox
|
pip install --upgrade archivebox
|
||||||
|
|
||||||
# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
|
# Install node packages in ./node_modules (used for SingleFile, Readability, and Puppeteer)
|
||||||
npm install --prefix . 'git+https://github.com/pirate/ArchiveBox.git'
|
npm install --prefix . 'git+https://github.com/ArchiveBox/ArchiveBox.git'
|
||||||
```
|
```
|
||||||
|
|
||||||
Initialize your archive and add some links:
|
Initialize your archive and add some links:
|
||||||
|
@ -329,13 +329,13 @@ Description: <div align="center">
|
||||||
|
|
||||||
## Comparison to Other Projects
|
## Comparison to Other Projects
|
||||||
|
|
||||||
▶ **Check out our [community page](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
|
▶ **Check out our [community page](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community) for an index of web archiving initiatives and projects.**
|
||||||
|
|
||||||
<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
|
<img src="https://i.imgur.com/4nkFjdv.png" width="10%" align="left"/> The aim of ArchiveBox is to go beyond what the Wayback Machine and other public archiving services can do, by adding a headless browser to replay sessions accurately, and by automatically extracting all the content in multiple redundant formats that will survive being passed down to historians and archivists through many generations.
|
||||||
|
|
||||||
#### User Interface & Intended Purpose
|
#### User Interface & Intended Purpose
|
||||||
|
|
||||||
ArchiveBox differentiates itself from [similar projects](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
|
ArchiveBox differentiates itself from [similar projects](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects) by being a simple, one-shot CLI interface for users to ingest bulk feeds of URLs over extended periods, as opposed to being a backend service that ingests individual, manually-submitted URLs from a web UI. However, we also have the option to add urls via a web interface through our Django frontend.
|
||||||
|
|
||||||
#### Private Local Archives vs Centralized Public Archives
|
#### Private Local Archives vs Centralized Public Archives
|
||||||
|
|
||||||
|
@ -351,16 +351,16 @@ Description: <div align="center">
|
||||||
|
|
||||||
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
|
<img src="https://i.imgur.com/0ZOmOvN.png" width="14%" align="right"/>
|
||||||
|
|
||||||
- [Community Wiki](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [Community Wiki](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
- [The Master Lists](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
- [The Master Lists](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#The-Master-Lists)
|
||||||
_Community-maintained indexes of archiving tools and institutions._
|
_Community-maintained indexes of archiving tools and institutions._
|
||||||
- [Web Archiving Software](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
- [Web Archiving Software](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Web-Archiving-Projects)
|
||||||
_Open source tools and projects in the internet archiving space._
|
_Open source tools and projects in the internet archiving space._
|
||||||
- [Reading List](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
- [Reading List](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Reading-List)
|
||||||
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
|
_Articles, posts, and blogs relevant to ArchiveBox and web archiving in general._
|
||||||
- [Communities](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
- [Communities](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community#Communities)
|
||||||
_A collection of the most active internet archiving communities and initiatives._
|
_A collection of the most active internet archiving communities and initiatives._
|
||||||
- Check out the ArchiveBox [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
- Check out the ArchiveBox [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) and [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
||||||
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
- Learn why archiving the internet is important by reading the "[On the Importance of Web Archiving](https://parameters.ssrc.org/2018/09/on-the-importance-of-web-archiving/)" blog post.
|
||||||
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
- Or reach out to me for questions and comments via [@theSquashSH](https://twitter.com/thesquashSH) on Twitter.
|
||||||
|
|
||||||
|
@ -370,51 +370,51 @@ Description: <div align="center">
|
||||||
|
|
||||||
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
|
<img src="https://read-the-docs-guidelines.readthedocs-hosted.com/_images/logo-dark.png" width="13%" align="right"/>
|
||||||
|
|
||||||
We use the [Github wiki system](https://github.com/pirate/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
|
We use the [Github wiki system](https://github.com/ArchiveBox/ArchiveBox/wiki) and [Read the Docs](https://archivebox.readthedocs.io/en/latest/) (WIP) for documentation.
|
||||||
|
|
||||||
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/pirate/ArchiveBox/wiki/Home) folder.
|
You can also access the docs locally by looking in the [`ArchiveBox/docs/`](https://github.com/ArchiveBox/ArchiveBox/wiki/Home) folder.
|
||||||
|
|
||||||
## Getting Started
|
## Getting Started
|
||||||
|
|
||||||
- [Quickstart](https://github.com/pirate/ArchiveBox/wiki/Quickstart)
|
- [Quickstart](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart)
|
||||||
- [Install](https://github.com/pirate/ArchiveBox/wiki/Install)
|
- [Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install)
|
||||||
- [Docker](https://github.com/pirate/ArchiveBox/wiki/Docker)
|
- [Docker](https://github.com/ArchiveBox/ArchiveBox/wiki/Docker)
|
||||||
|
|
||||||
## Reference
|
## Reference
|
||||||
|
|
||||||
- [Usage](https://github.com/pirate/ArchiveBox/wiki/Usage)
|
- [Usage](https://github.com/ArchiveBox/ArchiveBox/wiki/Usage)
|
||||||
- [Configuration](https://github.com/pirate/ArchiveBox/wiki/Configuration)
|
- [Configuration](https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration)
|
||||||
- [Supported Sources](https://github.com/pirate/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
- [Supported Sources](https://github.com/ArchiveBox/ArchiveBox/wiki/Quickstart#2-get-your-list-of-urls-to-archive)
|
||||||
- [Supported Outputs](https://github.com/pirate/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
- [Supported Outputs](https://github.com/ArchiveBox/ArchiveBox/wiki#can-save-these-things-for-each-site)
|
||||||
- [Scheduled Archiving](https://github.com/pirate/ArchiveBox/wiki/Scheduled-Archiving)
|
- [Scheduled Archiving](https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving)
|
||||||
- [Publishing Your Archive](https://github.com/pirate/ArchiveBox/wiki/Publishing-Your-Archive)
|
- [Publishing Your Archive](https://github.com/ArchiveBox/ArchiveBox/wiki/Publishing-Your-Archive)
|
||||||
- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
|
- [Chromium Install](https://github.com/ArchiveBox/ArchiveBox/wiki/Install-Chromium)
|
||||||
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
|
- [Security Overview](https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview)
|
||||||
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
|
- [Troubleshooting](https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting)
|
||||||
- [Python API](https://docs.archivebox.io/en/latest/modules.html)
|
- [Python API](https://docs.archivebox.io/en/latest/modules.html)
|
||||||
- REST API (coming soon...)
|
- REST API (coming soon...)
|
||||||
|
|
||||||
## More Info
|
## More Info
|
||||||
|
|
||||||
- [Tickets](https://github.com/pirate/ArchiveBox/issues)
|
- [Tickets](https://github.com/ArchiveBox/ArchiveBox/issues)
|
||||||
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
|
- [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap)
|
||||||
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
|
- [Changelog](https://github.com/ArchiveBox/ArchiveBox/wiki/Changelog)
|
||||||
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
|
- [Donations](https://github.com/ArchiveBox/ArchiveBox/wiki/Donations)
|
||||||
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
|
- [Background & Motivation](https://github.com/ArchiveBox/ArchiveBox#background--motivation)
|
||||||
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
|
- [Web Archiving Community](https://github.com/ArchiveBox/ArchiveBox/wiki/Web-Archiving-Community)
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
# ArchiveBox Development
|
# ArchiveBox Development
|
||||||
|
|
||||||
All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
|
All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/ArchiveBox/ArchiveBox/issues) and [Roadmap](https://github.com/ArchiveBox/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
|
||||||
|
|
||||||
### Setup the dev environment
|
### Setup the dev environment
|
||||||
|
|
||||||
First, install the system dependencies from the "Bare Metal" section above.
|
First, install the system dependencies from the "Bare Metal" section above.
|
||||||
Then you can clone the ArchiveBox repo and install
|
Then you can clone the ArchiveBox repo and install
|
||||||
```python3
|
```python3
|
||||||
git clone https://github.com/pirate/ArchiveBox
|
git clone https://github.com/ArchiveBox/ArchiveBox
|
||||||
cd ArchiveBox
|
cd ArchiveBox
|
||||||
git checkout master # or the branch you want to test
|
git checkout master # or the branch you want to test
|
||||||
git pull
|
git pull
|
||||||
|
@ -495,7 +495,7 @@ Description: <div align="center">
|
||||||
<br/>
|
<br/>
|
||||||
|
|
||||||
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
|
<a href="https://twitter.com/thesquashSH"><img src="https://img.shields.io/badge/Tweet-%40theSquashSH-blue.svg?style=flat"/></a>
|
||||||
<a href="https://github.com/pirate/ArchiveBox"><img src="https://img.shields.io/github/stars/pirate/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox"><img src="https://img.shields.io/github/stars/ArchiveBox/ArchiveBox.svg?style=flat&label=Star+on+Github"/></a>
|
||||||
|
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
|
|
||||||
|
|
|
@ -36,7 +36,7 @@ from .config_stubs import (
|
||||||
#
|
#
|
||||||
|
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
# Documentation: https://github.com/pirate/ArchiveBox/wiki/Configuration
|
# Documentation: https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||||
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
|
# Use the 'env' command to pass config options to ArchiveBox. e.g.:
|
||||||
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
|
# env USE_COLOR=True CHROME_BINARY=chromium archivebox add < example.html
|
||||||
# ******************************************************************************
|
# ******************************************************************************
|
||||||
|
@ -98,8 +98,8 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
||||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||||
|
|
||||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) curl/{CURL_VERSION}'},
|
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/pirate/ArchiveBox/) wget/{WGET_VERSION}'},
|
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.75 Safari/537.36'},
|
||||||
|
|
||||||
'COOKIES_FILE': {'type': str, 'default': None},
|
'COOKIES_FILE': {'type': str, 'default': None},
|
||||||
|
@ -248,7 +248,7 @@ CONFIG_HEADER = (
|
||||||
# archivebox init
|
# archivebox init
|
||||||
#
|
#
|
||||||
# A list of all possible config with documentation and examples can be found here:
|
# A list of all possible config with documentation and examples can be found here:
|
||||||
# https://github.com/pirate/ArchiveBox/wiki/Configuration
|
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||||
|
|
||||||
""")
|
""")
|
||||||
|
|
||||||
|
@ -505,7 +505,7 @@ def load_config(defaults: ConfigDefaultDict,
|
||||||
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
stderr(' Check your config for mistakes and try again (your archive data is unaffected).')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' For config documentation and examples see:')
|
stderr(' For config documentation and examples see:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||||
stderr()
|
stderr()
|
||||||
raise
|
raise
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
@ -565,7 +565,7 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
||||||
# stderr(f' {binary} --version')
|
# stderr(f' {binary} --version')
|
||||||
# stderr()
|
# stderr()
|
||||||
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
|
# stderr(' If you don\'t want to install it, you can disable it via config. See here for more info:')
|
||||||
# stderr(' https://github.com/pirate/ArchiveBox/wiki/Install')
|
# stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Install')
|
||||||
return None
|
return None
|
||||||
|
|
||||||
def bin_path(binary: Optional[str]) -> Optional[str]:
|
def bin_path(binary: Optional[str]) -> Optional[str]:
|
||||||
|
@ -831,13 +831,13 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||||
if config['USER'] == 'root':
|
if config['USER'] == 'root':
|
||||||
stderr('[!] ArchiveBox should never be run as root!', color='red')
|
stderr('[!] ArchiveBox should never be run as root!', color='red')
|
||||||
stderr(' For more information, see the security overview documentation:')
|
stderr(' For more information, see the security overview documentation:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
### Check Python environment
|
### Check Python environment
|
||||||
if sys.version_info[:3] < (3, 6, 0):
|
if sys.version_info[:3] < (3, 6, 0):
|
||||||
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
|
stderr(f'[X] Python version is not new enough: {config["PYTHON_VERSION"]} (>3.6 is required)', color='red')
|
||||||
stderr(' See https://github.com/pirate/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
stderr(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.')
|
||||||
raise SystemExit(2)
|
raise SystemExit(2)
|
||||||
|
|
||||||
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
|
if config['PYTHON_ENCODING'] not in ('UTF-8', 'UTF8'):
|
||||||
|
@ -857,7 +857,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||||
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
|
||||||
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||||
stderr(' For more info see:')
|
stderr(' For more info see:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||||
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
if '/Default' in str(config['CHROME_USER_DATA_DIR']):
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' Try removing /Default from the end e.g.:')
|
stderr(' Try removing /Default from the end e.g.:')
|
||||||
|
@ -881,7 +881,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY', 'MERCURY_BINARY'):
|
||||||
hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"',
|
hint(('npm install --prefix . "git+https://github.com/ArchiveBox/ArchiveBox.git"',
|
||||||
f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
|
f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
|
||||||
''), prefix=' ')
|
''), prefix=' ')
|
||||||
stderr('')
|
stderr('')
|
||||||
|
@ -892,7 +892,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||||
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
|
stderr(' (Setting it to somewhere between 30 and 3000 seconds is recommended)')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||||
stderr()
|
stderr()
|
||||||
|
|
||||||
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
|
elif config['USE_CHROME'] and config['TIMEOUT'] < 15:
|
||||||
|
@ -901,7 +901,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||||
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
stderr(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
stderr(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||||
stderr()
|
stderr()
|
||||||
|
|
||||||
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
|
if config['USE_YOUTUBEDL'] and config['MEDIA_TIMEOUT'] < 20:
|
||||||
|
@ -910,7 +910,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||||
stderr(' (Setting it somewhere over 60 seconds is recommended)')
|
stderr(' (Setting it somewhere over 60 seconds is recommended)')
|
||||||
stderr()
|
stderr()
|
||||||
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
stderr(' If you want to disable media archiving entirely, set SAVE_MEDIA=False instead:')
|
||||||
stderr(' https://github.com/pirate/ArchiveBox/wiki/Configuration#save_media')
|
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||||
stderr()
|
stderr()
|
||||||
|
|
||||||
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
|
def check_data_folder(out_dir: Optional[str]=None, config: ConfigDict=CONFIG) -> None:
|
||||||
|
|
|
@ -14,7 +14,7 @@ urlpatterns = [
|
||||||
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
|
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
|
||||||
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
|
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
|
||||||
|
|
||||||
path('docs/', RedirectView.as_view(url='https://github.com/pirate/ArchiveBox/wiki'), name='Docs'),
|
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
|
||||||
|
|
||||||
path('archive/', RedirectView.as_view(url='/')),
|
path('archive/', RedirectView.as_view(url='/')),
|
||||||
path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
|
path('archive/<path:path>', LinkDetails.as_view(), name='LinkAssets'),
|
||||||
|
|
|
@ -32,9 +32,9 @@ MAIN_INDEX_HEADER = {
|
||||||
'version': VERSION,
|
'version': VERSION,
|
||||||
'git_sha': GIT_SHA,
|
'git_sha': GIT_SHA,
|
||||||
'website': 'https://ArchiveBox.io',
|
'website': 'https://ArchiveBox.io',
|
||||||
'docs': 'https://github.com/pirate/ArchiveBox/wiki',
|
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||||
'source': 'https://github.com/pirate/ArchiveBox',
|
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||||
'issues': 'https://github.com/pirate/ArchiveBox/issues',
|
'issues': 'https://github.com/ArchiveBox/ArchiveBox/issues',
|
||||||
'dependencies': DEPENDENCIES,
|
'dependencies': DEPENDENCIES,
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
|
@ -447,7 +447,7 @@ def log_shell_welcome_msg():
|
||||||
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
|
print('{green}from archivebox import *\n {}{reset}'.format("\n ".join(list_subcommands().keys()), **ANSI))
|
||||||
print()
|
print()
|
||||||
print('[i] Welcome to the ArchiveBox Shell!')
|
print('[i] Welcome to the ArchiveBox Shell!')
|
||||||
print(' https://github.com/pirate/ArchiveBox/wiki/Usage#Shell-Usage')
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage')
|
||||||
print()
|
print()
|
||||||
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
|
print(' {lightred}Hint:{reset} Example use:'.format(**ANSI))
|
||||||
print(' print(Snapshot.objects.filter(is_archived=True).count())')
|
print(' print(Snapshot.objects.filter(is_archived=True).count())')
|
||||||
|
|
|
@ -178,7 +178,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
archivebox update --resume=15109948213.123
|
archivebox update --resume=15109948213.123
|
||||||
|
|
||||||
{lightred}Documentation:{reset}
|
{lightred}Documentation:{reset}
|
||||||
https://github.com/pirate/ArchiveBox/wiki
|
https://github.com/ArchiveBox/ArchiveBox/wiki
|
||||||
'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
|
'''.format(VERSION, out_dir, COMMANDS_HELP_TEXT, **ANSI))
|
||||||
|
|
||||||
else:
|
else:
|
||||||
|
@ -197,7 +197,7 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
||||||
print(' 2. archivebox init')
|
print(' 2. archivebox init')
|
||||||
print()
|
print()
|
||||||
print('For more information, see the documentation here:')
|
print('For more information, see the documentation here:')
|
||||||
print(' https://github.com/pirate/ArchiveBox/wiki')
|
print(' https://github.com/ArchiveBox/ArchiveBox/wiki')
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
|
|
|
@ -268,7 +268,7 @@
|
||||||
<div class="col-sm-10" style="text-align: right">
|
<div class="col-sm-10" style="text-align: right">
|
||||||
<a href="/add/">Add Links</a> |
|
<a href="/add/">Add Links</a> |
|
||||||
<a href="/admin/core/snapshot/">Admin</a> |
|
<a href="/admin/core/snapshot/">Admin</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -280,7 +280,7 @@
|
||||||
<br />
|
<br />
|
||||||
<center>
|
<center>
|
||||||
<small>
|
<small>
|
||||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a> |
|
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> |
|
||||||
|
|
||||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||||
<br /><br />
|
<br /><br />
|
||||||
|
@ -291,4 +291,4 @@
|
||||||
</footer>
|
</footer>
|
||||||
</body>
|
</body>
|
||||||
|
|
||||||
</html>
|
</html>
|
||||||
|
|
|
@ -223,7 +223,7 @@
|
||||||
<div class="col-sm-10" style="text-align: right">
|
<div class="col-sm-10" style="text-align: right">
|
||||||
<a href="/add/">Add Links</a> |
|
<a href="/add/">Add Links</a> |
|
||||||
<a href="/admin/core/snapshot/">Admin</a> |
|
<a href="/admin/core/snapshot/">Admin</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Docs</a>
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Docs</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -266,8 +266,8 @@
|
||||||
<br/>
|
<br/>
|
||||||
<center>
|
<center>
|
||||||
<small>
|
<small>
|
||||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
|
||||||
version <a href="https://github.com/pirate/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> |
|
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{VERSION}}" title="Git commit">v{{VERSION}}</a> |
|
||||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
{{FOOTER_INFO}}
|
{{FOOTER_INFO}}
|
||||||
|
|
|
@ -187,8 +187,8 @@
|
||||||
</a>
|
</a>
|
||||||
</div>
|
</div>
|
||||||
<div class="col-sm-10" style="text-align: right">
|
<div class="col-sm-10" style="text-align: right">
|
||||||
<a href="https://github.com/pirate/ArchiveBox/wiki">Documentation</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox/wiki">Documentation</a> |
|
||||||
<a href="https://github.com/pirate/ArchiveBox">Source</a> |
|
<a href="https://github.com/ArchiveBox/ArchiveBox">Source</a> |
|
||||||
<a href="https://archivebox.io">Website</a>
|
<a href="https://archivebox.io">Website</a>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
@ -209,8 +209,8 @@
|
||||||
<br/>
|
<br/>
|
||||||
<center>
|
<center>
|
||||||
<small>
|
<small>
|
||||||
Archive created using <a href="https://github.com/pirate/ArchiveBox" title="Github">ArchiveBox</a>
|
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
|
||||||
version <a href="https://github.com/pirate/ArchiveBox/tree/v$version" title="Git commit">v$version</a> |
|
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v$version" title="Git commit">v$version</a> |
|
||||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||||
<br/><br/>
|
<br/><br/>
|
||||||
$footer_info
|
$footer_info
|
||||||
|
|
|
@ -1,7 +1,7 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
# ArchiveBox Setup Script
|
# ArchiveBox Setup Script
|
||||||
# Nick Sweeting 2017 | MIT License
|
# Nick Sweeting 2017 | MIT License
|
||||||
# https://github.com/pirate/ArchiveBox
|
# https://github.com/ArchiveBox/ArchiveBox
|
||||||
|
|
||||||
echo "[i] ArchiveBox Setup Script 📦"
|
echo "[i] ArchiveBox Setup Script 📦"
|
||||||
echo ""
|
echo ""
|
||||||
|
@ -16,7 +16,7 @@ echo " - youtube-dl"
|
||||||
echo " - chromium-browser (skip this if Chrome/Chromium is already installed)"
|
echo " - chromium-browser (skip this if Chrome/Chromium is already installed)"
|
||||||
echo ""
|
echo ""
|
||||||
echo " If you'd rather install these manually, you can find documentation here:"
|
echo " If you'd rather install these manually, you can find documentation here:"
|
||||||
echo " https://github.com/pirate/ArchiveBox/wiki/Install"
|
echo " https://github.com/ArchiveBox/ArchiveBox/wiki/Install"
|
||||||
echo ""
|
echo ""
|
||||||
echo "Press enter to continue with the automatic install, or Ctrl+C to cancel..."
|
echo "Press enter to continue with the automatic install, or Ctrl+C to cancel..."
|
||||||
read
|
read
|
||||||
|
@ -112,5 +112,5 @@ echo "---------------------------------------------------"
|
||||||
echo "[X] Failed to install some dependencies! ‼️"
|
echo "[X] Failed to install some dependencies! ‼️"
|
||||||
echo " - Try the Manual Setup instructions in the README.md"
|
echo " - Try the Manual Setup instructions in the README.md"
|
||||||
echo " - Try the Troubleshooting: Dependencies instructions in the README.md"
|
echo " - Try the Troubleshooting: Dependencies instructions in the README.md"
|
||||||
echo " - Open an issue on github to get help: https://github.com/pirate/ArchiveBox/issues"
|
echo " - Open an issue on github to get help: https://github.com/ArchiveBox/ArchiveBox/issues"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -5,7 +5,7 @@
|
||||||
# docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss
|
# docker-compose run archivebox add --depth=1 https://example.com/some/feed.rss
|
||||||
# docker-compose run archivebox config --set PUBLIC_INDEX=True
|
# docker-compose run archivebox config --set PUBLIC_INDEX=True
|
||||||
# Documentation:
|
# Documentation:
|
||||||
# https://github.com/pirate/ArchiveBox/wiki/Docker#docker-compose
|
# https://github.com/ArchiveBox/ArchiveBox/wiki/Docker#docker-compose
|
||||||
|
|
||||||
version: '3.7'
|
version: '3.7'
|
||||||
|
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
# DO NOT EDIT THIS FILE DIRECTLY!
|
# DO NOT EDIT THIS FILE DIRECTLY!
|
||||||
#
|
#
|
||||||
# See the list of all the possible options. documentation, and examples here:
|
# See the list of all the possible options. documentation, and examples here:
|
||||||
# https://github.com/pirate/ArchiveBox/wiki/Configuration
|
# https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration
|
||||||
|
|
||||||
[GENERAL_CONFIG]
|
[GENERAL_CONFIG]
|
||||||
# OUTPUT_PERMISSIONS = 755
|
# OUTPUT_PERMISSIONS = 755
|
||||||
|
|
2
setup.py
2
setup.py
|
@ -9,7 +9,7 @@ DESCRIPTION = "The self-hosted internet archive."
|
||||||
LICENSE = "MIT"
|
LICENSE = "MIT"
|
||||||
AUTHOR = "Nick Sweeting"
|
AUTHOR = "Nick Sweeting"
|
||||||
AUTHOR_EMAIL="git@nicksweeting.com"
|
AUTHOR_EMAIL="git@nicksweeting.com"
|
||||||
REPO_URL = "https://github.com/pirate/ArchiveBox"
|
REPO_URL = "https://github.com/ArchiveBox/ArchiveBox"
|
||||||
PROJECT_URLS = {
|
PROJECT_URLS = {
|
||||||
"Source": f"{REPO_URL}",
|
"Source": f"{REPO_URL}",
|
||||||
"Documentation": f"{REPO_URL}/wiki",
|
"Documentation": f"{REPO_URL}/wiki",
|
||||||
|
|
|
@ -5,7 +5,7 @@ from .fixtures import *
|
||||||
|
|
||||||
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
|
def test_title_is_htmlencoded_in_index_html(tmp_path, process, disable_extractors_dict):
|
||||||
"""
|
"""
|
||||||
https://github.com/pirate/ArchiveBox/issues/330
|
https://github.com/ArchiveBox/ArchiveBox/issues/330
|
||||||
Unencoded content should not be rendered as it facilitates xss injections
|
Unencoded content should not be rendered as it facilitates xss injections
|
||||||
and breaks the layout.
|
and breaks the layout.
|
||||||
"""
|
"""
|
||||||
|
|
Loading…
Reference in a new issue