Merge pull request #655 from ArchiveBox/debug-toolbar

This commit is contained in:
Nick Sweeting 2021-03-27 05:14:51 -04:00 committed by GitHub
commit 6fb7bbf2fb
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
65 changed files with 1811 additions and 599 deletions

View file

@ -50,13 +50,6 @@ RUN apt-get update -qq \
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
&& rm -rf /var/lib/apt/lists/*
# Install apt development dependencies
# RUN apt-get install -qq \
# && apt-get install -qq -y --no-install-recommends \
# python3 python3-dev python3-pip python3-venv python3-all \
# dh-python debhelper devscripts dput software-properties-common \
# python3-distutils python3-setuptools python3-wheel python3-stdeb
# Install Node environment
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
&& echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR"
ENV PATH="${PATH}:$VENV_PATH/bin"
RUN python -m venv --clear --symlinks "$VENV_PATH" \
&& pip install --upgrade --quiet pip setuptools
ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt"
ADD "./setup.py" "$CODE_DIR/"
ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/"
RUN apt-get update -qq \
&& apt-get install -qq -y --no-install-recommends \
build-essential python-dev python3-dev \
# && pip install --upgrade pip \
&& grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
&& pip install --quiet "sonic-client==0.0.5" \
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
&& pip install --quiet -r /tmp/requirements.txt \
&& apt-get purge -y build-essential python-dev python3-dev \
&& apt-get autoremove -y \
&& rm -rf /var/lib/apt/lists/*
# Install apt development dependencies
# RUN apt-get install -qq \
# && apt-get install -qq -y --no-install-recommends \
# python3 python3-dev python3-pip python3-venv python3-all \
# dh-python debhelper devscripts dput software-properties-common \
# python3-distutils python3-setuptools python3-wheel python3-stdeb
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
# && pip install --quiet -r /tmp/dev_requirements.txt
# Install ArchiveBox Python package and its dependencies
WORKDIR "$CODE_DIR"
ADD . "$CODE_DIR"
@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
VOLUME "$DATA_DIR"
EXPOSE 8000
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "0.0.0.0:8000"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]

View file

@ -63,7 +63,11 @@ def run_subcommand(subcommand: str,
if subcommand not in meta_cmds:
from ..config import setup_django
setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
cmd_requires_db = subcommand in archive_cmds
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
module = import_module('.archivebox_{}'.format(subcommand), __package__)
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore

View file

@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
add_help=True,
formatter_class=SmartFormatter,
)
parser.add_argument(
'--tag', '-t',
type=str,
default='',
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
command = parser.parse_args(args or ())
urls = command.urls
stdin_urls = accept_stdin(stdin)
stdin_urls = ''
if not urls:
stdin_urls = accept_stdin(stdin)
if (stdin_urls and urls) or (not stdin and not urls):
stderr(
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
add(
urls=stdin_urls or urls,
depth=command.depth,
tag=command.tag,
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,

View file

@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help='KEY or KEY=VALUE formatted config values to get or set',
)
command = parser.parse_args(args or ())
config_options_str = accept_stdin(stdin)
config_options_str = ''
if not command.config_options:
config_options_str = accept_stdin(stdin)
config(
config_options_str=config_options_str,

View file

@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Ignore unrecognized files in current directory and initialize anyway',
)
parser.add_argument(
'--quick', '-q',
action='store_true',
help='Run any updates or migrations without rechecking all snapshot dirs',
)
command = parser.parse_args(args or ())
reject_stdin(__command__, stdin)
init(
force=command.force,
quick=command.quick,
out_dir=pwd or OUTPUT_DIR,
)

View file

@ -12,6 +12,7 @@ from ..main import list_all
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
LINK_FILTERS,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@ -23,7 +24,7 @@ from ..index import (
get_corrupted_folders,
get_unrecognized_folders,
)
from ..logging_util import SmartFormatter, accept_stdin, stderr
from ..logging_util import SmartFormatter, reject_stdin, stderr
@docstring(list_all.__doc__)
@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
group.add_argument(
'--json', #'-j',
action='store_true',
help="Print the output in JSON format with all columns included.",
help="Print the output in JSON format with all columns included",
)
group.add_argument(
'--html',
@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
parser.add_argument(
'--sort', #'-s',
type=str,
help="List the links sorted using the given key, e.g. timestamp or updated.",
help="List the links sorted using the given key, e.g. timestamp or updated",
default=None,
)
parser.add_argument(
'--before', #'-b',
type=float,
help="List only links bookmarked before the given timestamp.",
help="List only links bookmarked before (less than) the given timestamp",
default=None,
)
parser.add_argument(
'--after', #'-a',
type=float,
help="List only links bookmarked after the given timestamp.",
help="List only links bookmarked after (greater than or equal to) the given timestamp",
default=None,
)
parser.add_argument(
@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
)
parser.add_argument(
'--filter-type',
'--filter-type', '-t',
type=str,
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
nargs='*',
type=str,
default=None,
help='List only URLs matching these filter patterns.'
help='List only URLs matching these filter patterns'
)
command = parser.parse_args(args or ())
filter_patterns_str = accept_stdin(stdin)
reject_stdin(stdin)
if command.with_headers and not (command.json or command.html or command.csv):
stderr(
'[X] --with-headers can only be used with --json, --html or --csv options.\n',
'[X] --with-headers can only be used with --json, --html or --csv options\n',
color='red',
)
raise SystemExit(2)
matching_folders = list_all(
filter_patterns_str=filter_patterns_str,
filter_patterns=command.filter_patterns,
filter_type=command.filter_type,
status=command.status,

View file

@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
)
command = parser.parse_args(args or ())
stdin_url = None
url = command.url
stdin_url = accept_stdin(stdin)
if not url:
stdin_url = accept_stdin(stdin)
if (stdin_url and url) or (not stdin and not url):
stderr(
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n',

View file

@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help='URLs matching this filter pattern will be removed from the index.'
)
command = parser.parse_args(args or ())
filter_str = accept_stdin(stdin)
filter_str = None
if not command.filter_patterns:
filter_str = accept_stdin(stdin)
remove(
filter_str=filter_str,

View file

@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Enable DEBUG=True mode with more verbose errors',
)
parser.add_argument(
'--nothreading',
action='store_true',
help='Force runserver to run in single-threaded mode',
)
parser.add_argument(
'--init',
action='store_true',
help='Run archivebox init before starting the server',
help='Run a full archivebox init/upgrade before starting the server',
)
parser.add_argument(
'--quick-init', '-i',
action='store_true',
help='Run quick archivebox init/upgrade before starting the server',
)
parser.add_argument(
'--createsuperuser',
@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
reject_stdin(__command__, stdin)
server(
runserver_args=command.runserver_args,
runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []),
reload=command.reload,
debug=command.debug,
init=command.init,
quick_init=command.quick_init,
createsuperuser=command.createsuperuser,
out_dir=pwd or OUTPUT_DIR,
)

View file

@ -12,6 +12,7 @@ from ..main import update
from ..util import docstring
from ..config import OUTPUT_DIR
from ..index import (
LINK_FILTERS,
get_indexed_folders,
get_archived_folders,
get_unarchived_folders,
@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
)
)
parser.add_argument(
'--filter-type',
'--filter-type', '-t',
type=str,
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
choices=(*LINK_FILTERS.keys(), 'search'),
default='exact',
help='Type of pattern matching to use when filtering URLs',
)
@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
default=""
)
command = parser.parse_args(args or ())
filter_patterns_str = accept_stdin(stdin)
filter_patterns_str = None
if not command.filter_patterns:
filter_patterns_str = accept_stdin(stdin)
update(
resume=command.resume,

227
archivebox/cli/tests.py Normal file
View file

@ -0,0 +1,227 @@
#!/usr/bin/env python3
__package__ = 'archivebox.cli'
import os
import sys
import shutil
import unittest
from pathlib import Path
from contextlib import contextmanager
TEST_CONFIG = {
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'OUTPUT_DIR': 'data.tests',
'SAVE_ARCHIVE_DOT_ORG': 'False',
'SAVE_TITLE': 'False',
'USE_CURL': 'False',
'USE_WGET': 'False',
'USE_GIT': 'False',
'USE_CHROME': 'False',
'USE_YOUTUBEDL': 'False',
}
OUTPUT_DIR = 'data.tests'
os.environ.update(TEST_CONFIG)
from ..main import init
from ..index import load_main_index
from ..config import (
SQL_INDEX_FILENAME,
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
)
from . import (
archivebox_init,
archivebox_add,
archivebox_remove,
)
HIDE_CLI_OUTPUT = True
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
stdout = sys.stdout
stderr = sys.stderr
@contextmanager
def output_hidden(show_failing=True):
if not HIDE_CLI_OUTPUT:
yield
return
sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
try:
yield
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
except Exception:
sys.stdout.close()
sys.stderr.close()
sys.stdout = stdout
sys.stderr = stderr
if show_failing:
with open('stdout.txt', 'r', encoding='utf-8') as f:
print(f.read())
with open('stderr.txt', 'r', encoding='utf-8') as f:
print(f.read())
raise
finally:
os.remove('stdout.txt')
os.remove('stderr.txt')
class TestInit(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_basic_init(self):
with output_hidden():
archivebox_init.main([])
assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self):
with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
f.write('test')
try:
with output_hidden(show_failing=False):
archivebox_init.main([])
assert False, 'Init should have exited with an exception'
except SystemExit:
pass
assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
except Exception:
pass
def test_no_dirty_state(self):
with output_hidden():
init()
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
with output_hidden():
init()
class TestAdd(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
with output_hidden():
init()
def tearDown(self):
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_add_arg_url(self):
with output_hidden():
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 30
def test_add_arg_file(self):
test_file = Path(OUTPUT_DIR) / 'test.txt'
with open(test_file, 'w+', encoding='utf') as f:
f.write(test_urls)
with output_hidden():
archivebox_add.main([test_file])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
os.remove(test_file)
def test_add_stdin_url(self):
with output_hidden():
archivebox_add.main([], stdin=test_urls)
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 12
class TestRemove(unittest.TestCase):
def setUp(self):
os.makedirs(OUTPUT_DIR, exist_ok=True)
with output_hidden():
init()
archivebox_add.main([], stdin=test_urls)
# def tearDown(self):
# shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
def test_remove_exact(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 11
def test_remove_regex(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 4
def test_remove_domain(self):
with output_hidden():
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
all_links = load_main_index(out_dir=OUTPUT_DIR)
assert len(all_links) == 10
def test_remove_none(self):
try:
with output_hidden(show_failing=False):
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
assert False, 'Should raise if no URLs match'
except Exception:
pass
if __name__ == '__main__':
if '--verbose' in sys.argv or '-v' in sys.argv:
HIDE_CLI_OUTPUT = False
unittest.main()

View file

@ -29,10 +29,12 @@ import json
import getpass
import platform
import shutil
import sqlite3
import django
from hashlib import md5
from pathlib import Path
from datetime import datetime
from typing import Optional, Type, Tuple, Dict, Union, List
from subprocess import run, PIPE, DEVNULL
from configparser import ConfigParser
@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
},
'ARCHIVE_METHOD_TOGGLES': {
@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'ARCHIVE_METHOD_OPTIONS': {
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHROME_HEADLESS': {'type': bool, 'default': True},
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description',
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
'--max-filesize=750m',
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
]},
@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
config_file.optionxform = str
config_file.read(config_path)
with open(config_path, 'r') as old:
with open(config_path, 'r', encoding='utf-8') as old:
atomic_write(f'{config_path}.bak', old.read())
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
from django.utils.crypto import get_random_string
chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
random_secret_key = get_random_string(50, chars)
if 'SERVER_CONFIG' in config_file:
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
else:
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
with open(config_path, 'w+') as new:
with open(config_path, 'w+', encoding='utf-8') as new:
config_file.write(new)
try:
@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
}
except:
# something went horribly wrong, rever to the previous version
with open(f'{config_path}.bak', 'r') as old:
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
atomic_write(config_path, old.read())
if Path(f'{config_path}.bak').exists():
@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
try:
import django
from django.core.management import call_command
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr('[X] Your SQLite3 version is missing the required JSON1 extension', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
if in_memory_db:
# Put the db in memory and run migrations in case any command requires it
from django.core.management import call_command
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
# in those cases we create a temporary in-memory db and run the migrations
# immediately to get a usable in-memory-database at startup
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
django.setup()
call_command("migrate", interactive=False, verbosity=0)
else:
# Otherwise use default sqlite3 file-based database and initialize django
# without running migrations automatically (user runs them manually by calling init)
django.setup()
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Create cache table in DB if needed
try:
from django.core.cache import cache
cache.get('test', None)
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident
from django.db import connections
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
except KeyboardInterrupt:
raise SystemExit(2)

View file

@ -1,6 +1,7 @@
__package__ = 'archivebox.core'
from io import StringIO
from pathlib import Path
from contextlib import redirect_stdout
from django.contrib import admin
@ -13,15 +14,15 @@ from django import forms
from ..util import htmldecode, urldecode, ansi_to_html
from core.models import Snapshot, Tag
from core.forms import AddLinkForm, TagField
from core.models import Snapshot, ArchiveResult, Tag
from core.forms import AddLinkForm
from core.mixins import SearchResultsAdminMixin
from index.html import snapshot_icons
from logging_util import printable_filesize
from main import add, remove
from config import OUTPUT_DIR
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
from extractors import archive_links
# Admin URLs
@ -36,77 +37,34 @@ from extractors import archive_links
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
def update_snapshots(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Archive"
def update_titles(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "Pull title"
class ArchiveResultInline(admin.TabularInline):
model = ArchiveResult
def overwrite_snapshots(modeladmin, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Re-archive (overwrite)"
class TagInline(admin.TabularInline):
model = Snapshot.tags.through
def verify_snapshots(modeladmin, request, queryset):
for snapshot in queryset:
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
verify_snapshots.short_description = "Check"
def delete_snapshots(modeladmin, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
from django.contrib.admin.helpers import ActionForm
class SnapshotAdminForm(forms.ModelForm):
tags = TagField(required=False)
class Meta:
model = Snapshot
fields = "__all__"
def save(self, commit=True):
# Based on: https://stackoverflow.com/a/49933068/3509554
# Get the unsave instance
instance = forms.ModelForm.save(self, False)
tags = self.cleaned_data.pop("tags")
#update save_m2m
def new_save_m2m():
instance.save_tags(tags)
# Do we need to save all changes now?
self.save_m2m = new_save_m2m
if commit:
instance.save()
return instance
class SnapshotActionForm(ActionForm):
tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False)
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
sort_fields = ('title_str', 'url_str', 'added')
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
fields = (*readonly_fields, 'title', 'tags')
fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
list_filter = ('added', 'updated', 'tags')
ordering = ['-added']
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
actions_template = 'admin/actions_as_select.html'
form = SnapshotAdminForm
list_per_page = 40
actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag']
autocomplete_fields = ['tags']
inlines = [ArchiveResultInline]
list_per_page = SNAPSHOTS_PER_PAGE
action_form = SnapshotActionForm
def get_urls(self):
urls = super().get_urls()
@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
return custom_urls + urls
def get_queryset(self, request):
self.request = request
return super().get_queryset(request).prefetch_related('tags')
def tag_list(self, obj):
return ', '.join(obj.tags.values_list('name', flat=True))
def id_str(self, obj):
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
# def action(self, obj):
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
# # action: update_snapshots
# # select_across: 0
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
# return format_html(
# '''
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
# <input type="hidden" name="_selected_action" value="{}">
# <button name="update_snapshots">Check</button>
# <button name="update_titles">Pull title + favicon</button>
# <button name="update_snapshots">Update</button>
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
# <button name="delete_snapshots">Permanently delete</button>
# </form>
# ''',
# csrf.get_token(self.request),
# obj.id,
# )
def uuid(self, obj):
return format_html(
'<code style="font-size: 10px">{}</code>',
obj.url_hash[:8],
'<code style="font-size: 10px">{}</code><br/><a href="/archive/{}">View index ➡️</a> &nbsp; &nbsp; <a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>',
obj.id,
obj.timestamp,
obj.id,
)
def title_str(self, obj):
canon = obj.as_link().canonical_outputs()
tags = ''.join(
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
format_html('<a href="/admin/core/snapshot/?id__startswith={}"><span class="tag">{}</span></a> ', tag.id, tag)
for tag in obj.tags.all()
if str(tag).strip()
)
@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
return snapshot_icons(obj)
def size(self, obj):
archive_size = obj.archive_size
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
if archive_size:
size_txt = printable_filesize(archive_size)
if archive_size > 52428800:
@ -190,28 +173,136 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
rendered_response = self.changelist_view(request)
# Restore values
self.change_list_template = saved_change_list_template
self.change_list_template = saved_change_list_template
self.list_per_page = saved_list_per_page
self.list_max_show_all = saved_list_max_show_all
return rendered_response
def update_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], out_dir=OUTPUT_DIR)
update_snapshots.short_description = "Archive"
def update_titles(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
update_titles.short_description = "Pull title"
def overwrite_snapshots(self, request, queryset):
archive_links([
snapshot.as_link()
for snapshot in queryset
], overwrite=True, out_dir=OUTPUT_DIR)
overwrite_snapshots.short_description = "Re-archive (overwrite)"
def verify_snapshots(self, request, queryset):
for snapshot in queryset:
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
verify_snapshots.short_description = "Check"
def delete_snapshots(self, request, queryset):
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
delete_snapshots.short_description = "Delete"
def add_tag(self, request, queryset):
if tag and tag.id:
tag = request.POST['tag']
for obj in queryset:
obj.tags.add(tag)
add_tag.short_description = "Add tag"
def remove_tag(self, request, queryset):
tag = request.POST['tag']
for obj in queryset:
obj.tags.remove(tag)
remove_tag.short_description = "Remove tag"
id_str.short_description = 'ID'
title_str.short_description = 'Title'
url_str.short_description = 'Original URL'
id_str.admin_order_field = 'id'
title_str.admin_order_field = 'title'
url_str.admin_order_field = 'url'
class TagAdmin(admin.ModelAdmin):
list_display = ('slug', 'name', 'id')
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
sort_fields = ('id', 'name', 'slug')
readonly_fields = ('id',)
readonly_fields = ('id', 'num_snapshots', 'snapshots')
search_fields = ('id', 'name', 'slug')
fields = (*readonly_fields, 'name', 'slug')
actions = ['delete_selected']
ordering = ['-id']
def num_snapshots(self, obj):
return format_html(
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
obj.id,
obj.snapshot_set.count(),
)
def snapshots(self, obj):
total_count = obj.snapshot_set.count()
return mark_safe('<br/>'.join(
format_html(
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
snap.id,
snap.timestamp,
snap.url,
)
for snap in obj.snapshot_set.order_by('-updated')[:10]
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
class ArchiveResultAdmin(admin.ModelAdmin):
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str')
sort_fields = ('start_ts', 'extractor', 'status')
readonly_fields = ('id', 'uuid', 'snapshot_str')
search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output')
autocomplete_fields = ['snapshot']
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
ordering = ['-start_ts']
list_per_page = SNAPSHOTS_PER_PAGE
def snapshot_str(self, obj):
return format_html(
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
'<small>{}</small>',
obj.snapshot.timestamp,
obj.snapshot.timestamp,
obj.snapshot.url[:128],
)
def cmd_str(self, obj):
return format_html(
'<pre>{}</pre>',
' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
)
def output_str(self, obj):
return format_html(
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
obj.snapshot.timestamp,
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
obj.output,
)
snapshot_str.short_description = 'snapshot'
class ArchiveBoxAdmin(admin.AdminSite):
site_header = 'ArchiveBox'
@ -266,4 +357,5 @@ admin.site = ArchiveBoxAdmin()
admin.site.register(get_user_model())
admin.site.register(Snapshot, SnapshotAdmin)
admin.site.register(Tag, TagAdmin)
admin.site.register(ArchiveResult, ArchiveResultAdmin)
admin.site.disable_action('delete_selected')

View file

@ -20,7 +20,8 @@ ARCHIVE_METHODS = [
class AddLinkForm(forms.Form):
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
archive_methods = forms.MultipleChoiceField(
label="Archive methods (select at least 1, otherwise all will be used by default)",
required=False,

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-16 10:38
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0008_auto_20210105_1421'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='updated',
field=models.DateTimeField(auto_now=True, db_index=True, null=True),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-16 10:55
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0009_auto_20210216_1038'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='start_ts',
field=models.DateTimeField(db_index=True),
),
]

View file

@ -0,0 +1,24 @@
# Generated by Django 3.1.3 on 2021-02-16 13:31
from django.db import migrations, models
import uuid
class Migration(migrations.Migration):
dependencies = [
('core', '0010_auto_20210216_1055'),
]
operations = [
migrations.AddField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(default=uuid.uuid4, editable=False),
),
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
),
]

View file

@ -0,0 +1,23 @@
# Generated by Django 3.1.3 on 2021-02-16 14:25
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0011_auto_20210216_1331'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='cmd_version',
field=models.CharField(blank=True, default=None, max_length=128, null=True),
),
migrations.AlterField(
model_name='archiveresult',
name='output',
field=models.CharField(max_length=1024),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-18 07:29
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0012_auto_20210216_1425'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-18 07:29
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0013_auto_20210218_0729'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-18 07:30
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0014_auto_20210218_0729'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='title',
field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-18 12:04
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0015_auto_20210218_0730'),
]
operations = [
migrations.AlterField(
model_name='snapshot',
name='tags',
field=models.ManyToManyField(blank=True, to='core.Tag'),
),
]

View file

@ -0,0 +1,18 @@
# Generated by Django 3.1.3 on 2021-02-19 02:11
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0016_auto_20210218_1204'),
]
operations = [
migrations.AlterField(
model_name='tag',
name='slug',
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
),
]

View file

@ -2,12 +2,15 @@ __package__ = 'archivebox.core'
import uuid
from django.db import models, transaction
from django.db import models
from django.utils.functional import cached_property
from django.utils.text import slugify
from django.core.cache import cache
from django.db.models import Case, When, Value, IntegerField
from ..util import parse_date
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from ..system import get_dir_size
from ..util import parse_date, base_url, hashurl
from ..index.schema import Link
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
@ -29,8 +32,11 @@ class Tag(models.Model):
"""
Based on django-taggit model
"""
name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
name = models.CharField(unique=True, blank=False, max_length=100)
# slug is autoset on save from name, never set it manually
slug = models.SlugField(unique=True, blank=True, max_length=100)
class Meta:
verbose_name = "Tag"
@ -49,20 +55,21 @@ class Tag(models.Model):
if self._state.adding and not self.slug:
self.slug = self.slugify(self.name)
with transaction.atomic():
slugs = set(
type(self)
._default_manager.filter(slug__startswith=self.slug)
.values_list("slug", flat=True)
)
# if name is different but slug conficts with another tags slug, append a counter
# with transaction.atomic():
slugs = set(
type(self)
._default_manager.filter(slug__startswith=self.slug)
.values_list("slug", flat=True)
)
i = None
while True:
slug = self.slugify(self.name, i)
if slug not in slugs:
self.slug = slug
return super().save(*args, **kwargs)
i = 1 if i is None else i+1
i = None
while True:
slug = self.slugify(self.name, i)
if slug not in slugs:
self.slug = slug
return super().save(*args, **kwargs)
i = 1 if i is None else i+1
else:
return super().save(*args, **kwargs)
@ -73,11 +80,11 @@ class Snapshot(models.Model):
url = models.URLField(unique=True)
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
added = models.DateTimeField(auto_now_add=True, db_index=True)
updated = models.DateTimeField(null=True, blank=True, db_index=True)
tags = models.ManyToManyField(Tag)
updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
tags = models.ManyToManyField(Tag, blank=True)
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
@ -109,13 +116,24 @@ class Snapshot(models.Model):
from ..index import load_link_details
return load_link_details(self.as_link())
def tags_str(self) -> str:
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
def tags_str(self, nocache=True) -> str:
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
if nocache:
tags_str = calc_tags_str()
cache.set(cache_key, tags_str)
return tags_str
return cache.get_or_set(cache_key, calc_tags_str)
@cached_property
def bookmarked(self):
return parse_date(self.timestamp)
@cached_property
def bookmarked_date(self):
# TODO: remove this
return self.bookmarked
@cached_property
def is_archived(self):
return self.as_link().is_archived
@ -126,23 +144,31 @@ class Snapshot(models.Model):
@cached_property
def url_hash(self):
return self.as_link().url_hash
return hashurl(self.url)
@cached_property
def base_url(self):
return self.as_link().base_url
return base_url(self.url)
@cached_property
def link_dir(self):
return self.as_link().link_dir
return str(ARCHIVE_DIR / self.timestamp)
@cached_property
def archive_path(self):
return self.as_link().archive_path
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
@cached_property
def archive_size(self):
return self.as_link().archive_size
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
def calc_dir_size():
try:
return get_dir_size(self.link_dir)[0]
except Exception:
return 0
return cache.get_or_set(cache_key, calc_dir_size)
@cached_property
def history(self):
@ -151,17 +177,40 @@ class Snapshot(models.Model):
@cached_property
def latest_title(self):
if ('title' in self.history
and self.history['title']
and (self.history['title'][-1].status == 'succeeded')
and self.history['title'][-1].output.strip()):
return self.history['title'][-1].output.strip()
if self.title:
return self.title # whoopdedoo that was easy
try:
# take longest successful title from ArchiveResult db history
return sorted(
self.archiveresult_set\
.filter(extractor='title', status='succeeded', output__isnull=False)\
.values_list('output', flat=True),
key=lambda r: len(r),
)[-1]
except IndexError:
pass
try:
# take longest successful title from Link json index file history
return sorted(
(
result.output.strip()
for result in self.history['title']
if result.status == 'succeeded' and result.output.strip()
),
key=lambda r: len(r),
)[-1]
except (KeyError, IndexError):
pass
return None
def save_tags(self, tags=()):
tags_id = []
for tag in tags:
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
if tag.strip():
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
self.tags.clear()
self.tags.add(*tags_id)
@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager):
class ArchiveResult(models.Model):
id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
cmd = JSONField()
pwd = models.CharField(max_length=256)
cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
output = models.CharField(max_length=512)
start_ts = models.DateTimeField()
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
output = models.CharField(max_length=1024)
start_ts = models.DateTimeField(db_index=True)
end_ts = models.DateTimeField()
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
objects = ArchiveResultManager()

View file

@ -2,6 +2,9 @@ __package__ = 'archivebox.core'
import os
import sys
import re
import logging
import tempfile
from pathlib import Path
from django.utils.crypto import get_random_string
@ -14,6 +17,7 @@ from ..config import (
TEMPLATES_DIR_NAME,
SQL_INDEX_FILENAME,
OUTPUT_DIR,
LOGS_DIR,
)
@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [
'django.contrib.auth.backends.ModelBackend',
]
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
if DEBUG_TOOLBAR:
try:
import debug_toolbar # noqa
DEBUG_TOOLBAR = True
except ImportError:
DEBUG_TOOLBAR = False
if DEBUG_TOOLBAR:
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
DEBUG_TOOLBAR_CONFIG = {
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
"RENDER_PANELS": True,
}
DEBUG_TOOLBAR_PANELS = [
'debug_toolbar.panels.history.HistoryPanel',
'debug_toolbar.panels.versions.VersionsPanel',
'debug_toolbar.panels.timer.TimerPanel',
'debug_toolbar.panels.settings.SettingsPanel',
'debug_toolbar.panels.headers.HeadersPanel',
'debug_toolbar.panels.request.RequestPanel',
'debug_toolbar.panels.sql.SQLPanel',
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
# 'debug_toolbar.panels.templates.TemplatesPanel',
'debug_toolbar.panels.cache.CachePanel',
'debug_toolbar.panels.signals.SignalsPanel',
'debug_toolbar.panels.logging.LoggingPanel',
'debug_toolbar.panels.redirects.RedirectsPanel',
'debug_toolbar.panels.profiling.ProfilingPanel',
'djdt_flamegraph.FlamegraphPanel',
]
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
################################################################################
### Staticfile and Template Settings
@ -107,6 +145,22 @@ DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
'NAME': DATABASE_NAME,
'OPTIONS': {
'timeout': 60,
'check_same_thread': False,
},
# DB setup is sometimes modified at runtime by setup_django() in config.py
}
}
CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
CACHES = {
'default': {
'BACKEND': CACHE_BACKEND,
'LOCATION': 'django_cache_default',
}
}
@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
### Security Settings
################################################################################
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600 # 2 weeks
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
SESSION_SAVE_EVERY_REQUEST = True
SESSION_ENGINE = "django.contrib.sessions.backends.db"
AUTH_PASSWORD_VALIDATORS = [
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
@ -163,3 +219,73 @@ USE_TZ = False
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
################################################################################
### Logging Settings
################################################################################
IGNORABLE_404_URLS = [
re.compile(r'apple-touch-icon.*\.png$'),
re.compile(r'favicon\.ico$'),
re.compile(r'robots\.txt$'),
re.compile(r'.*\.(css|js)\.map$'),
]
class NoisyRequestsFilter(logging.Filter):
def filter(self, record):
logline = record.getMessage()
# ignore harmless 404s for the patterns in IGNORABLE_404_URLS
for ignorable_url_pattern in IGNORABLE_404_URLS:
ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
if ignorable_log_pattern.match(logline):
return 0
# ignore staticfile requests that 200 or 30*
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
if ignoreable_200_log_pattern.match(logline):
return 0
return 1
if LOGS_DIR.exists():
ERROR_LOG = (LOGS_DIR / 'errors.log')
else:
# meh too many edge cases here around creating log dir w/ correct permissions
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
ERROR_LOG = tempfile.NamedTemporaryFile().name
LOGGING = {
'version': 1,
'disable_existing_loggers': False,
'handlers': {
'console': {
'class': 'logging.StreamHandler',
},
'logfile': {
'level': 'ERROR',
'class': 'logging.handlers.RotatingFileHandler',
'filename': ERROR_LOG,
'maxBytes': 1024 * 1024 * 25, # 25 MB
'backupCount': 10,
},
},
'filters': {
'noisyrequestsfilter': {
'()': NoisyRequestsFilter,
}
},
'loggers': {
'django': {
'handlers': ['console', 'logfile'],
'level': 'INFO',
'filters': ['noisyrequestsfilter'],
},
'django.server': {
'handlers': ['console', 'logfile'],
'level': 'INFO',
'filters': ['noisyrequestsfilter'],
}
},
}

View file

@ -2,6 +2,7 @@ from django.contrib import admin
from django.urls import path, include
from django.views import static
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
urlpatterns = [
path('public/', PublicIndexView.as_view(), name='public-index'),
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
@ -35,35 +36,43 @@ urlpatterns = [
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
path('', HomepageView.as_view(), name='Home'),
]
urlpatterns += staticfiles_urlpatterns()
# # Proposed UI URLs spec
# path('', HomepageView)
# path('/add', AddView)
# path('/public', PublicIndexView)
# path('/snapshot/:slug', SnapshotView)
# path('/admin', admin.site.urls)
# path('/accounts', django.contrib.auth.urls)
if settings.DEBUG_TOOLBAR:
import debug_toolbar
urlpatterns += [
path('__debug__/', include(debug_toolbar.urls)),
]
# # Prposed REST API spec
# # :slugs can be uuid, short_uuid, or any of the unique index_fields
# path('api/v1/'),
# path('api/v1/core/' [GET])
# path('api/v1/core/snapshot/', [GET, POST, PUT]),
# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
# path('api/v1/core/archiveresult', [GET, POST, PUT]),
# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
# path('api/v1/core/tag/', [GET, POST, PUT]),
# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
# path('api/v1/cli/', [GET])
# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
# # Proposed FUTURE URLs spec
# path('', HomepageView)
# path('/add', AddView)
# path('/public', PublicIndexView)
# path('/snapshot/:slug', SnapshotView)
# path('api/v1/extractors/', [GET])
# path('api/v1/extractors/:extractor/', [GET]),
# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
# path('/admin', admin.site.urls)
# path('/accounts', django.contrib.auth.urls)
# future, just an idea:
# path('api/v1/scheduler/', [GET])
# path('api/v1/scheduler/task/', [GET, POST, PUT]),
# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),
# # Prposed REST API spec
# # :slugs can be uuid, short_uuid, or any of the unique index_fields
# path('api/v1/'),
# path('api/v1/core/' [GET])
# path('api/v1/core/snapshot/', [GET, POST, PUT]),
# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
# path('api/v1/core/archiveresult', [GET, POST, PUT]),
# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
# path('api/v1/core/tag/', [GET, POST, PUT]),
# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
# path('api/v1/cli/', [GET])
# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
# path('api/v1/extractors/', [GET])
# path('api/v1/extractors/:extractor/', [GET]),
# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
# future, just an idea:
# path('api/v1/scheduler/', [GET])
# path('api/v1/scheduler/task/', [GET, POST, PUT]),
# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),

View file

@ -4,8 +4,8 @@ from io import StringIO
from contextlib import redirect_stdout
from django.shortcuts import render, redirect
from django.http import HttpResponse
from django.http import HttpResponse, Http404
from django.utils.html import format_html, mark_safe
from django.views import View, static
from django.views.generic.list import ListView
from django.views.generic import FormView
@ -22,6 +22,7 @@ from ..config import (
PUBLIC_ADD_VIEW,
VERSION,
FOOTER_INFO,
SNAPSHOTS_PER_PAGE,
)
from main import add
from ..util import base_url, ansi_to_html
@ -43,10 +44,6 @@ class SnapshotView(View):
# render static html index from filesystem archive/<timestamp>/index.html
def get(self, request, path):
# missing trailing slash -> redirect to index
if '/' not in path:
return redirect(f'{path}/index.html')
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
@ -55,46 +52,163 @@ class SnapshotView(View):
except (IndexError, ValueError):
slug, archivefile = path.split('/', 1)[0], 'index.html'
all_pages = list(Snapshot.objects.all())
# slug is a timestamp
by_ts = {page.timestamp: page for page in all_pages}
try:
# print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
return response
except KeyError:
pass
if slug.replace('.','').isdigit():
# slug is a hash
by_hash = {page.url_hash: page for page in all_pages}
try:
timestamp = by_hash[slug].timestamp
return redirect(f'/archive/{timestamp}/{archivefile}')
except KeyError:
pass
# missing trailing slash -> redirect to index
if '/' not in path:
return redirect(f'{path}/index.html')
try:
try:
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
return response
except Snapshot.DoesNotExist:
if Snapshot.objects.filter(timestamp__startswith=slug).exists():
raise Snapshot.MultipleObjectsReturned
else:
raise
except Snapshot.DoesNotExist:
# Snapshot does not exist
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
'</center>'
),
slug,
path,
),
content_type="text/html",
status=404,
)
except Snapshot.MultipleObjectsReturned:
snapshot_hrefs = mark_safe('<br/>').join(
format_html(
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
snap.timestamp,
snap.timestamp,
snap.url,
snap.title or '',
)
for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
)
return HttpResponse(
format_html(
(
'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
),
slug,
) + snapshot_hrefs + format_html(
(
'</pre><br/>'
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
)
),
content_type="text/html",
status=404,
)
except Http404:
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
'{}'
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
'<div class="text-align: left; width: 100%; max-width: 400px">'
'<i><b>Next steps:</i></b><br/>'
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
'- or return to <a href="/" target="_top">the main index...</a></div>'
'</center>'
),
archivefile,
),
content_type="text/html",
status=404,
)
# slug is a URL
by_url = {page.base_url: page for page in all_pages}
try:
# TODO: add multiple snapshot support by showing index of all snapshots
# for given url instead of redirecting to timestamp index
timestamp = by_url[base_url(path)].timestamp
return redirect(f'/archive/{timestamp}/index.html')
except KeyError:
pass
return HttpResponse(
'No archived link matches the given timestamp or hash.',
content_type="text/plain",
status=404,
)
try:
# try exact match on full url first
snapshot = Snapshot.objects.get(
Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
)
except Snapshot.DoesNotExist:
# fall back to match on exact base_url
try:
snapshot = Snapshot.objects.get(
Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
)
except Snapshot.DoesNotExist:
# fall back to matching base_url as prefix
snapshot = Snapshot.objects.get(
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
)
return redirect(f'/archive/{snapshot.timestamp}/index.html')
except Snapshot.DoesNotExist:
return HttpResponse(
format_html(
(
'<center><br/><br/><br/>'
'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
'+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
'</center>'
),
base_url(path),
path if '://' in path else f'https://{path}',
path,
),
content_type="text/html",
status=404,
)
except Snapshot.MultipleObjectsReturned:
snapshot_hrefs = mark_safe('<br/>').join(
format_html(
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
snap.timestamp,
snap.timestamp,
snap.url,
snap.title or '',
)
for snap in Snapshot.objects.filter(
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
).only('url', 'timestamp', 'title', 'added').order_by('-added')
)
return HttpResponse(
format_html(
(
'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
),
base_url(path),
) + snapshot_hrefs + format_html(
(
'</pre><br/>'
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
)
),
content_type="text/html",
status=404,
)
class PublicIndexView(ListView):
template_name = 'public_index.html'
model = Snapshot
paginate_by = 100
paginate_by = SNAPSHOTS_PER_PAGE
ordering = ['title']
def get_context_data(self, **kwargs):
@ -105,12 +219,14 @@ class PublicIndexView(ListView):
}
def get_queryset(self, **kwargs):
qs = super().get_queryset(**kwargs)
qs = super().get_queryset(**kwargs)
query = self.request.GET.get('q')
if query:
qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
for snapshot in qs:
snapshot.icons = snapshot_icons(snapshot)
# lazy load snapshot icons, otherwise it will load icons for entire index at once
snapshot.icons = lambda: snapshot_icons(snapshot)
return qs
def get(self, *args, **kwargs):
@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
if self.request.method == 'GET':
url = self.request.GET.get('url', None)
if url:
return {'url': url}
else:
return super().get_initial()
return {'url': url if '://' in url else f'https://{url}'}
return super().get_initial()
def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
'VERSION': VERSION,
'FOOTER_INFO': FOOTER_INFO,
'stdout': '',
}
def form_valid(self, form):
url = form.cleaned_data["url"]
print(f'[+] Adding URL: {url}')
tag = form.cleaned_data["tag"]
depth = 0 if form.cleaned_data["depth"] == "0" else 1
extractors = ','.join(form.cleaned_data["archive_methods"])
input_kwargs = {
"urls": url,
"tag": tag,
"depth": depth,
"update_all": False,
"out_dir": OUTPUT_DIR,

View file

@ -7,10 +7,10 @@ For more information on this file, see
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
"""
import os
from archivebox.config import setup_django
setup_django(in_memory_db=False, check_db=True)
from django.core.wsgi import get_wsgi_application
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
application = get_wsgi_application()

View file

@ -44,16 +44,16 @@ def get_default_archive_methods():
return [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('wget', should_save_wget, save_wget),
('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
('pdf', should_save_pdf, save_pdf),
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
('wget', should_save_wget, save_wget),
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
('mercury', should_save_mercury, save_mercury),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
('headers', should_save_headers, save_headers),
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
]
@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
# bump the updated time on the main Snapshot here, this is critical
# to be able to cache summaries of the ArchiveResults for a given
# snapshot without having to load all the results from the DB each time.
# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
# ArchiveResults are unchanged as long as the updated timestamp is unchanged)
snapshot.save()
else:
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1

View file

@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / 'archive.org.txt').exists():
# if open(path, 'r').read().strip() != 'None':
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
return False
return SAVE_ARCHIVE_DOT_ORG

View file

@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "mercury"
output = str(output_folder)
output = "mercury"
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
output_folder.mkdir(exist_ok=True)
# Get plain text version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
if article_text.get('failed'):
raise ArchiveError('Mercury was not able to get article text from the URL')
atomic_write(str(output_folder / "content.txt"), article_text["content"])
# Get HTML version of article
cmd = [
DEPENDENCIES['MERCURY_BINARY']['path'],
@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
except json.JSONDecodeError:
raise ShellError(cmd, result)
output_folder.mkdir(exist_ok=True)
if article_text.get('failed'):
raise ArchiveError('Mercury was not able to get article HTML from the URL')
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
atomic_write(str(output_folder / "content.txt"), article_text["content"])
atomic_write(str(output_folder / "article.json"), article_json)
# Check for common failure cases

View file

@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
document = None
for source in sources:
try:
with open(abs_path / source, "r") as f:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output_folder = out_dir.absolute() / "readability"
output = str(output_folder)
output = "readability"
# Readability Docs: https://github.com/mozilla/readability
@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
temp_doc.write(document.encode("utf-8"))
temp_doc.close()
if not document or len(document) < 10:
raise ArchiveError('Readability could not find HTML to parse for article text')
cmd = [
DEPENDENCIES['READABILITY_BINARY']['path'],
temp_doc.name
temp_doc.name,
]
result = run(cmd, cwd=out_dir, timeout=timeout)
result_json = json.loads(result.stdout)
try:
result_json = json.loads(result.stdout)
except json.JSONDecodeError:
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
output_folder.mkdir(exist_ok=True)
readability_content = result_json.pop("textContent")
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
except (Exception, OSError) as err:
status = 'failed'
output = err
cmd = [cmd[0], './{singlefile,dom}.html']
finally:
timer.end()
@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
cmd_version=READABILITY_VERSION,
output=output,
status=status,
index_texts= [readability_content] if readability_content else [],
index_texts=[readability_content] if readability_content else [],
**timer.stats,
)

View file

@ -356,6 +356,7 @@ LINK_FILTERS = {
'regex': lambda pattern: Q(url__iregex=pattern),
'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
'tag': lambda pattern: Q(tags__name=pattern),
'timestamp': lambda pattern: Q(timestamp=pattern),
}
@enforce_types

View file

@ -1,11 +1,12 @@
__package__ = 'archivebox.index'
from datetime import datetime
from typing import List, Optional, Iterator, Mapping
from pathlib import Path
from datetime import datetime
from collections import defaultdict
from typing import List, Optional, Iterator, Mapping
from django.utils.html import format_html, mark_safe
from collections import defaultdict
from django.core.cache import cache
from .schema import Link
from ..system import atomic_write
@ -20,7 +21,6 @@ from ..util import (
from ..config import (
OUTPUT_DIR,
VERSION,
GIT_SHA,
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
return render_django_template(template, {
'version': VERSION,
'git_sha': GIT_SHA,
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'num_links': str(len(links)),
'date_updated': datetime.now().strftime('%Y-%m-%d'),
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
def snapshot_icons(snapshot) -> str:
from core.models import EXTRACTORS
cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
def calc_snapshot_icons():
from core.models import EXTRACTORS
# start = datetime.now()
# start = datetime.now()
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
link = snapshot.as_link()
path = link.archive_path
canon = link.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "",
"wget": "🆆",
"dom": "🅷",
"pdf": "📄",
"screenshot": "💻",
"media": "📼",
"git": "🅶",
"archive_org": "🏛",
"readability": "🆁",
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC
archive_results = snapshot.archiveresult_set.filter(status="succeeded")
link = snapshot.as_link()
path = link.archive_path
canon = link.canonical_outputs()
output = ""
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> &nbsp;'
icons = {
"singlefile": "",
"wget": "🆆",
"dom": "🅷",
"pdf": "📄",
"screenshot": "💻",
"media": "📼",
"git": "🅶",
"archive_org": "🏛",
"readability": "🆁",
"mercury": "🅼",
"warc": "📦"
}
exclude = ["favicon", "title", "headers", "archive_org"]
# Missing specific entry for WARC
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS:
for result in archive_results:
if result.extractor == extractor and result:
extractor_outputs[extractor] = result
extractor_outputs = defaultdict(lambda: None)
for extractor, _ in EXTRACTORS:
for result in archive_results:
if result.extractor == extractor and result:
extractor_outputs[extractor] = result
for extractor, _ in EXTRACTORS:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
# if existing:
# existing = (Path(path) / existing)
# if existing.is_file():
# existing = True
# elif existing.is_dir():
# existing = any(existing.glob('*.*'))
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
# get from db (faster but less thurthful)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower but more accurate)
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
for extractor, _ in EXTRACTORS:
if extractor not in exclude:
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
# if existing:
# existing = (Path(path) / existing)
# if existing.is_file():
# existing = True
# elif existing.is_dir():
# existing = any(existing.glob('*.*'))
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
extractor, icons.get(extractor, "?"))
if extractor == "wget":
# warc isn't technically it's own extractor, so we have to add it after wget
# get from db (faster but less thurthful)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower but more accurate)
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
if extractor == "archive_org":
# The check for archive_org is different, so it has to be handled separately
if extractor == "archive_org":
# The check for archive_org is different, so it has to be handled separately
# get from db (faster)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower)
# target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?"))
# get from db (faster)
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
# get from filesystem (slower)
# target_path = Path(path) / "archive.org.txt"
# exists = target_path.exists()
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
"archive_org", icons.get("archive_org", "?"))
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
# end = datetime.now()
# print(((end - start).total_seconds()*1000) // 1, 'ms')
return result
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
# end = datetime.now()
# print(((end - start).total_seconds()*1000) // 1, 'ms')
return result
return cache.get_or_set(cache_key, calc_snapshot_icons)
# return calc_snapshot_icons()

View file

@ -15,7 +15,6 @@ from ..config import (
VERSION,
OUTPUT_DIR,
FOOTER_INFO,
GIT_SHA,
DEPENDENCIES,
JSON_INDEX_FILENAME,
ARCHIVE_DIR_NAME,
@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
'meta': {
'project': 'ArchiveBox',
'version': VERSION,
'git_sha': GIT_SHA,
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
'website': 'https://ArchiveBox.io',
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
'source': 'https://github.com/ArchiveBox/ArchiveBox',

View file

@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
from dataclasses import dataclass, asdict, field, fields
from django.utils.functional import cached_property
from ..system import get_dir_size
@ -133,7 +134,6 @@ class Link:
updated: Optional[datetime] = None
schema: str = 'Link'
def __str__(self) -> str:
return f'[{self.timestamp}] {self.url} "{self.title}"'
@ -190,6 +190,7 @@ class Link:
}
if extended:
info.update({
'snapshot_id': self.snapshot_id,
'link_dir': self.link_dir,
'archive_path': self.archive_path,
@ -201,6 +202,9 @@ class Link:
'basename': self.basename,
'extension': self.extension,
'is_static': self.is_static,
'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there
'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
'bookmarked_date': self.bookmarked_date,
'updated_date': self.updated_date,
@ -255,6 +259,11 @@ class Link:
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
@cached_property
def snapshot_id(self):
from core.models import Snapshot
return str(Snapshot.objects.only('id').get(url=self.url).id)
@classmethod
def field_names(cls):
return [f.name for f in fields(cls)]

View file

@ -7,7 +7,7 @@ from django.db.models import QuerySet
from django.db import transaction
from .schema import Link
from ..util import enforce_types
from ..util import enforce_types, parse_date
from ..config import OUTPUT_DIR
@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
)
@enforce_types
def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
with transaction.atomic():
snapshots.delete()
def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
if atomic:
with transaction.atomic():
return snapshots.delete()
return snapshots.delete()
@enforce_types
def write_link_to_sql_index(link: Link):
from core.models import Snapshot
from core.models import Snapshot, ArchiveResult
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
for extractor, entries in link.history.items():
for entry in entries:
if isinstance(entry, dict):
result, _ = ArchiveResult.objects.get_or_create(
snapshot_id=snapshot.id,
extractor=extractor,
start_ts=parse_date(entry['start_ts']),
defaults={
'end_ts': parse_date(entry['end_ts']),
'cmd': entry['cmd'],
'output': entry['output'],
'cmd_version': entry.get('cmd_version') or 'unknown',
'pwd': entry['pwd'],
'status': entry['status'],
}
)
else:
result, _ = ArchiveResult.objects.update_or_create(
snapshot_id=snapshot.id,
extractor=extractor,
start_ts=parse_date(entry.start_ts),
defaults={
'end_ts': parse_date(entry.end_ts),
'cmd': entry.cmd,
'output': entry.output,
'cmd_version': entry.cmd_version or 'unknown',
'pwd': entry.pwd,
'status': entry.status,
}
)
return snapshot
@enforce_types
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
with transaction.atomic():
for link in links:
write_link_to_sql_index(link)
for link in links:
# with transaction.atomic():
# write_link_to_sql_index(link)
write_link_to_sql_index(link)
@enforce_types
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
from core.models import Snapshot
with transaction.atomic():
try:
snap = Snapshot.objects.get(url=link.url)
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
# with transaction.atomic():
# try:
# snap = Snapshot.objects.get(url=link.url)
# except Snapshot.DoesNotExist:
# snap = write_link_to_sql_index(link)
# snap.title = link.title
try:
snap = Snapshot.objects.get(url=link.url)
except Snapshot.DoesNotExist:
snap = write_link_to_sql_index(link)
snap.title = link.title
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
snap.save()
snap.save_tags(tag_list)
snap.save()
snap.save_tags(tag_list)

View file

@ -3,6 +3,7 @@ __package__ = 'archivebox'
import re
import os
import sys
import stat
import time
import argparse
from math import log
@ -11,18 +12,21 @@ from pathlib import Path
from datetime import datetime
from dataclasses import dataclass
from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
if TYPE_CHECKING:
from .index.schema import Link, ArchiveResult
from .system import get_dir_size
from .util import enforce_types
from .config import (
ConfigDict,
OUTPUT_DIR,
PYTHON_ENCODING,
VERSION,
ANSI,
IS_TTY,
IN_DOCKER,
TERM_WIDTH,
SHOW_PROGRESS,
SOURCES_DIR_NAME,
@ -50,6 +54,37 @@ class RuntimeStats:
_LAST_RUN_STATS = RuntimeStats()
def debug_dict_summary(obj: Dict[Any, Any]) -> None:
stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
def get_fd_info(fd) -> Dict[str, Any]:
NAME = fd.name[1:-1]
FILENO = fd.fileno()
MODE = os.fstat(FILENO).st_mode
IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
IS_PIPE = stat.S_ISFIFO(MODE)
IS_FILE = stat.S_ISREG(MODE)
IS_TERMINAL = not (IS_PIPE or IS_FILE)
IS_LINE_BUFFERED = fd.line_buffering
IS_READABLE = fd.readable()
return {
'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
'IS_READABLE': IS_READABLE,
}
# # Log debug information about stdin, stdout, and stderr
# sys.stdout.write('[>&1] this is python stdout\n')
# sys.stderr.write('[>&2] this is python stderr\n')
# debug_dict_summary(get_fd_info(sys.stdin))
# debug_dict_summary(get_fd_info(sys.stdout))
# debug_dict_summary(get_fd_info(sys.stderr))
class SmartFormatter(argparse.HelpFormatter):
"""Patched formatter that prints newlines in argparse help strings"""
@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
"""Tell the user they passed stdin to a command that doesn't accept it"""
if stdin and not stdin.isatty():
stdin_raw_text = stdin.read().strip()
if not stdin:
return None
if IN_DOCKER:
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
return None
if not stdin.isatty():
# stderr('READING STDIN TO REJECT...')
stdin_raw_text = stdin.read()
if stdin_raw_text:
# stderr('GOT STDIN!', len(stdin_str))
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
stderr()
raise SystemExit(1)
return None
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
"""accept any standard input and return it as a string or None"""
if not stdin:
return None
elif stdin and not stdin.isatty():
stdin_str = stdin.read().strip()
return stdin_str or None
if not stdin.isatty():
# stderr('READING STDIN TO ACCEPT...')
stdin_str = stdin.read()
if stdin_str:
# stderr('GOT STDIN...', len(stdin_str))
return stdin_str
return None
@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
from .config import VERSION, ANSI
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
@ -233,11 +285,11 @@ def log_indexing_process_finished():
def log_indexing_started(out_path: str):
if IS_TTY:
sys.stdout.write(f' > {out_path}')
sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
def log_indexing_finished(out_path: str):
print(f'\r{out_path}')
print(f'\r./{Path(out_path).relative_to(OUTPUT_DIR)}')
### Archiving Stage
@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
total=num_links,
))
print()
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
print(' archivebox server # then visit http://127.0.0.1:8000')
print(' Continue archiving where you left off by running:')
print(' archivebox update --resume={}'.format(timestamp))
@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
else:
_LAST_RUN_STATS.succeeded += 1
size = get_dir_size(link_dir)
print(' {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
def log_archive_method_started(method: str):
print(' > {}'.format(method))

View file

@ -67,6 +67,7 @@ from .config import (
ConfigDict,
ANSI,
IS_TTY,
DEBUG,
IN_DOCKER,
USER,
ARCHIVEBOX_BINARY,
@ -76,6 +77,7 @@ from .config import (
ARCHIVE_DIR,
LOGS_DIR,
CONFIG_FILE,
CONFIG_FILENAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
@ -84,6 +86,7 @@ from .config import (
SQL_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
SEARCH_BACKEND_ENGINE,
check_dependencies,
check_data_folder,
write_config_file,
@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
'node_modules',
'package-lock.json',
'static',
'sonic',
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
SQL_INDEX_FILENAME,
f'{SQL_INDEX_FILENAME}-wal',
f'{SQL_INDEX_FILENAME}-shm',
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f'{CONFIG_FILENAME}.bak',
}
@enforce_types
@ -214,9 +222,23 @@ def version(quiet: bool=False,
if quiet:
print(VERSION)
else:
# ArchiveBox v0.5.6
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
print('ArchiveBox v{}'.format(VERSION))
p = platform.uname()
print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
print(
sys.implementation.name.title(),
p.system,
platform.platform(),
p.machine,
)
print(
f'IN_DOCKER={IN_DOCKER}',
f'DEBUG={DEBUG}',
f'IS_TTY={IS_TTY}',
f'TZ={os.environ.get("TZ", "UTC")}',
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
)
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
@ -261,7 +283,7 @@ def run(subcommand: str,
@enforce_types
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
"""Initialize a new ArchiveBox collection in the current directory"""
from core.models import Snapshot
@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
if is_empty and not existing_index:
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
print(f' {out_dir}')
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
elif existing_index:
print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
print(f' {out_dir}')
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
# TODO: properly detect and print the existing version in current index as well
print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
else:
if force:
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
else:
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
Path(SOURCES_DIR).mkdir(exist_ok=True)
print(f'{SOURCES_DIR}')
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
print(f'{ARCHIVE_DIR}')
Path(LOGS_DIR).mkdir(exist_ok=True)
print(f'{LOGS_DIR}')
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
write_config_file({}, out_dir=out_dir)
print(f'{CONFIG_FILE}')
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
else:
print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
print(f'{DATABASE_FILE}')
print()
for migration_line in apply_migrations(out_dir):
print(f' {migration_line}')
assert DATABASE_FILE.exists()
print()
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
# from django.contrib.auth.models import User
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
# call_command("createsuperuser", interactive=True)
print()
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
all_links = Snapshot.objects.none()
pending_links: Dict[str, Link] = {}
@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
all_links = load_main_index(out_dir=out_dir, warn=False)
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
# Links in data folders that dont match their timestamp
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
if fixed:
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
if cant_fix:
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
if quick:
print(' > Skipping full snapshot directory check (quick mode)')
else:
try:
# Links in data folders that dont match their timestamp
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
if fixed:
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
if cant_fix:
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
# Links in JSON index but not in main index
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
# Links in JSON index but not in main index
orphaned_json_links = {
link.url: link
for link in parse_json_main_index(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_json_links:
pending_links.update(orphaned_json_links)
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
# Links in data dir indexes but not in main index
orphaned_data_dir_links = {
link.url: link
for link in parse_json_links_details(out_dir)
if not all_links.filter(url=link.url).exists()
}
if orphaned_data_dir_links:
pending_links.update(orphaned_data_dir_links)
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
# Links in invalid/duplicate data dirs
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
print()
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
print(' archivebox status')
print(' archivebox list --status=invalid')
# Links in invalid/duplicate data dirs
invalid_folders = {
folder: link
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
}
if invalid_folders:
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
print()
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
print(' archivebox status')
print(' archivebox list --status=invalid')
except (KeyboardInterrupt, SystemExit):
stderr()
stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
stderr()
stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
stderr(' archivebox init --quick')
raise SystemExit(1)
write_main_index(list(pending_links.values()), out_dir=out_dir)
write_main_index(list(pending_links.values()), out_dir=out_dir)
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
if existing_index:
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
else:
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
print()
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
print(' archivebox server # then visit http://127.0.0.1:8000')
print()
print(' To add new links, you can run:')
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
if Snapshot.objects.count() < 25: # hide the hints for experienced users
print()
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
print(' archivebox server # then visit http://127.0.0.1:8000')
print()
print(' To add new links, you can run:')
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
json_index = Path(out_dir) / JSON_INDEX_FILENAME
html_index = Path(out_dir) / HTML_INDEX_FILENAME
@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
@enforce_types
def add(urls: Union[str, List[str]],
tag: str='',
depth: int=0,
update_all: bool=not ONLY_NEW,
index_only: bool=False,
@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Add a new URL or list of URLs to your archive"""
from core.models import Tag
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
extractors = extractors.split(",") if extractors else []
@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
new_links = dedupe_links(all_links, imported_links)
write_main_index(links=new_links, out_dir=out_dir)
all_links = load_main_index(out_dir=out_dir)
if index_only:
return all_links
# mock archive all the links using the fake index_only extractor method in order to update their state
if overwrite:
archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
else:
archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
else:
# fully run the archive extractor methods for each link
archive_kwargs = {
"out_dir": out_dir,
}
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
archive_links(new_links, overwrite=False, **archive_kwargs)
# add any tags to imported links
tags = [
Tag.objects.get_or_create(name=name.strip())[0]
for name in tag.split(',')
if name.strip()
]
if tags:
for link in imported_links:
snapshot = link.as_snapshot()
snapshot.tags.add(*tags)
snapshot.tags_str(nocache=True)
snapshot.save()
# print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
# Run the archive methods for each link
archive_kwargs = {
"out_dir": out_dir,
}
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
archive_links(new_links, overwrite=False, **archive_kwargs)
return all_links
@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
all_snapshots = load_main_index(out_dir=out_dir)
if after is not None:
all_snapshots = all_snapshots.filter(timestamp__lt=after)
all_snapshots = all_snapshots.filter(timestamp__gte=after)
if before is not None:
all_snapshots = all_snapshots.filter(timestamp__gt=before)
all_snapshots = all_snapshots.filter(timestamp__lt=before)
if filter_patterns:
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
if not all_snapshots:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
return all_snapshots
@enforce_types
@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
reload: bool=False,
debug: bool=False,
init: bool=False,
quick_init: bool=False,
createsuperuser: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Run the ArchiveBox HTTP server"""
@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
if init:
run_subcommand('init', stdin=None, pwd=out_dir)
print()
elif quick_init:
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
print()
if createsuperuser:
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
print()
# setup config for django runserver
from . import config
@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
from django.core.management import call_command
from django.contrib.auth.models import User
admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
if admin_user:
hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
else:
print(' > Logging errors to ./logs/errors.log')
if not User.objects.filter(is_superuser=True).exists():
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
print()
print(' To create an admin user, run:')
@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
config.SHOW_PROGRESS = False
config.DEBUG = config.DEBUG or debug
call_command("runserver", *runserver_args)

View file

@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
"""
parse a list of URLS without touching the filesystem
"""
check_url_parsing_invariants()
timer = TimedProgress(TIMEOUT * 4)
#urls = list(map(lambda x: x + "\n", urls))
@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
RSS feed, bookmarks export, or text file
"""
check_url_parsing_invariants()
timer = TimedProgress(TIMEOUT * 4)
with open(source_file, 'r', encoding='utf-8') as file:
links, parser = run_parser_functions(file, timer, root_url=root_url)
@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
return source_path
def check_url_parsing_invariants() -> None:
"""Check that plain text regex URL parsing works as expected"""
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving, as the consequences could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive
test_urls = '''
https://example1.com/what/is/happening.html?what=1#how-about-this=1
https://example2.com/what/is/happening/?what=1#how-about-this=1
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
https://example4.com/what/is/happening.html
https://example5.com/
https://example6.com
<test>http://example7.com</test>
[https://example8.com/what/is/this.php?what=1]
[and http://example9.com?what=1&other=3#and-thing=2]
<what>https://example10.com#and-thing=2 "</about>
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
example13.bada
and example14.badb
<or>htt://example15.badc</that>
'''
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
assert len(re.findall(URL_REGEX, test_urls)) == 12
# Check that plain text regex URL parsing works as expected
# this is last-line-of-defense to make sure the URL_REGEX isn't
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
# the consequences of bad URL parsing could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
_test_url_strs = {
'example.com': 0,
'/example.com': 0,
'//example.com': 0,
':/example.com': 0,
'://example.com': 0,
'htt://example8.com': 0,
'/htt://example.com': 0,
'https://example': 1,
'https://localhost/2345': 1,
'https://localhost:1234/123': 1,
'://': 0,
'https://': 0,
'http://': 0,
'ftp://': 0,
'ftp://example.com': 0,
'https://example.com': 1,
'https://example.com/': 1,
'https://a.example.com': 1,
'https://a.example.com/': 1,
'https://a.example.com/what/is/happening.html': 1,
'https://a.example.com/what/ís/happening.html': 1,
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
'<or>http://examplehttp://15.badc</that>': 2,
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
}
for url_str, num_urls in _test_url_strs.items():
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
f'{url_str} does not contain {num_urls} urls')

View file

@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r') as file:
with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]

View file

@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
try:
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
if isinstance(contents, dict):
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
elif isinstance(contents, (bytes, str)):

View file

@ -1 +0,0 @@
actions_as_select

View file

@ -20,7 +20,7 @@
<body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
data-admin-utc-offset="{% now "Z" %}">
<style nonce="{{nonce}}">
<style>
/* Loading Progress Bar */
#progress {
position: absolute;
@ -89,7 +89,7 @@
<a href="{% url 'admin:Add' %}">Add </a> /
<a href="{% url 'Home' %}">Snapshots</a> /
<a href="/admin/core/tag/">Tags</a> /
<a href="/admin/auth/user/">Users</a> /
<a href="/admin/">Admin</a> /
<a href="{% url 'Docs' %}">Docs</a>
&nbsp; &nbsp;
{% block welcome-msg %}
@ -157,15 +157,15 @@
function fix_actions() {
var container = $('div.actions');
if (container.find('option').length < 10) {
container.find('label, button').hide();
if (container.find('select[name=action] option').length < 10) {
container.find('label:nth-child(1), button[value=0]').hide();
var buttons = $('<div></div>')
.prependTo(container)
.appendTo(container)
.css('display', 'inline')
.addClass('class', 'action-buttons');
container.find('option:gt(0)').reverse().each(function () {
container.find('select[name=action] option:gt(0)').reverse().each(function () {
const name = this.value
$('<button>')
.appendTo(buttons)

View file

@ -15,7 +15,7 @@
{% endblock %}
{% block body %}
<div style="max-width: 550px; margin: auto; float: none">
<div style="max-width: 1440px; margin: auto; float: none">
<br/><br/>
{% if stdout %}
<h1>Add new URLs to your archive: results</h1>

View file

@ -38,7 +38,7 @@
<div id="header">
<div id="branding">
<h1 id="site-name">
<a href="{% url 'public-index' %}" class="header-archivebox" title="Last updated: {{updated}}">
<a href="{% url 'public-index' %}" class="header-archivebox">
<img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
ArchiveBox
</a>
@ -70,7 +70,7 @@
<center>
<small>
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
<a href="https://github.com/ArchiveBox/ArchiveBox/releases" title="Releases">v{{VERSION}}</a>.
<a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{VERSION}}" title="Releases">v{{VERSION}}</a>.
<br/><br/>
{{FOOTER_INFO}}
</small>

View file

@ -10,7 +10,7 @@
{% endif %}
<a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'}}</span>
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'|truncatechars:128}}</span>
{% if link.tags_str %}
<span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
{% if link.tags_str != None %}
@ -33,5 +33,5 @@
{% endif %}
</span>
</td>
<td style="text-align:left"><a href="{{link.url}}">{{link.url}}</a></td>
<td style="text-align:left; word-wrap: anywhere;"><a href="{{link.url}}">{{link.url|truncatechars:128}}</a></td>
</tr>

View file

@ -4,7 +4,7 @@
<title>Archived Sites</title>
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
</head>
<body data-status="{{status}}">
<body>
<table id="table-bookmarks">
<thead>
<tr class="thead-tr">

View file

@ -2,6 +2,11 @@
{% load static %}
{% block body %}
<style>
#table-bookmarks_info {
display: none;
}
</style>
<div id="toolbar">
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
<div>
@ -21,7 +26,7 @@
<thead>
<tr>
<th style="width: 100px;">Bookmarked</th>
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
<th style="width: 26vw;">Snapshot ({{page_obj.paginator.count}})</th>
<th style="width: 140px">Files</th>
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
</tr>
@ -33,26 +38,26 @@
</tbody>
</table>
<center>
<br/>
Showing {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} total
<br/>
<span class="step-links">
{% if page_obj.has_previous %}
<a href="{% url 'public-index' %}?page=1">&laquo; first</a>
<a href="{% url 'public-index' %}?page=1">&laquo; first</a> &nbsp;
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
&nbsp;
{% endif %}
<span class="current">
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
&nbsp;
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a> &nbsp;
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
{% if page_obj.has_next %}
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last &raquo;</a>
{% endif %}
</span>
<br>
</center>

View file

@ -279,7 +279,7 @@
<div class="col-lg-8">
<img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
&nbsp;&nbsp;
{{title}}
{{title|safe}}
&nbsp;&nbsp;
<a href="#" class="header-toggle"></a>
<br/>
@ -335,20 +335,21 @@
</div>
<div class="col-lg-4">
<div class="info-chunk">
<h5>🗃 Files</h5>
<h5>🗃 Snapshot ID: <a href="/admin/core/snapshot/{{snapshot_id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|truncatechars:24}}</small></code></a></h5>
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
<a href="git/" title="Any git repos at the url">Git</a> |
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
<a href="." title="Webserver-provided index of files directory.">See all...</a>
<a href="/admin/core/snapshot/?id__startswith={{snapshot_id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> |
<a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> |
<a href="." title="Webserver-provided index of files directory.">See all files...</a><br/>
</div>
</div>
</div>
<div class="row header-bottom-frames">
<div class="col-lg-2">
<div class="card selected-card">
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./singlefile.html</code></p>
@ -381,7 +382,7 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./{{domain}}</code></p>
@ -393,30 +394,30 @@
{% if SAVE_ARCHIVE_DOT_ORG %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
</a>
<a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
<a href="{{archive_org_path}}" target="preview" id="archive_dot_org-btn"><h4 class="card-title">Archive.Org</h4></a>
</div>
</div>
</div>
{% endif %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>🌐 {{domain}}</code></p>
</a>
<a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
<a href="{{url}}" target="preview" id="original-btn"><h4 class="card-title">Original</h4></a>
</div>
</div>
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./headers.json</code></p>
@ -427,7 +428,7 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./output.html</code></p>
@ -438,7 +439,7 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./readability/content.html</code></p>
@ -450,7 +451,7 @@
<br/>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./mercury/content.html</code></p>
@ -461,7 +462,7 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./media/*.mp4</code></p>
@ -472,7 +473,7 @@
</div>
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
<div class="card-body">
<a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
<p class="card-text"><code>./git/*.git</code></p>
@ -484,7 +485,7 @@
</div>
</div>
</header>
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
<iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
<script
src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
@ -493,6 +494,16 @@
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
<script>
function getPreviewTypeFromPath(link) {
if (link.id == 'original-btn') {
return 'original'
}
if (link.id == 'archive_dot_org-btn') {
return 'archive_dot_org'
}
return link.pathname.split('/').filter(a => a.length).slice(-1)[0].toLowerCase()
}
// show selected file in iframe when preview card is clicked
jQuery('.card').on('click', function(e) {
jQuery('.selected-card').removeClass('selected-card')
@ -502,11 +513,26 @@
if (e.currentTarget.href.endsWith('.pdf')) {
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
} else {
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
}
window.location.hash = getPreviewTypeFromPath(e.currentTarget)
return true
})
// check URL for hash e.g. #git and load relevant preview
jQuery(document).ready(function() {
if (window.location.hash) {
for (const link of jQuery('a[target=preview]')) {
console.log(link.pathname)
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
jQuery(link).closest('.card').click()
jQuery(link).click()
link.click()
}
}
}
})
// un-sandbox iframes showing pdfs (required to display pdf viewer)
jQuery('iframe').map(function() {
if (this.src.endsWith('.pdf')) {

View file

@ -209,7 +209,7 @@
<div class="header-top container-fluid">
<div class="row nav">
<div class="col-sm-2">
<a href="/" class="header-archivebox" title="Last updated: {{updated}}">
<a href="/" class="header-archivebox">
<img src="{% static 'archive.png' %}" alt="Logo"/>
ArchiveBox: Index
</a>
@ -243,7 +243,7 @@
<center>
<small>
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{version}}" title="Git commit">v{{version}}</a> &nbsp; | &nbsp;
version <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{version}}" title="View source code and release info">v{{version}}</a> &nbsp; | &nbsp;
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
<br/><br/>
{{FOOTER_INFO}}

View file

@ -42,7 +42,7 @@ header {
background-color: #f5dd5d;
}
#stdout {
background-color: #ded;
background-color: #fbfbfb;
padding: 10px 10px;
border-radius: 4px;
white-space: normal;

View file

@ -237,3 +237,40 @@ body.model-snapshot.change-list #content .object-tools {
opacity: 0.1;
filter: grayscale(100%);
}
#result_list tbody td.field-cmd_str pre,
#result_list tbody td.field-output_str pre {
max-width: 22vw;
word-wrap: anywhere;
white-space: break-spaces;
max-height: 40px;
overflow: hidden;
margin: 2px;
background-color: rgba(0,0,0,0.05);
padding: 1px 4px 16px 8px;
border-radius: 4px;
}
#result_list tbody td.field-extractor {
font-weight: 800;
font-variant: small-caps;
}
#result_list tbody td.field-status {
font-variant: small-caps;
}
.inline-group .tabular td.original p {
margin-top: -33px;
}
tbody .output-link {
float: right;
margin-bottom: -25px;
margin-right: -3px;
margin-top: -4px;
opacity: 0.4;
box-shadow: 4px 4px 4px rgba(0,0,0,0.1);
}
tbody .output-link:hover {opacity: 1;}

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

View file

@ -0,0 +1,2 @@
User-agent: *
Disallow: /

View file

@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
r'))',
re.IGNORECASE,
)

View file

@ -3,6 +3,7 @@
DATA_DIR="${DATA_DIR:-/data}"
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
# Set the archivebox user UID & GID
if [[ -n "$PUID" && "$PUID" != 0 ]]; then
usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
@ -11,6 +12,7 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
fi
# Set the permissions of the data dir to match the archivebox user
if [[ -d "$DATA_DIR/archive" ]]; then
# check data directory permissions
@ -33,11 +35,11 @@ if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
# e.g. "archivebox init"
# "/bin/bash"
# "echo"
gosu "$ARCHIVEBOX_USER" bash -c "$*"
exec gosu "$ARCHIVEBOX_USER" bash -c "$*"
else
# no command given, assume args were meant to be passed to archivebox cmd
# e.g. "add https://example.com"
# "manage createsupseruser"
# "server 0.0.0.0:8000"
gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
fi

View file

@ -11,36 +11,39 @@ version: '3.7'
services:
archivebox:
# build: .
# build: . # for developers working on archivebox
image: ${DOCKER_IMAGE:-archivebox/archivebox:latest}
command: server 0.0.0.0:8000
command: server --quick-init 0.0.0.0:8000
stdin_open: true
tty: true
ports:
- 8000:8000
environment:
- USE_COLOR=True
- SHOW_PROGRESS=False
- SEARCH_BACKEND_ENGINE=sonic
- SEARCH_BACKEND_HOST_NAME=sonic
- SEARCH_BACKEND_PASSWORD=SecretPassword
- ALLOWED_HOSTS=* # add any config options you want as env vars
- MEDIA_MAX_SIZE=750m
# - SHOW_PROGRESS=False
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below
# - SEARCH_BACKEND_HOST_NAME=sonic
# - SEARCH_BACKEND_PASSWORD=SecretPassword
volumes:
- ./data:/data
depends_on:
- sonic
# - ./archivebox:/app/archivebox # for developers working on archivebox
# Run sonic search backend
sonic:
image: valeriansaliou/sonic:v1.3.0
ports:
- 1491:1491
environment:
- SEARCH_BACKEND_PASSWORD=SecretPassword
volumes:
- ./etc/sonic/config.cfg:/etc/sonic.cfg
- ./data:/var/lib/sonic/store/
# To run the Sonic full-text search backend, create an ./etc/sonic folder
# and download the sonic config file from here into that folder:
# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg
# sonic:
# image: valeriansaliou/sonic:v1.3.0
# expose:
# - 1491
# environment:
# - SEARCH_BACKEND_PASSWORD=SecretPassword
# volumes:
# - ./etc/sonic/config.cfg:/etc/sonic.cfg
# - ./data/sonic:/var/lib/sonic/store
# Optional Addons: tweak these examples as needed for your specific use case
### Optional Addons: tweak these examples as needed for your specific use case
# Example: Run scheduled imports in a docker instead of using cron on the
# host machine, add tasks and see more info with archivebox schedule --help

View file

@ -1,6 +1,6 @@
{
"name": "archivebox",
"version": "0.5.6",
"version": "0.6.0",
"description": "ArchiveBox: The self-hosted internet archive",
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
"license": "MIT",

View file

@ -27,6 +27,49 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
PYTHON_REQUIRES = ">=3.7"
SETUP_REQUIRES = ["wheel"]
INSTALL_REQUIRES = [
# only add things here that have corresponding apt python3-packages available
# anything added here also needs to be added to our package dependencies in
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests>=2.24.0",
"atomicwrites>=1.4.0",
"mypy-extensions>=0.4.3",
"django>=3.1.3",
"django-extensions>=3.0.3",
"dateparser",
"ipython",
"youtube-dl",
"python-crontab>=2.5.1",
"croniter>=0.3.34",
"w3lib>=1.22.0",
]
EXTRAS_REQUIRE = {
'sonic': [
"sonic-client>=0.0.5",
],
'dev': [
"setuptools",
"twine",
"wheel",
"flake8",
"ipdb",
"mypy",
"django-stubs",
"sphinx",
"sphinx-rtd-theme",
"recommonmark",
"pytest",
"bottle",
"stdeb",
"django-debug-toolbar",
"djdt_flamegraph",
],
}
# To see when setup.py gets called (uncomment for debugging):
# import sys
# print(PACKAGE_DIR, f" (v{VERSION})")
@ -36,7 +79,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio
class DisabledTestCommand(test):
def run(self):
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it
print('Use the ./bin/test.sh script to run tests, not setup.py test.')
print()
print('[X] Running tests via setup.py test is deprecated.')
print(' Hint: Use the ./bin/test.sh script or pytest instead')
setuptools.setup(
@ -50,45 +95,10 @@ setuptools.setup(
long_description_content_type="text/markdown",
url=REPO_URL,
project_urls=PROJECT_URLS,
python_requires=">=3.7",
setup_requires=[
"wheel",
],
install_requires=[
# only add things here that have corresponding apt python3-packages available
# anything added here also needs to be added to our package dependencies in
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
# if there is no apt python3-package equivalent, then vendor it instead in
# ./archivebox/vendor/
"requests==2.24.0",
"atomicwrites==1.4.0",
"mypy-extensions==0.4.3",
"django==3.1.3",
"django-extensions==3.0.3",
"dateparser",
"ipython",
"youtube-dl",
"python-crontab==2.5.1",
"croniter==0.3.34",
"w3lib==1.22.0",
],
extras_require={
'dev': [
"setuptools",
"twine",
"wheel",
"flake8",
"ipdb",
"mypy",
"django-stubs",
"sphinx",
"sphinx-rtd-theme",
"recommonmark",
"pytest",
"bottle",
"stdeb",
],
},
python_requires=PYTHON_REQUIRES,
setup_requires=SETUP_REQUIRES,
install_requires=INSTALL_REQUIRES,
extras_require=EXTRAS_REQUIRE,
packages=[PKG_NAME],
include_package_data=True, # see MANIFEST.in
entry_points={

View file

@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
)
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
with open(archived_item_path / "index.json", "r") as f:
with open(archived_item_path / "index.json", "r", encoding='utf-8') as f:
output_json = json.load(f)
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
with open(archived_item_path / "index.json", "r") as f:
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
output_json = json.load(f)
assert output_json["history"] != {}
@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
assert (archived_item_path / "warc").exists()
assert not (archived_item_path / "singlefile.html").exists()
assert not (archived_item_path / "singlefile.html").exists()

View file

@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
output_file = archived_item_path / "headers.json"
assert output_file.exists()
headers_file = archived_item_path / 'headers.json'
with open(headers_file) as f:
with open(headers_file, 'r', encoding='utf-8') as f:
headers = pyjson.load(f)
assert headers['Content-Language'] == 'en'
assert headers['Content-Script-Type'] == 'text/javascript'
@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "headers.json"
with open(output_file) as f:
with open(output_file, 'r', encoding='utf-8') as f:
headers = pyjson.load(f)
assert headers['Content-Language'] == 'en'
assert headers['Content-Script-Type'] == 'text/javascript'
@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
capture_output=True, env=disable_extractors_dict)
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
output_file = archived_item_path / "headers.json"
with open(output_file) as f:
with open(output_file, 'r', encoding='utf-8') as f:
headers = pyjson.load(f)
assert headers["Status-Code"] == "200"
assert headers["Status-Code"] == "200"

View file

@ -12,12 +12,12 @@ from archivebox.config import OUTPUT_PERMISSIONS
from .fixtures import *
def test_init(tmp_path, process):
assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
def test_update(tmp_path, process):
os.chdir(tmp_path)
update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8")
def test_add_link(tmp_path, process, disable_extractors_dict):
disable_extractors_dict.update({"USE_WGET": "true"})
@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
with open(archived_item_path / "index.json", "r") as f:
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
output_json = json.load(f)
assert "Example Domain" == output_json['history']['title'][0]['output']
with open(archived_item_path / "index.html", "r") as f:
with open(archived_item_path / "index.html", "r", encoding="utf-8") as f:
output_html = f.read()
assert "Example Domain" in output_html
@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
with open(archived_item_path / "index.json", "r") as f:
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
output_json = json.load(f)
assert "Example Domain" == output_json['history']['title'][0]['output']
@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto
first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
json_index = str(first_archive / "index.json")
with open(json_index, "r") as f:
with open(json_index, "r", encoding="utf-8") as f:
link_details = json.loads(f.read())
link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
with open(json_index, "w") as f:
with open(json_index, "w", encoding="utf-8") as f:
json.dump(link_details, f)
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto
archive_folders.remove(first_archive.name)
json_index = str(first_archive / "index.json")
with open(json_index, "r") as f:
with open(json_index, "r", encoding="utf-8") as f:
link_details = json.loads(f.read())
link_details["timestamp"] = archive_folders[0]
with open(json_index, "w") as f:
with open(json_index, "w", encoding="utf-8") as f:
json.dump(link_details, f)
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
@ -173,4 +173,4 @@ def test_tags_migration(tmp_path, disable_extractors_dict):
snapshot_id = tag["id"]
tag_name = tag["name"]
# Check each tag migrated is in the previous field
assert tag_name in snapshots_dict[snapshot_id]
assert tag_name in snapshots_dict[snapshot_id]

View file

@ -100,16 +100,18 @@ def test_remove_before(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
higherts, lowerts = timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
conn.commit()
conn.close()
before = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
lowerts = lowerts[0].split(".")[0]
higherts = higherts[0].split(".")[0]
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', str(before[1])], capture_output=True)
# before is less than, so only the lower snapshot gets deleted
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', higherts], capture_output=True)
assert (tmp_path / "archive" / timestamp[0][0]).exists()
assert not (tmp_path / "archive" / timestamp[1][0]).exists()
assert not (tmp_path / "archive" / lowerts).exists()
assert (tmp_path / "archive" / higherts).exists()
def test_remove_after(tmp_path, process, disable_extractors_dict):
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
@ -118,13 +120,15 @@ def test_remove_after(tmp_path, process, disable_extractors_dict):
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
higherts, lowerts = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
conn.commit()
conn.close()
after = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
lowerts = lowerts[0].split(".")[0]
higherts = higherts[0].split(".")[0]
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', str(after[1])], capture_output=True)
# after is greater than or equal to, so both snapshots get deleted
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', lowerts], capture_output=True)
assert (tmp_path / "archive" / timestamp[1][0]).exists()
assert not (tmp_path / "archive" / timestamp[0][0]).exists()
assert not (tmp_path / "archive" / lowerts).exists()
assert not (tmp_path / "archive" / higherts).exists()

13
uwsgi.ini Normal file
View file

@ -0,0 +1,13 @@
[uwsgi]
socket = 127.0.0.1:3031
chdir = ../
http = 0.0.0.0:8001
env = OUTPUT_DIR=./data
wsgi-file = archivebox/core/wsgi.py
processes = 4
threads = 1
stats = 127.0.0.1:9191
static-map /static=./archivebox/templates/static
harakiri = 172800
post-buffering = 1
disable-logging = True