mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-23 12:43:10 +00:00
Merge pull request #655 from ArchiveBox/debug-toolbar
This commit is contained in:
commit
6fb7bbf2fb
65 changed files with 1811 additions and 599 deletions
29
Dockerfile
29
Dockerfile
|
@ -50,13 +50,6 @@ RUN apt-get update -qq \
|
|||
fontconfig fonts-ipafont-gothic fonts-wqy-zenhei fonts-thai-tlwg fonts-kacst fonts-symbola fonts-noto fonts-freefont-ttf \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install apt development dependencies
|
||||
# RUN apt-get install -qq \
|
||||
# && apt-get install -qq -y --no-install-recommends \
|
||||
# python3 python3-dev python3-pip python3-venv python3-all \
|
||||
# dh-python debhelper devscripts dput software-properties-common \
|
||||
# python3-distutils python3-setuptools python3-wheel python3-stdeb
|
||||
|
||||
# Install Node environment
|
||||
RUN curl -s https://deb.nodesource.com/gpgkey/nodesource.gpg.key | apt-key add - \
|
||||
&& echo 'deb https://deb.nodesource.com/node_15.x buster main' >> /etc/apt/sources.list \
|
||||
|
@ -79,17 +72,26 @@ WORKDIR "$CODE_DIR"
|
|||
ENV PATH="${PATH}:$VENV_PATH/bin"
|
||||
RUN python -m venv --clear --symlinks "$VENV_PATH" \
|
||||
&& pip install --upgrade --quiet pip setuptools
|
||||
ADD ./pip_dist/archivebox.egg-info/requires.txt "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt"
|
||||
ADD "./setup.py" "$CODE_DIR/"
|
||||
ADD "./README.md" "./package.json" "$CODE_DIR/archivebox/"
|
||||
RUN apt-get update -qq \
|
||||
&& apt-get install -qq -y --no-install-recommends \
|
||||
build-essential python-dev python3-dev \
|
||||
# && pip install --upgrade pip \
|
||||
&& grep -B 1000 -E '^$' "$CODE_DIR/pip_dist/archivebox.egg-info/requires.txt" | pip install --quiet -r /dev/stdin \
|
||||
&& pip install --quiet "sonic-client==0.0.5" \
|
||||
&& python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.install_requires + result.extras_require["sonic"]))' > /tmp/requirements.txt \
|
||||
&& pip install --quiet -r /tmp/requirements.txt \
|
||||
&& apt-get purge -y build-essential python-dev python3-dev \
|
||||
&& apt-get autoremove -y \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install apt development dependencies
|
||||
# RUN apt-get install -qq \
|
||||
# && apt-get install -qq -y --no-install-recommends \
|
||||
# python3 python3-dev python3-pip python3-venv python3-all \
|
||||
# dh-python debhelper devscripts dput software-properties-common \
|
||||
# python3-distutils python3-setuptools python3-wheel python3-stdeb
|
||||
# RUN python3 -c 'from distutils.core import run_setup; result = run_setup("./setup.py", stop_after="init"); print("\n".join(result.extras_require["dev"]))' > /tmp/dev_requirements.txt \
|
||||
# && pip install --quiet -r /tmp/dev_requirements.txt
|
||||
|
||||
# Install ArchiveBox Python package and its dependencies
|
||||
WORKDIR "$CODE_DIR"
|
||||
ADD . "$CODE_DIR"
|
||||
|
@ -115,5 +117,8 @@ RUN /app/bin/docker_entrypoint.sh archivebox version
|
|||
VOLUME "$DATA_DIR"
|
||||
EXPOSE 8000
|
||||
|
||||
HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
|
||||
CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
|
||||
|
||||
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
|
||||
CMD ["archivebox", "server", "0.0.0.0:8000"]
|
||||
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
|
||||
|
|
|
@ -63,7 +63,11 @@ def run_subcommand(subcommand: str,
|
|||
|
||||
if subcommand not in meta_cmds:
|
||||
from ..config import setup_django
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=subcommand in archive_cmds)
|
||||
|
||||
cmd_requires_db = subcommand in archive_cmds
|
||||
init_pending = '--init' in subcommand_args or '--quick-init' in subcommand_args
|
||||
|
||||
setup_django(in_memory_db=subcommand in fake_db, check_db=cmd_requires_db and not init_pending)
|
||||
|
||||
module = import_module('.archivebox_{}'.format(subcommand), __package__)
|
||||
module.main(args=subcommand_args, stdin=stdin, pwd=pwd) # type: ignore
|
||||
|
|
|
@ -22,6 +22,12 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
add_help=True,
|
||||
formatter_class=SmartFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--tag', '-t',
|
||||
type=str,
|
||||
default='',
|
||||
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
|
@ -75,7 +81,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
)
|
||||
command = parser.parse_args(args or ())
|
||||
urls = command.urls
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
|
||||
stdin_urls = ''
|
||||
if not urls:
|
||||
stdin_urls = accept_stdin(stdin)
|
||||
|
||||
if (stdin_urls and urls) or (not stdin and not urls):
|
||||
stderr(
|
||||
'[X] You must pass URLs/paths to add via stdin or CLI arguments.\n',
|
||||
|
@ -85,6 +95,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
add(
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
tag=command.tag,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
|
|
|
@ -45,7 +45,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help='KEY or KEY=VALUE formatted config values to get or set',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
config_options_str = accept_stdin(stdin)
|
||||
|
||||
config_options_str = ''
|
||||
if not command.config_options:
|
||||
config_options_str = accept_stdin(stdin)
|
||||
|
||||
config(
|
||||
config_options_str=config_options_str,
|
||||
|
|
|
@ -27,11 +27,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
action='store_true',
|
||||
help='Ignore unrecognized files in current directory and initialize anyway',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quick', '-q',
|
||||
action='store_true',
|
||||
help='Run any updates or migrations without rechecking all snapshot dirs',
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
reject_stdin(__command__, stdin)
|
||||
|
||||
init(
|
||||
force=command.force,
|
||||
quick=command.quick,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..main import list_all
|
|||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
LINK_FILTERS,
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
|
@ -23,7 +24,7 @@ from ..index import (
|
|||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||
from ..logging_util import SmartFormatter, reject_stdin, stderr
|
||||
|
||||
|
||||
@docstring(list_all.__doc__)
|
||||
|
@ -44,7 +45,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
group.add_argument(
|
||||
'--json', #'-j',
|
||||
action='store_true',
|
||||
help="Print the output in JSON format with all columns included.",
|
||||
help="Print the output in JSON format with all columns included",
|
||||
)
|
||||
group.add_argument(
|
||||
'--html',
|
||||
|
@ -59,19 +60,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
parser.add_argument(
|
||||
'--sort', #'-s',
|
||||
type=str,
|
||||
help="List the links sorted using the given key, e.g. timestamp or updated.",
|
||||
help="List the links sorted using the given key, e.g. timestamp or updated",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--before', #'-b',
|
||||
type=float,
|
||||
help="List only links bookmarked before the given timestamp.",
|
||||
help="List only links bookmarked before (less than) the given timestamp",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
'--after', #'-a',
|
||||
type=float,
|
||||
help="List only links bookmarked after the given timestamp.",
|
||||
help="List only links bookmarked after (greater than or equal to) the given timestamp",
|
||||
default=None,
|
||||
)
|
||||
parser.add_argument(
|
||||
|
@ -96,9 +97,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
'--filter-type', '-t',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
|
||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
|
@ -107,20 +108,19 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
nargs='*',
|
||||
type=str,
|
||||
default=None,
|
||||
help='List only URLs matching these filter patterns.'
|
||||
help='List only URLs matching these filter patterns'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
reject_stdin(stdin)
|
||||
|
||||
if command.with_headers and not (command.json or command.html or command.csv):
|
||||
stderr(
|
||||
'[X] --with-headers can only be used with --json, --html or --csv options.\n',
|
||||
'[X] --with-headers can only be used with --json, --html or --csv options\n',
|
||||
color='red',
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
matching_folders = list_all(
|
||||
filter_patterns_str=filter_patterns_str,
|
||||
filter_patterns=command.filter_patterns,
|
||||
filter_type=command.filter_type,
|
||||
status=command.status,
|
||||
|
|
|
@ -50,8 +50,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help= "Path to save the single archive folder to, e.g. ./example.com_archive"
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
stdin_url = None
|
||||
url = command.url
|
||||
stdin_url = accept_stdin(stdin)
|
||||
if not url:
|
||||
stdin_url = accept_stdin(stdin)
|
||||
|
||||
if (stdin_url and url) or (not stdin and not url):
|
||||
stderr(
|
||||
'[X] You must pass a URL/path to add via stdin or CLI arguments.\n',
|
||||
|
|
|
@ -61,7 +61,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
help='URLs matching this filter pattern will be removed from the index.'
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_str = accept_stdin(stdin)
|
||||
|
||||
filter_str = None
|
||||
if not command.filter_patterns:
|
||||
filter_str = accept_stdin(stdin)
|
||||
|
||||
remove(
|
||||
filter_str=filter_str,
|
||||
|
|
|
@ -38,10 +38,20 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
action='store_true',
|
||||
help='Enable DEBUG=True mode with more verbose errors',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--nothreading',
|
||||
action='store_true',
|
||||
help='Force runserver to run in single-threaded mode',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--init',
|
||||
action='store_true',
|
||||
help='Run archivebox init before starting the server',
|
||||
help='Run a full archivebox init/upgrade before starting the server',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--quick-init', '-i',
|
||||
action='store_true',
|
||||
help='Run quick archivebox init/upgrade before starting the server',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--createsuperuser',
|
||||
|
@ -52,10 +62,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
reject_stdin(__command__, stdin)
|
||||
|
||||
server(
|
||||
runserver_args=command.runserver_args,
|
||||
runserver_args=command.runserver_args + (['--nothreading'] if command.nothreading else []),
|
||||
reload=command.reload,
|
||||
debug=command.debug,
|
||||
init=command.init,
|
||||
quick_init=command.quick_init,
|
||||
createsuperuser=command.createsuperuser,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
|
|
@ -12,6 +12,7 @@ from ..main import update
|
|||
from ..util import docstring
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..index import (
|
||||
LINK_FILTERS,
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_unarchived_folders,
|
||||
|
@ -89,9 +90,9 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
)
|
||||
)
|
||||
parser.add_argument(
|
||||
'--filter-type',
|
||||
'--filter-type', '-t',
|
||||
type=str,
|
||||
choices=('exact', 'substring', 'domain', 'regex', 'tag', 'search'),
|
||||
choices=(*LINK_FILTERS.keys(), 'search'),
|
||||
default='exact',
|
||||
help='Type of pattern matching to use when filtering URLs',
|
||||
)
|
||||
|
@ -110,7 +111,10 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
|||
default=""
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
filter_patterns_str = None
|
||||
if not command.filter_patterns:
|
||||
filter_patterns_str = accept_stdin(stdin)
|
||||
|
||||
update(
|
||||
resume=command.resume,
|
||||
|
|
227
archivebox/cli/tests.py
Normal file
227
archivebox/cli/tests.py
Normal file
|
@ -0,0 +1,227 @@
|
|||
#!/usr/bin/env python3
|
||||
|
||||
__package__ = 'archivebox.cli'
|
||||
|
||||
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
from contextlib import contextmanager
|
||||
|
||||
TEST_CONFIG = {
|
||||
'USE_COLOR': 'False',
|
||||
'SHOW_PROGRESS': 'False',
|
||||
|
||||
'OUTPUT_DIR': 'data.tests',
|
||||
|
||||
'SAVE_ARCHIVE_DOT_ORG': 'False',
|
||||
'SAVE_TITLE': 'False',
|
||||
|
||||
'USE_CURL': 'False',
|
||||
'USE_WGET': 'False',
|
||||
'USE_GIT': 'False',
|
||||
'USE_CHROME': 'False',
|
||||
'USE_YOUTUBEDL': 'False',
|
||||
}
|
||||
|
||||
OUTPUT_DIR = 'data.tests'
|
||||
os.environ.update(TEST_CONFIG)
|
||||
|
||||
from ..main import init
|
||||
from ..index import load_main_index
|
||||
from ..config import (
|
||||
SQL_INDEX_FILENAME,
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
)
|
||||
|
||||
from . import (
|
||||
archivebox_init,
|
||||
archivebox_add,
|
||||
archivebox_remove,
|
||||
)
|
||||
|
||||
HIDE_CLI_OUTPUT = True
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://subb.example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://subb.example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
|
||||
stdout = sys.stdout
|
||||
stderr = sys.stderr
|
||||
|
||||
|
||||
@contextmanager
|
||||
def output_hidden(show_failing=True):
|
||||
if not HIDE_CLI_OUTPUT:
|
||||
yield
|
||||
return
|
||||
|
||||
sys.stdout = open('stdout.txt', 'w+', encoding='utf-8')
|
||||
sys.stderr = open('stderr.txt', 'w+', encoding='utf-8')
|
||||
try:
|
||||
yield
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
except Exception:
|
||||
sys.stdout.close()
|
||||
sys.stderr.close()
|
||||
sys.stdout = stdout
|
||||
sys.stderr = stderr
|
||||
if show_failing:
|
||||
with open('stdout.txt', 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
with open('stderr.txt', 'r', encoding='utf-8') as f:
|
||||
print(f.read())
|
||||
raise
|
||||
finally:
|
||||
os.remove('stdout.txt')
|
||||
os.remove('stderr.txt')
|
||||
|
||||
|
||||
class TestInit(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
def test_basic_init(self):
|
||||
with output_hidden():
|
||||
archivebox_init.main([])
|
||||
|
||||
assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
|
||||
assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
|
||||
assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
|
||||
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
|
||||
|
||||
def test_conflicting_init(self):
|
||||
with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+', encoding='utf-8') as f:
|
||||
f.write('test')
|
||||
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_init.main([])
|
||||
assert False, 'Init should have exited with an exception'
|
||||
except SystemExit:
|
||||
pass
|
||||
|
||||
assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
|
||||
assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
|
||||
assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
|
||||
try:
|
||||
load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert False, 'load_main_index should raise an exception when no index is present'
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def test_no_dirty_state(self):
|
||||
with output_hidden():
|
||||
init()
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
|
||||
class TestAdd(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
def test_add_arg_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main(['https://getpocket.com/users/nikisweeting/feed/all'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 30
|
||||
|
||||
def test_add_arg_file(self):
|
||||
test_file = Path(OUTPUT_DIR) / 'test.txt'
|
||||
with open(test_file, 'w+', encoding='utf') as f:
|
||||
f.write(test_urls)
|
||||
|
||||
with output_hidden():
|
||||
archivebox_add.main([test_file])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 12
|
||||
os.remove(test_file)
|
||||
|
||||
def test_add_stdin_url(self):
|
||||
with output_hidden():
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 12
|
||||
|
||||
|
||||
class TestRemove(unittest.TestCase):
|
||||
def setUp(self):
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
with output_hidden():
|
||||
init()
|
||||
archivebox_add.main([], stdin=test_urls)
|
||||
|
||||
# def tearDown(self):
|
||||
# shutil.rmtree(OUTPUT_DIR, ignore_errors=True)
|
||||
|
||||
|
||||
def test_remove_exact(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://example5.com/'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 11
|
||||
|
||||
def test_remove_regex(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=regex', r'http(s)?:\/\/(.+\.)?(example\d\.com)'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 4
|
||||
|
||||
def test_remove_domain(self):
|
||||
with output_hidden():
|
||||
archivebox_remove.main(['--yes', '--delete', '--filter-type=domain', 'example5.com', 'example6.com'])
|
||||
|
||||
all_links = load_main_index(out_dir=OUTPUT_DIR)
|
||||
assert len(all_links) == 10
|
||||
|
||||
def test_remove_none(self):
|
||||
try:
|
||||
with output_hidden(show_failing=False):
|
||||
archivebox_remove.main(['--yes', '--delete', 'https://doesntexist.com'])
|
||||
assert False, 'Should raise if no URLs match'
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if '--verbose' in sys.argv or '-v' in sys.argv:
|
||||
HIDE_CLI_OUTPUT = False
|
||||
|
||||
unittest.main()
|
|
@ -29,10 +29,12 @@ import json
|
|||
import getpass
|
||||
import platform
|
||||
import shutil
|
||||
import sqlite3
|
||||
import django
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from typing import Optional, Type, Tuple, Dict, Union, List
|
||||
from subprocess import run, PIPE, DEVNULL
|
||||
from configparser import ConfigParser
|
||||
|
@ -77,6 +79,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'PUBLIC_SNAPSHOTS': {'type': bool, 'default': True},
|
||||
'PUBLIC_ADD_VIEW': {'type': bool, 'default': False},
|
||||
'FOOTER_INFO': {'type': str, 'default': 'Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.'},
|
||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
|
@ -99,8 +102,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
|
||||
'ARCHIVE_METHOD_OPTIONS': {
|
||||
'RESOLUTION': {'type': str, 'default': '1440,2000', 'aliases': ('SCREENSHOT_RESOLUTION',)},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com'},
|
||||
'GIT_DOMAINS': {'type': str, 'default': 'github.com,bitbucket.org,gitlab.com,gist.github.com'},
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
|
@ -111,7 +115,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
|
||||
'CHROME_HEADLESS': {'type': bool, 'default': True},
|
||||
'CHROME_SANDBOX': {'type': bool, 'default': lambda c: not c['IN_DOCKER']},
|
||||
'YOUTUBEDL_ARGS': {'type': list, 'default': ['--write-description',
|
||||
'YOUTUBEDL_ARGS': {'type': list, 'default': lambda c: ['--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
|
@ -122,7 +126,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
|||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
'--max-filesize=750m',
|
||||
'--max-filesize={}'.format(c['MEDIA_MAX_SIZE']),
|
||||
]},
|
||||
|
||||
|
||||
|
@ -287,7 +291,6 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
|||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
|
||||
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text().strip())['version']},
|
||||
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
|
||||
|
@ -459,7 +462,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||
config_file.optionxform = str
|
||||
config_file.read(config_path)
|
||||
|
||||
with open(config_path, 'r') as old:
|
||||
with open(config_path, 'r', encoding='utf-8') as old:
|
||||
atomic_write(f'{config_path}.bak', old.read())
|
||||
|
||||
find_section = lambda key: [name for name, opts in CONFIG_SCHEMA.items() if key in opts][0]
|
||||
|
@ -480,14 +483,14 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||
|
||||
if (not existing_secret_key) or ('not a valid secret' in existing_secret_key):
|
||||
from django.utils.crypto import get_random_string
|
||||
chars = 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.'
|
||||
chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789_'
|
||||
random_secret_key = get_random_string(50, chars)
|
||||
if 'SERVER_CONFIG' in config_file:
|
||||
config_file['SERVER_CONFIG']['SECRET_KEY'] = random_secret_key
|
||||
else:
|
||||
config_file['SERVER_CONFIG'] = {'SECRET_KEY': random_secret_key}
|
||||
|
||||
with open(config_path, 'w+') as new:
|
||||
with open(config_path, 'w+', encoding='utf-8') as new:
|
||||
config_file.write(new)
|
||||
|
||||
try:
|
||||
|
@ -499,7 +502,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
|
|||
}
|
||||
except:
|
||||
# something went horribly wrong, rever to the previous version
|
||||
with open(f'{config_path}.bak', 'r') as old:
|
||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||
atomic_write(config_path, old.read())
|
||||
|
||||
if Path(f'{config_path}.bak').exists():
|
||||
|
@ -1062,23 +1065,72 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
|||
|
||||
try:
|
||||
import django
|
||||
from django.core.management import call_command
|
||||
|
||||
sys.path.append(str(config['PACKAGE_DIR']))
|
||||
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
|
||||
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
|
||||
# Check to make sure JSON extension is available in our Sqlite3 instance
|
||||
try:
|
||||
cursor = sqlite3.connect(':memory:').cursor()
|
||||
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
|
||||
except sqlite3.OperationalError as exc:
|
||||
stderr('[X] Your SQLite3 version is missing the required JSON1 extension', color='red')
|
||||
hint([
|
||||
'Upgrade your Python version or install the extension manually:',
|
||||
'https://code.djangoproject.com/wiki/JSON1Extension'
|
||||
])
|
||||
|
||||
if in_memory_db:
|
||||
# Put the db in memory and run migrations in case any command requires it
|
||||
from django.core.management import call_command
|
||||
# some commands (e.g. oneshot) dont store a long-lived sqlite3 db file on disk.
|
||||
# in those cases we create a temporary in-memory db and run the migrations
|
||||
# immediately to get a usable in-memory-database at startup
|
||||
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
|
||||
django.setup()
|
||||
call_command("migrate", interactive=False, verbosity=0)
|
||||
else:
|
||||
# Otherwise use default sqlite3 file-based database and initialize django
|
||||
# without running migrations automatically (user runs them manually by calling init)
|
||||
django.setup()
|
||||
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now().strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
|
||||
if check_db:
|
||||
# Enable WAL mode in sqlite3
|
||||
from django.db import connection
|
||||
with connection.cursor() as cursor:
|
||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
||||
if current_mode != 'wal':
|
||||
cursor.execute("PRAGMA journal_mode=wal;")
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
cache.get('test', None)
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
from django.db import connections
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
|
||||
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
|
||||
assert sql_index_path.exists(), (
|
||||
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
__package__ = 'archivebox.core'
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from contextlib import redirect_stdout
|
||||
|
||||
from django.contrib import admin
|
||||
|
@ -13,15 +14,15 @@ from django import forms
|
|||
|
||||
from ..util import htmldecode, urldecode, ansi_to_html
|
||||
|
||||
from core.models import Snapshot, Tag
|
||||
from core.forms import AddLinkForm, TagField
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.forms import AddLinkForm
|
||||
|
||||
from core.mixins import SearchResultsAdminMixin
|
||||
|
||||
from index.html import snapshot_icons
|
||||
from logging_util import printable_filesize
|
||||
from main import add, remove
|
||||
from config import OUTPUT_DIR
|
||||
from config import OUTPUT_DIR, SNAPSHOTS_PER_PAGE
|
||||
from extractors import archive_links
|
||||
|
||||
# Admin URLs
|
||||
|
@ -36,77 +37,34 @@ from extractors import archive_links
|
|||
|
||||
# TODO: https://stackoverflow.com/questions/40760880/add-custom-button-to-django-admin-panel
|
||||
|
||||
def update_snapshots(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], out_dir=OUTPUT_DIR)
|
||||
update_snapshots.short_description = "Archive"
|
||||
|
||||
def update_titles(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
|
||||
update_titles.short_description = "Pull title"
|
||||
class ArchiveResultInline(admin.TabularInline):
|
||||
model = ArchiveResult
|
||||
|
||||
def overwrite_snapshots(modeladmin, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, out_dir=OUTPUT_DIR)
|
||||
overwrite_snapshots.short_description = "Re-archive (overwrite)"
|
||||
class TagInline(admin.TabularInline):
|
||||
model = Snapshot.tags.through
|
||||
|
||||
def verify_snapshots(modeladmin, request, queryset):
|
||||
for snapshot in queryset:
|
||||
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
|
||||
|
||||
verify_snapshots.short_description = "Check"
|
||||
|
||||
def delete_snapshots(modeladmin, request, queryset):
|
||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||
|
||||
delete_snapshots.short_description = "Delete"
|
||||
from django.contrib.admin.helpers import ActionForm
|
||||
|
||||
|
||||
class SnapshotAdminForm(forms.ModelForm):
|
||||
tags = TagField(required=False)
|
||||
|
||||
class Meta:
|
||||
model = Snapshot
|
||||
fields = "__all__"
|
||||
|
||||
def save(self, commit=True):
|
||||
# Based on: https://stackoverflow.com/a/49933068/3509554
|
||||
|
||||
# Get the unsave instance
|
||||
instance = forms.ModelForm.save(self, False)
|
||||
tags = self.cleaned_data.pop("tags")
|
||||
|
||||
#update save_m2m
|
||||
def new_save_m2m():
|
||||
instance.save_tags(tags)
|
||||
|
||||
# Do we need to save all changes now?
|
||||
self.save_m2m = new_save_m2m
|
||||
if commit:
|
||||
instance.save()
|
||||
|
||||
return instance
|
||||
class SnapshotActionForm(ActionForm):
|
||||
tag = forms.ModelChoiceField(queryset=Tag.objects.all(), required=False)
|
||||
|
||||
|
||||
class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
||||
list_display = ('added', 'title_str', 'url_str', 'files', 'size')
|
||||
sort_fields = ('title_str', 'url_str', 'added')
|
||||
readonly_fields = ('id', 'url', 'timestamp', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
||||
readonly_fields = ('uuid', 'num_outputs', 'is_archived', 'url_hash', 'added', 'updated')
|
||||
search_fields = ['url__icontains', 'timestamp', 'title', 'tags__name']
|
||||
fields = (*readonly_fields, 'title', 'tags')
|
||||
fields = ('timestamp', 'url', 'title', 'tags', *readonly_fields)
|
||||
list_filter = ('added', 'updated', 'tags')
|
||||
ordering = ['-added']
|
||||
actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots]
|
||||
actions_template = 'admin/actions_as_select.html'
|
||||
form = SnapshotAdminForm
|
||||
list_per_page = 40
|
||||
actions = ['delete_snapshots', 'overwrite_snapshots', 'update_snapshots', 'update_titles', 'verify_snapshots', 'add_tag', 'remove_tag']
|
||||
autocomplete_fields = ['tags']
|
||||
inlines = [ArchiveResultInline]
|
||||
list_per_page = SNAPSHOTS_PER_PAGE
|
||||
|
||||
action_form = SnapshotActionForm
|
||||
|
||||
def get_urls(self):
|
||||
urls = super().get_urls()
|
||||
|
@ -116,21 +74,46 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
return custom_urls + urls
|
||||
|
||||
def get_queryset(self, request):
|
||||
self.request = request
|
||||
return super().get_queryset(request).prefetch_related('tags')
|
||||
|
||||
def tag_list(self, obj):
|
||||
return ', '.join(obj.tags.values_list('name', flat=True))
|
||||
|
||||
def id_str(self, obj):
|
||||
# TODO: figure out a different way to do this, you cant nest forms so this doenst work
|
||||
# def action(self, obj):
|
||||
# # csrfmiddlewaretoken: Wa8UcQ4fD3FJibzxqHN3IYrrjLo4VguWynmbzzcPYoebfVUnDovon7GEMYFRgsh0
|
||||
# # action: update_snapshots
|
||||
# # select_across: 0
|
||||
# # _selected_action: 76d29b26-2a88-439e-877c-a7cca1b72bb3
|
||||
# return format_html(
|
||||
# '''
|
||||
# <form action="/admin/core/snapshot/" method="post" onsubmit="e => e.stopPropagation()">
|
||||
# <input type="hidden" name="csrfmiddlewaretoken" value="{}">
|
||||
# <input type="hidden" name="_selected_action" value="{}">
|
||||
# <button name="update_snapshots">Check</button>
|
||||
# <button name="update_titles">Pull title + favicon</button>
|
||||
# <button name="update_snapshots">Update</button>
|
||||
# <button name="overwrite_snapshots">Re-Archive (overwrite)</button>
|
||||
# <button name="delete_snapshots">Permanently delete</button>
|
||||
# </form>
|
||||
# ''',
|
||||
# csrf.get_token(self.request),
|
||||
# obj.id,
|
||||
# )
|
||||
|
||||
def uuid(self, obj):
|
||||
return format_html(
|
||||
'<code style="font-size: 10px">{}</code>',
|
||||
obj.url_hash[:8],
|
||||
'<code style="font-size: 10px">{}</code><br/><a href="/archive/{}">View index ➡️</a> <a href="/admin/core/snapshot/?id__exact={}">View actions ⚙️</a>',
|
||||
obj.id,
|
||||
obj.timestamp,
|
||||
obj.id,
|
||||
)
|
||||
|
||||
def title_str(self, obj):
|
||||
canon = obj.as_link().canonical_outputs()
|
||||
tags = ''.join(
|
||||
format_html('<a href="/admin/core/snapshot/?tags__id__exact={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
||||
format_html('<a href="/admin/core/snapshot/?id__startswith={}"><span class="tag">{}</span></a> ', tag.id, tag)
|
||||
for tag in obj.tags.all()
|
||||
if str(tag).strip()
|
||||
)
|
||||
|
@ -152,7 +135,7 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
return snapshot_icons(obj)
|
||||
|
||||
def size(self, obj):
|
||||
archive_size = obj.archive_size
|
||||
archive_size = (Path(obj.link_dir) / 'index.html').exists() and obj.archive_size
|
||||
if archive_size:
|
||||
size_txt = printable_filesize(archive_size)
|
||||
if archive_size > 52428800:
|
||||
|
@ -190,28 +173,136 @@ class SnapshotAdmin(SearchResultsAdminMixin, admin.ModelAdmin):
|
|||
rendered_response = self.changelist_view(request)
|
||||
|
||||
# Restore values
|
||||
self.change_list_template = saved_change_list_template
|
||||
self.change_list_template = saved_change_list_template
|
||||
self.list_per_page = saved_list_per_page
|
||||
self.list_max_show_all = saved_list_max_show_all
|
||||
|
||||
return rendered_response
|
||||
|
||||
|
||||
def update_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], out_dir=OUTPUT_DIR)
|
||||
update_snapshots.short_description = "Archive"
|
||||
|
||||
def update_titles(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, methods=('title','favicon'), out_dir=OUTPUT_DIR)
|
||||
update_titles.short_description = "Pull title"
|
||||
|
||||
def overwrite_snapshots(self, request, queryset):
|
||||
archive_links([
|
||||
snapshot.as_link()
|
||||
for snapshot in queryset
|
||||
], overwrite=True, out_dir=OUTPUT_DIR)
|
||||
overwrite_snapshots.short_description = "Re-archive (overwrite)"
|
||||
|
||||
def verify_snapshots(self, request, queryset):
|
||||
for snapshot in queryset:
|
||||
print(snapshot.timestamp, snapshot.url, snapshot.is_archived, snapshot.archive_size, len(snapshot.history))
|
||||
|
||||
verify_snapshots.short_description = "Check"
|
||||
|
||||
def delete_snapshots(self, request, queryset):
|
||||
remove(snapshots=queryset, yes=True, delete=True, out_dir=OUTPUT_DIR)
|
||||
|
||||
delete_snapshots.short_description = "Delete"
|
||||
|
||||
def add_tag(self, request, queryset):
|
||||
if tag and tag.id:
|
||||
tag = request.POST['tag']
|
||||
for obj in queryset:
|
||||
obj.tags.add(tag)
|
||||
|
||||
add_tag.short_description = "Add tag"
|
||||
|
||||
def remove_tag(self, request, queryset):
|
||||
tag = request.POST['tag']
|
||||
for obj in queryset:
|
||||
obj.tags.remove(tag)
|
||||
|
||||
remove_tag.short_description = "Remove tag"
|
||||
|
||||
|
||||
|
||||
id_str.short_description = 'ID'
|
||||
title_str.short_description = 'Title'
|
||||
url_str.short_description = 'Original URL'
|
||||
|
||||
id_str.admin_order_field = 'id'
|
||||
title_str.admin_order_field = 'title'
|
||||
url_str.admin_order_field = 'url'
|
||||
|
||||
|
||||
|
||||
class TagAdmin(admin.ModelAdmin):
|
||||
list_display = ('slug', 'name', 'id')
|
||||
list_display = ('slug', 'name', 'num_snapshots', 'snapshots', 'id')
|
||||
sort_fields = ('id', 'name', 'slug')
|
||||
readonly_fields = ('id',)
|
||||
readonly_fields = ('id', 'num_snapshots', 'snapshots')
|
||||
search_fields = ('id', 'name', 'slug')
|
||||
fields = (*readonly_fields, 'name', 'slug')
|
||||
actions = ['delete_selected']
|
||||
ordering = ['-id']
|
||||
|
||||
def num_snapshots(self, obj):
|
||||
return format_html(
|
||||
'<a href="/admin/core/snapshot/?tags__id__exact={}">{} total</a>',
|
||||
obj.id,
|
||||
obj.snapshot_set.count(),
|
||||
)
|
||||
|
||||
def snapshots(self, obj):
|
||||
total_count = obj.snapshot_set.count()
|
||||
return mark_safe('<br/>'.join(
|
||||
format_html(
|
||||
'{} <code><a href="/admin/core/snapshot/{}/change"><b>[{}]</b></a> {}</code>',
|
||||
snap.updated.strftime('%Y-%m-%d %H:%M') if snap.updated else 'pending...',
|
||||
snap.id,
|
||||
snap.timestamp,
|
||||
snap.url,
|
||||
)
|
||||
for snap in obj.snapshot_set.order_by('-updated')[:10]
|
||||
) + (f'<br/><a href="/admin/core/snapshot/?tags__id__exact={obj.id}">and {total_count-10} more...<a>' if obj.snapshot_set.count() > 10 else ''))
|
||||
|
||||
|
||||
class ArchiveResultAdmin(admin.ModelAdmin):
|
||||
list_display = ('id', 'start_ts', 'extractor', 'snapshot_str', 'cmd_str', 'status', 'output_str')
|
||||
sort_fields = ('start_ts', 'extractor', 'status')
|
||||
readonly_fields = ('id', 'uuid', 'snapshot_str')
|
||||
search_fields = ('id', 'uuid', 'snapshot__url', 'extractor', 'output', 'cmd_version', 'cmd', 'snapshot__timestamp')
|
||||
fields = (*readonly_fields, 'snapshot', 'snapshot__tags', 'extractor', 'status', 'start_ts', 'end_ts', 'pwd', 'cmd', 'cmd_version', 'output')
|
||||
autocomplete_fields = ['snapshot']
|
||||
|
||||
list_filter = ('status', 'extractor', 'start_ts', 'cmd_version')
|
||||
ordering = ['-start_ts']
|
||||
list_per_page = SNAPSHOTS_PER_PAGE
|
||||
|
||||
def snapshot_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="/archive/{}/index.html"><b><code>[{}]</code></b></a><br/>'
|
||||
'<small>{}</small>',
|
||||
obj.snapshot.timestamp,
|
||||
obj.snapshot.timestamp,
|
||||
obj.snapshot.url[:128],
|
||||
)
|
||||
|
||||
def cmd_str(self, obj):
|
||||
return format_html(
|
||||
'<pre>{}</pre>',
|
||||
' '.join(obj.cmd) if isinstance(obj.cmd, list) else str(obj.cmd),
|
||||
)
|
||||
|
||||
def output_str(self, obj):
|
||||
return format_html(
|
||||
'<a href="/archive/{}/{}" class="output-link">↗️</a><pre>{}</pre>',
|
||||
obj.snapshot.timestamp,
|
||||
obj.output if (obj.status == 'succeeded') and obj.extractor not in ('title', 'archive_org') else 'index.html',
|
||||
obj.output,
|
||||
)
|
||||
|
||||
snapshot_str.short_description = 'snapshot'
|
||||
|
||||
class ArchiveBoxAdmin(admin.AdminSite):
|
||||
site_header = 'ArchiveBox'
|
||||
|
@ -266,4 +357,5 @@ admin.site = ArchiveBoxAdmin()
|
|||
admin.site.register(get_user_model())
|
||||
admin.site.register(Snapshot, SnapshotAdmin)
|
||||
admin.site.register(Tag, TagAdmin)
|
||||
admin.site.register(ArchiveResult, ArchiveResultAdmin)
|
||||
admin.site.disable_action('delete_selected')
|
||||
|
|
|
@ -20,7 +20,8 @@ ARCHIVE_METHODS = [
|
|||
|
||||
class AddLinkForm(forms.Form):
|
||||
url = forms.RegexField(label="URLs (one per line)", regex=URL_REGEX, min_length='6', strip=True, widget=forms.Textarea, required=True)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, widget=forms.RadioSelect, initial='0')
|
||||
tag = forms.CharField(label="Tags (comma separated tag1,tag2,tag3)", strip=True, required=False)
|
||||
depth = forms.ChoiceField(label="Archive depth", choices=CHOICES, initial='0', widget=forms.RadioSelect(attrs={"class": "depth-selection"}))
|
||||
archive_methods = forms.MultipleChoiceField(
|
||||
label="Archive methods (select at least 1, otherwise all will be used by default)",
|
||||
required=False,
|
||||
|
|
18
archivebox/core/migrations/0009_auto_20210216_1038.py
Normal file
18
archivebox/core/migrations/0009_auto_20210216_1038.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-16 10:38
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0008_auto_20210105_1421'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='updated',
|
||||
field=models.DateTimeField(auto_now=True, db_index=True, null=True),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0010_auto_20210216_1055.py
Normal file
18
archivebox/core/migrations/0010_auto_20210216_1055.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-16 10:55
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0009_auto_20210216_1038'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='start_ts',
|
||||
field=models.DateTimeField(db_index=True),
|
||||
),
|
||||
]
|
24
archivebox/core/migrations/0011_auto_20210216_1331.py
Normal file
24
archivebox/core/migrations/0011_auto_20210216_1331.py
Normal file
|
@ -0,0 +1,24 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-16 13:31
|
||||
|
||||
from django.db import migrations, models
|
||||
import uuid
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0010_auto_20210216_1055'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(default=uuid.uuid4, editable=False),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
),
|
||||
]
|
23
archivebox/core/migrations/0012_auto_20210216_1425.py
Normal file
23
archivebox/core/migrations/0012_auto_20210216_1425.py
Normal file
|
@ -0,0 +1,23 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-16 14:25
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0011_auto_20210216_1331'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
field=models.CharField(blank=True, default=None, max_length=128, null=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='output',
|
||||
field=models.CharField(max_length=1024),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0013_auto_20210218_0729.py
Normal file
18
archivebox/core/migrations/0013_auto_20210218_0729.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-18 07:29
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0012_auto_20210216_1425'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=256, null=True),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0014_auto_20210218_0729.py
Normal file
18
archivebox/core/migrations/0014_auto_20210218_0729.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-18 07:29
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0013_auto_20210218_0729'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=1024, null=True),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0015_auto_20210218_0730.py
Normal file
18
archivebox/core/migrations/0015_auto_20210218_0730.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-18 07:30
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0014_auto_20210218_0729'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='title',
|
||||
field=models.CharField(blank=True, db_index=True, max_length=512, null=True),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0016_auto_20210218_1204.py
Normal file
18
archivebox/core/migrations/0016_auto_20210218_1204.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-18 12:04
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0015_auto_20210218_0730'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='snapshot',
|
||||
name='tags',
|
||||
field=models.ManyToManyField(blank=True, to='core.Tag'),
|
||||
),
|
||||
]
|
18
archivebox/core/migrations/0017_auto_20210219_0211.py
Normal file
18
archivebox/core/migrations/0017_auto_20210219_0211.py
Normal file
|
@ -0,0 +1,18 @@
|
|||
# Generated by Django 3.1.3 on 2021-02-19 02:11
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0016_auto_20210218_1204'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='tag',
|
||||
name='slug',
|
||||
field=models.SlugField(blank=True, max_length=100, unique=True, verbose_name='slug'),
|
||||
),
|
||||
]
|
|
@ -2,12 +2,15 @@ __package__ = 'archivebox.core'
|
|||
|
||||
import uuid
|
||||
|
||||
from django.db import models, transaction
|
||||
from django.db import models
|
||||
from django.utils.functional import cached_property
|
||||
from django.utils.text import slugify
|
||||
from django.core.cache import cache
|
||||
from django.db.models import Case, When, Value, IntegerField
|
||||
|
||||
from ..util import parse_date
|
||||
from ..config import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
||||
from ..system import get_dir_size
|
||||
from ..util import parse_date, base_url, hashurl
|
||||
from ..index.schema import Link
|
||||
from ..extractors import get_default_archive_methods, ARCHIVE_METHODS_INDEXING_PRECEDENCE
|
||||
|
||||
|
@ -29,8 +32,11 @@ class Tag(models.Model):
|
|||
"""
|
||||
Based on django-taggit model
|
||||
"""
|
||||
name = models.CharField(verbose_name="name", unique=True, blank=False, max_length=100)
|
||||
slug = models.SlugField(verbose_name="slug", unique=True, max_length=100)
|
||||
name = models.CharField(unique=True, blank=False, max_length=100)
|
||||
|
||||
# slug is autoset on save from name, never set it manually
|
||||
slug = models.SlugField(unique=True, blank=True, max_length=100)
|
||||
|
||||
|
||||
class Meta:
|
||||
verbose_name = "Tag"
|
||||
|
@ -49,20 +55,21 @@ class Tag(models.Model):
|
|||
if self._state.adding and not self.slug:
|
||||
self.slug = self.slugify(self.name)
|
||||
|
||||
with transaction.atomic():
|
||||
slugs = set(
|
||||
type(self)
|
||||
._default_manager.filter(slug__startswith=self.slug)
|
||||
.values_list("slug", flat=True)
|
||||
)
|
||||
# if name is different but slug conficts with another tags slug, append a counter
|
||||
# with transaction.atomic():
|
||||
slugs = set(
|
||||
type(self)
|
||||
._default_manager.filter(slug__startswith=self.slug)
|
||||
.values_list("slug", flat=True)
|
||||
)
|
||||
|
||||
i = None
|
||||
while True:
|
||||
slug = self.slugify(self.name, i)
|
||||
if slug not in slugs:
|
||||
self.slug = slug
|
||||
return super().save(*args, **kwargs)
|
||||
i = 1 if i is None else i+1
|
||||
i = None
|
||||
while True:
|
||||
slug = self.slugify(self.name, i)
|
||||
if slug not in slugs:
|
||||
self.slug = slug
|
||||
return super().save(*args, **kwargs)
|
||||
i = 1 if i is None else i+1
|
||||
else:
|
||||
return super().save(*args, **kwargs)
|
||||
|
||||
|
@ -73,11 +80,11 @@ class Snapshot(models.Model):
|
|||
url = models.URLField(unique=True)
|
||||
timestamp = models.CharField(max_length=32, unique=True, db_index=True)
|
||||
|
||||
title = models.CharField(max_length=128, null=True, blank=True, db_index=True)
|
||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||
|
||||
added = models.DateTimeField(auto_now_add=True, db_index=True)
|
||||
updated = models.DateTimeField(null=True, blank=True, db_index=True)
|
||||
tags = models.ManyToManyField(Tag)
|
||||
updated = models.DateTimeField(auto_now=True, blank=True, null=True, db_index=True)
|
||||
tags = models.ManyToManyField(Tag, blank=True)
|
||||
|
||||
keys = ('url', 'timestamp', 'title', 'tags', 'updated')
|
||||
|
||||
|
@ -109,13 +116,24 @@ class Snapshot(models.Model):
|
|||
from ..index import load_link_details
|
||||
return load_link_details(self.as_link())
|
||||
|
||||
def tags_str(self) -> str:
|
||||
return ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
def tags_str(self, nocache=True) -> str:
|
||||
cache_key = f'{self.id}-{(self.updated or self.added).timestamp()}-tags'
|
||||
calc_tags_str = lambda: ','.join(self.tags.order_by('name').values_list('name', flat=True))
|
||||
if nocache:
|
||||
tags_str = calc_tags_str()
|
||||
cache.set(cache_key, tags_str)
|
||||
return tags_str
|
||||
return cache.get_or_set(cache_key, calc_tags_str)
|
||||
|
||||
@cached_property
|
||||
def bookmarked(self):
|
||||
return parse_date(self.timestamp)
|
||||
|
||||
@cached_property
|
||||
def bookmarked_date(self):
|
||||
# TODO: remove this
|
||||
return self.bookmarked
|
||||
|
||||
@cached_property
|
||||
def is_archived(self):
|
||||
return self.as_link().is_archived
|
||||
|
@ -126,23 +144,31 @@ class Snapshot(models.Model):
|
|||
|
||||
@cached_property
|
||||
def url_hash(self):
|
||||
return self.as_link().url_hash
|
||||
return hashurl(self.url)
|
||||
|
||||
@cached_property
|
||||
def base_url(self):
|
||||
return self.as_link().base_url
|
||||
return base_url(self.url)
|
||||
|
||||
@cached_property
|
||||
def link_dir(self):
|
||||
return self.as_link().link_dir
|
||||
return str(ARCHIVE_DIR / self.timestamp)
|
||||
|
||||
@cached_property
|
||||
def archive_path(self):
|
||||
return self.as_link().archive_path
|
||||
return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp)
|
||||
|
||||
@cached_property
|
||||
def archive_size(self):
|
||||
return self.as_link().archive_size
|
||||
cache_key = f'{str(self.id)[:12]}-{(self.updated or self.added).timestamp()}-size'
|
||||
|
||||
def calc_dir_size():
|
||||
try:
|
||||
return get_dir_size(self.link_dir)[0]
|
||||
except Exception:
|
||||
return 0
|
||||
|
||||
return cache.get_or_set(cache_key, calc_dir_size)
|
||||
|
||||
@cached_property
|
||||
def history(self):
|
||||
|
@ -151,17 +177,40 @@ class Snapshot(models.Model):
|
|||
|
||||
@cached_property
|
||||
def latest_title(self):
|
||||
if ('title' in self.history
|
||||
and self.history['title']
|
||||
and (self.history['title'][-1].status == 'succeeded')
|
||||
and self.history['title'][-1].output.strip()):
|
||||
return self.history['title'][-1].output.strip()
|
||||
if self.title:
|
||||
return self.title # whoopdedoo that was easy
|
||||
|
||||
try:
|
||||
# take longest successful title from ArchiveResult db history
|
||||
return sorted(
|
||||
self.archiveresult_set\
|
||||
.filter(extractor='title', status='succeeded', output__isnull=False)\
|
||||
.values_list('output', flat=True),
|
||||
key=lambda r: len(r),
|
||||
)[-1]
|
||||
except IndexError:
|
||||
pass
|
||||
|
||||
try:
|
||||
# take longest successful title from Link json index file history
|
||||
return sorted(
|
||||
(
|
||||
result.output.strip()
|
||||
for result in self.history['title']
|
||||
if result.status == 'succeeded' and result.output.strip()
|
||||
),
|
||||
key=lambda r: len(r),
|
||||
)[-1]
|
||||
except (KeyError, IndexError):
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def save_tags(self, tags=()):
|
||||
tags_id = []
|
||||
for tag in tags:
|
||||
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
|
||||
if tag.strip():
|
||||
tags_id.append(Tag.objects.get_or_create(name=tag)[0].id)
|
||||
self.tags.clear()
|
||||
self.tags.add(*tags_id)
|
||||
|
||||
|
@ -178,15 +227,18 @@ class ArchiveResultManager(models.Manager):
|
|||
|
||||
|
||||
class ArchiveResult(models.Model):
|
||||
id = models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')
|
||||
uuid = models.UUIDField(default=uuid.uuid4, editable=False)
|
||||
|
||||
snapshot = models.ForeignKey(Snapshot, on_delete=models.CASCADE)
|
||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||
cmd = JSONField()
|
||||
pwd = models.CharField(max_length=256)
|
||||
cmd_version = models.CharField(max_length=32, default=None, null=True, blank=True)
|
||||
output = models.CharField(max_length=512)
|
||||
start_ts = models.DateTimeField()
|
||||
cmd_version = models.CharField(max_length=128, default=None, null=True, blank=True)
|
||||
output = models.CharField(max_length=1024)
|
||||
start_ts = models.DateTimeField(db_index=True)
|
||||
end_ts = models.DateTimeField()
|
||||
status = models.CharField(max_length=16, choices=STATUS_CHOICES)
|
||||
extractor = models.CharField(choices=EXTRACTORS, max_length=32)
|
||||
|
||||
objects = ArchiveResultManager()
|
||||
|
||||
|
|
|
@ -2,6 +2,9 @@ __package__ = 'archivebox.core'
|
|||
|
||||
import os
|
||||
import sys
|
||||
import re
|
||||
import logging
|
||||
import tempfile
|
||||
|
||||
from pathlib import Path
|
||||
from django.utils.crypto import get_random_string
|
||||
|
@ -14,6 +17,7 @@ from ..config import (
|
|||
TEMPLATES_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
LOGS_DIR,
|
||||
)
|
||||
|
||||
|
||||
|
@ -62,6 +66,40 @@ AUTHENTICATION_BACKENDS = [
|
|||
'django.contrib.auth.backends.ModelBackend',
|
||||
]
|
||||
|
||||
# only enable debug toolbar when in DEBUG mode with --nothreading (it doesnt work in multithreaded mode)
|
||||
DEBUG_TOOLBAR = DEBUG and ('--nothreading' in sys.argv) and ('--reload' not in sys.argv)
|
||||
if DEBUG_TOOLBAR:
|
||||
try:
|
||||
import debug_toolbar # noqa
|
||||
DEBUG_TOOLBAR = True
|
||||
except ImportError:
|
||||
DEBUG_TOOLBAR = False
|
||||
|
||||
if DEBUG_TOOLBAR:
|
||||
INSTALLED_APPS = [*INSTALLED_APPS, 'debug_toolbar']
|
||||
INTERNAL_IPS = ['0.0.0.0', '127.0.0.1', '*']
|
||||
DEBUG_TOOLBAR_CONFIG = {
|
||||
"SHOW_TOOLBAR_CALLBACK": lambda request: True,
|
||||
"RENDER_PANELS": True,
|
||||
}
|
||||
DEBUG_TOOLBAR_PANELS = [
|
||||
'debug_toolbar.panels.history.HistoryPanel',
|
||||
'debug_toolbar.panels.versions.VersionsPanel',
|
||||
'debug_toolbar.panels.timer.TimerPanel',
|
||||
'debug_toolbar.panels.settings.SettingsPanel',
|
||||
'debug_toolbar.panels.headers.HeadersPanel',
|
||||
'debug_toolbar.panels.request.RequestPanel',
|
||||
'debug_toolbar.panels.sql.SQLPanel',
|
||||
'debug_toolbar.panels.staticfiles.StaticFilesPanel',
|
||||
# 'debug_toolbar.panels.templates.TemplatesPanel',
|
||||
'debug_toolbar.panels.cache.CachePanel',
|
||||
'debug_toolbar.panels.signals.SignalsPanel',
|
||||
'debug_toolbar.panels.logging.LoggingPanel',
|
||||
'debug_toolbar.panels.redirects.RedirectsPanel',
|
||||
'debug_toolbar.panels.profiling.ProfilingPanel',
|
||||
'djdt_flamegraph.FlamegraphPanel',
|
||||
]
|
||||
MIDDLEWARE = [*MIDDLEWARE, 'debug_toolbar.middleware.DebugToolbarMiddleware']
|
||||
|
||||
################################################################################
|
||||
### Staticfile and Template Settings
|
||||
|
@ -107,6 +145,22 @@ DATABASES = {
|
|||
'default': {
|
||||
'ENGINE': 'django.db.backends.sqlite3',
|
||||
'NAME': DATABASE_NAME,
|
||||
'OPTIONS': {
|
||||
'timeout': 60,
|
||||
'check_same_thread': False,
|
||||
},
|
||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||
}
|
||||
}
|
||||
|
||||
CACHE_BACKEND = 'django.core.cache.backends.locmem.LocMemCache'
|
||||
# CACHE_BACKEND = 'django.core.cache.backends.db.DatabaseCache'
|
||||
# CACHE_BACKEND = 'django.core.cache.backends.dummy.DummyCache'
|
||||
|
||||
CACHES = {
|
||||
'default': {
|
||||
'BACKEND': CACHE_BACKEND,
|
||||
'LOCATION': 'django_cache_default',
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -117,7 +171,7 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
|
|||
### Security Settings
|
||||
################################################################################
|
||||
|
||||
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789-_+!.')
|
||||
SECRET_KEY = SECRET_KEY or get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_')
|
||||
|
||||
ALLOWED_HOSTS = ALLOWED_HOSTS.split(',')
|
||||
|
||||
|
@ -131,6 +185,8 @@ SESSION_COOKIE_AGE = 1209600 # 2 weeks
|
|||
SESSION_EXPIRE_AT_BROWSER_CLOSE = False
|
||||
SESSION_SAVE_EVERY_REQUEST = True
|
||||
|
||||
SESSION_ENGINE = "django.contrib.sessions.backends.db"
|
||||
|
||||
AUTH_PASSWORD_VALIDATORS = [
|
||||
{'NAME': 'django.contrib.auth.password_validation.UserAttributeSimilarityValidator'},
|
||||
{'NAME': 'django.contrib.auth.password_validation.MinimumLengthValidator'},
|
||||
|
@ -163,3 +219,73 @@ USE_TZ = False
|
|||
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
|
||||
|
||||
################################################################################
|
||||
### Logging Settings
|
||||
################################################################################
|
||||
|
||||
IGNORABLE_404_URLS = [
|
||||
re.compile(r'apple-touch-icon.*\.png$'),
|
||||
re.compile(r'favicon\.ico$'),
|
||||
re.compile(r'robots\.txt$'),
|
||||
re.compile(r'.*\.(css|js)\.map$'),
|
||||
]
|
||||
|
||||
class NoisyRequestsFilter(logging.Filter):
|
||||
def filter(self, record):
|
||||
logline = record.getMessage()
|
||||
|
||||
# ignore harmless 404s for the patterns in IGNORABLE_404_URLS
|
||||
for ignorable_url_pattern in IGNORABLE_404_URLS:
|
||||
ignorable_log_pattern = re.compile(f'^"GET /.*/?{ignorable_url_pattern.pattern[:-1]} HTTP/.*" (200|30.|404) .+$', re.I | re.M)
|
||||
if ignorable_log_pattern.match(logline):
|
||||
return 0
|
||||
|
||||
# ignore staticfile requests that 200 or 30*
|
||||
ignoreable_200_log_pattern = re.compile(r'"GET /static/.* HTTP/.*" (200|30.) .+', re.I | re.M)
|
||||
if ignoreable_200_log_pattern.match(logline):
|
||||
return 0
|
||||
|
||||
return 1
|
||||
|
||||
if LOGS_DIR.exists():
|
||||
ERROR_LOG = (LOGS_DIR / 'errors.log')
|
||||
else:
|
||||
# meh too many edge cases here around creating log dir w/ correct permissions
|
||||
# cant be bothered, just trash the log and let them figure it out via stdout/stderr
|
||||
ERROR_LOG = tempfile.NamedTemporaryFile().name
|
||||
|
||||
LOGGING = {
|
||||
'version': 1,
|
||||
'disable_existing_loggers': False,
|
||||
'handlers': {
|
||||
'console': {
|
||||
'class': 'logging.StreamHandler',
|
||||
},
|
||||
'logfile': {
|
||||
'level': 'ERROR',
|
||||
'class': 'logging.handlers.RotatingFileHandler',
|
||||
'filename': ERROR_LOG,
|
||||
'maxBytes': 1024 * 1024 * 25, # 25 MB
|
||||
'backupCount': 10,
|
||||
},
|
||||
},
|
||||
'filters': {
|
||||
'noisyrequestsfilter': {
|
||||
'()': NoisyRequestsFilter,
|
||||
}
|
||||
},
|
||||
'loggers': {
|
||||
'django': {
|
||||
'handlers': ['console', 'logfile'],
|
||||
'level': 'INFO',
|
||||
'filters': ['noisyrequestsfilter'],
|
||||
},
|
||||
'django.server': {
|
||||
'handlers': ['console', 'logfile'],
|
||||
'level': 'INFO',
|
||||
'filters': ['noisyrequestsfilter'],
|
||||
}
|
||||
},
|
||||
}
|
||||
|
|
|
@ -2,6 +2,7 @@ from django.contrib import admin
|
|||
|
||||
from django.urls import path, include
|
||||
from django.views import static
|
||||
from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
||||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
|
@ -13,8 +14,8 @@ from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
|
|||
urlpatterns = [
|
||||
path('public/', PublicIndexView.as_view(), name='public-index'),
|
||||
|
||||
path('robots.txt', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'robots.txt'}),
|
||||
path('favicon.ico', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'favicon.ico'}),
|
||||
path('robots.txt', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'robots.txt'}),
|
||||
path('favicon.ico', static.serve, {'document_root': settings.STATICFILES_DIRS[0], 'path': 'favicon.ico'}),
|
||||
|
||||
path('docs/', RedirectView.as_view(url='https://github.com/ArchiveBox/ArchiveBox/wiki'), name='Docs'),
|
||||
|
||||
|
@ -35,35 +36,43 @@ urlpatterns = [
|
|||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||
path('', HomepageView.as_view(), name='Home'),
|
||||
]
|
||||
urlpatterns += staticfiles_urlpatterns()
|
||||
|
||||
# # Proposed UI URLs spec
|
||||
# path('', HomepageView)
|
||||
# path('/add', AddView)
|
||||
# path('/public', PublicIndexView)
|
||||
# path('/snapshot/:slug', SnapshotView)
|
||||
|
||||
# path('/admin', admin.site.urls)
|
||||
# path('/accounts', django.contrib.auth.urls)
|
||||
if settings.DEBUG_TOOLBAR:
|
||||
import debug_toolbar
|
||||
urlpatterns += [
|
||||
path('__debug__/', include(debug_toolbar.urls)),
|
||||
]
|
||||
|
||||
# # Prposed REST API spec
|
||||
# # :slugs can be uuid, short_uuid, or any of the unique index_fields
|
||||
# path('api/v1/'),
|
||||
# path('api/v1/core/' [GET])
|
||||
# path('api/v1/core/snapshot/', [GET, POST, PUT]),
|
||||
# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
|
||||
# path('api/v1/core/archiveresult', [GET, POST, PUT]),
|
||||
# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
|
||||
# path('api/v1/core/tag/', [GET, POST, PUT]),
|
||||
# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
|
||||
|
||||
# path('api/v1/cli/', [GET])
|
||||
# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
|
||||
# # Proposed FUTURE URLs spec
|
||||
# path('', HomepageView)
|
||||
# path('/add', AddView)
|
||||
# path('/public', PublicIndexView)
|
||||
# path('/snapshot/:slug', SnapshotView)
|
||||
|
||||
# path('api/v1/extractors/', [GET])
|
||||
# path('api/v1/extractors/:extractor/', [GET]),
|
||||
# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
|
||||
# path('/admin', admin.site.urls)
|
||||
# path('/accounts', django.contrib.auth.urls)
|
||||
|
||||
# future, just an idea:
|
||||
# path('api/v1/scheduler/', [GET])
|
||||
# path('api/v1/scheduler/task/', [GET, POST, PUT]),
|
||||
# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),
|
||||
# # Prposed REST API spec
|
||||
# # :slugs can be uuid, short_uuid, or any of the unique index_fields
|
||||
# path('api/v1/'),
|
||||
# path('api/v1/core/' [GET])
|
||||
# path('api/v1/core/snapshot/', [GET, POST, PUT]),
|
||||
# path('api/v1/core/snapshot/:slug', [GET, PATCH, DELETE]),
|
||||
# path('api/v1/core/archiveresult', [GET, POST, PUT]),
|
||||
# path('api/v1/core/archiveresult/:slug', [GET, PATCH, DELETE]),
|
||||
# path('api/v1/core/tag/', [GET, POST, PUT]),
|
||||
# path('api/v1/core/tag/:slug', [GET, PATCH, DELETE]),
|
||||
|
||||
# path('api/v1/cli/', [GET])
|
||||
# path('api/v1/cli/{add,list,config,...}', [POST]), # pass query as kwargs directly to `run_subcommand` and return stdout, stderr, exitcode
|
||||
|
||||
# path('api/v1/extractors/', [GET])
|
||||
# path('api/v1/extractors/:extractor/', [GET]),
|
||||
# path('api/v1/extractors/:extractor/:func', [GET, POST]), # pass query as args directly to chosen function
|
||||
|
||||
# future, just an idea:
|
||||
# path('api/v1/scheduler/', [GET])
|
||||
# path('api/v1/scheduler/task/', [GET, POST, PUT]),
|
||||
# path('api/v1/scheduler/task/:slug', [GET, PATCH, DELETE]),
|
||||
|
|
|
@ -4,8 +4,8 @@ from io import StringIO
|
|||
from contextlib import redirect_stdout
|
||||
|
||||
from django.shortcuts import render, redirect
|
||||
|
||||
from django.http import HttpResponse
|
||||
from django.http import HttpResponse, Http404
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from django.views import View, static
|
||||
from django.views.generic.list import ListView
|
||||
from django.views.generic import FormView
|
||||
|
@ -22,6 +22,7 @@ from ..config import (
|
|||
PUBLIC_ADD_VIEW,
|
||||
VERSION,
|
||||
FOOTER_INFO,
|
||||
SNAPSHOTS_PER_PAGE,
|
||||
)
|
||||
from main import add
|
||||
from ..util import base_url, ansi_to_html
|
||||
|
@ -43,10 +44,6 @@ class SnapshotView(View):
|
|||
# render static html index from filesystem archive/<timestamp>/index.html
|
||||
|
||||
def get(self, request, path):
|
||||
# missing trailing slash -> redirect to index
|
||||
if '/' not in path:
|
||||
return redirect(f'{path}/index.html')
|
||||
|
||||
if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS:
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
|
@ -55,46 +52,163 @@ class SnapshotView(View):
|
|||
except (IndexError, ValueError):
|
||||
slug, archivefile = path.split('/', 1)[0], 'index.html'
|
||||
|
||||
all_pages = list(Snapshot.objects.all())
|
||||
|
||||
# slug is a timestamp
|
||||
by_ts = {page.timestamp: page for page in all_pages}
|
||||
try:
|
||||
# print('SERVING STATICFILE', by_ts[slug].link_dir, request.path, path)
|
||||
response = static.serve(request, archivefile, document_root=by_ts[slug].link_dir, show_indexes=True)
|
||||
response["Link"] = f'<{by_ts[slug].url}>; rel="canonical"'
|
||||
return response
|
||||
except KeyError:
|
||||
pass
|
||||
if slug.replace('.','').isdigit():
|
||||
|
||||
# slug is a hash
|
||||
by_hash = {page.url_hash: page for page in all_pages}
|
||||
try:
|
||||
timestamp = by_hash[slug].timestamp
|
||||
return redirect(f'/archive/{timestamp}/{archivefile}')
|
||||
except KeyError:
|
||||
pass
|
||||
# missing trailing slash -> redirect to index
|
||||
if '/' not in path:
|
||||
return redirect(f'{path}/index.html')
|
||||
|
||||
try:
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(Q(timestamp=slug) | Q(id__startswith=slug))
|
||||
response = static.serve(request, archivefile, document_root=snapshot.link_dir, show_indexes=True)
|
||||
response["Link"] = f'<{snapshot.url}>; rel="canonical"'
|
||||
return response
|
||||
except Snapshot.DoesNotExist:
|
||||
if Snapshot.objects.filter(timestamp__startswith=slug).exists():
|
||||
raise Snapshot.MultipleObjectsReturned
|
||||
else:
|
||||
raise
|
||||
except Snapshot.DoesNotExist:
|
||||
# Snapshot does not exist
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'<center><br/><br/><br/>'
|
||||
'No Snapshot directories match the given timestamp or UUID: <code>{}</code><br/><br/>'
|
||||
'You can <a href="/add/" target="_top">add a new Snapshot</a>, or return to the <a href="/" target="_top">Main Index</a>'
|
||||
'</center>'
|
||||
),
|
||||
slug,
|
||||
path,
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot_hrefs = mark_safe('<br/>').join(
|
||||
format_html(
|
||||
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
snap.timestamp,
|
||||
snap.timestamp,
|
||||
snap.url,
|
||||
snap.title or '',
|
||||
)
|
||||
for snap in Snapshot.objects.filter(timestamp__startswith=slug).only('url', 'timestamp', 'title', 'added').order_by('-added')
|
||||
)
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'Multiple Snapshots match the given timestamp/UUID <code>{}</code><br/><pre>'
|
||||
),
|
||||
slug,
|
||||
) + snapshot_hrefs + format_html(
|
||||
(
|
||||
'</pre><br/>'
|
||||
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
|
||||
)
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
except Http404:
|
||||
# Snapshot dir exists but file within does not e.g. 124235.324234/screenshot.png
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'<center><br/><br/><br/>'
|
||||
f'Snapshot <a href="/archive/{snapshot.timestamp}/index.html" target="_top"><b><code>[{snapshot.timestamp}]</code></b></a> exists in DB, but resource <b><code>{snapshot.timestamp}/'
|
||||
'{}'
|
||||
f'</code></b> does not exist in <a href="/archive/{snapshot.timestamp}/" target="_top">snapshot dir</a> yet.<br/><br/>'
|
||||
'Maybe this resource type is not availabe for this Snapshot,<br/>or the archiving process has not completed yet?<br/>'
|
||||
f'<pre><code># run this cmd to finish archiving this Snapshot<br/>archivebox update -t timestamp {snapshot.timestamp}</code></pre><br/><br/>'
|
||||
'<div class="text-align: left; width: 100%; max-width: 400px">'
|
||||
'<i><b>Next steps:</i></b><br/>'
|
||||
f'- list all the <a href="/archive/{snapshot.timestamp}/" target="_top">Snapshot files <code>.*</code></a><br/>'
|
||||
f'- view the <a href="/archive/{snapshot.timestamp}/index.html" target="_top">Snapshot <code>./index.html</code></a><br/>'
|
||||
f'- go to the <a href="/admin/core/snapshot/{snapshot.id}/change/" target="_top">Snapshot admin</a> to edit<br/>'
|
||||
f'- go to the <a href="/admin/core/snapshot/?id__startswith={snapshot.id}" target="_top">Snapshot actions</a> to re-archive<br/>'
|
||||
'- or return to <a href="/" target="_top">the main index...</a></div>'
|
||||
'</center>'
|
||||
),
|
||||
archivefile,
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
# slug is a URL
|
||||
by_url = {page.base_url: page for page in all_pages}
|
||||
try:
|
||||
# TODO: add multiple snapshot support by showing index of all snapshots
|
||||
# for given url instead of redirecting to timestamp index
|
||||
timestamp = by_url[base_url(path)].timestamp
|
||||
return redirect(f'/archive/{timestamp}/index.html')
|
||||
except KeyError:
|
||||
pass
|
||||
|
||||
return HttpResponse(
|
||||
'No archived link matches the given timestamp or hash.',
|
||||
content_type="text/plain",
|
||||
status=404,
|
||||
)
|
||||
try:
|
||||
# try exact match on full url first
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url='http://' + path) | Q(url='https://' + path) | Q(id__startswith=path)
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
# fall back to match on exact base_url
|
||||
try:
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url='http://' + base_url(path)) | Q(url='https://' + base_url(path))
|
||||
)
|
||||
except Snapshot.DoesNotExist:
|
||||
# fall back to matching base_url as prefix
|
||||
snapshot = Snapshot.objects.get(
|
||||
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||
)
|
||||
return redirect(f'/archive/{snapshot.timestamp}/index.html')
|
||||
except Snapshot.DoesNotExist:
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'<center><br/><br/><br/>'
|
||||
'No Snapshots match the given url: <code>{}</code><br/><br/><br/>'
|
||||
'Return to the <a href="/" target="_top">Main Index</a>, or:<br/><br/>'
|
||||
'+ <i><a href="/add/?url={}" target="_top">Add a new Snapshot for <code>{}</code></a><br/><br/></i>'
|
||||
'</center>'
|
||||
),
|
||||
base_url(path),
|
||||
path if '://' in path else f'https://{path}',
|
||||
path,
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
except Snapshot.MultipleObjectsReturned:
|
||||
snapshot_hrefs = mark_safe('<br/>').join(
|
||||
format_html(
|
||||
'{} <a href="/archive/{}/index.html"><b><code>{}</code></b></a> {} <b>{}</b>',
|
||||
snap.added.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
snap.timestamp,
|
||||
snap.timestamp,
|
||||
snap.url,
|
||||
snap.title or '',
|
||||
)
|
||||
for snap in Snapshot.objects.filter(
|
||||
Q(url__startswith='http://' + base_url(path)) | Q(url__startswith='https://' + base_url(path))
|
||||
).only('url', 'timestamp', 'title', 'added').order_by('-added')
|
||||
)
|
||||
return HttpResponse(
|
||||
format_html(
|
||||
(
|
||||
'Multiple Snapshots match the given URL <code>{}</code><br/><pre>'
|
||||
),
|
||||
base_url(path),
|
||||
) + snapshot_hrefs + format_html(
|
||||
(
|
||||
'</pre><br/>'
|
||||
'Choose a Snapshot to proceed or go back to the <a href="/" target="_top">Main Index</a>'
|
||||
)
|
||||
),
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
|
||||
|
||||
class PublicIndexView(ListView):
|
||||
template_name = 'public_index.html'
|
||||
model = Snapshot
|
||||
paginate_by = 100
|
||||
paginate_by = SNAPSHOTS_PER_PAGE
|
||||
ordering = ['title']
|
||||
|
||||
def get_context_data(self, **kwargs):
|
||||
|
@ -105,12 +219,14 @@ class PublicIndexView(ListView):
|
|||
}
|
||||
|
||||
def get_queryset(self, **kwargs):
|
||||
qs = super().get_queryset(**kwargs)
|
||||
qs = super().get_queryset(**kwargs)
|
||||
query = self.request.GET.get('q')
|
||||
if query:
|
||||
qs = qs.filter(Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query))
|
||||
|
||||
for snapshot in qs:
|
||||
snapshot.icons = snapshot_icons(snapshot)
|
||||
# lazy load snapshot icons, otherwise it will load icons for entire index at once
|
||||
snapshot.icons = lambda: snapshot_icons(snapshot)
|
||||
return qs
|
||||
|
||||
def get(self, *args, **kwargs):
|
||||
|
@ -130,9 +246,9 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
if self.request.method == 'GET':
|
||||
url = self.request.GET.get('url', None)
|
||||
if url:
|
||||
return {'url': url}
|
||||
else:
|
||||
return super().get_initial()
|
||||
return {'url': url if '://' in url else f'https://{url}'}
|
||||
|
||||
return super().get_initial()
|
||||
|
||||
def test_func(self):
|
||||
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
|
||||
|
@ -145,15 +261,18 @@ class AddView(UserPassesTestMixin, FormView):
|
|||
'absolute_add_path': self.request.build_absolute_uri(self.request.path),
|
||||
'VERSION': VERSION,
|
||||
'FOOTER_INFO': FOOTER_INFO,
|
||||
'stdout': '',
|
||||
}
|
||||
|
||||
def form_valid(self, form):
|
||||
url = form.cleaned_data["url"]
|
||||
print(f'[+] Adding URL: {url}')
|
||||
tag = form.cleaned_data["tag"]
|
||||
depth = 0 if form.cleaned_data["depth"] == "0" else 1
|
||||
extractors = ','.join(form.cleaned_data["archive_methods"])
|
||||
input_kwargs = {
|
||||
"urls": url,
|
||||
"tag": tag,
|
||||
"depth": depth,
|
||||
"update_all": False,
|
||||
"out_dir": OUTPUT_DIR,
|
||||
|
|
|
@ -7,10 +7,10 @@ For more information on this file, see
|
|||
https://docs.djangoproject.com/en/2.1/howto/deployment/wsgi/
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
from archivebox.config import setup_django
|
||||
setup_django(in_memory_db=False, check_db=True)
|
||||
|
||||
from django.core.wsgi import get_wsgi_application
|
||||
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
|
||||
application = get_wsgi_application()
|
||||
|
|
|
@ -44,16 +44,16 @@ def get_default_archive_methods():
|
|||
return [
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('headers', should_save_headers, save_headers),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
('pdf', should_save_pdf, save_pdf),
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('readability', should_save_readability, save_readability), #keep readability below wget and singlefile, as it depends on them
|
||||
('wget', should_save_wget, save_wget),
|
||||
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
|
||||
('mercury', should_save_mercury, save_mercury),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
('headers', should_save_headers, save_headers),
|
||||
('archive_org', should_save_archive_dot_org, save_archive_dot_org),
|
||||
]
|
||||
|
||||
|
@ -115,6 +115,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
|||
ArchiveResult.objects.create(snapshot=snapshot, extractor=method_name, cmd=result.cmd, cmd_version=result.cmd_version,
|
||||
output=result.output, pwd=result.pwd, start_ts=result.start_ts, end_ts=result.end_ts, status=result.status)
|
||||
|
||||
|
||||
# bump the updated time on the main Snapshot here, this is critical
|
||||
# to be able to cache summaries of the ArchiveResults for a given
|
||||
# snapshot without having to load all the results from the DB each time.
|
||||
# (we use {Snapshot.id}-{Snapshot.updated} as the cache key and assume
|
||||
# ArchiveResults are unchanged as long as the updated timestamp is unchanged)
|
||||
snapshot.save()
|
||||
else:
|
||||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
|
|
|
@ -31,7 +31,7 @@ def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwr
|
|||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
if not overwrite and (out_dir / 'archive.org.txt').exists():
|
||||
# if open(path, 'r').read().strip() != 'None':
|
||||
# if open(path, 'r', encoding='utf-8').read().strip() != 'None':
|
||||
return False
|
||||
|
||||
return SAVE_ARCHIVE_DOT_ORG
|
||||
|
|
|
@ -54,11 +54,13 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute() / "mercury"
|
||||
output = str(output_folder)
|
||||
output = "mercury"
|
||||
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
|
||||
# Get plain text version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
|
@ -71,6 +73,11 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
except json.JSONDecodeError:
|
||||
raise ShellError(cmd, result)
|
||||
|
||||
if article_text.get('failed'):
|
||||
raise ArchiveError('Mercury was not able to get article text from the URL')
|
||||
|
||||
atomic_write(str(output_folder / "content.txt"), article_text["content"])
|
||||
|
||||
# Get HTML version of article
|
||||
cmd = [
|
||||
DEPENDENCIES['MERCURY_BINARY']['path'],
|
||||
|
@ -82,9 +89,10 @@ def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT)
|
|||
except json.JSONDecodeError:
|
||||
raise ShellError(cmd, result)
|
||||
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
if article_text.get('failed'):
|
||||
raise ArchiveError('Mercury was not able to get article HTML from the URL')
|
||||
|
||||
atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
|
||||
atomic_write(str(output_folder / "content.txt"), article_text["content"])
|
||||
atomic_write(str(output_folder / "article.json"), article_json)
|
||||
|
||||
# Check for common failure cases
|
||||
|
|
|
@ -35,7 +35,7 @@ def get_html(link: Link, path: Path) -> str:
|
|||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
with open(abs_path / source, "r") as f:
|
||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
|
@ -63,7 +63,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|||
|
||||
out_dir = Path(out_dir or link.link_dir)
|
||||
output_folder = out_dir.absolute() / "readability"
|
||||
output = str(output_folder)
|
||||
output = "readability"
|
||||
|
||||
# Readability Docs: https://github.com/mozilla/readability
|
||||
|
||||
|
@ -81,13 +81,20 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|||
temp_doc.write(document.encode("utf-8"))
|
||||
temp_doc.close()
|
||||
|
||||
if not document or len(document) < 10:
|
||||
raise ArchiveError('Readability could not find HTML to parse for article text')
|
||||
|
||||
cmd = [
|
||||
DEPENDENCIES['READABILITY_BINARY']['path'],
|
||||
temp_doc.name
|
||||
temp_doc.name,
|
||||
]
|
||||
|
||||
result = run(cmd, cwd=out_dir, timeout=timeout)
|
||||
result_json = json.loads(result.stdout)
|
||||
try:
|
||||
result_json = json.loads(result.stdout)
|
||||
except json.JSONDecodeError:
|
||||
raise ArchiveError('Readability was not able to archive the page', result.stdout + result.stderr)
|
||||
|
||||
output_folder.mkdir(exist_ok=True)
|
||||
readability_content = result_json.pop("textContent")
|
||||
atomic_write(str(output_folder / "content.html"), result_json.pop("content"))
|
||||
|
@ -112,6 +119,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|||
except (Exception, OSError) as err:
|
||||
status = 'failed'
|
||||
output = err
|
||||
cmd = [cmd[0], './{singlefile,dom}.html']
|
||||
finally:
|
||||
timer.end()
|
||||
|
||||
|
@ -121,6 +129,6 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
|||
cmd_version=READABILITY_VERSION,
|
||||
output=output,
|
||||
status=status,
|
||||
index_texts= [readability_content] if readability_content else [],
|
||||
index_texts=[readability_content] if readability_content else [],
|
||||
**timer.stats,
|
||||
)
|
||||
|
|
|
@ -356,6 +356,7 @@ LINK_FILTERS = {
|
|||
'regex': lambda pattern: Q(url__iregex=pattern),
|
||||
'domain': lambda pattern: Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}"),
|
||||
'tag': lambda pattern: Q(tags__name=pattern),
|
||||
'timestamp': lambda pattern: Q(timestamp=pattern),
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
|
|
|
@ -1,11 +1,12 @@
|
|||
__package__ = 'archivebox.index'
|
||||
|
||||
from datetime import datetime
|
||||
from typing import List, Optional, Iterator, Mapping
|
||||
from pathlib import Path
|
||||
from datetime import datetime
|
||||
from collections import defaultdict
|
||||
from typing import List, Optional, Iterator, Mapping
|
||||
|
||||
from django.utils.html import format_html, mark_safe
|
||||
from collections import defaultdict
|
||||
from django.core.cache import cache
|
||||
|
||||
from .schema import Link
|
||||
from ..system import atomic_write
|
||||
|
@ -20,7 +21,6 @@ from ..util import (
|
|||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
VERSION,
|
||||
GIT_SHA,
|
||||
FOOTER_INFO,
|
||||
HTML_INDEX_FILENAME,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
|
@ -60,7 +60,7 @@ def main_index_template(links: List[Link], template: str=MAIN_INDEX_TEMPLATE) ->
|
|||
|
||||
return render_django_template(template, {
|
||||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
|
||||
'num_links': str(len(links)),
|
||||
'date_updated': datetime.now().strftime('%Y-%m-%d'),
|
||||
'time_updated': datetime.now().strftime('%Y-%m-%d %H:%M'),
|
||||
|
@ -116,71 +116,78 @@ def render_django_template(template: str, context: Mapping[str, str]) -> str:
|
|||
|
||||
|
||||
def snapshot_icons(snapshot) -> str:
|
||||
from core.models import EXTRACTORS
|
||||
cache_key = f'{str(snapshot.id)[:12]}-{(snapshot.updated or snapshot.added).timestamp()}-snapshot-icons'
|
||||
|
||||
def calc_snapshot_icons():
|
||||
from core.models import EXTRACTORS
|
||||
# start = datetime.now()
|
||||
|
||||
# start = datetime.now()
|
||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded", output__isnull=False)
|
||||
link = snapshot.as_link()
|
||||
path = link.archive_path
|
||||
canon = link.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
icons = {
|
||||
"singlefile": "❶",
|
||||
"wget": "🆆",
|
||||
"dom": "🅷",
|
||||
"pdf": "📄",
|
||||
"screenshot": "💻",
|
||||
"media": "📼",
|
||||
"git": "🅶",
|
||||
"archive_org": "🏛",
|
||||
"readability": "🆁",
|
||||
"mercury": "🅼",
|
||||
"warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||
# Missing specific entry for WARC
|
||||
|
||||
archive_results = snapshot.archiveresult_set.filter(status="succeeded")
|
||||
link = snapshot.as_link()
|
||||
path = link.archive_path
|
||||
canon = link.canonical_outputs()
|
||||
output = ""
|
||||
output_template = '<a href="/{}/{}" class="exists-{}" title="{}">{}</a> '
|
||||
icons = {
|
||||
"singlefile": "❶",
|
||||
"wget": "🆆",
|
||||
"dom": "🅷",
|
||||
"pdf": "📄",
|
||||
"screenshot": "💻",
|
||||
"media": "📼",
|
||||
"git": "🅶",
|
||||
"archive_org": "🏛",
|
||||
"readability": "🆁",
|
||||
"mercury": "🅼",
|
||||
"warc": "📦"
|
||||
}
|
||||
exclude = ["favicon", "title", "headers", "archive_org"]
|
||||
# Missing specific entry for WARC
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in EXTRACTORS:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor and result:
|
||||
extractor_outputs[extractor] = result
|
||||
|
||||
extractor_outputs = defaultdict(lambda: None)
|
||||
for extractor, _ in EXTRACTORS:
|
||||
for result in archive_results:
|
||||
if result.extractor == extractor and result:
|
||||
extractor_outputs[extractor] = result
|
||||
for extractor, _ in EXTRACTORS:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||
# if existing:
|
||||
# existing = (Path(path) / existing)
|
||||
# if existing.is_file():
|
||||
# existing = True
|
||||
# elif existing.is_dir():
|
||||
# existing = any(existing.glob('*.*'))
|
||||
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
||||
extractor, icons.get(extractor, "?"))
|
||||
if extractor == "wget":
|
||||
# warc isn't technically it's own extractor, so we have to add it after wget
|
||||
|
||||
# get from db (faster but less thurthful)
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# get from filesystem (slower but more accurate)
|
||||
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
||||
output += format_html(output_template, path, canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||
|
||||
for extractor, _ in EXTRACTORS:
|
||||
if extractor not in exclude:
|
||||
existing = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# Check filesystsem to see if anything is actually present (too slow, needs optimization/caching)
|
||||
# if existing:
|
||||
# existing = (Path(path) / existing)
|
||||
# if existing.is_file():
|
||||
# existing = True
|
||||
# elif existing.is_dir():
|
||||
# existing = any(existing.glob('*.*'))
|
||||
output += format_html(output_template, path, canon[f"{extractor}_path"], str(bool(existing)),
|
||||
extractor, icons.get(extractor, "?"))
|
||||
if extractor == "wget":
|
||||
# warc isn't technically it's own extractor, so we have to add it after wget
|
||||
|
||||
# get from db (faster but less thurthful)
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# get from filesystem (slower but more accurate)
|
||||
# exists = list((Path(path) / canon["warc_path"]).glob("*.warc.gz"))
|
||||
output += format_html(output_template, 'warc/', canon["warc_path"], str(bool(exists)), "warc", icons.get("warc", "?"))
|
||||
if extractor == "archive_org":
|
||||
# The check for archive_org is different, so it has to be handled separately
|
||||
|
||||
if extractor == "archive_org":
|
||||
# The check for archive_org is different, so it has to be handled separately
|
||||
# get from db (faster)
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# get from filesystem (slower)
|
||||
# target_path = Path(path) / "archive.org.txt"
|
||||
# exists = target_path.exists()
|
||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
||||
"archive_org", icons.get("archive_org", "?"))
|
||||
|
||||
# get from db (faster)
|
||||
exists = extractor_outputs[extractor] and extractor_outputs[extractor].status == 'succeeded' and extractor_outputs[extractor].output
|
||||
# get from filesystem (slower)
|
||||
# target_path = Path(path) / "archive.org.txt"
|
||||
# exists = target_path.exists()
|
||||
output += '<a href="{}" class="exists-{}" title="{}">{}</a> '.format(canon["archive_org_path"], str(exists),
|
||||
"archive_org", icons.get("archive_org", "?"))
|
||||
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||
# end = datetime.now()
|
||||
# print(((end - start).total_seconds()*1000) // 1, 'ms')
|
||||
return result
|
||||
|
||||
result = format_html('<span class="files-icons" style="font-size: 1.1em; opacity: 0.8; min-width: 240px; display: inline-block">{}<span>', mark_safe(output))
|
||||
# end = datetime.now()
|
||||
# print(((end - start).total_seconds()*1000) // 1, 'ms')
|
||||
return result
|
||||
return cache.get_or_set(cache_key, calc_snapshot_icons)
|
||||
# return calc_snapshot_icons()
|
||||
|
||||
|
||||
|
|
|
@ -15,7 +15,6 @@ from ..config import (
|
|||
VERSION,
|
||||
OUTPUT_DIR,
|
||||
FOOTER_INFO,
|
||||
GIT_SHA,
|
||||
DEPENDENCIES,
|
||||
JSON_INDEX_FILENAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
|
@ -30,7 +29,7 @@ MAIN_INDEX_HEADER = {
|
|||
'meta': {
|
||||
'project': 'ArchiveBox',
|
||||
'version': VERSION,
|
||||
'git_sha': GIT_SHA,
|
||||
'git_sha': VERSION, # not used anymore, but kept for backwards compatibility
|
||||
'website': 'https://ArchiveBox.io',
|
||||
'docs': 'https://github.com/ArchiveBox/ArchiveBox/wiki',
|
||||
'source': 'https://github.com/ArchiveBox/ArchiveBox',
|
||||
|
|
|
@ -16,6 +16,7 @@ from typing import List, Dict, Any, Optional, Union
|
|||
|
||||
from dataclasses import dataclass, asdict, field, fields
|
||||
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
from ..system import get_dir_size
|
||||
|
||||
|
@ -133,7 +134,6 @@ class Link:
|
|||
updated: Optional[datetime] = None
|
||||
schema: str = 'Link'
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.timestamp}] {self.url} "{self.title}"'
|
||||
|
||||
|
@ -190,6 +190,7 @@ class Link:
|
|||
}
|
||||
if extended:
|
||||
info.update({
|
||||
'snapshot_id': self.snapshot_id,
|
||||
'link_dir': self.link_dir,
|
||||
'archive_path': self.archive_path,
|
||||
|
||||
|
@ -201,6 +202,9 @@ class Link:
|
|||
'basename': self.basename,
|
||||
'extension': self.extension,
|
||||
'is_static': self.is_static,
|
||||
|
||||
'tags_str': self.tags, # only used to render static index in index/html.py, remove if no longer needed there
|
||||
'icons': None, # only used to render static index in index/html.py, remove if no longer needed there
|
||||
|
||||
'bookmarked_date': self.bookmarked_date,
|
||||
'updated_date': self.updated_date,
|
||||
|
@ -255,6 +259,11 @@ class Link:
|
|||
|
||||
return to_csv(self, cols=cols or self.field_names(), separator=separator, ljust=ljust)
|
||||
|
||||
@cached_property
|
||||
def snapshot_id(self):
|
||||
from core.models import Snapshot
|
||||
return str(Snapshot.objects.only('id').get(url=self.url).id)
|
||||
|
||||
@classmethod
|
||||
def field_names(cls):
|
||||
return [f.name for f in fields(cls)]
|
||||
|
|
|
@ -7,7 +7,7 @@ from django.db.models import QuerySet
|
|||
from django.db import transaction
|
||||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types
|
||||
from ..util import enforce_types, parse_date
|
||||
from ..config import OUTPUT_DIR
|
||||
|
||||
|
||||
|
@ -23,13 +23,15 @@ def parse_sql_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
|
|||
)
|
||||
|
||||
@enforce_types
|
||||
def remove_from_sql_main_index(snapshots: QuerySet, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
with transaction.atomic():
|
||||
snapshots.delete()
|
||||
def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
if atomic:
|
||||
with transaction.atomic():
|
||||
return snapshots.delete()
|
||||
return snapshots.delete()
|
||||
|
||||
@enforce_types
|
||||
def write_link_to_sql_index(link: Link):
|
||||
from core.models import Snapshot
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
tags = info.pop("tags")
|
||||
if tags is None:
|
||||
|
@ -41,36 +43,74 @@ def write_link_to_sql_index(link: Link):
|
|||
while Snapshot.objects.filter(timestamp=info["timestamp"]).exists():
|
||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
|
||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
snapshot.save_tags(tags)
|
||||
|
||||
for extractor, entries in link.history.items():
|
||||
for entry in entries:
|
||||
if isinstance(entry, dict):
|
||||
result, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot_id=snapshot.id,
|
||||
extractor=extractor,
|
||||
start_ts=parse_date(entry['start_ts']),
|
||||
defaults={
|
||||
'end_ts': parse_date(entry['end_ts']),
|
||||
'cmd': entry['cmd'],
|
||||
'output': entry['output'],
|
||||
'cmd_version': entry.get('cmd_version') or 'unknown',
|
||||
'pwd': entry['pwd'],
|
||||
'status': entry['status'],
|
||||
}
|
||||
)
|
||||
else:
|
||||
result, _ = ArchiveResult.objects.update_or_create(
|
||||
snapshot_id=snapshot.id,
|
||||
extractor=extractor,
|
||||
start_ts=parse_date(entry.start_ts),
|
||||
defaults={
|
||||
'end_ts': parse_date(entry.end_ts),
|
||||
'cmd': entry.cmd,
|
||||
'output': entry.output,
|
||||
'cmd_version': entry.cmd_version or 'unknown',
|
||||
'pwd': entry.pwd,
|
||||
'status': entry.status,
|
||||
}
|
||||
)
|
||||
|
||||
return snapshot
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
|
||||
with transaction.atomic():
|
||||
for link in links:
|
||||
write_link_to_sql_index(link)
|
||||
for link in links:
|
||||
# with transaction.atomic():
|
||||
# write_link_to_sql_index(link)
|
||||
write_link_to_sql_index(link)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
from core.models import Snapshot
|
||||
|
||||
with transaction.atomic():
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=link.url)
|
||||
except Snapshot.DoesNotExist:
|
||||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
# with transaction.atomic():
|
||||
# try:
|
||||
# snap = Snapshot.objects.get(url=link.url)
|
||||
# except Snapshot.DoesNotExist:
|
||||
# snap = write_link_to_sql_index(link)
|
||||
# snap.title = link.title
|
||||
try:
|
||||
snap = Snapshot.objects.get(url=link.url)
|
||||
except Snapshot.DoesNotExist:
|
||||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
||||
)
|
||||
tag_list = list(tag_set) or []
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
||||
)
|
||||
tag_list = list(tag_set) or []
|
||||
|
||||
snap.save()
|
||||
snap.save_tags(tag_list)
|
||||
snap.save()
|
||||
snap.save_tags(tag_list)
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@ __package__ = 'archivebox'
|
|||
import re
|
||||
import os
|
||||
import sys
|
||||
import stat
|
||||
import time
|
||||
import argparse
|
||||
from math import log
|
||||
|
@ -11,18 +12,21 @@ from pathlib import Path
|
|||
|
||||
from datetime import datetime
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional, List, Dict, Union, IO, TYPE_CHECKING
|
||||
from typing import Any, Optional, List, Dict, Union, IO, TYPE_CHECKING
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .index.schema import Link, ArchiveResult
|
||||
|
||||
from .system import get_dir_size
|
||||
from .util import enforce_types
|
||||
from .config import (
|
||||
ConfigDict,
|
||||
OUTPUT_DIR,
|
||||
PYTHON_ENCODING,
|
||||
VERSION,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
IN_DOCKER,
|
||||
TERM_WIDTH,
|
||||
SHOW_PROGRESS,
|
||||
SOURCES_DIR_NAME,
|
||||
|
@ -50,6 +54,37 @@ class RuntimeStats:
|
|||
_LAST_RUN_STATS = RuntimeStats()
|
||||
|
||||
|
||||
def debug_dict_summary(obj: Dict[Any, Any]) -> None:
|
||||
stderr(' '.join(f'{key}={str(val).ljust(6)}' for key, val in obj.items()))
|
||||
|
||||
|
||||
def get_fd_info(fd) -> Dict[str, Any]:
|
||||
NAME = fd.name[1:-1]
|
||||
FILENO = fd.fileno()
|
||||
MODE = os.fstat(FILENO).st_mode
|
||||
IS_TTY = hasattr(fd, 'isatty') and fd.isatty()
|
||||
IS_PIPE = stat.S_ISFIFO(MODE)
|
||||
IS_FILE = stat.S_ISREG(MODE)
|
||||
IS_TERMINAL = not (IS_PIPE or IS_FILE)
|
||||
IS_LINE_BUFFERED = fd.line_buffering
|
||||
IS_READABLE = fd.readable()
|
||||
return {
|
||||
'NAME': NAME, 'FILENO': FILENO, 'MODE': MODE,
|
||||
'IS_TTY': IS_TTY, 'IS_PIPE': IS_PIPE, 'IS_FILE': IS_FILE,
|
||||
'IS_TERMINAL': IS_TERMINAL, 'IS_LINE_BUFFERED': IS_LINE_BUFFERED,
|
||||
'IS_READABLE': IS_READABLE,
|
||||
}
|
||||
|
||||
|
||||
# # Log debug information about stdin, stdout, and stderr
|
||||
# sys.stdout.write('[>&1] this is python stdout\n')
|
||||
# sys.stderr.write('[>&2] this is python stderr\n')
|
||||
|
||||
# debug_dict_summary(get_fd_info(sys.stdin))
|
||||
# debug_dict_summary(get_fd_info(sys.stdout))
|
||||
# debug_dict_summary(get_fd_info(sys.stderr))
|
||||
|
||||
|
||||
|
||||
class SmartFormatter(argparse.HelpFormatter):
|
||||
"""Patched formatter that prints newlines in argparse help strings"""
|
||||
|
@ -62,22 +97,40 @@ class SmartFormatter(argparse.HelpFormatter):
|
|||
def reject_stdin(caller: str, stdin: Optional[IO]=sys.stdin) -> None:
|
||||
"""Tell the user they passed stdin to a command that doesn't accept it"""
|
||||
|
||||
if stdin and not stdin.isatty():
|
||||
stdin_raw_text = stdin.read().strip()
|
||||
if not stdin:
|
||||
return None
|
||||
|
||||
if IN_DOCKER:
|
||||
# when TTY is disabled in docker we cant tell if stdin is being piped in or not
|
||||
# if we try to read stdin when its not piped we will hang indefinitely waiting for it
|
||||
return None
|
||||
|
||||
if not stdin.isatty():
|
||||
# stderr('READING STDIN TO REJECT...')
|
||||
stdin_raw_text = stdin.read()
|
||||
if stdin_raw_text:
|
||||
# stderr('GOT STDIN!', len(stdin_str))
|
||||
stderr(f'[X] The "{caller}" command does not accept stdin.', color='red')
|
||||
stderr(f' Run archivebox "{caller} --help" to see usage and examples.')
|
||||
stderr()
|
||||
raise SystemExit(1)
|
||||
return None
|
||||
|
||||
|
||||
def accept_stdin(stdin: Optional[IO]=sys.stdin) -> Optional[str]:
|
||||
"""accept any standard input and return it as a string or None"""
|
||||
|
||||
if not stdin:
|
||||
return None
|
||||
elif stdin and not stdin.isatty():
|
||||
stdin_str = stdin.read().strip()
|
||||
return stdin_str or None
|
||||
|
||||
if not stdin.isatty():
|
||||
# stderr('READING STDIN TO ACCEPT...')
|
||||
stdin_str = stdin.read()
|
||||
|
||||
if stdin_str:
|
||||
# stderr('GOT STDIN...', len(stdin_str))
|
||||
return stdin_str
|
||||
|
||||
return None
|
||||
|
||||
|
||||
|
@ -174,7 +227,6 @@ def progress_bar(seconds: int, prefix: str='') -> None:
|
|||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: List[str], stdin: Optional[str], pwd: str):
|
||||
from .config import VERSION, ANSI
|
||||
cmd = ' '.join(('archivebox', subcommand, *subcommand_args))
|
||||
stderr('{black}[i] [{now}] ArchiveBox v{VERSION}: {cmd}{reset}'.format(
|
||||
now=datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
|
||||
|
@ -233,11 +285,11 @@ def log_indexing_process_finished():
|
|||
|
||||
def log_indexing_started(out_path: str):
|
||||
if IS_TTY:
|
||||
sys.stdout.write(f' > {out_path}')
|
||||
sys.stdout.write(f' > ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
||||
|
||||
|
||||
def log_indexing_finished(out_path: str):
|
||||
print(f'\r √ {out_path}')
|
||||
print(f'\r √ ./{Path(out_path).relative_to(OUTPUT_DIR)}')
|
||||
|
||||
|
||||
### Archiving Stage
|
||||
|
@ -272,8 +324,6 @@ def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
|||
total=num_links,
|
||||
))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||
print(' Continue archiving where you left off by running:')
|
||||
print(' archivebox update --resume={}'.format(timestamp))
|
||||
|
||||
|
@ -331,6 +381,9 @@ def log_link_archiving_finished(link: "Link", link_dir: str, is_new: bool, stats
|
|||
else:
|
||||
_LAST_RUN_STATS.succeeded += 1
|
||||
|
||||
size = get_dir_size(link_dir)
|
||||
print(' {black}{} files ({}){reset}'.format(size[2], printable_filesize(size[0]), **ANSI))
|
||||
|
||||
|
||||
def log_archive_method_started(method: str):
|
||||
print(' > {}'.format(method))
|
||||
|
|
|
@ -67,6 +67,7 @@ from .config import (
|
|||
ConfigDict,
|
||||
ANSI,
|
||||
IS_TTY,
|
||||
DEBUG,
|
||||
IN_DOCKER,
|
||||
USER,
|
||||
ARCHIVEBOX_BINARY,
|
||||
|
@ -76,6 +77,7 @@ from .config import (
|
|||
ARCHIVE_DIR,
|
||||
LOGS_DIR,
|
||||
CONFIG_FILE,
|
||||
CONFIG_FILENAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
|
@ -84,6 +86,7 @@ from .config import (
|
|||
SQL_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
SEARCH_BACKEND_ENGINE,
|
||||
check_dependencies,
|
||||
check_data_folder,
|
||||
write_config_file,
|
||||
|
@ -125,14 +128,19 @@ ALLOWED_IN_OUTPUT_DIR = {
|
|||
'node_modules',
|
||||
'package-lock.json',
|
||||
'static',
|
||||
'sonic',
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f'{SQL_INDEX_FILENAME}-wal',
|
||||
f'{SQL_INDEX_FILENAME}-shm',
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f'{CONFIG_FILENAME}.bak',
|
||||
}
|
||||
|
||||
@enforce_types
|
||||
|
@ -214,9 +222,23 @@ def version(quiet: bool=False,
|
|||
if quiet:
|
||||
print(VERSION)
|
||||
else:
|
||||
# ArchiveBox v0.5.6
|
||||
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
p = platform.uname()
|
||||
print(sys.implementation.name.title(), p.system, platform.platform(), p.machine, '(in Docker)' if IN_DOCKER else '(not in Docker)')
|
||||
print(
|
||||
sys.implementation.name.title(),
|
||||
p.system,
|
||||
platform.platform(),
|
||||
p.machine,
|
||||
)
|
||||
print(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'DEBUG={DEBUG}',
|
||||
f'IS_TTY={IS_TTY}',
|
||||
f'TZ={os.environ.get("TZ", "UTC")}',
|
||||
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
|
||||
)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
|
@ -261,7 +283,7 @@ def run(subcommand: str,
|
|||
|
||||
|
||||
@enforce_types
|
||||
def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
def init(force: bool=False, quick: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Initialize a new ArchiveBox collection in the current directory"""
|
||||
|
||||
from core.models import Snapshot
|
||||
|
@ -276,13 +298,12 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
existing_index = (Path(out_dir) / SQL_INDEX_FILENAME).exists()
|
||||
|
||||
if is_empty and not existing_index:
|
||||
print('{green}[+] Initializing a new ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
||||
print(f' {out_dir}')
|
||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
print('{green}[+] Initializing a new ArchiveBox v{} collection...{reset}'.format(VERSION, **ANSI))
|
||||
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
elif existing_index:
|
||||
print('{green}[*] Updating existing ArchiveBox collection in this folder...{reset}'.format(**ANSI))
|
||||
print(f' {out_dir}')
|
||||
print('{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
# TODO: properly detect and print the existing version in current index as well
|
||||
print('{green}[^] Verifying and updating existing ArchiveBox collection to v{}...{reset}'.format(VERSION, **ANSI))
|
||||
print('{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
else:
|
||||
if force:
|
||||
stderr('[!] This folder appears to already have files in it, but no index.sqlite3 is present.', color='lightyellow')
|
||||
|
@ -303,30 +324,25 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
else:
|
||||
print('\n{green}[+] Building archive folder structure...{reset}'.format(**ANSI))
|
||||
|
||||
print(f' + ./{ARCHIVE_DIR.relative_to(OUTPUT_DIR)}, ./{SOURCES_DIR.relative_to(OUTPUT_DIR)}, ./{LOGS_DIR.relative_to(OUTPUT_DIR)}...')
|
||||
Path(SOURCES_DIR).mkdir(exist_ok=True)
|
||||
print(f' √ {SOURCES_DIR}')
|
||||
|
||||
Path(ARCHIVE_DIR).mkdir(exist_ok=True)
|
||||
print(f' √ {ARCHIVE_DIR}')
|
||||
|
||||
Path(LOGS_DIR).mkdir(exist_ok=True)
|
||||
print(f' √ {LOGS_DIR}')
|
||||
|
||||
print(f' + ./{CONFIG_FILE.relative_to(OUTPUT_DIR)}...')
|
||||
write_config_file({}, out_dir=out_dir)
|
||||
print(f' √ {CONFIG_FILE}')
|
||||
|
||||
if (Path(out_dir) / SQL_INDEX_FILENAME).exists():
|
||||
print('\n{green}[*] Verifying main SQL index and running migrations...{reset}'.format(**ANSI))
|
||||
print('\n{green}[*] Verifying main SQL index and running any migrations needed...{reset}'.format(**ANSI))
|
||||
else:
|
||||
print('\n{green}[+] Building main SQL index and running migrations...{reset}'.format(**ANSI))
|
||||
print('\n{green}[+] Building main SQL index and running initial migrations...{reset}'.format(**ANSI))
|
||||
|
||||
DATABASE_FILE = Path(out_dir) / SQL_INDEX_FILENAME
|
||||
print(f' √ {DATABASE_FILE}')
|
||||
print()
|
||||
for migration_line in apply_migrations(out_dir):
|
||||
print(f' {migration_line}')
|
||||
|
||||
|
||||
assert DATABASE_FILE.exists()
|
||||
print()
|
||||
print(f' √ ./{DATABASE_FILE.relative_to(OUTPUT_DIR)}')
|
||||
|
||||
# from django.contrib.auth.models import User
|
||||
# if IS_TTY and not User.objects.filter(is_superuser=True).exists():
|
||||
|
@ -334,7 +350,7 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
# call_command("createsuperuser", interactive=True)
|
||||
|
||||
print()
|
||||
print('{green}[*] Collecting links from any existing indexes and archive folders...{reset}'.format(**ANSI))
|
||||
print('{green}[*] Checking links from indexes and archive folders (safe to Ctrl+C)...{reset}'.format(**ANSI))
|
||||
|
||||
all_links = Snapshot.objects.none()
|
||||
pending_links: Dict[str, Link] = {}
|
||||
|
@ -343,63 +359,77 @@ def init(force: bool=False, out_dir: Path=OUTPUT_DIR) -> None:
|
|||
all_links = load_main_index(out_dir=out_dir, warn=False)
|
||||
print(' √ Loaded {} links from existing main index.'.format(all_links.count()))
|
||||
|
||||
# Links in data folders that dont match their timestamp
|
||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||
if fixed:
|
||||
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
|
||||
if cant_fix:
|
||||
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
|
||||
if quick:
|
||||
print(' > Skipping full snapshot directory check (quick mode)')
|
||||
else:
|
||||
try:
|
||||
# Links in data folders that dont match their timestamp
|
||||
fixed, cant_fix = fix_invalid_folder_locations(out_dir=out_dir)
|
||||
if fixed:
|
||||
print(' {lightyellow}√ Fixed {} data directory locations that didn\'t match their link timestamps.{reset}'.format(len(fixed), **ANSI))
|
||||
if cant_fix:
|
||||
print(' {lightyellow}! Could not fix {} data directory locations due to conflicts with existing folders.{reset}'.format(len(cant_fix), **ANSI))
|
||||
|
||||
# Links in JSON index but not in main index
|
||||
orphaned_json_links = {
|
||||
link.url: link
|
||||
for link in parse_json_main_index(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||
# Links in JSON index but not in main index
|
||||
orphaned_json_links = {
|
||||
link.url: link
|
||||
for link in parse_json_main_index(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_json_links:
|
||||
pending_links.update(orphaned_json_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing JSON index...{reset}'.format(len(orphaned_json_links), **ANSI))
|
||||
|
||||
# Links in data dir indexes but not in main index
|
||||
orphaned_data_dir_links = {
|
||||
link.url: link
|
||||
for link in parse_json_links_details(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
||||
# Links in data dir indexes but not in main index
|
||||
orphaned_data_dir_links = {
|
||||
link.url: link
|
||||
for link in parse_json_links_details(out_dir)
|
||||
if not all_links.filter(url=link.url).exists()
|
||||
}
|
||||
if orphaned_data_dir_links:
|
||||
pending_links.update(orphaned_data_dir_links)
|
||||
print(' {lightyellow}√ Added {} orphaned links from existing archive directories.{reset}'.format(len(orphaned_data_dir_links), **ANSI))
|
||||
|
||||
# Links in invalid/duplicate data dirs
|
||||
invalid_folders = {
|
||||
folder: link
|
||||
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
||||
}
|
||||
if invalid_folders:
|
||||
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||
print(' X ' + '\n X '.join(f'{folder} {link}' for folder, link in invalid_folders.items()))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
||||
print(' archivebox status')
|
||||
print(' archivebox list --status=invalid')
|
||||
# Links in invalid/duplicate data dirs
|
||||
invalid_folders = {
|
||||
folder: link
|
||||
for folder, link in get_invalid_folders(all_links, out_dir=out_dir).items()
|
||||
}
|
||||
if invalid_folders:
|
||||
print(' {lightyellow}! Skipped adding {} invalid link data directories.{reset}'.format(len(invalid_folders), **ANSI))
|
||||
print(' X ' + '\n X '.join(f'./{Path(folder).relative_to(OUTPUT_DIR)} {link}' for folder, link in invalid_folders.items()))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} For more information about the link data directories that were skipped, run:'.format(**ANSI))
|
||||
print(' archivebox status')
|
||||
print(' archivebox list --status=invalid')
|
||||
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
stderr()
|
||||
stderr('[x] Stopped checking archive directories due to Ctrl-C/SIGTERM', color='red')
|
||||
stderr(' Your archive data is safe, but you should re-run `archivebox init` to finish the process later.')
|
||||
stderr()
|
||||
stderr(' {lightred}Hint:{reset} In the future you can run a quick init without checking dirs like so:'.format(**ANSI))
|
||||
stderr(' archivebox init --quick')
|
||||
raise SystemExit(1)
|
||||
|
||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||
|
||||
write_main_index(list(pending_links.values()), out_dir=out_dir)
|
||||
|
||||
print('\n{green}------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
print('\n{green}----------------------------------------------------------------------{reset}'.format(**ANSI))
|
||||
if existing_index:
|
||||
print('{green}[√] Done. Verified and updated the existing ArchiveBox collection.{reset}'.format(**ANSI))
|
||||
else:
|
||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links), **ANSI))
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
print('{green}[√] Done. A new ArchiveBox collection was initialized ({} links).{reset}'.format(len(all_links) + len(pending_links), **ANSI))
|
||||
|
||||
if Snapshot.objects.count() < 25: # hide the hints for experienced users
|
||||
print()
|
||||
print(' {lightred}Hint:{reset} To view your archive index, run:'.format(**ANSI))
|
||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
|
||||
json_index = Path(out_dir) / JSON_INDEX_FILENAME
|
||||
html_index = Path(out_dir) / HTML_INDEX_FILENAME
|
||||
|
@ -531,6 +561,7 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
|
|||
|
||||
@enforce_types
|
||||
def add(urls: Union[str, List[str]],
|
||||
tag: str='',
|
||||
depth: int=0,
|
||||
update_all: bool=not ONLY_NEW,
|
||||
index_only: bool=False,
|
||||
|
@ -540,6 +571,8 @@ def add(urls: Union[str, List[str]],
|
|||
out_dir: Path=OUTPUT_DIR) -> List[Link]:
|
||||
"""Add a new URL or list of URLs to your archive"""
|
||||
|
||||
from core.models import Tag
|
||||
|
||||
assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
|
||||
|
||||
extractors = extractors.split(",") if extractors else []
|
||||
|
@ -572,26 +605,48 @@ def add(urls: Union[str, List[str]],
|
|||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
|
||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
new_links = dedupe_links(all_links, imported_links)
|
||||
|
||||
write_main_index(links=new_links, out_dir=out_dir)
|
||||
all_links = load_main_index(out_dir=out_dir)
|
||||
|
||||
if index_only:
|
||||
return all_links
|
||||
# mock archive all the links using the fake index_only extractor method in order to update their state
|
||||
if overwrite:
|
||||
archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir)
|
||||
else:
|
||||
archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir)
|
||||
else:
|
||||
# fully run the archive extractor methods for each link
|
||||
archive_kwargs = {
|
||||
"out_dir": out_dir,
|
||||
}
|
||||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
if update_all:
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
|
||||
# add any tags to imported links
|
||||
tags = [
|
||||
Tag.objects.get_or_create(name=name.strip())[0]
|
||||
for name in tag.split(',')
|
||||
if name.strip()
|
||||
]
|
||||
if tags:
|
||||
for link in imported_links:
|
||||
snapshot = link.as_snapshot()
|
||||
snapshot.tags.add(*tags)
|
||||
snapshot.tags_str(nocache=True)
|
||||
snapshot.save()
|
||||
# print(f' √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
|
||||
|
||||
# Run the archive methods for each link
|
||||
archive_kwargs = {
|
||||
"out_dir": out_dir,
|
||||
}
|
||||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
if update_all:
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
return all_links
|
||||
|
||||
|
@ -811,11 +866,15 @@ def list_links(snapshots: Optional[QuerySet]=None,
|
|||
all_snapshots = load_main_index(out_dir=out_dir)
|
||||
|
||||
if after is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__lt=after)
|
||||
all_snapshots = all_snapshots.filter(timestamp__gte=after)
|
||||
if before is not None:
|
||||
all_snapshots = all_snapshots.filter(timestamp__gt=before)
|
||||
all_snapshots = all_snapshots.filter(timestamp__lt=before)
|
||||
if filter_patterns:
|
||||
all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
|
||||
|
||||
if not all_snapshots:
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
|
||||
return all_snapshots
|
||||
|
||||
@enforce_types
|
||||
|
@ -1061,6 +1120,7 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
reload: bool=False,
|
||||
debug: bool=False,
|
||||
init: bool=False,
|
||||
quick_init: bool=False,
|
||||
createsuperuser: bool=False,
|
||||
out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Run the ArchiveBox HTTP server"""
|
||||
|
@ -1069,9 +1129,14 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
|
||||
if init:
|
||||
run_subcommand('init', stdin=None, pwd=out_dir)
|
||||
print()
|
||||
elif quick_init:
|
||||
run_subcommand('init', subcommand_args=['--quick'], stdin=None, pwd=out_dir)
|
||||
print()
|
||||
|
||||
if createsuperuser:
|
||||
run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
|
||||
print()
|
||||
|
||||
# setup config for django runserver
|
||||
from . import config
|
||||
|
@ -1083,12 +1148,9 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
from django.core.management import call_command
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
admin_user = User.objects.filter(is_superuser=True).order_by('date_joined').only('username').last()
|
||||
|
||||
print('{green}[+] Starting ArchiveBox webserver...{reset}'.format(**ANSI))
|
||||
if admin_user:
|
||||
hint('The admin username is{lightblue} {}{reset}\n'.format(admin_user.username, **ANSI))
|
||||
else:
|
||||
print(' > Logging errors to ./logs/errors.log')
|
||||
if not User.objects.filter(is_superuser=True).exists():
|
||||
print('{lightyellow}[!] No admin users exist yet, you will not be able to edit links in the UI.{reset}'.format(**ANSI))
|
||||
print()
|
||||
print(' To create an admin user, run:')
|
||||
|
@ -1106,7 +1168,6 @@ def server(runserver_args: Optional[List[str]]=None,
|
|||
config.SHOW_PROGRESS = False
|
||||
config.DEBUG = config.DEBUG or debug
|
||||
|
||||
|
||||
call_command("runserver", *runserver_args)
|
||||
|
||||
|
||||
|
|
|
@ -68,7 +68,6 @@ def parse_links_memory(urls: List[str], root_url: Optional[str]=None):
|
|||
"""
|
||||
parse a list of URLS without touching the filesystem
|
||||
"""
|
||||
check_url_parsing_invariants()
|
||||
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
#urls = list(map(lambda x: x + "\n", urls))
|
||||
|
@ -89,8 +88,6 @@ def parse_links(source_file: str, root_url: Optional[str]=None) -> Tuple[List[Li
|
|||
RSS feed, bookmarks export, or text file
|
||||
"""
|
||||
|
||||
check_url_parsing_invariants()
|
||||
|
||||
timer = TimedProgress(TIMEOUT * 4)
|
||||
with open(source_file, 'r', encoding='utf-8') as file:
|
||||
links, parser = run_parser_functions(file, timer, root_url=root_url)
|
||||
|
@ -173,31 +170,48 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
|||
return source_path
|
||||
|
||||
|
||||
def check_url_parsing_invariants() -> None:
|
||||
"""Check that plain text regex URL parsing works as expected"""
|
||||
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving, as the consequences could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive
|
||||
|
||||
test_urls = '''
|
||||
https://example1.com/what/is/happening.html?what=1#how-about-this=1
|
||||
https://example2.com/what/is/happening/?what=1#how-about-this=1
|
||||
HTtpS://example3.com/what/is/happening/?what=1#how-about-this=1f
|
||||
https://example4.com/what/is/happening.html
|
||||
https://example5.com/
|
||||
https://example6.com
|
||||
|
||||
<test>http://example7.com</test>
|
||||
[https://example8.com/what/is/this.php?what=1]
|
||||
[and http://example9.com?what=1&other=3#and-thing=2]
|
||||
<what>https://example10.com#and-thing=2 "</about>
|
||||
abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def
|
||||
sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi
|
||||
example13.bada
|
||||
and example14.badb
|
||||
<or>htt://example15.badc</that>
|
||||
'''
|
||||
# print('\n'.join(re.findall(URL_REGEX, test_urls)))
|
||||
assert len(re.findall(URL_REGEX, test_urls)) == 12
|
||||
|
||||
# Check that plain text regex URL parsing works as expected
|
||||
# this is last-line-of-defense to make sure the URL_REGEX isn't
|
||||
# misbehaving due to some OS-level or environment level quirks (e.g. bad regex lib)
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(re.findall(URL_REGEX, url_str)) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
|
|
|
@ -16,7 +16,7 @@ def get_file_result_content(res, extra_path, use_pwd=False):
|
|||
if extra_path:
|
||||
fpath = f'{fpath}/{extra_path}'
|
||||
|
||||
with open(fpath, 'r') as file:
|
||||
with open(fpath, 'r', encoding='utf-8') as file:
|
||||
data = file.read()
|
||||
if data:
|
||||
return [data]
|
||||
|
|
|
@ -37,10 +37,11 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
|
|||
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
||||
|
||||
mode = 'wb+' if isinstance(contents, bytes) else 'w'
|
||||
encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
|
||||
|
||||
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
|
||||
try:
|
||||
with lib_atomic_write(path, mode=mode, overwrite=overwrite) as f:
|
||||
with lib_atomic_write(path, mode=mode, overwrite=overwrite, encoding=encoding) as f:
|
||||
if isinstance(contents, dict):
|
||||
dump(contents, f, indent=4, sort_keys=True, cls=ExtendedEncoder)
|
||||
elif isinstance(contents, (bytes, str)):
|
||||
|
|
|
@ -1 +0,0 @@
|
|||
actions_as_select
|
|
@ -20,7 +20,7 @@
|
|||
<body class="{% if is_popup %}popup {% endif %}{% block bodyclass %}{% endblock %}"
|
||||
data-admin-utc-offset="{% now "Z" %}">
|
||||
|
||||
<style nonce="{{nonce}}">
|
||||
<style>
|
||||
/* Loading Progress Bar */
|
||||
#progress {
|
||||
position: absolute;
|
||||
|
@ -89,7 +89,7 @@
|
|||
<a href="{% url 'admin:Add' %}">Add ➕</a> /
|
||||
<a href="{% url 'Home' %}">Snapshots</a> /
|
||||
<a href="/admin/core/tag/">Tags</a> /
|
||||
<a href="/admin/auth/user/">Users</a> /
|
||||
<a href="/admin/">Admin</a> /
|
||||
<a href="{% url 'Docs' %}">Docs</a>
|
||||
|
||||
{% block welcome-msg %}
|
||||
|
@ -157,15 +157,15 @@
|
|||
function fix_actions() {
|
||||
var container = $('div.actions');
|
||||
|
||||
if (container.find('option').length < 10) {
|
||||
container.find('label, button').hide();
|
||||
if (container.find('select[name=action] option').length < 10) {
|
||||
container.find('label:nth-child(1), button[value=0]').hide();
|
||||
|
||||
var buttons = $('<div></div>')
|
||||
.prependTo(container)
|
||||
.appendTo(container)
|
||||
.css('display', 'inline')
|
||||
.addClass('class', 'action-buttons');
|
||||
|
||||
container.find('option:gt(0)').reverse().each(function () {
|
||||
container.find('select[name=action] option:gt(0)').reverse().each(function () {
|
||||
const name = this.value
|
||||
$('<button>')
|
||||
.appendTo(buttons)
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
{% endblock %}
|
||||
|
||||
{% block body %}
|
||||
<div style="max-width: 550px; margin: auto; float: none">
|
||||
<div style="max-width: 1440px; margin: auto; float: none">
|
||||
<br/><br/>
|
||||
{% if stdout %}
|
||||
<h1>Add new URLs to your archive: results</h1>
|
||||
|
|
|
@ -38,7 +38,7 @@
|
|||
<div id="header">
|
||||
<div id="branding">
|
||||
<h1 id="site-name">
|
||||
<a href="{% url 'public-index' %}" class="header-archivebox" title="Last updated: {{updated}}">
|
||||
<a href="{% url 'public-index' %}" class="header-archivebox">
|
||||
<img src="{% static 'archive.png' %}" alt="Logo" style="height: 30px"/>
|
||||
ArchiveBox
|
||||
</a>
|
||||
|
@ -70,7 +70,7 @@
|
|||
<center>
|
||||
<small>
|
||||
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a> version
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/releases" title="Releases">v{{VERSION}}</a>.
|
||||
<a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{VERSION}}" title="Releases">v{{VERSION}}</a>.
|
||||
<br/><br/>
|
||||
{{FOOTER_INFO}}
|
||||
</small>
|
||||
|
|
|
@ -10,7 +10,7 @@
|
|||
{% endif %}
|
||||
|
||||
<a href="archive/{{link.timestamp}}/index.html" title="{{link.title|default:'Not yet archived...'}}">
|
||||
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'}}</span>
|
||||
<span data-title-for="{{link.url}}" data-archived="{{link.is_archived}}">{{link.title|default:'Loading...'|truncatechars:128}}</span>
|
||||
{% if link.tags_str %}
|
||||
<span class="tags" style="float: right; border-radius: 5px; background-color: #bfdfff; padding: 2px 5px; margin-left: 4px; margin-top: 1px;">
|
||||
{% if link.tags_str != None %}
|
||||
|
@ -33,5 +33,5 @@
|
|||
{% endif %}
|
||||
</span>
|
||||
</td>
|
||||
<td style="text-align:left"><a href="{{link.url}}">{{link.url}}</a></td>
|
||||
<td style="text-align:left; word-wrap: anywhere;"><a href="{{link.url}}">{{link.url|truncatechars:128}}</a></td>
|
||||
</tr>
|
||||
|
|
|
@ -4,7 +4,7 @@
|
|||
<title>Archived Sites</title>
|
||||
<meta charset="utf-8" name="viewport" content="width=device-width, initial-scale=1">
|
||||
</head>
|
||||
<body data-status="{{status}}">
|
||||
<body>
|
||||
<table id="table-bookmarks">
|
||||
<thead>
|
||||
<tr class="thead-tr">
|
||||
|
|
|
@ -2,6 +2,11 @@
|
|||
{% load static %}
|
||||
|
||||
{% block body %}
|
||||
<style>
|
||||
#table-bookmarks_info {
|
||||
display: none;
|
||||
}
|
||||
</style>
|
||||
<div id="toolbar">
|
||||
<form id="changelist-search" action="{% url 'public-index' %}" method="get">
|
||||
<div>
|
||||
|
@ -21,7 +26,7 @@
|
|||
<thead>
|
||||
<tr>
|
||||
<th style="width: 100px;">Bookmarked</th>
|
||||
<th style="width: 26vw;">Snapshot ({{object_list|length}})</th>
|
||||
<th style="width: 26vw;">Snapshot ({{page_obj.paginator.count}})</th>
|
||||
<th style="width: 140px">Files</th>
|
||||
<th style="width: 16vw;whitespace:nowrap;overflow-x:hidden;">Original URL</th>
|
||||
</tr>
|
||||
|
@ -33,26 +38,26 @@
|
|||
</tbody>
|
||||
</table>
|
||||
<center>
|
||||
<br/>
|
||||
Showing {{ page_obj.start_index }}-{{ page_obj.end_index }} of {{ page_obj.paginator.count }} total
|
||||
<br/>
|
||||
<span class="step-links">
|
||||
{% if page_obj.has_previous %}
|
||||
<a href="{% url 'public-index' %}?page=1">« first</a>
|
||||
<a href="{% url 'public-index' %}?page=1">« first</a>
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.previous_page_number }}">previous</a>
|
||||
|
||||
{% endif %}
|
||||
|
||||
<span class="current">
|
||||
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}.
|
||||
Page {{ page_obj.number }} of {{ page_obj.paginator.num_pages }}
|
||||
</span>
|
||||
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
|
||||
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last »</a>
|
||||
{% endif %}
|
||||
</span>
|
||||
|
||||
{% if page_obj.has_next %}
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.next_page_number }}">next </a>
|
||||
<a href="{% url 'public-index' %}?page={{ page_obj.paginator.num_pages }}">last »</a>
|
||||
{% endif %}
|
||||
</span>
|
||||
<br>
|
||||
</center>
|
||||
|
|
|
@ -279,7 +279,7 @@
|
|||
<div class="col-lg-8">
|
||||
<img src="favicon.ico" onerror="this.style.opacity=0" alt="Favicon">
|
||||
|
||||
{{title}}
|
||||
{{title|safe}}
|
||||
|
||||
<a href="#" class="header-toggle">▾</a>
|
||||
<br/>
|
||||
|
@ -335,20 +335,21 @@
|
|||
</div>
|
||||
<div class="col-lg-4">
|
||||
<div class="info-chunk">
|
||||
<h5>🗃 Files</h5>
|
||||
<h5>🗃 Snapshot ID: <a href="/admin/core/snapshot/{{snapshot_id}}/change/"><code style="color: rgba(255,255,255,0.6); font-weight: 200; font-size: 12px; background-color: #1a1a1a"><b>[{{timestamp}}]</b> <small>{{snapshot_id|truncatechars:24}}</small></code></a></h5>
|
||||
<a href="index.json" title="JSON summary of archived link.">JSON</a> |
|
||||
<a href="warc/" title="Any WARC archives for the page">WARC</a> |
|
||||
<a href="media/" title="Audio, Video, and Subtitle files.">Media</a> |
|
||||
<a href="git/" title="Any git repos at the url">Git</a> |
|
||||
<a href="favicon.ico" title="Any git repos at the url">Favicon</a> |
|
||||
<a href="." title="Webserver-provided index of files directory.">See all...</a>
|
||||
<a href="/admin/core/snapshot/?id__startswith={{snapshot_id}}" title="Go to the Snapshot admin to update, overwrite, or delete this Snapshot">Actions</a> |
|
||||
<a href="/admin/core/snapshot/{{snapshot_id}}/change/" title="Edit this snapshot in the Admin UI">Admin</a> |
|
||||
<a href="." title="Webserver-provided index of files directory.">See all files...</a><br/>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="row header-bottom-frames">
|
||||
<div class="col-lg-2">
|
||||
<div class="card selected-card">
|
||||
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{singlefile_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{singlefile_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./singlefile.html</code></p>
|
||||
|
@ -381,7 +382,7 @@
|
|||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{archive_url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{archive_url}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./{{domain}}</code></p>
|
||||
|
@ -393,30 +394,30 @@
|
|||
{% if SAVE_ARCHIVE_DOT_ORG %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{archive_org_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{archive_org_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>🌐 web.archive.org/web/...</code></p>
|
||||
</a>
|
||||
<a href="{{archive_org_path}}" target="preview"><h4 class="card-title">Archive.Org</h4></a>
|
||||
<a href="{{archive_org_path}}" target="preview" id="archive_dot_org-btn"><h4 class="card-title">Archive.Org</h4></a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{url}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>🌐 {{domain}}</code></p>
|
||||
</a>
|
||||
<a href="{{url}}" target="preview"><h4 class="card-title">Original</h4></a>
|
||||
<a href="{{url}}" target="preview" id="original-btn"><h4 class="card-title">Original</h4></a>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{headers_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./headers.json</code></p>
|
||||
|
@ -427,7 +428,7 @@
|
|||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{dom_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{dom_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./output.html</code></p>
|
||||
|
@ -438,7 +439,7 @@
|
|||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{readability_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{readability_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./readability/content.html</code></p>
|
||||
|
@ -450,7 +451,7 @@
|
|||
<br/>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{mercury_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{mercury_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./mercury/content.html</code></p>
|
||||
|
@ -461,7 +462,7 @@
|
|||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{media_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{media_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./media/*.mp4</code></p>
|
||||
|
@ -472,7 +473,7 @@
|
|||
</div>
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<iframe class="card-img-top" src="{{git_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no"></iframe>
|
||||
<div class="card-body">
|
||||
<a href="{{git_path}}" title="Open in new tab..." target="_blank" rel="noopener">
|
||||
<p class="card-text"><code>./git/*.git</code></p>
|
||||
|
@ -484,7 +485,7 @@
|
|||
</div>
|
||||
</div>
|
||||
</header>
|
||||
<iframe sandbox="allow-same-origin allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
|
||||
<iframe sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" class="full-page-iframe" src="{{singlefile_path}}" name="preview"></iframe>
|
||||
|
||||
<script
|
||||
src="https://code.jquery.com/jquery-3.2.1.slim.min.js"
|
||||
|
@ -493,6 +494,16 @@
|
|||
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.0.0-alpha.6/js/bootstrap.min.js" integrity="sha384-vBWWzlZJ8ea9aCX4pEW3rVHjgjt7zpkNpZk+02D9phzyeVkE+jo0ieGizqPLForn" crossorigin="anonymous"></script>
|
||||
|
||||
<script>
|
||||
function getPreviewTypeFromPath(link) {
|
||||
if (link.id == 'original-btn') {
|
||||
return 'original'
|
||||
}
|
||||
if (link.id == 'archive_dot_org-btn') {
|
||||
return 'archive_dot_org'
|
||||
}
|
||||
return link.pathname.split('/').filter(a => a.length).slice(-1)[0].toLowerCase()
|
||||
}
|
||||
|
||||
// show selected file in iframe when preview card is clicked
|
||||
jQuery('.card').on('click', function(e) {
|
||||
jQuery('.selected-card').removeClass('selected-card')
|
||||
|
@ -502,11 +513,26 @@
|
|||
if (e.currentTarget.href.endsWith('.pdf')) {
|
||||
jQuery('.full-page-iframe')[0].removeAttribute('sandbox')
|
||||
} else {
|
||||
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms"
|
||||
jQuery('.full-page-iframe')[0].sandbox = "allow-same-origin allow-scripts allow-forms allow-top-navigation-by-user-activation"
|
||||
}
|
||||
window.location.hash = getPreviewTypeFromPath(e.currentTarget)
|
||||
return true
|
||||
})
|
||||
|
||||
// check URL for hash e.g. #git and load relevant preview
|
||||
jQuery(document).ready(function() {
|
||||
if (window.location.hash) {
|
||||
for (const link of jQuery('a[target=preview]')) {
|
||||
console.log(link.pathname)
|
||||
if (getPreviewTypeFromPath(link) == window.location.hash.slice(1).toLowerCase()) {
|
||||
jQuery(link).closest('.card').click()
|
||||
jQuery(link).click()
|
||||
link.click()
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
// un-sandbox iframes showing pdfs (required to display pdf viewer)
|
||||
jQuery('iframe').map(function() {
|
||||
if (this.src.endsWith('.pdf')) {
|
||||
|
|
|
@ -209,7 +209,7 @@
|
|||
<div class="header-top container-fluid">
|
||||
<div class="row nav">
|
||||
<div class="col-sm-2">
|
||||
<a href="/" class="header-archivebox" title="Last updated: {{updated}}">
|
||||
<a href="/" class="header-archivebox">
|
||||
<img src="{% static 'archive.png' %}" alt="Logo"/>
|
||||
ArchiveBox: Index
|
||||
</a>
|
||||
|
@ -243,7 +243,7 @@
|
|||
<center>
|
||||
<small>
|
||||
Archive created using <a href="https://github.com/ArchiveBox/ArchiveBox" title="Github">ArchiveBox</a>
|
||||
version <a href="https://github.com/ArchiveBox/ArchiveBox/tree/v{{version}}" title="Git commit">v{{version}}</a> |
|
||||
version <a href="https://github.com/ArchiveBox/ArchiveBox/releases/tag/v{{version}}" title="View source code and release info">v{{version}}</a> |
|
||||
Download index as <a href="index.json" title="JSON summary of archived links.">JSON</a>
|
||||
<br/><br/>
|
||||
{{FOOTER_INFO}}
|
||||
|
|
|
@ -42,7 +42,7 @@ header {
|
|||
background-color: #f5dd5d;
|
||||
}
|
||||
#stdout {
|
||||
background-color: #ded;
|
||||
background-color: #fbfbfb;
|
||||
padding: 10px 10px;
|
||||
border-radius: 4px;
|
||||
white-space: normal;
|
||||
|
|
|
@ -237,3 +237,40 @@ body.model-snapshot.change-list #content .object-tools {
|
|||
opacity: 0.1;
|
||||
filter: grayscale(100%);
|
||||
}
|
||||
|
||||
|
||||
#result_list tbody td.field-cmd_str pre,
|
||||
#result_list tbody td.field-output_str pre {
|
||||
max-width: 22vw;
|
||||
word-wrap: anywhere;
|
||||
white-space: break-spaces;
|
||||
max-height: 40px;
|
||||
overflow: hidden;
|
||||
margin: 2px;
|
||||
background-color: rgba(0,0,0,0.05);
|
||||
padding: 1px 4px 16px 8px;
|
||||
border-radius: 4px;
|
||||
}
|
||||
|
||||
#result_list tbody td.field-extractor {
|
||||
font-weight: 800;
|
||||
font-variant: small-caps;
|
||||
}
|
||||
|
||||
#result_list tbody td.field-status {
|
||||
font-variant: small-caps;
|
||||
}
|
||||
|
||||
.inline-group .tabular td.original p {
|
||||
margin-top: -33px;
|
||||
}
|
||||
|
||||
tbody .output-link {
|
||||
float: right;
|
||||
margin-bottom: -25px;
|
||||
margin-right: -3px;
|
||||
margin-top: -4px;
|
||||
opacity: 0.4;
|
||||
box-shadow: 4px 4px 4px rgba(0,0,0,0.1);
|
||||
}
|
||||
tbody .output-link:hover {opacity: 1;}
|
||||
|
|
BIN
archivebox/templates/static/favicon.ico
Normal file
BIN
archivebox/templates/static/favicon.ico
Normal file
Binary file not shown.
After Width: | Height: | Size: 15 KiB |
2
archivebox/templates/static/robots.txt
Normal file
2
archivebox/templates/static/robots.txt
Normal file
|
@ -0,0 +1,2 @@
|
|||
User-agent: *
|
||||
Disallow: /
|
|
@ -56,11 +56,13 @@ ts_to_iso = lambda ts: ts and parse_date(ts).isoformat()
|
|||
|
||||
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[$-_@.&+]|[!*\(\),]' # or allowed symbols
|
||||
r'|(?:%[0-9a-fA-F][0-9a-fA-F]))' # or allowed unicode bytes
|
||||
r'[^\]\[\(\)<>"\'\s]+', # stop parsing at these symbols
|
||||
r'[^\]\[\(\)<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
|
|
@ -3,6 +3,7 @@
|
|||
DATA_DIR="${DATA_DIR:-/data}"
|
||||
ARCHIVEBOX_USER="${ARCHIVEBOX_USER:-archivebox}"
|
||||
|
||||
|
||||
# Set the archivebox user UID & GID
|
||||
if [[ -n "$PUID" && "$PUID" != 0 ]]; then
|
||||
usermod -u "$PUID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
|
||||
|
@ -11,6 +12,7 @@ if [[ -n "$PGID" && "$PGID" != 0 ]]; then
|
|||
groupmod -g "$PGID" "$ARCHIVEBOX_USER" > /dev/null 2>&1
|
||||
fi
|
||||
|
||||
|
||||
# Set the permissions of the data dir to match the archivebox user
|
||||
if [[ -d "$DATA_DIR/archive" ]]; then
|
||||
# check data directory permissions
|
||||
|
@ -33,11 +35,11 @@ if [[ "$1" == /* || "$1" == "echo" || "$1" == "archivebox" ]]; then
|
|||
# e.g. "archivebox init"
|
||||
# "/bin/bash"
|
||||
# "echo"
|
||||
gosu "$ARCHIVEBOX_USER" bash -c "$*"
|
||||
exec gosu "$ARCHIVEBOX_USER" bash -c "$*"
|
||||
else
|
||||
# no command given, assume args were meant to be passed to archivebox cmd
|
||||
# e.g. "add https://example.com"
|
||||
# "manage createsupseruser"
|
||||
# "server 0.0.0.0:8000"
|
||||
gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
|
||||
exec gosu "$ARCHIVEBOX_USER" bash -c "archivebox $*"
|
||||
fi
|
||||
|
|
|
@ -11,36 +11,39 @@ version: '3.7'
|
|||
|
||||
services:
|
||||
archivebox:
|
||||
# build: .
|
||||
# build: . # for developers working on archivebox
|
||||
image: ${DOCKER_IMAGE:-archivebox/archivebox:latest}
|
||||
command: server 0.0.0.0:8000
|
||||
command: server --quick-init 0.0.0.0:8000
|
||||
stdin_open: true
|
||||
tty: true
|
||||
ports:
|
||||
- 8000:8000
|
||||
environment:
|
||||
- USE_COLOR=True
|
||||
- SHOW_PROGRESS=False
|
||||
- SEARCH_BACKEND_ENGINE=sonic
|
||||
- SEARCH_BACKEND_HOST_NAME=sonic
|
||||
- SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
- ALLOWED_HOSTS=* # add any config options you want as env vars
|
||||
- MEDIA_MAX_SIZE=750m
|
||||
# - SHOW_PROGRESS=False
|
||||
# - SEARCH_BACKEND_ENGINE=sonic # uncomment these if you enable sonic below
|
||||
# - SEARCH_BACKEND_HOST_NAME=sonic
|
||||
# - SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
volumes:
|
||||
- ./data:/data
|
||||
depends_on:
|
||||
- sonic
|
||||
# - ./archivebox:/app/archivebox # for developers working on archivebox
|
||||
|
||||
# Run sonic search backend
|
||||
sonic:
|
||||
image: valeriansaliou/sonic:v1.3.0
|
||||
ports:
|
||||
- 1491:1491
|
||||
environment:
|
||||
- SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
volumes:
|
||||
- ./etc/sonic/config.cfg:/etc/sonic.cfg
|
||||
- ./data:/var/lib/sonic/store/
|
||||
# To run the Sonic full-text search backend, create an ./etc/sonic folder
|
||||
# and download the sonic config file from here into that folder:
|
||||
# https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/master/etc/sonic/config.cfg
|
||||
# sonic:
|
||||
# image: valeriansaliou/sonic:v1.3.0
|
||||
# expose:
|
||||
# - 1491
|
||||
# environment:
|
||||
# - SEARCH_BACKEND_PASSWORD=SecretPassword
|
||||
# volumes:
|
||||
# - ./etc/sonic/config.cfg:/etc/sonic.cfg
|
||||
# - ./data/sonic:/var/lib/sonic/store
|
||||
|
||||
# Optional Addons: tweak these examples as needed for your specific use case
|
||||
|
||||
### Optional Addons: tweak these examples as needed for your specific use case
|
||||
|
||||
# Example: Run scheduled imports in a docker instead of using cron on the
|
||||
# host machine, add tasks and see more info with archivebox schedule --help
|
||||
|
|
|
@ -1,6 +1,6 @@
|
|||
{
|
||||
"name": "archivebox",
|
||||
"version": "0.5.6",
|
||||
"version": "0.6.0",
|
||||
"description": "ArchiveBox: The self-hosted internet archive",
|
||||
"author": "Nick Sweeting <archivebox-npm@sweeting.me>",
|
||||
"license": "MIT",
|
||||
|
|
90
setup.py
90
setup.py
|
@ -27,6 +27,49 @@ PACKAGE_DIR = ROOT_DIR / PKG_NAME
|
|||
README = (PACKAGE_DIR / "README.md").read_text(encoding='utf-8', errors='ignore')
|
||||
VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['version']
|
||||
|
||||
PYTHON_REQUIRES = ">=3.7"
|
||||
SETUP_REQUIRES = ["wheel"]
|
||||
INSTALL_REQUIRES = [
|
||||
# only add things here that have corresponding apt python3-packages available
|
||||
# anything added here also needs to be added to our package dependencies in
|
||||
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
|
||||
# if there is no apt python3-package equivalent, then vendor it instead in
|
||||
# ./archivebox/vendor/
|
||||
"requests>=2.24.0",
|
||||
"atomicwrites>=1.4.0",
|
||||
"mypy-extensions>=0.4.3",
|
||||
"django>=3.1.3",
|
||||
"django-extensions>=3.0.3",
|
||||
"dateparser",
|
||||
"ipython",
|
||||
"youtube-dl",
|
||||
"python-crontab>=2.5.1",
|
||||
"croniter>=0.3.34",
|
||||
"w3lib>=1.22.0",
|
||||
]
|
||||
EXTRAS_REQUIRE = {
|
||||
'sonic': [
|
||||
"sonic-client>=0.0.5",
|
||||
],
|
||||
'dev': [
|
||||
"setuptools",
|
||||
"twine",
|
||||
"wheel",
|
||||
"flake8",
|
||||
"ipdb",
|
||||
"mypy",
|
||||
"django-stubs",
|
||||
"sphinx",
|
||||
"sphinx-rtd-theme",
|
||||
"recommonmark",
|
||||
"pytest",
|
||||
"bottle",
|
||||
"stdeb",
|
||||
"django-debug-toolbar",
|
||||
"djdt_flamegraph",
|
||||
],
|
||||
}
|
||||
|
||||
# To see when setup.py gets called (uncomment for debugging):
|
||||
# import sys
|
||||
# print(PACKAGE_DIR, f" (v{VERSION})")
|
||||
|
@ -36,7 +79,9 @@ VERSION = json.loads((PACKAGE_DIR / "package.json").read_text().strip())['versio
|
|||
class DisabledTestCommand(test):
|
||||
def run(self):
|
||||
# setup.py test is deprecated, disable it here by force so stdeb doesnt run it
|
||||
print('Use the ./bin/test.sh script to run tests, not setup.py test.')
|
||||
print()
|
||||
print('[X] Running tests via setup.py test is deprecated.')
|
||||
print(' Hint: Use the ./bin/test.sh script or pytest instead')
|
||||
|
||||
|
||||
setuptools.setup(
|
||||
|
@ -50,45 +95,10 @@ setuptools.setup(
|
|||
long_description_content_type="text/markdown",
|
||||
url=REPO_URL,
|
||||
project_urls=PROJECT_URLS,
|
||||
python_requires=">=3.7",
|
||||
setup_requires=[
|
||||
"wheel",
|
||||
],
|
||||
install_requires=[
|
||||
# only add things here that have corresponding apt python3-packages available
|
||||
# anything added here also needs to be added to our package dependencies in
|
||||
# stdeb.cfg (apt), archivebox.rb (brew), Dockerfile, etc.
|
||||
# if there is no apt python3-package equivalent, then vendor it instead in
|
||||
# ./archivebox/vendor/
|
||||
"requests==2.24.0",
|
||||
"atomicwrites==1.4.0",
|
||||
"mypy-extensions==0.4.3",
|
||||
"django==3.1.3",
|
||||
"django-extensions==3.0.3",
|
||||
"dateparser",
|
||||
"ipython",
|
||||
"youtube-dl",
|
||||
"python-crontab==2.5.1",
|
||||
"croniter==0.3.34",
|
||||
"w3lib==1.22.0",
|
||||
],
|
||||
extras_require={
|
||||
'dev': [
|
||||
"setuptools",
|
||||
"twine",
|
||||
"wheel",
|
||||
"flake8",
|
||||
"ipdb",
|
||||
"mypy",
|
||||
"django-stubs",
|
||||
"sphinx",
|
||||
"sphinx-rtd-theme",
|
||||
"recommonmark",
|
||||
"pytest",
|
||||
"bottle",
|
||||
"stdeb",
|
||||
],
|
||||
},
|
||||
python_requires=PYTHON_REQUIRES,
|
||||
setup_requires=SETUP_REQUIRES,
|
||||
install_requires=INSTALL_REQUIRES,
|
||||
extras_require=EXTRAS_REQUIRE,
|
||||
packages=[PKG_NAME],
|
||||
include_package_data=True, # see MANIFEST.in
|
||||
entry_points={
|
||||
|
|
|
@ -33,7 +33,7 @@ def test_depth_flag_0_crawls_only_the_arg_page(tmp_path, process, disable_extrac
|
|||
)
|
||||
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
with open(archived_item_path / "index.json", "r") as f:
|
||||
with open(archived_item_path / "index.json", "r", encoding='utf-8') as f:
|
||||
output_json = json.load(f)
|
||||
assert output_json["base_url"] == "127.0.0.1:8080/static/example.com.html"
|
||||
|
||||
|
@ -79,7 +79,7 @@ def test_add_updates_history_json_index(tmp_path, process, disable_extractors_di
|
|||
|
||||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
|
||||
with open(archived_item_path / "index.json", "r") as f:
|
||||
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
|
||||
output_json = json.load(f)
|
||||
assert output_json["history"] != {}
|
||||
|
||||
|
@ -90,4 +90,4 @@ def test_extract_input_uses_only_passed_extractors(tmp_path, process):
|
|||
archived_item_path = list(tmp_path.glob('archive/**/*'))[0]
|
||||
|
||||
assert (archived_item_path / "warc").exists()
|
||||
assert not (archived_item_path / "singlefile.html").exists()
|
||||
assert not (archived_item_path / "singlefile.html").exists()
|
||||
|
|
|
@ -86,7 +86,7 @@ def test_headers_retrieved(tmp_path, process, disable_extractors_dict):
|
|||
output_file = archived_item_path / "headers.json"
|
||||
assert output_file.exists()
|
||||
headers_file = archived_item_path / 'headers.json'
|
||||
with open(headers_file) as f:
|
||||
with open(headers_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
assert headers['Content-Language'] == 'en'
|
||||
assert headers['Content-Script-Type'] == 'text/javascript'
|
||||
|
@ -98,7 +98,7 @@ def test_headers_redirect_chain(tmp_path, process, disable_extractors_dict):
|
|||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
output_file = archived_item_path / "headers.json"
|
||||
with open(output_file) as f:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
assert headers['Content-Language'] == 'en'
|
||||
assert headers['Content-Script-Type'] == 'text/javascript'
|
||||
|
@ -110,6 +110,6 @@ def test_headers_400_plus(tmp_path, process, disable_extractors_dict):
|
|||
capture_output=True, env=disable_extractors_dict)
|
||||
archived_item_path = list(tmp_path.glob("archive/**/*"))[0]
|
||||
output_file = archived_item_path / "headers.json"
|
||||
with open(output_file) as f:
|
||||
with open(output_file, 'r', encoding='utf-8') as f:
|
||||
headers = pyjson.load(f)
|
||||
assert headers["Status-Code"] == "200"
|
||||
assert headers["Status-Code"] == "200"
|
||||
|
|
|
@ -12,12 +12,12 @@ from archivebox.config import OUTPUT_PERMISSIONS
|
|||
from .fixtures import *
|
||||
|
||||
def test_init(tmp_path, process):
|
||||
assert "Initializing a new ArchiveBox collection in this folder..." in process.stdout.decode("utf-8")
|
||||
assert "Initializing a new ArchiveBox" in process.stdout.decode("utf-8")
|
||||
|
||||
def test_update(tmp_path, process):
|
||||
os.chdir(tmp_path)
|
||||
update_process = subprocess.run(['archivebox', 'init'], capture_output=True)
|
||||
assert "Updating existing ArchiveBox collection in this folder" in update_process.stdout.decode("utf-8")
|
||||
assert "updating existing ArchiveBox" in update_process.stdout.decode("utf-8")
|
||||
|
||||
def test_add_link(tmp_path, process, disable_extractors_dict):
|
||||
disable_extractors_dict.update({"USE_WGET": "true"})
|
||||
|
@ -28,11 +28,11 @@ def test_add_link(tmp_path, process, disable_extractors_dict):
|
|||
|
||||
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
|
||||
|
||||
with open(archived_item_path / "index.json", "r") as f:
|
||||
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
|
||||
output_json = json.load(f)
|
||||
assert "Example Domain" == output_json['history']['title'][0]['output']
|
||||
|
||||
with open(archived_item_path / "index.html", "r") as f:
|
||||
with open(archived_item_path / "index.html", "r", encoding="utf-8") as f:
|
||||
output_html = f.read()
|
||||
assert "Example Domain" in output_html
|
||||
|
||||
|
@ -47,7 +47,7 @@ def test_add_link_support_stdin(tmp_path, process, disable_extractors_dict):
|
|||
|
||||
assert "index.json" in [x.name for x in archived_item_path.iterdir()]
|
||||
|
||||
with open(archived_item_path / "index.json", "r") as f:
|
||||
with open(archived_item_path / "index.json", "r", encoding="utf-8") as f:
|
||||
output_json = json.load(f)
|
||||
assert "Example Domain" == output_json['history']['title'][0]['output']
|
||||
|
||||
|
@ -75,11 +75,11 @@ def test_collision_urls_different_timestamps(tmp_path, process, disable_extracto
|
|||
|
||||
first_archive = tmp_path / "archive" / str(min([float(folder) for folder in archive_folders]))
|
||||
json_index = str(first_archive / "index.json")
|
||||
with open(json_index, "r") as f:
|
||||
with open(json_index, "r", encoding="utf-8") as f:
|
||||
link_details = json.loads(f.read())
|
||||
|
||||
link_details["url"] = "http://127.0.0.1:8080/static/iana.org.html"
|
||||
with open(json_index, "w") as f:
|
||||
with open(json_index, "w", encoding="utf-8") as f:
|
||||
json.dump(link_details, f)
|
||||
|
||||
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
|
||||
|
@ -98,12 +98,12 @@ def test_collision_timestamps_different_urls(tmp_path, process, disable_extracto
|
|||
archive_folders.remove(first_archive.name)
|
||||
json_index = str(first_archive / "index.json")
|
||||
|
||||
with open(json_index, "r") as f:
|
||||
with open(json_index, "r", encoding="utf-8") as f:
|
||||
link_details = json.loads(f.read())
|
||||
|
||||
link_details["timestamp"] = archive_folders[0]
|
||||
|
||||
with open(json_index, "w") as f:
|
||||
with open(json_index, "w", encoding="utf-8") as f:
|
||||
json.dump(link_details, f)
|
||||
|
||||
init_process = subprocess.run(['archivebox', 'init'], capture_output=True, env=disable_extractors_dict)
|
||||
|
@ -173,4 +173,4 @@ def test_tags_migration(tmp_path, disable_extractors_dict):
|
|||
snapshot_id = tag["id"]
|
||||
tag_name = tag["name"]
|
||||
# Check each tag migrated is in the previous field
|
||||
assert tag_name in snapshots_dict[snapshot_id]
|
||||
assert tag_name in snapshots_dict[snapshot_id]
|
||||
|
|
|
@ -100,16 +100,18 @@ def test_remove_before(tmp_path, process, disable_extractors_dict):
|
|||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
|
||||
higherts, lowerts = timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
before = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
|
||||
lowerts = lowerts[0].split(".")[0]
|
||||
higherts = higherts[0].split(".")[0]
|
||||
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', str(before[1])], capture_output=True)
|
||||
# before is less than, so only the lower snapshot gets deleted
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--before', higherts], capture_output=True)
|
||||
|
||||
assert (tmp_path / "archive" / timestamp[0][0]).exists()
|
||||
assert not (tmp_path / "archive" / timestamp[1][0]).exists()
|
||||
assert not (tmp_path / "archive" / lowerts).exists()
|
||||
assert (tmp_path / "archive" / higherts).exists()
|
||||
|
||||
def test_remove_after(tmp_path, process, disable_extractors_dict):
|
||||
subprocess.run(['archivebox', 'add', 'http://127.0.0.1:8080/static/example.com.html'], capture_output=True, env=disable_extractors_dict)
|
||||
|
@ -118,13 +120,15 @@ def test_remove_after(tmp_path, process, disable_extractors_dict):
|
|||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp ASC").fetchall()
|
||||
higherts, lowerts = c.execute("SELECT timestamp FROM core_snapshot ORDER BY timestamp DESC").fetchall()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
after = list(map(lambda x: int(x[0].split(".")[0]), timestamp))
|
||||
lowerts = lowerts[0].split(".")[0]
|
||||
higherts = higherts[0].split(".")[0]
|
||||
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', str(after[1])], capture_output=True)
|
||||
# after is greater than or equal to, so both snapshots get deleted
|
||||
subprocess.run(['archivebox', 'remove', '--filter-type=regex', '.*', '--yes', '--delete', '--after', lowerts], capture_output=True)
|
||||
|
||||
assert (tmp_path / "archive" / timestamp[1][0]).exists()
|
||||
assert not (tmp_path / "archive" / timestamp[0][0]).exists()
|
||||
assert not (tmp_path / "archive" / lowerts).exists()
|
||||
assert not (tmp_path / "archive" / higherts).exists()
|
||||
|
|
13
uwsgi.ini
Normal file
13
uwsgi.ini
Normal file
|
@ -0,0 +1,13 @@
|
|||
[uwsgi]
|
||||
socket = 127.0.0.1:3031
|
||||
chdir = ../
|
||||
http = 0.0.0.0:8001
|
||||
env = OUTPUT_DIR=./data
|
||||
wsgi-file = archivebox/core/wsgi.py
|
||||
processes = 4
|
||||
threads = 1
|
||||
stats = 127.0.0.1:9191
|
||||
static-map /static=./archivebox/templates/static
|
||||
harakiri = 172800
|
||||
post-buffering = 1
|
||||
disable-logging = True
|
Loading…
Reference in a new issue