diff --git a/.dockerignore b/.dockerignore
index 08408d22..27ad7a81 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -28,4 +28,5 @@ assets/
docker/
data/
+data*/
output/
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 7224eee9..d90ccf6c 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -30,5 +30,4 @@ formats:
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
python:
install:
- - requirements: requirements.txt
- - requirements: docs/requirements.txt
\ No newline at end of file
+ - requirements: docs/requirements.txt
diff --git a/Dockerfile b/Dockerfile
index 72259949..4a6cc4b5 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -294,9 +294,8 @@ WORKDIR "$DATA_DIR"
VOLUME "$DATA_DIR"
EXPOSE 8000
-# Optional:
-# HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
-# CMD curl --silent 'http://localhost:8000/admin/login/' || exit 1
+HEALTHCHECK --interval=30s --timeout=20s --retries=15 \
+ CMD curl --silent 'http://localhost:8000/health/' | grep -q 'OK'
ENTRYPOINT ["dumb-init", "--", "/app/bin/docker_entrypoint.sh"]
CMD ["archivebox", "server", "--quick-init", "0.0.0.0:8000"]
diff --git a/README.md b/README.md
index f298e392..a961cb47 100644
--- a/README.md
+++ b/README.md
@@ -408,7 +408,7 @@ See below for usage examples using the CLI, W
> *Warning: These are contributed by external volunteers and may lag behind the official `pip` channel.*
-- TrueNAS
+- TrueNAS: Official ArchiveBox TrueChart / Custom App Guide
- UnRaid
- Yunohost
- Cloudron
@@ -1441,23 +1441,62 @@ archivebox init --setup
-#### Make migrations or enter a django shell
+#### Make DB migrations, enter Django shell, other dev helper commands
Click to expand...
-Make sure to run this whenever you change things in `models.py`.
-
```bash
+# generate the database migrations after changes to models.py
cd archivebox/
./manage.py makemigrations
+# enter a python shell or a SQL shell
cd path/to/test/data/
archivebox shell
archivebox manage dbshell
+
+# generate a graph of the ORM models
+brew install graphviz
+pip install pydot graphviz
+archivebox manage graph_models -a -o orm.png
+open orm.png
+
+# list all models with field db info and methods
+archivebox manage list_model_info --all --signature --db-type --field-class
+
+# print all django settings
+archivebox manage print_settings
+archivebox manage print_settings --format=yaml # pip install pyyaml
+
+# autogenerate an admin.py from given app models
+archivebox manage admin_generator core > core/admin.py
+
+# dump db data to a script that re-populates it
+archivebox manage dumpscript core > scripts/testdata.py
+archivebox manage reset core
+archivebox manage runscript testdata
+
+# resetdb and clear all data!
+archivebox manage reset_db
+
+# use django-tui to interactively explore commands
+pip install django-tui
+# ensure django-tui is in INSTALLED_APPS: core/settings.py
+archivebox manage tui
+
+# show python and JS package dependency trees
+pdm list --tree
+npm ls --all
```
-(uses `pytest -s`)
-https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
+
+
+- https://django-extensions.readthedocs.io/en/latest/command_extensions.html
+- https://stackoverflow.com/questions/1074212/how-can-i-see-the-raw-sql-queries-django-is-running
+- https://github.com/anze3db/django-tui (explore `manage.py` commands as TUI)
+- https://github.com/bloomberg/memray (advanced python profiler)
+- https://github.com/laixintao/flameshow (display flamegraphs in terminal)
+- https://github.com/taliraj/django-migrations-tui (explore migrations as TUI)
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 2515b8fd..cb1c6841 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -131,7 +131,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
link = load_link_details(link, out_dir=out_dir)
write_link_details(link, out_dir=out_dir, skip_sql_index=False)
- log_link_archiving_started(link, out_dir, is_new)
+ log_link_archiving_started(link, str(out_dir), is_new)
link = link.overwrite(updated=datetime.now(timezone.utc))
stats = {'skipped': 0, 'succeeded': 0, 'failed': 0}
start_ts = datetime.now(timezone.utc)
@@ -165,16 +165,6 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
- # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
- # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
- # are fixed.
- """
- raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
- method_name,
- link.url,
- )) from e
- """
- # Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
@@ -186,6 +176,13 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
ts
) + "\n" + str(e) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
+
+ # print(f' ERROR: {method_name} {e.__class__.__name__}: {e} {getattr(e, "hints", "")}', ts, link.url, command)
+ raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
+ method_name,
+ link.url,
+ )) from e
+
# print(' ', stats)
@@ -218,7 +215,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
if type(all_links) is QuerySet:
num_links: int = all_links.count()
- get_link = lambda x: x.as_link()
+ get_link = lambda x: x.as_link_with_details()
all_links = all_links.iterator()
else:
num_links: int = len(all_links)
diff --git a/archivebox/extractors/htmltotext.py b/archivebox/extractors/htmltotext.py
index 18722f13..0686f76e 100644
--- a/archivebox/extractors/htmltotext.py
+++ b/archivebox/extractors/htmltotext.py
@@ -121,9 +121,11 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
out_dir = Path(out_dir or link.link_dir)
output = "htmltotext.txt"
+ cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
timer = TimedProgress(timeout, prefix=' ')
extracted_text = None
+ status = 'failed'
try:
extractor = HTMLTextExtractor()
document = get_html(link, out_dir)
@@ -136,10 +138,9 @@ def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
extracted_text = str(extractor)
atomic_write(str(out_dir / output), extracted_text)
+ status = 'succeeded'
except (Exception, OSError) as err:
- status = 'failed'
output = err
- cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
finally:
timer.end()
diff --git a/archivebox/extractors/singlefile.py b/archivebox/extractors/singlefile.py
index e3860527..e50b3932 100644
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -77,6 +77,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
+ result = None
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout)
@@ -84,7 +85,7 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# "Downloaded: 76 files, 4.0M in 1.6s (2.52 MB/s)"
output_tail = [
line.strip()
- for line in (result.stdout + result.stderr).decode().rsplit('\n', 3)[-3:]
+ for line in (result.stdout + result.stderr).decode().rsplit('\n', 5)[-5:]
if line.strip()
]
hints = (
@@ -94,12 +95,13 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# Check for common failure cases
if (result.returncode > 0) or not (out_dir / output).is_file():
- raise ArchiveError('SingleFile was not able to archive the page', hints)
+ raise ArchiveError(f'SingleFile was not able to archive the page (status={result.returncode})', hints)
chmod_file(output, cwd=str(out_dir))
except (Exception, OSError) as err:
status = 'failed'
# TODO: Make this prettier. This is necessary to run the command (escape JSON internal quotes).
cmd[2] = browser_args.replace('"', "\\\"")
+ err.hints = (result.stdout + result.stderr).decode().split('\n')
output = err
finally:
timer.end()
diff --git a/archivebox/extractors/title.py b/archivebox/extractors/title.py
index 3505e03f..6b0e37f6 100644
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -75,7 +75,7 @@ def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
- except (FileNotFoundError, TypeError):
+ except (FileNotFoundError, TypeError, UnicodeDecodeError):
continue
if document is None:
return download_url(link.url, timeout=timeout)
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index b9d57aeb..9912b4c7 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -407,7 +407,7 @@ def snapshot_filter(snapshots: QuerySet, filter_patterns: List[str], filter_type
def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links without checking archive status or data directory validity"""
- links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+ links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in links
@@ -415,7 +415,7 @@ def get_indexed_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Option
def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are archived with a valid data directory"""
- links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+ links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in filter(is_archived, links)
@@ -423,7 +423,7 @@ def get_archived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optio
def get_unarchived_folders(snapshots, out_dir: Path=OUTPUT_DIR) -> Dict[str, Optional[Link]]:
"""indexed links that are unarchived with no data directory or an empty data directory"""
- links = [snapshot.as_link_with_details() for snapshot in snapshots.iterator()]
+ links = (snapshot.as_link() for snapshot in snapshots.iterator())
return {
link.link_dir: link
for link in filter(is_unarchived, links)
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index 3c688a3c..933214b9 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -432,12 +432,14 @@ def log_archive_method_finished(result: "ArchiveResult"):
**ANSI,
),
]
+
+ # import pudb; pudb.set_trace()
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
- hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
+ hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
else:
if isinstance(hints, bytes):
hints = hints.decode()
diff --git a/archivebox/main.py b/archivebox/main.py
index 76b204b8..7389c032 100755
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -791,6 +791,8 @@ def update(resume: Optional[float]=None,
out_dir: Path=OUTPUT_DIR) -> List[Link]:
"""Import any new links from subscriptions and retry any previously failed/skipped links"""
+ from core.models import ArchiveResult
+
check_data_folder(out_dir=out_dir)
check_dependencies()
new_links: List[Link] = [] # TODO: Remove input argument: only_new
@@ -798,19 +800,23 @@ def update(resume: Optional[float]=None,
extractors = extractors.split(",") if extractors else []
# Step 1: Filter for selected_links
+ print('[*] Finding matching Snapshots to update...')
+ print(f' - Filtering by {" ".join(filter_patterns)} ({filter_type}) {before=} {after=} {status=}...')
matching_snapshots = list_links(
filter_patterns=filter_patterns,
filter_type=filter_type,
before=before,
after=after,
)
-
+ print(f' - Checking {matching_snapshots.count()} snapshot folders for existing data with {status=}...')
matching_folders = list_folders(
links=matching_snapshots,
status=status,
out_dir=out_dir,
)
- all_links = [link for link in matching_folders.values() if link]
+ all_links = (link for link in matching_folders.values() if link)
+ print(' - Sorting by most unfinished -> least unfinished + date archived...')
+ all_links = sorted(all_links, key=lambda link: (ArchiveResult.objects.filter(snapshot__url=link.url).count(), link.timestamp))
if index_only:
for link in all_links:
@@ -836,6 +842,7 @@ def update(resume: Optional[float]=None,
if extractors:
archive_kwargs["methods"] = extractors
+
archive_links(to_archive, overwrite=overwrite, **archive_kwargs)
# Step 4: Re-write links index with updated titles, icons, and resources
diff --git a/archivebox/util.py b/archivebox/util.py
index faa720b5..f2f75ae3 100644
--- a/archivebox/util.py
+++ b/archivebox/util.py
@@ -179,7 +179,11 @@ def download_url(url: str, timeout: int=None) -> str:
if encoding is not None:
response.encoding = encoding
- return response.text
+ try:
+ return response.text
+ except UnicodeDecodeError:
+ # if response is non-test (e.g. image or other binary files), just return the filename instead
+ return url.rsplit('/', 1)[-1]
@enforce_types
def get_headers(url: str, timeout: int=None) -> str:
diff --git a/etc/sonic.cfg b/etc/sonic.cfg
index 10fbda53..0018c87c 100644
--- a/etc/sonic.cfg
+++ b/etc/sonic.cfg
@@ -6,7 +6,8 @@
[server]
-log_level = "debug"
+# log_level = "debug"
+log_level = "warn"
[channel]