- [Chromium Install](https://github.com/pirate/ArchiveBox/wiki/Install-Chromium)
- [Security Overview](https://github.com/pirate/ArchiveBox/wiki/Security-Overview)
- [Troubleshooting](https://github.com/pirate/ArchiveBox/wiki/Troubleshooting)
+ - [Python API](https://docs.archivebox.io/en/latest/modules.html)
+ - REST API (coming soon...)
## More Info
+ - [Tickets](https://github.com/pirate/ArchiveBox/issues)
- [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap)
- [Changelog](https://github.com/pirate/ArchiveBox/wiki/Changelog)
- [Donations](https://github.com/pirate/ArchiveBox/wiki/Donations)
- [Background & Motivation](https://github.com/pirate/ArchiveBox#background--motivation)
- [Web Archiving Community](https://github.com/pirate/ArchiveBox/wiki/Web-Archiving-Community)
+ ---
+
+ # ArchiveBox Development
+
+ All contributions to ArchiveBox are welcomed! Check our [issues](https://github.com/pirate/ArchiveBox/issues) and [Roadmap](https://github.com/pirate/ArchiveBox/wiki/Roadmap) for things to work on, and please open an issue to discuss your proposed implementation before working on things! Otherwise we may have to close your PR if it doesn't align with our roadmap.
+
+ ### Setup the dev environment
+
+ ```python3
+ git clone https://github.com/pirate/ArchiveBox
+ cd ArchiveBox
+ git checkout master # or the branch you want to test
+ git pull
+
+ # Install ArchiveBox + python dependencies
+ python3 -m venv .venv && source .venv/bin/activate && pip install -e .[dev]
+ # or
+ pipenv install --dev && pipenv shell
+
+ # Install node dependencies
+ npm install
+
+ # Optional: install the extractor dependencies
+ ./bin/setup.sh
+ ```
+
+ ### Common development tasks
+
+ See the `./bin/` folder and read the source of the bash scripts within.
+ You can also run all these in Docker. For more examples see the Github Actions CI/CD tests that are run: `.github/workflows/*.yaml`.
+
+ #### Run the linters
+
+ ```bash
+ ./bin/lint.sh
+ ```
+ (uses `flake8` and `mypy`)
+
+ #### Run the integration tests
+
+ ```bash
+ ./bin/test.sh
+ ```
+ (uses `pytest -s`)
+
+ #### Build the docs, pip package, and docker image
+
+ ```bash
+ ./bin/build.sh
+
+ # or individually:
+ ./bin/build_docs.sh
+ ./bin/build_pip.sh
+ ./bin/build_docker.sh
+ ```
+
+ #### Roll a release
+
+ ```bash
+ ./bin/release.sh
+ ```
+ (bumps the version, builds, and pushes a release to PyPI, Docker Hub, and Github Packages)
+
+
---
diff --git a/archivebox.egg-info/SOURCES.txt b/archivebox.egg-info/SOURCES.txt
index d186b2fb..5c78bd8c 100644
--- a/archivebox.egg-info/SOURCES.txt
+++ b/archivebox.egg-info/SOURCES.txt
@@ -45,6 +45,7 @@ archivebox/core/models.py
archivebox/core/settings.py
archivebox/core/tests.py
archivebox/core/urls.py
+archivebox/core/utils.py
archivebox/core/views.py
archivebox/core/welcome_message.py
archivebox/core/wsgi.py
@@ -60,7 +61,9 @@ archivebox/extractors/archive_org.py
archivebox/extractors/dom.py
archivebox/extractors/favicon.py
archivebox/extractors/git.py
+archivebox/extractors/headers.py
archivebox/extractors/media.py
+archivebox/extractors/mercury.py
archivebox/extractors/pdf.py
archivebox/extractors/readability.py
archivebox/extractors/screenshot.py
@@ -88,7 +91,10 @@ archivebox/themes/admin/app_index.html
archivebox/themes/admin/base.html
archivebox/themes/admin/login.html
archivebox/themes/default/add_links.html
+archivebox/themes/default/base.html
archivebox/themes/default/main_index.html
+archivebox/themes/default/core/snapshot_list.html
+archivebox/themes/default/static/add.css
archivebox/themes/default/static/admin.css
archivebox/themes/default/static/archive.png
archivebox/themes/default/static/bootstrap.min.css
@@ -103,6 +109,7 @@ archivebox/themes/default/static/spinner.gif
archivebox/themes/legacy/favicon.ico
archivebox/themes/legacy/link_details.html
archivebox/themes/legacy/main_index.html
+archivebox/themes/legacy/main_index_minimal.html
archivebox/themes/legacy/main_index_row.html
archivebox/themes/legacy/robots.txt
archivebox/themes/legacy/static/archive.png
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index aa26715b..83055e8e 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -6,12 +6,13 @@ import sys
import argparse
from typing import Optional, Dict, List, IO
+from pathlib import Path
from ..config import OUTPUT_DIR
from importlib import import_module
-CLI_DIR = os.path.dirname(os.path.abspath(__file__))
+CLI_DIR = Path(__file__).resolve().parent
# these common commands will appear sorted before any others for ease-of-use
meta_cmds = ('help', 'version')
diff --git a/archivebox/cli/tests.py b/archivebox/cli/tests.py
index 1f44784d..4d7016aa 100755
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -7,6 +7,7 @@ import os
import sys
import shutil
import unittest
+from pathlib import Path
from contextlib import contextmanager
@@ -109,13 +110,13 @@ class TestInit(unittest.TestCase):
with output_hidden():
archivebox_init.main([])
- assert os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
- assert os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
- assert os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+ assert (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+ assert (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+ assert (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
assert len(load_main_index(out_dir=OUTPUT_DIR)) == 0
def test_conflicting_init(self):
- with open(os.path.join(OUTPUT_DIR, 'test_conflict.txt'), 'w+') as f:
+ with open(Path(OUTPUT_DIR) / 'test_conflict.txt', 'w+') as f:
f.write('test')
try:
@@ -125,9 +126,9 @@ class TestInit(unittest.TestCase):
except SystemExit:
pass
- assert not os.path.exists(os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME))
- assert not os.path.exists(os.path.join(OUTPUT_DIR, JSON_INDEX_FILENAME))
- assert not os.path.exists(os.path.join(OUTPUT_DIR, HTML_INDEX_FILENAME))
+ assert not (Path(OUTPUT_DIR) / SQL_INDEX_FILENAME).exists()
+ assert not (Path(OUTPUT_DIR) / JSON_INDEX_FILENAME).exists()
+ assert not (Path(OUTPUT_DIR) / HTML_INDEX_FILENAME).exists()
try:
load_main_index(out_dir=OUTPUT_DIR)
assert False, 'load_main_index should raise an exception when no index is present'
@@ -159,7 +160,7 @@ class TestAdd(unittest.TestCase):
assert len(all_links) == 30
def test_add_arg_file(self):
- test_file = os.path.join(OUTPUT_DIR, 'test.txt')
+ test_file = Path(OUTPUT_DIR) / 'test.txt'
with open(test_file, 'w+') as f:
f.write(test_urls)
diff --git a/archivebox/config/__init__.py b/archivebox/config/__init__.py
index 4cd78609..3d7e3730 100644
--- a/archivebox/config/__init__.py
+++ b/archivebox/config/__init__.py
@@ -431,7 +431,7 @@ def write_config_file(config: Dict[str, str], out_dir: str=None) -> ConfigDict:
with open(f'{config_path}.bak', 'r') as old:
atomic_write(config_path, old.read())
- if os.path.exists(f'{config_path}.bak'):
+ if Path(f'{config_path}.bak').exists():
os.remove(f'{config_path}.bak')
return {}
@@ -540,7 +540,7 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
if node_modules_bin.exists():
return str(node_modules_bin.resolve())
- return shutil.which(os.path.expanduser(binary)) or binary
+ return shutil.which(Path(binary).expanduser()) or binary
def bin_hash(binary: Optional[str]) -> Optional[str]:
if binary is None:
@@ -634,17 +634,17 @@ def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
}
def get_external_locations(config: ConfigDict) -> ConfigValue:
- abspath = lambda path: None if path is None else os.path.abspath(path)
+ abspath = lambda path: None if path is None else Path(path).resolve()
return {
'CHROME_USER_DATA_DIR': {
'path': abspath(config['CHROME_USER_DATA_DIR']),
'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
- 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')),
+ 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
},
'COOKIES_FILE': {
'path': abspath(config['COOKIES_FILE']),
'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
- 'is_valid': False if config['COOKIES_FILE'] is None else os.path.exists(config['COOKIES_FILE']),
+ 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
},
}
@@ -828,7 +828,7 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
# stderr('[i] Using Chrome binary: {}'.format(shutil.which(CHROME_BINARY) or CHROME_BINARY))
# stderr('[i] Using Chrome data dir: {}'.format(os.path.abspath(CHROME_USER_DATA_DIR)))
if config['CHROME_USER_DATA_DIR'] is not None:
- if not os.path.exists(os.path.join(config['CHROME_USER_DATA_DIR'], 'Default')):
+ if not (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists():
stderr('[X] Could not find profile "Default" in CHROME_USER_DATA_DIR.', color='red')
stderr(f' {config["CHROME_USER_DATA_DIR"]}')
stderr(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py
index 14b3b369..44065de4 100644
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -2,6 +2,7 @@ __package__ = 'archivebox.core'
import os
import sys
+from pathlib import Path
from django.utils.crypto import get_random_string
@@ -49,9 +50,9 @@ TEMPLATES = [
{
'BACKEND': 'django.template.backends.django.DjangoTemplates',
'DIRS': [
- os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME),
- os.path.join(PYTHON_DIR, 'themes', 'default'),
- os.path.join(PYTHON_DIR, 'themes'),
+ str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME),
+ str(Path(PYTHON_DIR) / 'themes' / 'default'),
+ str(Path(PYTHON_DIR) / 'themes'),
],
'APP_DIRS': True,
'OPTIONS': {
@@ -70,7 +71,7 @@ WSGI_APPLICATION = 'core.wsgi.application'
DATABASES = {
'default': {
'ENGINE': 'django.db.backends.sqlite3',
- 'NAME': os.path.join(OUTPUT_DIR, SQL_INDEX_FILENAME),
+ 'NAME': str(Path(OUTPUT_DIR) / SQL_INDEX_FILENAME),
}
}
@@ -105,7 +106,7 @@ SHELL_PLUS_PRINT_SQL = False
IPYTHON_ARGUMENTS = ['--no-confirm-exit', '--no-banner']
IPYTHON_KERNEL_DISPLAY_NAME = 'ArchiveBox Django Shell'
if IS_SHELL:
- os.environ['PYTHONSTARTUP'] = os.path.join(PYTHON_DIR, 'core', 'welcome_message.py')
+ os.environ['PYTHONSTARTUP'] = str(Path(PYTHON_DIR) / 'core' / 'welcome_message.py')
LANGUAGE_CODE = 'en-us'
@@ -122,6 +123,6 @@ EMAIL_BACKEND = 'django.core.mail.backends.console.EmailBackend'
STATIC_URL = '/static/'
STATICFILES_DIRS = [
- os.path.join(PYTHON_DIR, 'themes', ACTIVE_THEME, 'static'),
- os.path.join(PYTHON_DIR, 'themes', 'default', 'static'),
+ str(Path(PYTHON_DIR) / 'themes' / ACTIVE_THEME / 'static'),
+ str(Path(PYTHON_DIR) / 'themes' / 'default' / 'static'),
]
diff --git a/archivebox/core/utils.py b/archivebox/core/utils.py
index 902eef01..1c24fe4d 100644
--- a/archivebox/core/utils.py
+++ b/archivebox/core/utils.py
@@ -14,11 +14,11 @@ def get_icons(snapshot: Snapshot) -> str:
return format_html(
'
'
- '🌐 '
+ '🌐 '
'📄 '
'🖥 '
'🅷 '
- '🆆 '
+ '🆆 '
'🗜 '
'📼 '
'📦 '
diff --git a/archivebox/core/views.py b/archivebox/core/views.py
index 4144b2db..7cd8b104 100644
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -114,12 +114,23 @@ class AddView(UserPassesTestMixin, FormView):
template_name = "add_links.html"
form_class = AddLinkForm
+ def get_initial(self):
+ """Prefill the AddLinkForm with the 'url' GET parameter"""
+ if self.request.method == 'GET':
+ url = self.request.GET.get('url', None)
+ if url:
+ return {'url': url}
+ else:
+ return super().get_initial()
+
def test_func(self):
return PUBLIC_ADD_VIEW or self.request.user.is_authenticated
def get_context_data(self, *args, **kwargs):
context = super().get_context_data(*args, **kwargs)
context["title"] = "Add URLs"
+ # We can't just call request.build_absolute_uri in the template, because it would include query parameters
+ context["absolute_add_path"] = self.request.build_absolute_uri(self.request.path)
return context
def form_valid(self, form):
diff --git a/archivebox/extractors/__init__.py b/archivebox/extractors/__init__.py
index 53a77941..60f20adf 100644
--- a/archivebox/extractors/__init__.py
+++ b/archivebox/extractors/__init__.py
@@ -75,7 +75,7 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
out_dir = out_dir or Path(link.link_dir)
try:
- is_new = not os.path.exists(out_dir)
+ is_new = not Path(out_dir).exists()
if is_new:
os.makedirs(out_dir)
diff --git a/archivebox/extractors/favicon.py b/archivebox/extractors/favicon.py
index fe8895a5..86d2c506 100644
--- a/archivebox/extractors/favicon.py
+++ b/archivebox/extractors/favicon.py
@@ -1,6 +1,5 @@
__package__ = 'archivebox.extractors'
-import os
from pathlib import Path
from typing import Optional
@@ -22,7 +21,7 @@ from ..logging_util import TimedProgress
@enforce_types
def should_save_favicon(link: Link, out_dir: Optional[str]=None) -> bool:
out_dir = out_dir or link.link_dir
- if os.path.exists(os.path.join(out_dir, 'favicon.ico')):
+ if (Path(out_dir) / 'favicon.ico').exists():
return False
return SAVE_FAVICON
diff --git a/archivebox/extractors/wget.py b/archivebox/extractors/wget.py
index dac0bdd3..da88dc5f 100644
--- a/archivebox/extractors/wget.py
+++ b/archivebox/extractors/wget.py
@@ -179,7 +179,7 @@ def wget_output_path(link: Link) -> Optional[str]:
if re.search(".+\\.[Ss]?[Hh][Tt][Mm][Ll]?$", str(f), re.I | re.M)
]
if html_files:
- return str(html_files[0])
+ return str(html_files[0].relative_to(link.link_dir))
# Move up one directory level
search_dir = search_dir.parent
diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py
index 06832dbc..a496e03c 100644
--- a/archivebox/index/__init__.py
+++ b/archivebox/index/__init__.py
@@ -575,7 +575,7 @@ def is_archived(link: Link) -> bool:
return is_valid(link) and link.is_archived
def is_unarchived(link: Link) -> bool:
- if not os.path.exists(link.link_dir):
+ if not Path(link.link_dir).exists():
return True
return not link.is_archived
diff --git a/archivebox/index/html.py b/archivebox/index/html.py
index a46611d6..793a60af 100644
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -1,7 +1,5 @@
__package__ = 'archivebox.index'
-import os
-
from string import Template
from datetime import datetime
from typing import List, Optional, Iterator, Mapping
@@ -30,11 +28,10 @@ from ..config import (
FAVICON_FILENAME,
)
-join = lambda *paths: os.path.join(*paths)
-MAIN_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index.html')
-MINIMAL_INDEX_TEMPLATE = join(TEMPLATES_DIR, 'main_index_minimal.html')
-MAIN_INDEX_ROW_TEMPLATE = join(TEMPLATES_DIR, 'main_index_row.html')
-LINK_DETAILS_TEMPLATE = join(TEMPLATES_DIR, 'link_details.html')
+MAIN_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index.html')
+MINIMAL_INDEX_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_minimal.html')
+MAIN_INDEX_ROW_TEMPLATE = str(Path(TEMPLATES_DIR) / 'main_index_row.html')
+LINK_DETAILS_TEMPLATE = str(Path(TEMPLATES_DIR) / 'link_details.html')
TITLE_LOADING_MSG = 'Not yet archived...'
@@ -44,8 +41,8 @@ TITLE_LOADING_MSG = 'Not yet archived...'
def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
"""parse an archive index html file and return the list of urls"""
- index_path = join(out_dir, HTML_INDEX_FILENAME)
- if os.path.exists(index_path):
+ index_path = Path(out_dir) / HTML_INDEX_FILENAME
+ if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f:
for line in f:
if 'class="link-url"' in line:
@@ -56,12 +53,12 @@ def parse_html_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[str]:
def write_html_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR, finished: bool=False) -> None:
"""write the html link index to a given path"""
- copy_and_overwrite(join(TEMPLATES_DIR, FAVICON_FILENAME), join(out_dir, FAVICON_FILENAME))
- copy_and_overwrite(join(TEMPLATES_DIR, ROBOTS_TXT_FILENAME), join(out_dir, ROBOTS_TXT_FILENAME))
- copy_and_overwrite(join(TEMPLATES_DIR, STATIC_DIR_NAME), join(out_dir, STATIC_DIR_NAME))
+ copy_and_overwrite(str(Path(TEMPLATES_DIR) / FAVICON_FILENAME), str(out_dir / FAVICON_FILENAME))
+ copy_and_overwrite(str(Path(TEMPLATES_DIR) / ROBOTS_TXT_FILENAME), str(out_dir / ROBOTS_TXT_FILENAME))
+ copy_and_overwrite(str(Path(TEMPLATES_DIR) / STATIC_DIR_NAME), str(out_dir / STATIC_DIR_NAME))
rendered_html = main_index_template(links, finished=finished)
- atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
+ atomic_write(str(out_dir / HTML_INDEX_FILENAME), rendered_html)
@enforce_types
@@ -100,7 +97,7 @@ def main_index_row_template(link: Link) -> str:
# before pages are finished archiving, show fallback loading favicon
'favicon_url': (
- join(ARCHIVE_DIR_NAME, link.timestamp, 'favicon.ico')
+ str(Path(ARCHIVE_DIR_NAME) / link.timestamp / 'favicon.ico')
# if link['is_archived'] else ''
),
@@ -119,7 +116,7 @@ def write_html_link_details(link: Link, out_dir: Optional[str]=None) -> None:
out_dir = out_dir or link.link_dir
rendered_html = link_details_template(link)
- atomic_write(join(out_dir, HTML_INDEX_FILENAME), rendered_html)
+ atomic_write(str(Path(out_dir) / HTML_INDEX_FILENAME), rendered_html)
@enforce_types
diff --git a/archivebox/index/json.py b/archivebox/index/json.py
index 76e6ec80..36c5ccdb 100644
--- a/archivebox/index/json.py
+++ b/archivebox/index/json.py
@@ -45,8 +45,8 @@ MAIN_INDEX_HEADER = {
def parse_json_main_index(out_dir: Path=OUTPUT_DIR) -> Iterator[Link]:
"""parse an archive index json file and return the list of links"""
- index_path = os.path.join(out_dir, JSON_INDEX_FILENAME)
- if os.path.exists(index_path):
+ index_path = Path(out_dir) / JSON_INDEX_FILENAME
+ if index_path.exists():
with open(index_path, 'r', encoding='utf-8') as f:
links = pyjson.load(f)['links']
for link_json in links:
@@ -86,7 +86,7 @@ def write_json_main_index(links: List[Link], out_dir: Path=OUTPUT_DIR) -> None:
'last_run_cmd': sys.argv,
'links': links,
}
- atomic_write(os.path.join(out_dir, JSON_INDEX_FILENAME), main_index_json)
+ atomic_write(str(Path(out_dir) / JSON_INDEX_FILENAME), main_index_json)
### Link Details Index
@@ -96,15 +96,15 @@ def write_json_link_details(link: Link, out_dir: Optional[str]=None) -> None:
"""write a json file with some info about the link"""
out_dir = out_dir or link.link_dir
- path = os.path.join(out_dir, JSON_INDEX_FILENAME)
- atomic_write(path, link._asdict(extended=True))
+ path = Path(out_dir) / JSON_INDEX_FILENAME
+ atomic_write(str(path), link._asdict(extended=True))
@enforce_types
def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=False) -> Optional[Link]:
"""load the json link index from a given directory"""
- existing_index = os.path.join(out_dir, JSON_INDEX_FILENAME)
- if os.path.exists(existing_index):
+ existing_index = Path(out_dir) / JSON_INDEX_FILENAME
+ if existing_index.exists():
with open(existing_index, 'r', encoding='utf-8') as f:
try:
link_json = pyjson.load(f)
@@ -118,9 +118,9 @@ def parse_json_link_details(out_dir: Union[Path, str], guess: Optional[bool]=Fal
def parse_json_links_details(out_dir: Union[Path, str]) -> Iterator[Link]:
"""read through all the archive data folders and return the parsed links"""
- for entry in os.scandir(os.path.join(out_dir, ARCHIVE_DIR_NAME)):
+ for entry in os.scandir(Path(out_dir) / ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True):
- if os.path.exists(os.path.join(entry.path, 'index.json')):
+ if (Path(entry.path) / 'index.json').exists():
try:
link = parse_json_link_details(entry.path)
except KeyError:
diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py
index 7508890d..68d840a2 100644
--- a/archivebox/index/schema.py
+++ b/archivebox/index/schema.py
@@ -1,6 +1,5 @@
__package__ = 'archivebox.index'
-import os
from pathlib import Path
from datetime import datetime, timedelta
@@ -250,7 +249,7 @@ class Link:
@property
def link_dir(self) -> str:
from ..config import CONFIG
- return os.path.join(CONFIG['ARCHIVE_DIR'], self.timestamp)
+ return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp)
@property
def archive_path(self) -> str:
@@ -369,7 +368,7 @@ class Link:
)
return any(
- os.path.exists(os.path.join(ARCHIVE_DIR, self.timestamp, path))
+ (Path(ARCHIVE_DIR) / self.timestamp / path).exists()
for path in output_paths
)
diff --git a/archivebox/logging_util.py b/archivebox/logging_util.py
index 089d49ab..eef0c30e 100644
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -390,7 +390,7 @@ def log_list_finished(links):
def log_removal_started(links: List["Link"], yes: bool, delete: bool):
print('{lightyellow}[i] Found {} matching URLs to remove.{reset}'.format(len(links), **ANSI))
if delete:
- file_counts = [link.num_outputs for link in links if os.path.exists(link.link_dir)]
+ file_counts = [link.num_outputs for link in links if Path(link.link_dir).exists()]
print(
f' {len(links)} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
@@ -445,9 +445,9 @@ def log_shell_welcome_msg():
@enforce_types
def pretty_path(path: Union[Path, str]) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
- pwd = os.path.abspath('.')
+ pwd = Path('.').resolve()
# parent = os.path.abspath(os.path.join(pwd, os.path.pardir))
- return str(path).replace(pwd + '/', './')
+ return str(path).replace(str(pwd) + '/', './')
@enforce_types
@@ -518,11 +518,11 @@ def printable_folder_status(name: str, folder: Dict) -> str:
color, symbol, note, num_files = 'lightyellow', '-', 'disabled', '-'
if folder['path']:
- if os.path.exists(folder['path']):
+ if Path(folder['path']).exists():
num_files = (
f'{len(os.listdir(folder["path"]))} files'
- if os.path.isdir(folder['path']) else
- printable_filesize(os.path.getsize(folder['path']))
+ if Path(folder['path']).is_dir() else
+ printable_filesize(Path(folder['path']).stat().st_size)
)
else:
num_files = 'missing'
diff --git a/archivebox/parsers/__init__.py b/archivebox/parsers/__init__.py
index 520b9609..5d0d5ca5 100644
--- a/archivebox/parsers/__init__.py
+++ b/archivebox/parsers/__init__.py
@@ -8,7 +8,6 @@ For examples of supported import formats see tests/.
__package__ = 'archivebox.parsers'
import re
-import os
from io import StringIO
from typing import IO, Tuple, List, Optional
@@ -128,7 +127,7 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None)
@enforce_types
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
ts = str(datetime.now().timestamp()).split('.', 1)[0]
- source_path = os.path.join(out_dir, SOURCES_DIR_NAME, filename.format(ts=ts))
+ source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
atomic_write(source_path, raw_text)
log_source_saved(source_file=source_path)
return source_path
@@ -138,7 +137,7 @@ def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir:
def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{basename}.txt', out_dir: Path=OUTPUT_DIR) -> str:
"""download a given url's content into output/sources/domain-.txt"""
ts = str(datetime.now().timestamp()).split('.', 1)[0]
- source_path = os.path.join(OUTPUT_DIR, SOURCES_DIR_NAME, filename.format(basename=basename(path), ts=ts))
+ source_path = str(OUTPUT_DIR / SOURCES_DIR_NAME / filename.format(basename=basename(path), ts=ts))
if any(path.startswith(s) for s in ('http://', 'https://', 'ftp://')):
# Source is a URL that needs to be downloaded
diff --git a/archivebox/system.py b/archivebox/system.py
index f7d1d41c..e07c69c7 100644
--- a/archivebox/system.py
+++ b/archivebox/system.py
@@ -64,7 +64,7 @@ def chmod_file(path: str, cwd: str='.', permissions: str=OUTPUT_PERMISSIONS) ->
@enforce_types
def copy_and_overwrite(from_path: str, to_path: str):
"""copy a given file or directory to a given path, overwriting the destination"""
- if os.path.isdir(from_path):
+ if Path(from_path).is_dir():
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
diff --git a/archivebox/themes/default/add_links.html b/archivebox/themes/default/add_links.html
index cb6f4341..0b384f5c 100644
--- a/archivebox/themes/default/add_links.html
+++ b/archivebox/themes/default/add_links.html
@@ -49,6 +49,12 @@
(it's safe to leave this page, adding will continue in the background)