diff --git a/archivebox/core/views.py b/archivebox/core/views.py index 0964696e..e323eddc 100644 --- a/archivebox/core/views.py +++ b/archivebox/core/views.py @@ -2,7 +2,6 @@ __package__ = 'archivebox.core' from typing import Callable -import threading from pathlib import Path from django.shortcuts import render, redirect @@ -12,6 +11,7 @@ from django.views import View from django.views.generic.list import ListView from django.views.generic import FormView from django.db.models import Q +from django.conf import settings from django.contrib import messages from django.contrib.auth.mixins import UserPassesTestMixin from django.views.decorators.csrf import csrf_exempt @@ -20,6 +20,8 @@ from django.utils.decorators import method_decorator from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink +import archivebox +from archivebox.constants import CONSTANTS from core.models import Snapshot from core.forms import AddLinkForm @@ -27,28 +29,17 @@ from core.admin import result_url from queues.tasks import bg_add +from ..plugins_sys.config.apps import SHELL_CONFIG, SERVER_CONFIG +from ..plugins_extractor.archivedotorg.apps import ARCHIVEDOTORG_CONFIG + from ..config import ( - OUTPUT_DIR, - PUBLIC_INDEX, - PUBLIC_SNAPSHOTS, - PUBLIC_ADD_VIEW, - VERSION, - COMMIT_HASH, - FOOTER_INFO, - SNAPSHOTS_PER_PAGE, - CONFIG, CONFIG_SCHEMA, DYNAMIC_CONFIG_SCHEMA, USER_CONFIG, - SAVE_ARCHIVE_DOT_ORG, - PREVIEW_ORIGINALS, - CONSTANTS, ) from ..logging_util import printable_filesize -from ..main import add -from ..util import base_url, ansi_to_html, htmlencode, urldecode, urlencode, ts_to_date_str +from ..util import base_url, htmlencode, ts_to_date_str from ..search import query_search_index -from ..extractors.wget import wget_output_path from .serve_static import serve_static_with_byterange_support @@ -57,7 +48,7 @@ class HomepageView(View): if request.user.is_authenticated: return redirect('/admin/core/snapshot/') - if PUBLIC_INDEX: + if SERVER_CONFIG.PUBLIC_INDEX: return redirect('/public') return redirect(f'/admin/login/?next={request.path}') @@ -166,8 +157,8 @@ class SnapshotView(View): 'status_color': 'success' if link.is_archived else 'danger', 'oldest_archive_date': ts_to_date_str(link.oldest_archive_date), 'warc_path': warc_path, - 'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG, - 'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS, + 'SAVE_ARCHIVE_DOT_ORG': ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG, + 'PREVIEW_ORIGINALS': SERVER_CONFIG.PREVIEW_ORIGINALS, 'archiveresults': sorted(archiveresults.values(), key=lambda r: all_types.index(r['name']) if r['name'] in all_types else -r['size']), 'best_result': best_result, # 'tags_str': 'somealskejrewlkrjwer,werlmwrwlekrjewlkrjwer324m532l,4m32,23m324234', @@ -176,7 +167,7 @@ class SnapshotView(View): def get(self, request, path): - if not request.user.is_authenticated and not PUBLIC_SNAPSHOTS: + if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS: return redirect(f'/admin/login/?next={request.path}') snapshot = None @@ -381,15 +372,15 @@ class SnapshotView(View): class PublicIndexView(ListView): template_name = 'public_index.html' model = Snapshot - paginate_by = SNAPSHOTS_PER_PAGE + paginate_by = SERVER_CONFIG.SNAPSHOTS_PER_PAGE ordering = ['-bookmarked_at', '-created_at'] def get_context_data(self, **kwargs): return { **super().get_context_data(**kwargs), - 'VERSION': VERSION, - 'COMMIT_HASH': COMMIT_HASH, - 'FOOTER_INFO': FOOTER_INFO, + 'VERSION': archivebox.VERSION, + 'COMMIT_HASH': SHELL_CONFIG.COMMIT_HASH, + 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, } def get_queryset(self, **kwargs): @@ -428,7 +419,7 @@ class PublicIndexView(ListView): return qs.distinct() def get(self, *args, **kwargs): - if PUBLIC_INDEX or self.request.user.is_authenticated: + if SERVER_CONFIG.PUBLIC_INDEX or self.request.user.is_authenticated: response = super().get(*args, **kwargs) return response else: @@ -449,7 +440,7 @@ class AddView(UserPassesTestMixin, FormView): return super().get_initial() def test_func(self): - return PUBLIC_ADD_VIEW or self.request.user.is_authenticated + return SERVER_CONFIG.PUBLIC_ADD_VIEW or self.request.user.is_authenticated def get_context_data(self, **kwargs): return { @@ -457,8 +448,8 @@ class AddView(UserPassesTestMixin, FormView): 'title': "Add URLs", # We can't just call request.build_absolute_uri in the template, because it would include query parameters 'absolute_add_path': self.request.build_absolute_uri(self.request.path), - 'VERSION': VERSION, - 'FOOTER_INFO': FOOTER_INFO, + 'VERSION': archivebox.VERSION, + 'FOOTER_INFO': SERVER_CONFIG.FOOTER_INFO, 'stdout': '', } @@ -475,7 +466,7 @@ class AddView(UserPassesTestMixin, FormView): "depth": depth, "parser": parser, "update_all": False, - "out_dir": OUTPUT_DIR, + "out_dir": archivebox.DATA_DIR, "created_by_id": self.request.user.pk, } if extractors: diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index bcf48fc9..1c16c3bd 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -9,8 +9,6 @@ These are the old types we used to use before ArchiveBox v0.4 (before we switche __package__ = 'archivebox.index' -from pathlib import Path - from datetime import datetime, timezone, timedelta from typing import List, Dict, Any, Optional, Union @@ -19,9 +17,13 @@ from dataclasses import dataclass, asdict, field, fields from django.utils.functional import cached_property +from archivebox.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME + +from plugins_extractor.favicon.apps import FAVICON_CONFIG + from ..system import get_dir_size from ..util import ts_to_date_str, parse_date -from ..config import OUTPUT_DIR, ARCHIVE_DIR_NAME, FAVICON_PROVIDER + class ArchiveError(Exception): def __init__(self, message, hints=None): @@ -88,7 +90,7 @@ class ArchiveResult: info['start_ts'] = parse_date(info['start_ts']) info['end_ts'] = parse_date(info['end_ts']) if "pwd" not in keys: - info["pwd"] = str(Path(OUTPUT_DIR) / ARCHIVE_DIR_NAME / json_info["timestamp"]) + info["pwd"] = str(ARCHIVE_DIR / json_info["timestamp"]) if "cmd_version" not in keys: info["cmd_version"] = "Undefined" if "cmd" not in keys: @@ -281,12 +283,10 @@ class Link: @property def link_dir(self) -> str: - from ..config import CONFIG - return str(Path(CONFIG['ARCHIVE_DIR']) / self.timestamp) + return str(ARCHIVE_DIR / self.timestamp) @property def archive_path(self) -> str: - from ..config import ARCHIVE_DIR_NAME return '{}/{}'.format(ARCHIVE_DIR_NAME, self.timestamp) @property @@ -385,7 +385,6 @@ class Link: @property def is_archived(self) -> bool: - from ..config import ARCHIVE_DIR from ..util import domain output_paths = ( @@ -402,7 +401,7 @@ class Link: ) return any( - (Path(ARCHIVE_DIR) / self.timestamp / path).exists() + (ARCHIVE_DIR / self.timestamp / path).exists() for path in output_paths ) @@ -438,7 +437,7 @@ class Link: canonical = { 'index_path': 'index.html', 'favicon_path': 'favicon.ico', - 'google_favicon_path': FAVICON_PROVIDER.format(self.domain), + 'google_favicon_path': FAVICON_CONFIG.FAVICON_PROVIDER.format(self.domain), 'wget_path': wget_output_path(self), 'warc_path': 'warc/', 'singlefile_path': 'singlefile.html', diff --git a/archivebox/plugantic/views.py b/archivebox/plugantic/views.py index 05567641..6025cba9 100644 --- a/archivebox/plugantic/views.py +++ b/archivebox/plugantic/views.py @@ -12,6 +12,8 @@ from django.utils.html import format_html, mark_safe from admin_data_views.typing import TableContext, ItemContext from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink +import archivebox + from ..config_stubs import AttrDict from ..util import parse_date @@ -378,9 +380,8 @@ def worker_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: def log_list_view(request: HttpRequest, **kwargs) -> TableContext: assert request.user.is_superuser, "Must be a superuser to view configuration settings." - from django.conf import settings - log_files = settings.CONFIG.LOGS_DIR.glob("*.log") + log_files = archivebox.CONSTANTS.LOGS_DIR.glob("*.log") log_files = sorted(log_files, key=os.path.getmtime)[::-1] rows = { @@ -418,7 +419,7 @@ def log_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext: from django.conf import settings - log_file = [logfile for logfile in settings.CONFIG.LOGS_DIR.glob('*.log') if key in logfile.name][0] + log_file = [logfile for logfile in archivebox.CONSTANTS.LOGS_DIR.glob('*.log') if key in logfile.name][0] log_text = log_file.read_text() log_stat = log_file.stat() diff --git a/archivebox/plugins_search/ripgrep/apps.py b/archivebox/plugins_search/ripgrep/apps.py index 006a049a..ba398762 100644 --- a/archivebox/plugins_search/ripgrep/apps.py +++ b/archivebox/plugins_search/ripgrep/apps.py @@ -37,7 +37,7 @@ class RipgrepConfig(BaseConfigSet): '--files-with-matches', '--regexp', ]) - RIPGREP_SEARCH_DIR: str = Field(default=lambda: str(settings.ARCHIVE_DIR)) + RIPGREP_SEARCH_DIR: Path = archivebox.CONSTANTS.ARCHIVE_DIR RIPGREP_CONFIG = RipgrepConfig() @@ -81,7 +81,7 @@ class RipgrepSearchBackend(BaseSearchBackend): ripgrep_binary.abspath, *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT, text, - RIPGREP_CONFIG.RIPGREP_SEARCH_DIR, + str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR), ] proc = run(cmd, timeout=SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT, capture_output=True, text=True) timestamps = set() diff --git a/archivebox/util.py b/archivebox/util.py index 33409c3c..4db47a85 100644 --- a/archivebox/util.py +++ b/archivebox/util.py @@ -18,13 +18,19 @@ from requests.exceptions import RequestException, ReadTimeout from base32_crockford import encode as base32_encode # type: ignore from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding - try: import chardet detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"] except ImportError: detect_encoding = lambda rawdata: "utf-8" + +from archivebox.constants import STATICFILE_EXTENSIONS +from archivebox.plugins_sys.config.apps import ARCHIVING_CONFIG + +from .misc.logging import COLOR_DICT + + ### Parsing Helpers # All of these are (str) -> str @@ -114,7 +120,6 @@ def find_all_urls(urls_str: str): def is_static_file(url: str): # TODO: the proper way is with MIME type detection + ext, not only extension - from .config import STATICFILE_EXTENSIONS return extension(url).lower() in STATICFILE_EXTENSIONS @@ -206,25 +211,20 @@ def parse_date(date: Any) -> Optional[datetime]: @enforce_types def download_url(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the text""" - from .config import ( - TIMEOUT, - CHECK_SSL_VALIDITY, - WGET_USER_AGENT, - COOKIES_FILE, - ) - timeout = timeout or TIMEOUT + + timeout = timeout or ARCHIVING_CONFIG.TIMEOUT session = requests.Session() - if COOKIES_FILE and Path(COOKIES_FILE).is_file(): - cookie_jar = http.cookiejar.MozillaCookieJar(COOKIES_FILE) + if ARCHIVING_CONFIG.COOKIES_FILE and Path(ARCHIVING_CONFIG.COOKIES_FILE).is_file(): + cookie_jar = http.cookiejar.MozillaCookieJar(ARCHIVING_CONFIG.COOKIES_FILE) cookie_jar.load(ignore_discard=True, ignore_expires=True) for cookie in cookie_jar: session.cookies.set(cookie.name, cookie.value, domain=cookie.domain, path=cookie.path) response = session.get( url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, + headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, timeout=timeout, ) @@ -243,14 +243,13 @@ def download_url(url: str, timeout: int=None) -> str: @enforce_types def get_headers(url: str, timeout: int=None) -> str: """Download the contents of a remote url and return the headers""" - from .config import TIMEOUT, CHECK_SSL_VALIDITY, WGET_USER_AGENT - timeout = timeout or TIMEOUT + timeout = timeout or ARCHIVING_CONFIG.TIMEOUT try: response = requests.head( url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, + headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, timeout=timeout, allow_redirects=True, ) @@ -261,8 +260,8 @@ def get_headers(url: str, timeout: int=None) -> str: except RequestException: response = requests.get( url, - headers={'User-Agent': WGET_USER_AGENT}, - verify=CHECK_SSL_VALIDITY, + headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT}, + verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY, timeout=timeout, stream=True ) @@ -285,7 +284,6 @@ def ansi_to_html(text: str) -> str: """ Based on: https://stackoverflow.com/questions/19212665/python-converting-ansi-color-codes-to-html """ - from .config import COLOR_DICT TEMPLATE = '
' text = text.replace('[m', '
')