mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2025-02-16 13:28:29 +00:00
move util.py into misc folder
This commit is contained in:
parent
dfca4b13b2
commit
363a499289
68 changed files with 136 additions and 161 deletions
|
@ -11,7 +11,7 @@ from uuid import UUID
|
|||
from typeid import TypeID # type: ignore[import-untyped]
|
||||
from datetime import datetime
|
||||
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
|
||||
ABID_PREFIX_LEN = 4
|
||||
|
|
|
@ -13,7 +13,7 @@ from django_object_actions import DjangoObjectActions, action
|
|||
|
||||
from api.auth import get_or_create_api_token
|
||||
|
||||
from ..util import parse_date
|
||||
from archivebox.misc.util import parse_date
|
||||
from .abid import ABID
|
||||
|
||||
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):
|
||||
|
|
|
@ -25,7 +25,7 @@ class BaseQueue(BaseHook):
|
|||
|
||||
@property
|
||||
def tasks(self) -> Dict[str, 'TaskWrapper']:
|
||||
"""Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file."""
|
||||
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
|
||||
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
|
||||
|
||||
all_tasks = {}
|
||||
|
@ -83,7 +83,7 @@ class BaseQueue(BaseHook):
|
|||
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
|
||||
|
||||
# Update settings.WORKERS to include this worker
|
||||
settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({})
|
||||
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
|
||||
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
|
||||
|
||||
return worker
|
||||
|
|
|
@ -12,7 +12,7 @@ from ..main import (
|
|||
list_all,
|
||||
schedule,
|
||||
)
|
||||
from ..util import ansi_to_html
|
||||
from archivebox.misc.util import ansi_to_html
|
||||
from ..config.legacy import ONLY_NEW
|
||||
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import List, Optional, IO
|
||||
|
||||
from ..main import add
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..parsers import PARSERS
|
||||
from ..config.legacy import OUTPUT_DIR, ONLY_NEW
|
||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import config
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import help
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import init
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import list_all
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..index import (
|
||||
LINK_FILTERS,
|
||||
|
|
|
@ -8,7 +8,7 @@ import sys
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import manage
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
|
||||
|
||||
|
|
|
@ -10,7 +10,7 @@ from pathlib import Path
|
|||
from typing import List, Optional, IO
|
||||
|
||||
from ..main import oneshot
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, accept_stdin, stderr
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import remove
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, accept_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import schedule
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import server
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR, BIND_ADDR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import setup
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import shell
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import status
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import List, Optional, IO
|
||||
|
||||
from ..main import update
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..index import (
|
||||
LINK_FILTERS,
|
||||
|
|
|
@ -9,7 +9,7 @@ import argparse
|
|||
from typing import Optional, List, IO
|
||||
|
||||
from ..main import version
|
||||
from ..util import docstring
|
||||
from archivebox.misc.util import docstring
|
||||
from ..config.legacy import OUTPUT_DIR
|
||||
from ..logging_util import SmartFormatter, reject_stdin
|
||||
|
||||
|
|
|
@ -9,17 +9,12 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
|
|||
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
|
||||
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
|
||||
|
||||
# class AttrDict(dict):
|
||||
# def __init__(self, *args, **kwargs):
|
||||
# super().__init__(*args, **kwargs)
|
||||
# self.__dict__ = self
|
||||
AttrDict = benedict # https://github.com/fabiocaccamo/python-benedict/
|
||||
|
||||
|
||||
class BaseConfig(TypedDict):
|
||||
pass
|
||||
|
||||
class ConfigDict(BaseConfig, AttrDict, total=False):
|
||||
class ConfigDict(BaseConfig, benedict, total=False):
|
||||
"""
|
||||
# Regenerate by pasting this quine into `archivebox shell` 🥚
|
||||
from archivebox.config import ConfigDict, CONFIG_DEFAULTS
|
||||
|
|
|
@ -173,7 +173,7 @@ class ConstantsDict(Mapping):
|
|||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
|
||||
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
|
||||
*INGORED_PATHS,
|
||||
*PIP_RELATED_NAMES,
|
||||
*NPM_RELATED_NAMES,
|
||||
|
@ -212,7 +212,7 @@ class ConstantsDict(Mapping):
|
|||
})
|
||||
|
||||
DATA_LOCATIONS = benedict({
|
||||
"OUTPUT_DIR": {
|
||||
"DATA_DIR": {
|
||||
"path": DATA_DIR.resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": DATABASE_FILE.exists(),
|
||||
|
|
|
@ -23,7 +23,7 @@ from signal_webhooks.utils import get_webhook_model
|
|||
|
||||
from archivebox.config import VERSION
|
||||
|
||||
from ..util import htmldecode, urldecode
|
||||
from archivebox.misc.util import htmldecode, urldecode
|
||||
|
||||
from core.models import Snapshot, ArchiveResult, Tag
|
||||
from core.mixins import SearchResultsAdminMixin
|
||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
|
|||
|
||||
from django import forms
|
||||
|
||||
from ..util import URL_REGEX
|
||||
from archivebox.misc.util import URL_REGEX
|
||||
from ..parsers import PARSERS
|
||||
from taggit.utils import edit_string_for_tags, parse_tags
|
||||
|
||||
|
|
|
@ -23,7 +23,7 @@ from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
|
|||
from queues.tasks import bg_archive_snapshot
|
||||
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from ..util import parse_date, base_url
|
||||
from archivebox.misc.util import parse_date, base_url
|
||||
from ..index.schema import Link
|
||||
from ..index.html import snapshot_icons
|
||||
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
|
||||
|
@ -231,7 +231,7 @@ class Snapshot(ABIDModel):
|
|||
|
||||
@cached_property
|
||||
def extension(self) -> str:
|
||||
from ..util import extension
|
||||
from archivebox.misc.util import extension
|
||||
return extension(self.url)
|
||||
|
||||
@cached_property
|
||||
|
|
|
@ -37,7 +37,7 @@ from ..config.legacy import (
|
|||
CONFIG,
|
||||
)
|
||||
from ..logging_util import printable_filesize
|
||||
from ..util import base_url, htmlencode, ts_to_date_str
|
||||
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
|
||||
from ..search import query_search_index
|
||||
from .serve_static import serve_static_with_byterange_support
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ from ..index import (
|
|||
load_link_details,
|
||||
write_link_details,
|
||||
)
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
from ..logging_util import (
|
||||
log_archiving_started,
|
||||
log_archiving_paused,
|
||||
|
|
|
@ -7,7 +7,7 @@ from collections import defaultdict
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file, atomic_write
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from archivebox.misc.system import chmod_file, run
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
domain,
|
||||
dedupe,
|
||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
domain,
|
||||
|
|
|
@ -6,7 +6,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
get_headers,
|
||||
dedupe,
|
||||
|
|
|
@ -13,7 +13,7 @@ from ..config.legacy import (
|
|||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..logging_util import TimedProgress
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import enforce_types, is_static_file, dedupe
|
||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
|
|
@ -8,7 +8,7 @@ import json
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from archivebox.misc.system import run, atomic_write
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
dedupe,
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
is_static_file,
|
||||
)
|
||||
|
|
|
@ -8,7 +8,7 @@ import json
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from archivebox.misc.system import run, atomic_write
|
||||
from ..util import enforce_types, is_static_file
|
||||
from archivebox.misc.util import enforce_types, is_static_file
|
||||
from ..logging_util import TimedProgress
|
||||
from .title import get_html
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import Optional
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import enforce_types, is_static_file
|
||||
from archivebox.misc.util import enforce_types, is_static_file
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
|
|
@ -7,7 +7,7 @@ import json
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import enforce_types, is_static_file, dedupe
|
||||
from archivebox.misc.util import enforce_types, is_static_file, dedupe
|
||||
from ..logging_util import TimedProgress
|
||||
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from pathlib import Path
|
|||
from typing import Optional
|
||||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
download_url,
|
||||
htmldecode,
|
||||
|
|
|
@ -8,7 +8,7 @@ from datetime import datetime, timezone
|
|||
|
||||
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
|
||||
from archivebox.misc.system import run, chmod_file
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
without_fragment,
|
||||
without_query,
|
||||
|
|
|
@ -13,7 +13,7 @@ from django.db.models import QuerySet, Q
|
|||
|
||||
|
||||
from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
scheme,
|
||||
enforce_types,
|
||||
ExtendedEncoder,
|
||||
|
|
|
@ -2,7 +2,7 @@ __package__ = 'archivebox.index'
|
|||
|
||||
from typing import List, Optional, Any
|
||||
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
from .schema import Link
|
||||
|
||||
|
||||
|
|
|
@ -11,7 +11,7 @@ from django.core.cache import cache
|
|||
from .schema import Link
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..logging_util import printable_filesize
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
ts_to_date_str,
|
||||
urlencode,
|
||||
|
|
|
@ -12,7 +12,7 @@ from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL
|
|||
|
||||
from .schema import Link
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -22,7 +22,7 @@ from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
|
|||
from plugins_extractor.favicon.apps import FAVICON_CONFIG
|
||||
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from ..util import ts_to_date_str, parse_date
|
||||
from archivebox.misc.util import ts_to_date_str, parse_date
|
||||
|
||||
|
||||
class ArchiveError(Exception):
|
||||
|
@ -67,7 +67,7 @@ class ArchiveResult:
|
|||
|
||||
@classmethod
|
||||
def guess_ts(_cls, dict_info):
|
||||
from ..util import parse_date
|
||||
from archivebox.misc.util import parse_date
|
||||
parsed_timestamp = parse_date(dict_info["timestamp"])
|
||||
start_ts = parsed_timestamp
|
||||
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
|
||||
|
@ -75,7 +75,7 @@ class ArchiveResult:
|
|||
|
||||
@classmethod
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
|
@ -231,7 +231,7 @@ class Link:
|
|||
|
||||
@classmethod
|
||||
def from_json(cls, json_info, guess=False):
|
||||
from ..util import parse_date
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
info = {
|
||||
key: val
|
||||
|
@ -299,38 +299,38 @@ class Link:
|
|||
### URL Helpers
|
||||
@property
|
||||
def url_hash(self):
|
||||
from ..util import hashurl
|
||||
from archivebox.misc.util import hashurl
|
||||
|
||||
return hashurl(self.url)
|
||||
|
||||
@property
|
||||
def scheme(self) -> str:
|
||||
from ..util import scheme
|
||||
from archivebox.misc.util import scheme
|
||||
return scheme(self.url)
|
||||
|
||||
@property
|
||||
def extension(self) -> str:
|
||||
from ..util import extension
|
||||
from archivebox.misc.util import extension
|
||||
return extension(self.url)
|
||||
|
||||
@property
|
||||
def domain(self) -> str:
|
||||
from ..util import domain
|
||||
from archivebox.misc.util import domain
|
||||
return domain(self.url)
|
||||
|
||||
@property
|
||||
def path(self) -> str:
|
||||
from ..util import path
|
||||
from archivebox.misc.util import path
|
||||
return path(self.url)
|
||||
|
||||
@property
|
||||
def basename(self) -> str:
|
||||
from ..util import basename
|
||||
from archivebox.misc.util import basename
|
||||
return basename(self.url)
|
||||
|
||||
@property
|
||||
def base_url(self) -> str:
|
||||
from ..util import base_url
|
||||
from archivebox.misc.util import base_url
|
||||
return base_url(self.url)
|
||||
|
||||
### Pretty Printing Helpers
|
||||
|
@ -380,12 +380,12 @@ class Link:
|
|||
|
||||
@property
|
||||
def is_static(self) -> bool:
|
||||
from ..util import is_static_file
|
||||
from archivebox.misc.util import is_static_file
|
||||
return is_static_file(self.url)
|
||||
|
||||
@property
|
||||
def is_archived(self) -> bool:
|
||||
from ..util import domain
|
||||
from archivebox.misc.util import domain
|
||||
|
||||
output_paths = (
|
||||
domain(self.url),
|
||||
|
|
|
@ -9,7 +9,7 @@ from django.db.models import QuerySet
|
|||
from django.db import transaction
|
||||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types, parse_date
|
||||
from archivebox.misc.util import enforce_types, parse_date
|
||||
from ..config.legacy import (
|
||||
OUTPUT_DIR,
|
||||
TAG_SEPARATOR_PATTERN,
|
||||
|
|
|
@ -23,8 +23,8 @@ from rich.panel import Panel
|
|||
|
||||
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from .util import enforce_types
|
||||
from .misc.logging import ANSI, stderr
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.logging import ANSI, stderr
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
|
|
|
@ -28,10 +28,10 @@ from .parsers import (
|
|||
save_file_as_source,
|
||||
parse_links_memory,
|
||||
)
|
||||
from .index.schema import Link
|
||||
from .util import enforce_types # type: ignore
|
||||
from archivebox.misc.util import enforce_types # type: ignore
|
||||
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
|
||||
from archivebox.misc.system import run as run_shell
|
||||
from .index.schema import Link
|
||||
from .index import (
|
||||
load_main_index,
|
||||
parse_links_from_source,
|
||||
|
@ -61,14 +61,12 @@ from .index.sql import (
|
|||
apply_migrations,
|
||||
remove_from_sql_main_index,
|
||||
)
|
||||
from .index.html import (
|
||||
generate_index_from_links,
|
||||
)
|
||||
from .index.html import generate_index_from_links
|
||||
from .index.csv import links_to_csv
|
||||
from .extractors import archive_links, archive_link, ignore_methods
|
||||
from .misc.logging import stderr, hint
|
||||
from .misc.checks import check_data_folder
|
||||
from .config.legacy import (
|
||||
from archivebox.misc.logging import stderr, hint
|
||||
from archivebox.misc.checks import check_data_folder
|
||||
from archivebox.config.legacy import (
|
||||
write_config_file,
|
||||
DEPENDENCIES,
|
||||
load_all_config,
|
||||
|
@ -194,7 +192,7 @@ def version(quiet: bool=False,
|
|||
f'PLATFORM={platform.platform()}',
|
||||
f'PYTHON={sys.implementation.name.title()}',
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['DATA_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
|
||||
print(
|
||||
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
|
@ -221,7 +219,7 @@ def version(quiet: bool=False,
|
|||
|
||||
print()
|
||||
print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI))
|
||||
for name, binary in settings.BINARIES.items():
|
||||
for name, binary in reversed(list(settings.BINARIES.items())):
|
||||
err = None
|
||||
try:
|
||||
loaded_bin = binary.load()
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
__package__ = 'archivebox'
|
||||
__package__ = 'archivebox.misc'
|
||||
|
||||
|
||||
import os
|
||||
|
@ -14,8 +14,8 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
|
|||
from crontab import CronTab
|
||||
from atomicwrites import atomic_write as lib_atomic_write
|
||||
|
||||
from .util import enforce_types, ExtendedEncoder
|
||||
from .config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
|
||||
from archivebox.config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
|
||||
from archivebox.misc.util import enforce_types, ExtendedEncoder
|
||||
|
||||
|
||||
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
|
||||
|
|
|
@ -317,22 +317,6 @@ def dedupe(options: List[str]) -> List[str]:
|
|||
return list(deduped.values())
|
||||
|
||||
|
||||
class AttributeDict(dict):
|
||||
"""Helper to allow accessing dict values via Example.key or Example['key']"""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
# Recursively convert nested dicts to AttributeDicts (optional):
|
||||
# for key, val in self.items():
|
||||
# if isinstance(val, dict) and type(val) is not AttributeDict:
|
||||
# self[key] = AttributeDict(val)
|
||||
|
||||
def __getattr__(self, attr: str) -> Any:
|
||||
return dict.__getitem__(self, attr)
|
||||
|
||||
def __setattr__(self, attr: str, value: Any) -> None:
|
||||
return dict.__setitem__(self, attr, value)
|
||||
|
||||
|
||||
class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
"""
|
|
@ -22,7 +22,7 @@ from ..config.legacy import (
|
|||
stderr,
|
||||
hint,
|
||||
)
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
basename,
|
||||
htmldecode,
|
||||
download_url,
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
find_all_urls,
|
||||
|
|
|
@ -6,7 +6,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
|
|
@ -5,7 +5,7 @@ import json
|
|||
from typing import IO, Iterable
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
)
|
||||
|
||||
|
|
|
@ -6,7 +6,7 @@ from time import mktime
|
|||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types
|
||||
)
|
||||
|
|
|
@ -6,7 +6,7 @@ from datetime import datetime, timezone
|
|||
from pathlib import Path
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
find_all_urls,
|
||||
|
|
|
@ -7,7 +7,7 @@ from datetime import datetime
|
|||
from xml.etree import ElementTree
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
|
|
@ -6,7 +6,7 @@ from time import mktime
|
|||
from feedparser import parse as feedparser
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types
|
||||
)
|
||||
|
|
|
@ -11,7 +11,7 @@ from pocket import Pocket
|
|||
from archivebox.config import CONSTANTS
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..config.legacy import (
|
||||
POCKET_CONSUMER_KEY,
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
)
|
||||
|
|
|
@ -11,7 +11,7 @@ from configparser import ConfigParser
|
|||
from archivebox.config import CONSTANTS
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.system import atomic_write
|
||||
from ..config.legacy import READWISE_READER_TOKENS
|
||||
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
|
|
|
@ -7,7 +7,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime, timezone
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
enforce_types,
|
||||
URL_REGEX,
|
||||
)
|
||||
|
|
|
@ -5,7 +5,7 @@ from typing import IO, Iterable
|
|||
from datetime import datetime
|
||||
|
||||
from ..index.schema import Link
|
||||
from ..util import (
|
||||
from archivebox.misc.util import (
|
||||
htmldecode,
|
||||
enforce_types,
|
||||
str_between,
|
||||
|
|
|
@ -3,7 +3,7 @@ __package__ = 'archivebox.plugins_extractor.chrome'
|
|||
import sys
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, ClassVar
|
||||
from typing import List, Optional, Dict
|
||||
|
||||
# Depends on other PyPI/vendor packages:
|
||||
from rich import print
|
||||
|
@ -29,7 +29,7 @@ from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
|
|||
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
|
||||
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
from ...util import dedupe
|
||||
from archivebox.misc.util import dedupe
|
||||
|
||||
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
__package__ = 'archivebox.search'
|
||||
|
||||
from typing import List, Union
|
||||
from pathlib import Path
|
||||
|
||||
|
@ -5,12 +7,53 @@ from django.db.models import QuerySet
|
|||
from django.conf import settings
|
||||
|
||||
from archivebox.index.schema import Link
|
||||
from archivebox.util import enforce_types
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.logging import stderr
|
||||
from archivebox.config.legacy import ANSI
|
||||
|
||||
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
|
||||
|
||||
from .utils import get_indexable_content, log_index_started
|
||||
|
||||
def log_index_started(url):
|
||||
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
|
||||
print( )
|
||||
|
||||
def get_file_result_content(res, extra_path, use_pwd=False):
|
||||
if use_pwd:
|
||||
fpath = f'{res.pwd}/{res.output}'
|
||||
else:
|
||||
fpath = f'{res.output}'
|
||||
|
||||
if extra_path:
|
||||
fpath = f'{fpath}/{extra_path}'
|
||||
|
||||
with open(fpath, 'r', encoding='utf-8') as file:
|
||||
data = file.read()
|
||||
if data:
|
||||
return [data]
|
||||
return []
|
||||
|
||||
|
||||
# This should be abstracted by a plugin interface for extractors
|
||||
@enforce_types
|
||||
def get_indexable_content(results: QuerySet):
|
||||
if not results:
|
||||
return []
|
||||
# Only use the first method available
|
||||
res, method = results.first(), results.first().extractor
|
||||
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
||||
return []
|
||||
# This should come from a plugin interface
|
||||
|
||||
# TODO: banish this duplication and get these from the extractor file
|
||||
if method == 'readability':
|
||||
return get_file_result_content(res, 'content.txt', use_pwd=True)
|
||||
elif method == 'singlefile':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
||||
elif method == 'dom':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
||||
elif method == 'wget':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
||||
|
||||
|
||||
def import_backend():
|
||||
|
|
|
@ -1,45 +0,0 @@
|
|||
from django.db.models import QuerySet
|
||||
|
||||
from archivebox.util import enforce_types
|
||||
from archivebox.config.legacy import ANSI
|
||||
|
||||
def log_index_started(url):
|
||||
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
|
||||
print( )
|
||||
|
||||
def get_file_result_content(res, extra_path, use_pwd=False):
|
||||
if use_pwd:
|
||||
fpath = f'{res.pwd}/{res.output}'
|
||||
else:
|
||||
fpath = f'{res.output}'
|
||||
|
||||
if extra_path:
|
||||
fpath = f'{fpath}/{extra_path}'
|
||||
|
||||
with open(fpath, 'r', encoding='utf-8') as file:
|
||||
data = file.read()
|
||||
if data:
|
||||
return [data]
|
||||
return []
|
||||
|
||||
|
||||
# This should be abstracted by a plugin interface for extractors
|
||||
@enforce_types
|
||||
def get_indexable_content(results: QuerySet):
|
||||
if not results:
|
||||
return []
|
||||
# Only use the first method available
|
||||
res, method = results.first(), results.first().extractor
|
||||
if method not in ('readability', 'singlefile', 'dom', 'wget'):
|
||||
return []
|
||||
# This should come from a plugin interface
|
||||
|
||||
# TODO: banish this duplication and get these from the extractor file
|
||||
if method == 'readability':
|
||||
return get_file_result_content(res, 'content.txt', use_pwd=True)
|
||||
elif method == 'singlefile':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
||||
elif method == 'dom':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
||||
elif method == 'wget':
|
||||
return get_file_result_content(res, '', use_pwd=True)
|
Loading…
Add table
Reference in a new issue