move util.py into misc folder

This commit is contained in:
Nick Sweeting 2024-09-30 17:25:15 -07:00
parent dfca4b13b2
commit 363a499289
No known key found for this signature in database
68 changed files with 136 additions and 161 deletions

View file

@ -11,7 +11,7 @@ from uuid import UUID
from typeid import TypeID # type: ignore[import-untyped]
from datetime import datetime
from ..util import enforce_types
from archivebox.misc.util import enforce_types
ABID_PREFIX_LEN = 4

View file

@ -13,7 +13,7 @@ from django_object_actions import DjangoObjectActions, action
from api.auth import get_or_create_api_token
from ..util import parse_date
from archivebox.misc.util import parse_date
from .abid import ABID
def highlight_diff(display_val: Any, compare_val: Any, invert: bool=False, color_same: str | None=None, color_diff: str | None=None):

View file

@ -25,7 +25,7 @@ class BaseQueue(BaseHook):
@property
def tasks(self) -> Dict[str, 'TaskWrapper']:
"""Return an AttrDict of all the background worker tasks defined in the plugin's tasks.py file."""
"""Return an dict of all the background worker tasks defined in the plugin's tasks.py file."""
tasks = importlib.import_module(f"{self.plugin_module}.tasks")
all_tasks = {}
@ -83,7 +83,7 @@ class BaseQueue(BaseHook):
worker = start_worker(supervisor, self.get_supervisord_config(settings), lazy=lazy)
# Update settings.WORKERS to include this worker
settings.WORKERS = getattr(settings, "WORKERS", None) or AttrDict({})
settings.WORKERS = getattr(settings, "WORKERS", None) or benedict({})
settings.WORKERS[self.id] = self.start_supervisord_worker(settings, lazy=True)
return worker

View file

@ -12,7 +12,7 @@ from ..main import (
list_all,
schedule,
)
from ..util import ansi_to_html
from archivebox.misc.util import ansi_to_html
from ..config.legacy import ONLY_NEW

View file

@ -9,7 +9,7 @@ import argparse
from typing import List, Optional, IO
from ..main import add
from ..util import docstring
from archivebox.misc.util import docstring
from ..parsers import PARSERS
from ..config.legacy import OUTPUT_DIR, ONLY_NEW
from ..logging_util import SmartFormatter, accept_stdin, stderr

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import config
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import help
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import init
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import list_all
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..index import (
LINK_FILTERS,

View file

@ -8,7 +8,7 @@ import sys
from typing import Optional, List, IO
from ..main import manage
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR

View file

@ -10,7 +10,7 @@ from pathlib import Path
from typing import List, Optional, IO
from ..main import oneshot
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin, stderr

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import remove
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, accept_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import schedule
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import server
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR, BIND_ADDR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import setup
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import shell
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import status
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,7 +9,7 @@ import argparse
from typing import List, Optional, IO
from ..main import update
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..index import (
LINK_FILTERS,

View file

@ -9,7 +9,7 @@ import argparse
from typing import Optional, List, IO
from ..main import version
from ..util import docstring
from archivebox.misc.util import docstring
from ..config.legacy import OUTPUT_DIR
from ..logging_util import SmartFormatter, reject_stdin

View file

@ -9,17 +9,12 @@ SimpleConfigValueDict = Dict[str, SimpleConfigValue]
SimpleConfigValueGetter = Callable[[], SimpleConfigValue]
ConfigValue = Union[SimpleConfigValue, SimpleConfigValueDict, SimpleConfigValueGetter]
# class AttrDict(dict):
# def __init__(self, *args, **kwargs):
# super().__init__(*args, **kwargs)
# self.__dict__ = self
AttrDict = benedict # https://github.com/fabiocaccamo/python-benedict/
class BaseConfig(TypedDict):
pass
class ConfigDict(BaseConfig, AttrDict, total=False):
class ConfigDict(BaseConfig, benedict, total=False):
"""
# Regenerate by pasting this quine into `archivebox shell` 🥚
from archivebox.config import ConfigDict, CONFIG_DEFAULTS

View file

@ -173,7 +173,7 @@ class ConstantsDict(Mapping):
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR: frozenset[str] = frozenset((
ALLOWED_IN_DATA_DIR: frozenset[str] = frozenset((
*INGORED_PATHS,
*PIP_RELATED_NAMES,
*NPM_RELATED_NAMES,
@ -212,7 +212,7 @@ class ConstantsDict(Mapping):
})
DATA_LOCATIONS = benedict({
"OUTPUT_DIR": {
"DATA_DIR": {
"path": DATA_DIR.resolve(),
"enabled": True,
"is_valid": DATABASE_FILE.exists(),

View file

@ -23,7 +23,7 @@ from signal_webhooks.utils import get_webhook_model
from archivebox.config import VERSION
from ..util import htmldecode, urldecode
from archivebox.misc.util import htmldecode, urldecode
from core.models import Snapshot, ArchiveResult, Tag
from core.mixins import SearchResultsAdminMixin

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.core'
from django import forms
from ..util import URL_REGEX
from archivebox.misc.util import URL_REGEX
from ..parsers import PARSERS
from taggit.utils import edit_string_for_tags, parse_tags

View file

@ -23,7 +23,7 @@ from abid_utils.models import ABIDModel, ABIDField, AutoDateTimeField
from queues.tasks import bg_archive_snapshot
from archivebox.misc.system import get_dir_size
from ..util import parse_date, base_url
from archivebox.misc.util import parse_date, base_url
from ..index.schema import Link
from ..index.html import snapshot_icons
from ..extractors import ARCHIVE_METHODS_INDEXING_PRECEDENCE, EXTRACTORS
@ -231,7 +231,7 @@ class Snapshot(ABIDModel):
@cached_property
def extension(self) -> str:
from ..util import extension
from archivebox.misc.util import extension
return extension(self.url)
@cached_property

View file

@ -37,7 +37,7 @@ from ..config.legacy import (
CONFIG,
)
from ..logging_util import printable_filesize
from ..util import base_url, htmlencode, ts_to_date_str
from archivebox.misc.util import base_url, htmlencode, ts_to_date_str
from ..search import query_search_index
from .serve_static import serve_static_with_byterange_support

View file

@ -20,7 +20,7 @@ from ..index import (
load_link_details,
write_link_details,
)
from ..util import enforce_types
from archivebox.misc.util import enforce_types
from ..logging_util import (
log_archiving_started,
log_archiving_paused,

View file

@ -7,7 +7,7 @@ from collections import defaultdict
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
dedupe,

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file, atomic_write
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
)

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import chmod_file, run
from ..util import (
from archivebox.misc.util import (
enforce_types,
domain,
dedupe,

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
domain,

View file

@ -6,7 +6,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput
from archivebox.misc.system import atomic_write
from ..util import (
from archivebox.misc.util import (
enforce_types,
get_headers,
dedupe,

View file

@ -13,7 +13,7 @@ from ..config.legacy import (
from ..index.schema import Link, ArchiveResult, ArchiveError
from ..logging_util import TimedProgress
from archivebox.misc.system import atomic_write
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
)

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file, dedupe
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress

View file

@ -8,7 +8,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
dedupe,

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import (
from archivebox.misc.util import (
enforce_types,
is_static_file,
)

View file

@ -8,7 +8,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, atomic_write
from ..util import enforce_types, is_static_file
from archivebox.misc.util import enforce_types, is_static_file
from ..logging_util import TimedProgress
from .title import get_html

View file

@ -5,7 +5,7 @@ from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file
from archivebox.misc.util import enforce_types, is_static_file
from ..logging_util import TimedProgress

View file

@ -7,7 +7,7 @@ import json
from ..index.schema import Link, ArchiveResult, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import enforce_types, is_static_file, dedupe
from archivebox.misc.util import enforce_types, is_static_file, dedupe
from ..logging_util import TimedProgress

View file

@ -6,7 +6,7 @@ from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..util import (
from archivebox.misc.util import (
enforce_types,
download_url,
htmldecode,

View file

@ -8,7 +8,7 @@ from datetime import datetime, timezone
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file
from ..util import (
from archivebox.misc.util import (
enforce_types,
without_fragment,
without_query,

View file

@ -13,7 +13,7 @@ from django.db.models import QuerySet, Q
from archivebox.config import DATA_DIR, CONSTANTS, SEARCH_BACKEND_CONFIG
from ..util import (
from archivebox.misc.util import (
scheme,
enforce_types,
ExtendedEncoder,

View file

@ -2,7 +2,7 @@ __package__ = 'archivebox.index'
from typing import List, Optional, Any
from ..util import enforce_types
from archivebox.misc.util import enforce_types
from .schema import Link

View file

@ -11,7 +11,7 @@ from django.core.cache import cache
from .schema import Link
from archivebox.misc.system import atomic_write
from ..logging_util import printable_filesize
from ..util import (
from archivebox.misc.util import (
enforce_types,
ts_to_date_str,
urlencode,

View file

@ -12,7 +12,7 @@ from archivebox.config import VERSION, DATA_DIR, CONSTANTS, SERVER_CONFIG, SHELL
from .schema import Link
from archivebox.misc.system import atomic_write
from ..util import enforce_types
from archivebox.misc.util import enforce_types

View file

@ -22,7 +22,7 @@ from archivebox.config.constants import ARCHIVE_DIR, ARCHIVE_DIR_NAME
from plugins_extractor.favicon.apps import FAVICON_CONFIG
from archivebox.misc.system import get_dir_size
from ..util import ts_to_date_str, parse_date
from archivebox.misc.util import ts_to_date_str, parse_date
class ArchiveError(Exception):
@ -67,7 +67,7 @@ class ArchiveResult:
@classmethod
def guess_ts(_cls, dict_info):
from ..util import parse_date
from archivebox.misc.util import parse_date
parsed_timestamp = parse_date(dict_info["timestamp"])
start_ts = parsed_timestamp
end_ts = parsed_timestamp + timedelta(seconds=int(dict_info["duration"]))
@ -75,7 +75,7 @@ class ArchiveResult:
@classmethod
def from_json(cls, json_info, guess=False):
from ..util import parse_date
from archivebox.misc.util import parse_date
info = {
key: val
@ -231,7 +231,7 @@ class Link:
@classmethod
def from_json(cls, json_info, guess=False):
from ..util import parse_date
from archivebox.misc.util import parse_date
info = {
key: val
@ -299,38 +299,38 @@ class Link:
### URL Helpers
@property
def url_hash(self):
from ..util import hashurl
from archivebox.misc.util import hashurl
return hashurl(self.url)
@property
def scheme(self) -> str:
from ..util import scheme
from archivebox.misc.util import scheme
return scheme(self.url)
@property
def extension(self) -> str:
from ..util import extension
from archivebox.misc.util import extension
return extension(self.url)
@property
def domain(self) -> str:
from ..util import domain
from archivebox.misc.util import domain
return domain(self.url)
@property
def path(self) -> str:
from ..util import path
from archivebox.misc.util import path
return path(self.url)
@property
def basename(self) -> str:
from ..util import basename
from archivebox.misc.util import basename
return basename(self.url)
@property
def base_url(self) -> str:
from ..util import base_url
from archivebox.misc.util import base_url
return base_url(self.url)
### Pretty Printing Helpers
@ -380,12 +380,12 @@ class Link:
@property
def is_static(self) -> bool:
from ..util import is_static_file
from archivebox.misc.util import is_static_file
return is_static_file(self.url)
@property
def is_archived(self) -> bool:
from ..util import domain
from archivebox.misc.util import domain
output_paths = (
domain(self.url),

View file

@ -9,7 +9,7 @@ from django.db.models import QuerySet
from django.db import transaction
from .schema import Link
from ..util import enforce_types, parse_date
from archivebox.misc.util import enforce_types, parse_date
from ..config.legacy import (
OUTPUT_DIR,
TAG_SEPARATOR_PATTERN,

View file

@ -23,8 +23,8 @@ from rich.panel import Panel
from archivebox.config import CONSTANTS, DATA_DIR, VERSION, SHELL_CONFIG
from archivebox.misc.system import get_dir_size
from .util import enforce_types
from .misc.logging import ANSI, stderr
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI, stderr
@dataclass
class RuntimeStats:

View file

@ -28,10 +28,10 @@ from .parsers import (
save_file_as_source,
parse_links_memory,
)
from .index.schema import Link
from .util import enforce_types # type: ignore
from archivebox.misc.util import enforce_types # type: ignore
from archivebox.misc.system import get_dir_size, dedupe_cron_jobs, CRON_COMMENT
from archivebox.misc.system import run as run_shell
from .index.schema import Link
from .index import (
load_main_index,
parse_links_from_source,
@ -61,14 +61,12 @@ from .index.sql import (
apply_migrations,
remove_from_sql_main_index,
)
from .index.html import (
generate_index_from_links,
)
from .index.html import generate_index_from_links
from .index.csv import links_to_csv
from .extractors import archive_links, archive_link, ignore_methods
from .misc.logging import stderr, hint
from .misc.checks import check_data_folder
from .config.legacy import (
from archivebox.misc.logging import stderr, hint
from archivebox.misc.checks import check_data_folder
from archivebox.config.legacy import (
write_config_file,
DEPENDENCIES,
load_all_config,
@ -194,7 +192,7 @@ def version(quiet: bool=False,
f'PLATFORM={platform.platform()}',
f'PYTHON={sys.implementation.name.title()}',
)
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS['DATA_DIR']['is_mount'] or CONSTANTS.DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
OUTPUT_IS_REMOTE_FS = CONSTANTS.DATA_LOCATIONS.DATA_DIR.is_mount or CONSTANTS.DATA_LOCATIONS.ARCHIVE_DIR.is_mount
print(
f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
@ -221,7 +219,7 @@ def version(quiet: bool=False,
print()
print('{white}[i] New dependency versions:{reset}'.format(**SHELL_CONFIG.ANSI))
for name, binary in settings.BINARIES.items():
for name, binary in reversed(list(settings.BINARIES.items())):
err = None
try:
loaded_bin = binary.load()

View file

@ -1,4 +1,4 @@
__package__ = 'archivebox'
__package__ = 'archivebox.misc'
import os
@ -14,8 +14,8 @@ from subprocess import _mswindows, PIPE, Popen, CalledProcessError, CompletedPro
from crontab import CronTab
from atomicwrites import atomic_write as lib_atomic_write
from .util import enforce_types, ExtendedEncoder
from .config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
from archivebox.config.legacy import OUTPUT_PERMISSIONS, DIR_OUTPUT_PERMISSIONS, ENFORCE_ATOMIC_WRITES
from archivebox.misc.util import enforce_types, ExtendedEncoder
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):

View file

@ -317,22 +317,6 @@ def dedupe(options: List[str]) -> List[str]:
return list(deduped.values())
class AttributeDict(dict):
"""Helper to allow accessing dict values via Example.key or Example['key']"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Recursively convert nested dicts to AttributeDicts (optional):
# for key, val in self.items():
# if isinstance(val, dict) and type(val) is not AttributeDict:
# self[key] = AttributeDict(val)
def __getattr__(self, attr: str) -> Any:
return dict.__getitem__(self, attr)
def __setattr__(self, attr: str, value: Any) -> None:
return dict.__setitem__(self, attr, value)
class ExtendedEncoder(pyjson.JSONEncoder):
"""

View file

@ -22,7 +22,7 @@ from ..config.legacy import (
stderr,
hint,
)
from ..util import (
from archivebox.misc.util import (
basename,
htmldecode,
download_url,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable, Optional
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
find_all_urls,

View file

@ -6,7 +6,7 @@ from typing import IO, Iterable
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
)

View file

@ -5,7 +5,7 @@ import json
from typing import IO, Iterable
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
enforce_types,
)

View file

@ -6,7 +6,7 @@ from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types
)

View file

@ -6,7 +6,7 @@ from datetime import datetime, timezone
from pathlib import Path
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
find_all_urls,

View file

@ -7,7 +7,7 @@ from datetime import datetime
from xml.etree import ElementTree
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
)

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
)

View file

@ -6,7 +6,7 @@ from time import mktime
from feedparser import parse as feedparser
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types
)

View file

@ -11,7 +11,7 @@ from pocket import Pocket
from archivebox.config import CONSTANTS
from ..index.schema import Link
from ..util import enforce_types
from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write
from ..config.legacy import (
POCKET_CONSUMER_KEY,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
)

View file

@ -11,7 +11,7 @@ from configparser import ConfigParser
from archivebox.config import CONSTANTS
from ..index.schema import Link
from ..util import enforce_types
from archivebox.misc.util import enforce_types
from archivebox.misc.system import atomic_write
from ..config.legacy import READWISE_READER_TOKENS

View file

@ -5,7 +5,7 @@ from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
str_between,

View file

@ -7,7 +7,7 @@ from typing import IO, Iterable
from datetime import datetime, timezone
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
enforce_types,
URL_REGEX,
)

View file

@ -5,7 +5,7 @@ from typing import IO, Iterable
from datetime import datetime
from ..index.schema import Link
from ..util import (
from archivebox.misc.util import (
htmldecode,
enforce_types,
str_between,

View file

@ -3,7 +3,7 @@ __package__ = 'archivebox.plugins_extractor.chrome'
import sys
import platform
from pathlib import Path
from typing import List, Optional, Dict, ClassVar
from typing import List, Optional, Dict
# Depends on other PyPI/vendor packages:
from rich import print
@ -29,7 +29,7 @@ from archivebox.config import CONSTANTS, ARCHIVING_CONFIG, SHELL_CONFIG
from plugins_pkg.puppeteer.apps import PUPPETEER_BINPROVIDER
from plugins_pkg.playwright.apps import PLAYWRIGHT_BINPROVIDER
from ...util import dedupe
from archivebox.misc.util import dedupe
CHROMIUM_BINARY_NAMES_LINUX = [

View file

@ -1,3 +1,5 @@
__package__ = 'archivebox.search'
from typing import List, Union
from pathlib import Path
@ -5,12 +7,53 @@ from django.db.models import QuerySet
from django.conf import settings
from archivebox.index.schema import Link
from archivebox.util import enforce_types
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config.legacy import ANSI
# from archivebox.archivebox.config import settings.CONFIGS.SearchBackendConfig
from .utils import get_indexable_content, log_index_started
def log_index_started(url):
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
print( )
def get_file_result_content(res, extra_path, use_pwd=False):
if use_pwd:
fpath = f'{res.pwd}/{res.output}'
else:
fpath = f'{res.output}'
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
return []
# This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
return []
# Only use the first method available
res, method = results.first(), results.first().extractor
if method not in ('readability', 'singlefile', 'dom', 'wget'):
return []
# This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget':
return get_file_result_content(res, '', use_pwd=True)
def import_backend():

View file

@ -1,45 +0,0 @@
from django.db.models import QuerySet
from archivebox.util import enforce_types
from archivebox.config.legacy import ANSI
def log_index_started(url):
print('{green}[*] Indexing url: {} in the search index {reset}'.format(url, **ANSI))
print( )
def get_file_result_content(res, extra_path, use_pwd=False):
if use_pwd:
fpath = f'{res.pwd}/{res.output}'
else:
fpath = f'{res.output}'
if extra_path:
fpath = f'{fpath}/{extra_path}'
with open(fpath, 'r', encoding='utf-8') as file:
data = file.read()
if data:
return [data]
return []
# This should be abstracted by a plugin interface for extractors
@enforce_types
def get_indexable_content(results: QuerySet):
if not results:
return []
# Only use the first method available
res, method = results.first(), results.first().extractor
if method not in ('readability', 'singlefile', 'dom', 'wget'):
return []
# This should come from a plugin interface
# TODO: banish this duplication and get these from the extractor file
if method == 'readability':
return get_file_result_content(res, 'content.txt', use_pwd=True)
elif method == 'singlefile':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'dom':
return get_file_result_content(res, '', use_pwd=True)
elif method == 'wget':
return get_file_result_content(res, '', use_pwd=True)