From caf4660ac86153632c76de247b6ac8579d06de31 Mon Sep 17 00:00:00 2001 From: JDC Date: Mon, 23 Nov 2020 15:51:59 -0500 Subject: [PATCH] Add indexing to update command and utilities --- archivebox/main.py | 3 ++- archivebox/search/__init__.py | 16 +++++++++++++++ archivebox/search/utils.py | 38 +++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 archivebox/search/utils.py diff --git a/archivebox/main.py b/archivebox/main.py index 73278702..bb24d124 100644 --- a/archivebox/main.py +++ b/archivebox/main.py @@ -115,7 +115,7 @@ from .logging_util import ( printable_dependency_version, ) -from .search import flush_search_index +from .search import flush_search_index, index_links ALLOWED_IN_OUTPUT_DIR = { 'lost+found', @@ -711,6 +711,7 @@ def update(resume: Optional[float]=None, if index_only: for link in all_links: write_link_details(link, out_dir=out_dir, skip_sql_index=True) + index_links(all_links, out_dir=out_dir) return all_links # Step 2: Run the archive methods for each link diff --git a/archivebox/search/__init__.py b/archivebox/search/__init__.py index fdf19a89..537fa1ff 100644 --- a/archivebox/search/__init__.py +++ b/archivebox/search/__init__.py @@ -8,6 +8,8 @@ from archivebox.index.schema import Link from archivebox.util import enforce_types from archivebox.config import setup_django,stderr, OUTPUT_DIR, USE_INDEXING_BACKEND, USE_SEARCHING_BACKEND, SEARCH_BACKEND_ENGINE +from .utils import get_indexable_content + def indexing_enabled(): return USE_INDEXING_BACKEND @@ -83,3 +85,17 @@ def flush_search_index(snapshots: QuerySet): f'[X] The search backend threw an exception={err}:', color='red', ) + +@enforce_types +def index_links(links: Union[List[Link],None], out_dir: Path=OUTPUT_DIR): + if not links: + return + + setup_django(out_dir=out_dir, check_db=True) + from core.models import Snapshot, ArchiveResult + + for link in links: + if snap := Snapshot.objects.filter(url=link.url).first(): + results = ArchiveResult.objects.indexable().filter(snapshot=snap) + texts = get_indexable_content(results) + write_search_index(link,texts,out_dir=out_dir) diff --git a/archivebox/search/utils.py b/archivebox/search/utils.py new file mode 100644 index 00000000..f2d86b2c --- /dev/null +++ b/archivebox/search/utils.py @@ -0,0 +1,38 @@ +from django.db.models import QuerySet + +from archivebox.util import enforce_types + +def get_file_result_content(res, extra_path, use_pwd=False): + if use_pwd: + fpath = f'{res.pwd}/{res.output}' + else: + fpath = f'{res.output}' + + if extra_path: + fpath = f'{fpath}/{extra_path}' + + with open(fpath, 'r') as file: + data = file.read().replace('\n', '') + if data: + return [data] + return [] + + +# This should be abstracted by a plugin interface for extractors +@enforce_types +def get_indexable_content(results: QuerySet): + if not results: + return [] + # Only use the first method available + res, method = results.first(), results.first().extractor + if method not in ('readability', 'singlefile', 'dom', 'wget'): + return [] + # This should come from a plugin interface + if method == 'readability': + return get_file_result_content(res, 'content.txt') + elif method == 'singlefile': + return get_file_result_content(res, '') + elif method == 'dom': + return get_file_result_content(res,'',use_pwd=True) + elif method == 'wget': + return get_file_result_content(res,'',use_pwd=True)