mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 06:34:16 +00:00
fix readability indexing process and implement a max total character length on indexed content
This commit is contained in:
parent
b3a89172ab
commit
f67a5a215a
2 changed files with 12 additions and 6 deletions
|
@ -5,13 +5,19 @@ from sonic import IngestClient, SearchClient
|
||||||
from archivebox.util import enforce_types
|
from archivebox.util import enforce_types
|
||||||
from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
|
from archivebox.config import SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD, SONIC_BUCKET, SONIC_COLLECTION
|
||||||
|
|
||||||
MAX_SONIC_TEXT_LENGTH = 2000
|
MAX_SONIC_TEXT_TOTAL_LENGTH = 100000000 # dont index more than 100 million characters per text
|
||||||
|
MAX_SONIC_TEXT_CHUNK_LENGTH = 2000 # dont index more than 2000 characters per chunk
|
||||||
|
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def index(snapshot_id: str, texts: List[str]):
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
||||||
for text in texts:
|
for text in texts:
|
||||||
chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
|
max_length = 1000000
|
||||||
|
chunks = (
|
||||||
|
text[i:i+MAX_SONIC_TEXT_CHUNK_LENGTH]
|
||||||
|
for i in range(0, min(len(text), MAX_SONIC_TEXT_TOTAL_LENGTH), MAX_SONIC_TEXT_CHUNK_LENGTH)
|
||||||
|
)
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
|
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
|
||||||
|
|
||||||
|
|
|
@ -36,10 +36,10 @@ def get_indexable_content(results: QuerySet):
|
||||||
|
|
||||||
# TODO: banish this duplication and get these from the extractor file
|
# TODO: banish this duplication and get these from the extractor file
|
||||||
if method == 'readability':
|
if method == 'readability':
|
||||||
return get_file_result_content(res, 'content.txt')
|
return get_file_result_content(res, 'content.txt', use_pwd=True)
|
||||||
elif method == 'singlefile':
|
elif method == 'singlefile':
|
||||||
return get_file_result_content(res,'',use_pwd=True)
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
elif method == 'dom':
|
elif method == 'dom':
|
||||||
return get_file_result_content(res,'',use_pwd=True)
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
elif method == 'wget':
|
elif method == 'wget':
|
||||||
return get_file_result_content(res,'',use_pwd=True)
|
return get_file_result_content(res, '', use_pwd=True)
|
||||||
|
|
Loading…
Reference in a new issue