mirror of
https://github.com/ArchiveBox/ArchiveBox
synced 2024-11-10 14:44:18 +00:00
refactor: Remove if LENGTH and use text chunker for every input
This commit is contained in:
parent
5a6b814c79
commit
172197ae01
1 changed files with 4 additions and 6 deletions
|
@ -11,12 +11,10 @@ MAX_SONIC_TEXT_LENGTH = 20000
|
||||||
def index(snapshot_id: str, texts: List[str]):
|
def index(snapshot_id: str, texts: List[str]):
|
||||||
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
with IngestClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as ingestcl:
|
||||||
for text in texts:
|
for text in texts:
|
||||||
if len(text) < MAX_SONIC_TEXT_LENGTH:
|
|
||||||
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(text))
|
|
||||||
else:
|
|
||||||
chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
|
chunks = [text[i:i+MAX_SONIC_TEXT_LENGTH] for i in range(0, len(text), MAX_SONIC_TEXT_LENGTH)]
|
||||||
for chunk in chunks:
|
for chunk in chunks:
|
||||||
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
|
ingestcl.push(SONIC_COLLECTION, SONIC_BUCKET, snapshot_id, str(chunk))
|
||||||
|
|
||||||
@enforce_types
|
@enforce_types
|
||||||
def search(text: str) -> List[str]:
|
def search(text: str) -> List[str]:
|
||||||
with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
|
with SearchClient(SEARCH_BACKEND_HOST_NAME, SEARCH_BACKEND_PORT, SEARCH_BACKEND_PASSWORD) as querycl:
|
||||||
|
|
Loading…
Reference in a new issue