From d65ce55453b196f0a86a9636a98222a3d68e4572 Mon Sep 17 00:00:00 2001 From: Matthew Woodcraft Date: Sun, 22 May 2022 13:37:19 +0100 Subject: [PATCH] When creating the search index, omit words longer than 80 characters This avoids creating deeply nested objects in searchindex.json --- src/renderer/html_handlebars/search.rs | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/renderer/html_handlebars/search.rs b/src/renderer/html_handlebars/search.rs index 0a59ffe9..b39569d4 100644 --- a/src/renderer/html_handlebars/search.rs +++ b/src/renderer/html_handlebars/search.rs @@ -13,6 +13,8 @@ use crate::utils; use serde::Serialize; +const MAX_WORD_LENGTH_TO_INDEX: usize = 80; + /// Creates all files required for search. pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Result<()> { let mut index = Index::new(&["title", "body", "breadcrumbs"]); @@ -44,6 +46,15 @@ pub fn create_files(search_config: &Search, destination: &Path, book: &Book) -> Ok(()) } +/// Tokenizes in the same way as elasticlunr-rs (for English), but also drops long tokens. +fn tokenize(text: &str) -> Vec { + text.split(|c: char| c.is_whitespace() || c == '-') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().to_lowercase()) + .filter(|s| s.len() <= MAX_WORD_LENGTH_TO_INDEX) + .collect() +} + /// Uses the given arguments to construct a search document, then inserts it to the given index. fn add_doc( index: &mut Index, @@ -62,7 +73,7 @@ fn add_doc( doc_urls.push(url.into()); let items = items.iter().map(|&x| utils::collapse_whitespace(x.trim())); - index.add_doc(&doc_ref, items); + index.add_doc_with_tokenizer(&doc_ref, items, tokenize); } /// Renders markdown into flat unformatted text and adds it to the search index.