From 16c123aa2089f4aba2825e83eb7dd7e15335b590 Mon Sep 17 00:00:00 2001 From: Tim Schumacher Date: Wed, 2 Jun 2021 09:18:39 +0200 Subject: [PATCH] Include path in the search index with include_path (#1509) --- Cargo.lock | 45 ++++++------ components/config/src/config/search.rs | 3 + components/search/src/lib.rs | 69 +++++++++++++++++-- .../getting-started/configuration.md | 2 + 4 files changed, 89 insertions(+), 30 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 9f29bca9..cec75dda 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -155,9 +155,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.6.1" +version = "3.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "63396b8a4b9de3f4fdfb320ab6080762242f66a8ef174c49d8e19b674db4cdbe" +checksum = "9c59e7af012c713f529e7a3ee57ce9b31ddd858d4b512923602f74608b009631" [[package]] name = "byte-tools" @@ -207,9 +207,9 @@ checksum = "b700ce4376041dcd0a327fd0097c41095743c4c8af8887265942faf1100bd040" [[package]] name = "cc" -version = "1.0.67" +version = "1.0.68" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e3c69b077ad434294d3ce9f1f6143a2a4b89a8a2d54ef813d85003a4fd1137fd" +checksum = "4a72c244c1ff497a746a7e1fb3d14bd08420ecda70c8f25c7112f2781652d787" dependencies = [ "jobserver", ] @@ -348,9 +348,9 @@ dependencies = [ [[package]] name = "crossbeam-epoch" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "52fb27eab85b17fbb9f6fd667089e07d6a2eb8743d02639ee7f6a7a7729c9c94" +checksum = "4ec02e091aa634e2c3ada4a392989e7c3116673ef0ac5b72232439094d73b7fd" dependencies = [ "cfg-if 1.0.0", "crossbeam-utils", @@ -361,11 +361,10 @@ dependencies = [ [[package]] name = "crossbeam-utils" -version = "0.8.4" +version = "0.8.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4feb231f0d4d6af81aed15928e58ecf5816aa62a2393e2c82f46973e92a9a278" +checksum = "d82cfc11ce7f2c3faef78d8a684447b40d503d9681acebed6cb728d45940c4db" dependencies = [ - "autocfg", "cfg-if 1.0.0", "lazy_static", ] @@ -462,9 +461,9 @@ checksum = "e78d4f1cc4ae33bbfc157ed5d5a5ef3bc29227303d595861deb238fcec4e9457" [[package]] name = "elasticlunr-rs" -version = "2.3.11" +version = "2.3.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "959fbc9a6ebced545cbe365fdce5e25c6ab7683f2ca4ecc9fb9d0db663bf73d5" +checksum = "2f8cf73b19a7aece6942f5745a2fc1ae3c8b0533569707d596b5d6baa7d6c600" dependencies = [ "jieba-rs", "lazy_static", @@ -922,9 +921,9 @@ checksum = "f3a87b616e37e93c22fb19bcd386f02f3af5ea98a25670ad0fce773de23c5e68" [[package]] name = "httpdate" -version = "1.0.0" +version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05842d0d43232b23ccb7060ecb0f0626922c21f30012e97b767b30afd4a5d4b9" +checksum = "6456b8a6c8f33fee7d958fcd1b60d55b11940a79e63ae87013e6d22e26034440" [[package]] name = "humansize" @@ -934,9 +933,9 @@ checksum = "02296996cb8796d7c6e3bc2d9211b7802812d36999a51bb754123ead7d37d026" [[package]] name = "hyper" -version = "0.14.7" +version = "0.14.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e5f105c494081baa3bf9e200b279e27ec1623895cd504c7dbef8d0b080fcf54" +checksum = "d3f71a7eea53a3f8257a7b4795373ff886397178cd634430ea94e12d7fe4fe34" dependencies = [ "bytes 1.0.1", "futures-channel", @@ -1177,9 +1176,9 @@ dependencies = [ [[package]] name = "libc" -version = "0.2.94" +version = "0.2.95" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "18794a8ad5b29321f790b55d93dfba91e125cb1a9edbd4f8e3150acc771c1a5e" +checksum = "789da6d93f1b866ffe175afc5322a4d76c038605a1c3319bb57b06967ca98a36" [[package]] name = "library" @@ -1364,9 +1363,9 @@ checksum = "b16bd47d9e329435e309c58469fe0791c2d0d1ba96ec0954152a5ae2b04387dc" [[package]] name = "memoffset" -version = "0.6.3" +version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f83fb6581e8ed1f85fd45c116db8405483899489e38406156c25eb743554361d" +checksum = "59accc507f1338036a0477ef61afdae33cde60840f4dfe481319ce3ad116ddf9" dependencies = [ "autocfg", ] @@ -2719,9 +2718,9 @@ checksum = "cda74da7e1a664f795bb1f8a87ec406fb89a02522cf6e50620d016add6dbbf5c" [[package]] name = "tokio" -version = "1.6.0" +version = "1.6.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd3076b5c8cc18138b8f8814895c11eb4de37114a5d127bafdc5e55798ceef37" +checksum = "0a38d31d7831c6ed7aad00aa4c12d9375fd225a6dd77da1d25b707346319a975" dependencies = [ "autocfg", "bytes 1.0.1", @@ -2898,9 +2897,9 @@ dependencies = [ [[package]] name = "unicode-normalization" -version = "0.1.17" +version = "0.1.18" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "07fbfce1c8a97d547e8b5334978438d9d6ec8c20e38f56d4a4374d181493eaef" +checksum = "33717dca7ac877f497014e10d73f3acf948c342bee31b5ca7892faf94ccc6b49" dependencies = [ "tinyvec", ] diff --git a/components/config/src/config/search.rs b/components/config/src/config/search.rs index 82d46c7a..71ff325d 100644 --- a/components/config/src/config/search.rs +++ b/components/config/src/config/search.rs @@ -13,6 +13,8 @@ pub struct Search { /// Includes the description in the search index. When the site becomes too large, you can switch /// to that instead. `false` by default pub include_description: bool, + /// Include the path of the page in the search index. `false` by default. + pub include_path: bool, } impl Default for Search { @@ -21,6 +23,7 @@ impl Default for Search { include_title: true, include_content: true, include_description: false, + include_path: false, truncate_content_length: None, } } diff --git a/components/search/src/lib.rs b/components/search/src/lib.rs index 4daaa761..a91f9587 100644 --- a/components/search/src/lib.rs +++ b/components/search/src/lib.rs @@ -1,6 +1,8 @@ use std::collections::{HashMap, HashSet}; use elasticlunr::{Index, Language}; +use elasticlunr::pipeline; +use elasticlunr::pipeline::TokenizerFn; use lazy_static::lazy_static; use config::{Config, Search}; @@ -36,6 +38,10 @@ fn build_fields(search_config: &Search) -> Vec { fields.push("description".to_owned()); } + if search_config.include_path { + fields.push("path".to_owned()); + } + if search_config.include_content { fields.push("body".to_owned()); } @@ -43,10 +49,46 @@ fn build_fields(search_config: &Search) -> Vec { fields } +fn path_tokenizer(text: &str) -> Vec { + text.split(|c: char| c.is_whitespace() || c == '-' || c == '/') + .filter(|s| !s.is_empty()) + .map(|s| s.trim().to_lowercase()) + .collect() +} + +fn build_tokenizers(search_config: &Search, language: Language) -> Vec { + let text_tokenizer = match language { + #[cfg(feature = "indexing-zh")] + Language::Chinese => pipeline::tokenize_chinese, + #[cfg(feature = "indexing-ja")] + Language::Japanese => pipeline::tokenize_japanese, + _ => pipeline::tokenize, + }; + let mut tokenizers: Vec = vec![]; + if search_config.include_title { + tokenizers.push(text_tokenizer); + } + + if search_config.include_description { + tokenizers.push(text_tokenizer); + } + + if search_config.include_path { + tokenizers.push(path_tokenizer); + } + + if search_config.include_content { + tokenizers.push(text_tokenizer); + } + + tokenizers +} + fn fill_index( search_config: &Search, title: &Option, description: &Option, + path: &str, content: &str, ) -> Vec { let mut row = vec![]; @@ -59,6 +101,10 @@ fn fill_index( row.push(description.clone().unwrap_or_default()); } + if search_config.include_path { + row.push(path.to_string()); + } + if search_config.include_content { let body = AMMONIA.clean(&content).to_string(); if let Some(truncate_len) = search_config.truncate_content_length { @@ -90,9 +136,11 @@ pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result, ) { if !section.meta.in_search_index { return; @@ -111,14 +160,16 @@ fn add_section_to_index( // Don't index redirecting sections if section.meta.redirect_to.is_none() { - index.add_doc( + index.add_doc_with_tokenizers( §ion.permalink, &fill_index( search_config, §ion.meta.title, §ion.meta.description, + §ion.path, §ion.content, ), + tokenizers.clone(), ); } @@ -128,9 +179,10 @@ fn add_section_to_index( continue; } - index.add_doc( + index.add_doc_with_tokenizers( &page.permalink, - &fill_index(search_config, &page.meta.title, &page.meta.description, &page.content), + &fill_index(search_config, &page.meta.title, &page.meta.description, &page.path, &page.content), + tokenizers.clone(), ); } } @@ -166,9 +218,10 @@ mod tests { let config = Config::default(); let title = Some("A title".to_string()); let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); let content = "Some content".to_string(); - let res = fill_index(&config.search, &title, &description, &content); + let res = fill_index(&config.search, &title, &description, &path, &content); assert_eq!(res.len(), 2); assert_eq!(res[0], title.unwrap()); assert_eq!(res[1], content); @@ -180,9 +233,10 @@ mod tests { config.search.include_description = true; let title = Some("A title".to_string()); let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); let content = "Some content".to_string(); - let res = fill_index(&config.search, &title, &description, &content); + let res = fill_index(&config.search, &title, &description, &path, &content); assert_eq!(res.len(), 3); assert_eq!(res[0], title.unwrap()); assert_eq!(res[1], description.unwrap()); @@ -195,9 +249,10 @@ mod tests { config.search.truncate_content_length = Some(5); let title = Some("A title".to_string()); let description = Some("A description".to_string()); + let path = "/a/page/".to_string(); let content = "Some content".to_string(); - let res = fill_index(&config.search, &title, &description, &content); + let res = fill_index(&config.search, &title, &description, &path, &content); assert_eq!(res.len(), 2); assert_eq!(res[0], title.unwrap()); assert_eq!(res[1], content[..5]); diff --git a/docs/content/documentation/getting-started/configuration.md b/docs/content/documentation/getting-started/configuration.md index 8673dc54..7d4a4332 100644 --- a/docs/content/documentation/getting-started/configuration.md +++ b/docs/content/documentation/getting-started/configuration.md @@ -151,6 +151,8 @@ build_search_index = false include_title = true # Whether to include the description of the page/section in the index include_description = false +# Whether to include the path of the page/section in the index +include_path = false # Whether to include the rendered content of the page/section in the index include_content = true # At which character to truncate the content to. Useful if you have a lot of pages and the index would