Add support for Fuse.js search format (#2507)

* inital "just barely works" Fuse.js support

* implement FuseJavascript; refactor index_for_lang

* support search config

* move fuse index building to it's own file

* update doc of Search.index_format

* update config docs

* update search documentation

* use &str where possible

* use libs::serde_json

remmeber to commit Cargo.lock

* move extension logic to IndexFormat

* move the entire filename logic inside IndexFormat

* move elasticlunr to it's own module

* only create elasticlunr.min.js if we're actually using elasticlunr

* move ELASTICLUNR_JS to elasticlunr.js

* hide the details of search's submodules

* optionally include path

* explain include_path better

* remove references to stork

* replace if with match

* support include_description

* specify "permalink"

* move body cleaning and truncation to a function

* update truncate_content_length docs to specify *code points*
This commit is contained in:
Connor K 2024-05-31 06:29:26 -04:00 committed by Vincent Prouillet
parent 0d0036e14a
commit 26f6677bfb
10 changed files with 496 additions and 348 deletions

161
Cargo.lock generated
View file

@ -150,9 +150,9 @@ checksum = "70033777eb8b5124a81a1889416543dddef2de240019b674c81285a2635a7e1e"
[[package]]
name = "anyhow"
version = "1.0.83"
version = "1.0.86"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25bdb32cbbdce2b519a9cd7df3a678443100e265d5e25ca763b7572a5104f5f3"
checksum = "b3d1d046238990b9cf5bcde22a3fb3584ee5cf65fb2765f454ed428c7a0063da"
[[package]]
name = "arbitrary"
@ -168,7 +168,7 @@ checksum = "0ae92a5119aa49cdbcf6b9f893fe4e1d98b04ccbf82ee0584ad948a44a734dea"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -402,9 +402,9 @@ checksum = "5ce89b21cab1437276d2650d57e971f9d548a2d9037cc231abdc0562b97498ce"
[[package]]
name = "bytemuck"
version = "1.15.0"
version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5d6d68c57235a3a081186990eca2867354726650f42f7516ca50c28d6281fd15"
checksum = "78834c15cb5d5efe3452d58b1e8ba890dd62d21907f867f383358198e56ebca5"
[[package]]
name = "byteorder"
@ -436,9 +436,9 @@ checksum = "514de17de45fdb8dc022b1a7975556c53c86f9f0aa5f534b98977b171857c2c9"
[[package]]
name = "camino"
version = "1.1.6"
version = "1.1.7"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c59e92b5a388f549b863a7bea62612c09f24c8393560709a54558a9abdfb3b9c"
checksum = "e0ec6b951b160caa93cc0c7b209e5a3bff7aae9062213451ac99493cd844c239"
dependencies = [
"serde",
]
@ -467,9 +467,9 @@ dependencies = [
[[package]]
name = "cc"
version = "1.0.97"
version = "1.0.98"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "099a5357d84c4c61eb35fc8eafa9a79a902c2f76911e5747ced4e032edd8d9b4"
checksum = "41c270e7540d725e65ac7f1b212ac8ce349719624d7bcff99f8e2e488e8cf03f"
dependencies = [
"jobserver",
"libc",
@ -618,7 +618,7 @@ dependencies = [
"heck 0.5.0",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -765,9 +765,9 @@ dependencies = [
[[package]]
name = "crc32fast"
version = "1.4.0"
version = "1.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b3855a8a784b474f333699ef2bbca9db2c4a1f6d9088a90a2d25b1eb53111eaa"
checksum = "a97769d94ddab943e4510d138150169a2758b5ef3eb191a9ee688de3e23ef7b3"
dependencies = [
"cfg-if 1.0.0",
]
@ -802,9 +802,9 @@ dependencies = [
[[package]]
name = "crossbeam-utils"
version = "0.8.19"
version = "0.8.20"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "248e3bacc7dc6baa3b21e405ee045c3047101a49145e7e9eca583ab4c2ca5345"
checksum = "22ec99545bb0ed0ea7bb9b8e1e9122ea386ff8a48c0922e43f36d45ab09e0e80"
[[package]]
name = "crunchy"
@ -851,7 +851,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
dependencies = [
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -924,9 +924,9 @@ dependencies = [
[[package]]
name = "deunicode"
version = "1.4.4"
version = "1.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "322ef0094744e63628e6f0eb2295517f79276a5b342a4c2ff3042566ca181d4e"
checksum = "339544cc9e2c4dc3fc7149fd630c5f22263a4fdf18a98afd0075784968b5cf00"
[[package]]
name = "digest"
@ -955,7 +955,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -981,9 +981,9 @@ dependencies = [
[[package]]
name = "either"
version = "1.11.0"
version = "1.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a47c1c47d2f5964e29c61246e81db715514cd532db6b5116a25ea3c03d6780a2"
checksum = "3dca9240753cf90908d7e4aac30f630662b02aebaa1b58a3cadabdb23385b58b"
[[package]]
name = "elasticlunr-rs"
@ -1367,8 +1367,10 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c4567c8db10ae91089c99af84c68c38da3ec2f087c3f82960bcdbf3656b6f4d7"
dependencies = [
"cfg-if 1.0.0",
"js-sys",
"libc",
"wasi 0.11.0+wasi-snapshot-preview1",
"wasm-bindgen",
]
[[package]]
@ -1429,18 +1431,19 @@ dependencies = [
[[package]]
name = "grass"
version = "0.13.2"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b89786a806d5b192cf4e573f9831c847a455a142d000c922bdfc1e5edad14303"
checksum = "a46def7216d331efa51a6aa796ef777bfdfe9605378382827a553344b7e5eefc"
dependencies = [
"getrandom 0.2.15",
"grass_compiler",
]
[[package]]
name = "grass_compiler"
version = "0.13.2"
version = "0.13.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cf7d155dd7cef20195016d01005033a5521aad307033f0f8e8bf0a02f5f7554"
checksum = "f39216c1843182f78541276fec96f88406861f16aa19cc9f8add70f8e67b7577"
dependencies = [
"codemap",
"indexmap 2.2.6",
@ -1536,7 +1539,7 @@ dependencies = [
"markup5ever",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -1782,9 +1785,9 @@ dependencies = [
[[package]]
name = "insta"
version = "1.38.0"
version = "1.39.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3eab73f58e59ca6526037208f0e98851159ec1633cf17b6cd2e1f2c3fd5d53cc"
checksum = "810ae6042d48e2c9e9215043563a58a80b877bc863228a74cf10c49d4620a6f5"
dependencies = [
"console 0.15.8",
"lazy_static",
@ -1800,7 +1803,7 @@ checksum = "c34819042dc3d3971c46c2190835914dfbe0c3c13f61449b2997f4e9722dfa60"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -1983,9 +1986,9 @@ dependencies = [
[[package]]
name = "libc"
version = "0.2.154"
version = "0.2.155"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae743338b92ff9146ce83992f766a31066a91a8c84a45e0e9f21e7cf6de6d346"
checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
[[package]]
name = "libfuzzer-sys"
@ -2042,7 +2045,7 @@ dependencies = [
"tera",
"termcolor",
"time",
"toml 0.8.12",
"toml 0.8.13",
"unic-langid",
"unicode-segmentation",
"url",
@ -2062,9 +2065,9 @@ dependencies = [
[[package]]
name = "lightningcss"
version = "1.0.0-alpha.55"
version = "1.0.0-alpha.56"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3bd5bed3814fb631bfc1e24c2be6f7e86a9837c660909acab79a38374dcb8798"
checksum = "668e9f1774a4dda9e2233ad0f78c6987878bcf4201d2085bc3517a7f84d0ee92"
dependencies = [
"ahash 0.8.11",
"bitflags 2.5.0",
@ -2074,6 +2077,7 @@ dependencies = [
"dashmap",
"data-encoding",
"getrandom 0.2.15",
"indexmap 2.2.6",
"itertools 0.10.5",
"lazy_static",
"parcel_selectors",
@ -2267,9 +2271,9 @@ checksum = "0717cef1bc8b636c6e1c1bbdefc09e6322da8a9321966e8928ef80d20f7f770f"
[[package]]
name = "linux-raw-sys"
version = "0.4.13"
version = "0.4.14"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
[[package]]
name = "lock_api"
@ -2443,9 +2447,9 @@ checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
[[package]]
name = "miniz_oxide"
version = "0.7.2"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "9d811f3e15f28568be3407c8e7fdb6514c1cda3cb30683f15b6a1a1dc4ea14a7"
checksum = "87dfd01fe195c66b572b37921ad8803d010623c0aca821bea2302239d155cdae"
dependencies = [
"adler",
"simd-adler32",
@ -2694,7 +2698,7 @@ checksum = "ed3955f1a9c7c0c15e092f9c887db08b1fc683305fdf6eb6684f22555355e202"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -2790,9 +2794,9 @@ checksum = "2839e79665f131bdb5782e51f2c6c9599c133c6098982a54c794358bf432529c"
[[package]]
name = "open"
version = "5.1.2"
version = "5.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "449f0ff855d85ddbf1edd5b646d65249ead3f5e422aaa86b7d2d0b049b103e32"
checksum = "2eb49fbd5616580e9974662cb96a3463da4476e649a7e4b258df0de065db0657"
dependencies = [
"is-wsl",
"libc",
@ -2822,7 +2826,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -2857,9 +2861,9 @@ checksum = "7f222829ae9293e33a9f5e9f440c6760a3d450a64affe1846486b140db81c1f4"
[[package]]
name = "parcel_selectors"
version = "0.26.4"
version = "0.26.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "05d74befe2d076330d9a58bf9ca2da424568724ab278adf15fb5718253133887"
checksum = "ce9c47a67c66fee4a5a42756f9784d92941bd0ab2b653539a9e90521a44b66f0"
dependencies = [
"bitflags 2.5.0",
"cssparser",
@ -2985,7 +2989,7 @@ dependencies = [
"pest_meta",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -3068,7 +3072,7 @@ dependencies = [
"phf_shared 0.11.2",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -3178,9 +3182,9 @@ dependencies = [
[[package]]
name = "proc-macro2"
version = "1.0.82"
version = "1.0.83"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8ad3d49ab951a01fbaafe34f2ec74122942fe18a3f9814c3268f1bb72042131b"
checksum = "0b33eb56c327dec362a9e55b3ad14f9d2f0904fb5a5b03b513ab5465399e9f43"
dependencies = [
"unicode-ident",
]
@ -3201,7 +3205,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8021cf59c8ec9c432cfc2526ac6b8aa508ecaf29cd415f271b8406c1b851c3fd"
dependencies = [
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -3749,6 +3753,7 @@ dependencies = [
"content",
"errors",
"libs",
"serde",
]
[[package]]
@ -3785,22 +3790,22 @@ dependencies = [
[[package]]
name = "serde"
version = "1.0.201"
version = "1.0.202"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "780f1cebed1629e4753a1a38a3c72d30b97ec044f0aef68cb26650a3c5cf363c"
checksum = "226b61a0d411b2ba5ff6d7f73a476ac4f8bb900373459cd00fab8512828ba395"
dependencies = [
"serde_derive",
]
[[package]]
name = "serde_derive"
version = "1.0.201"
version = "1.0.202"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c5e405930b9796f1c00bee880d03fc7e0bb4b9a11afc776885ffe84320da2865"
checksum = "6048858004bcff69094cd972ed40a32500f153bd3be9f716b2eed2e8217c4838"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -3817,9 +3822,9 @@ dependencies = [
[[package]]
name = "serde_spanned"
version = "0.6.5"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb3622f419d1296904700073ea6cc23ad690adbd66f13ea683df73298736f0c1"
checksum = "79e674e01f999af37c49f70a6ede167a8a60b2503e56c5599532a65baa5969a0"
dependencies = [
"serde",
]
@ -4056,9 +4061,9 @@ dependencies = [
[[package]]
name = "syn"
version = "2.0.61"
version = "2.0.65"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c993ed8ccba56ae856363b1845da7266a7cb78e1d146c8a32d54b45a8b831fc9"
checksum = "d2863d96a84c6439701d7a38f9de935ec562c8832cc55d1dde0f513b52fad106"
dependencies = [
"proc-macro2",
"quote",
@ -4123,7 +4128,7 @@ dependencies = [
"cfg-expr",
"heck 0.5.0",
"pkg-config",
"toml 0.8.12",
"toml 0.8.13",
"version-compare",
]
@ -4237,7 +4242,7 @@ dependencies = [
"cfg-if 1.0.0",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -4248,7 +4253,7 @@ checksum = "5c89e72a01ed4c579669add59014b9a524d609c0c88c6a585ce37485879f6ffb"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
"test-case-core",
]
@ -4260,22 +4265,22 @@ checksum = "23d434d3f8967a09480fb04132ebe0a3e088c173e6d0ee7897abbdf4eab0f8b9"
[[package]]
name = "thiserror"
version = "1.0.60"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "579e9083ca58dd9dcf91a9923bb9054071b9ebbd800b342194c9feb0ee89fc18"
checksum = "c546c80d6be4bc6a00c0f01730c08df82eaa7a7a61f11d656526506112cc1709"
dependencies = [
"thiserror-impl",
]
[[package]]
name = "thiserror-impl"
version = "1.0.60"
version = "1.0.61"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e2470041c06ec3ac1ab38d0356a6119054dedaea53e12fbefc0de730a1c08524"
checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]
@ -4405,9 +4410,9 @@ dependencies = [
[[package]]
name = "toml"
version = "0.8.12"
version = "0.8.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e9dd1545e8208b4a5af1aa9bbd0b4cf7e9ea08fabc5d0a5c67fcaafa17433aa3"
checksum = "a4e43f8cc456c9704c851ae29c67e17ef65d2c30017c17a9765b89c382dc8bba"
dependencies = [
"serde",
"serde_spanned",
@ -4417,18 +4422,18 @@ dependencies = [
[[package]]
name = "toml_datetime"
version = "0.6.5"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3550f4e9685620ac18a50ed434eb3aec30db8ba93b0287467bca5826ea25baf1"
checksum = "4badfd56924ae69bcc9039335b2e017639ce3f9b001c393c1b2d1ef846ce2cbf"
dependencies = [
"serde",
]
[[package]]
name = "toml_edit"
version = "0.22.12"
version = "0.22.13"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d3328d4f68a705b2a4498da1d580585d39a6510f98318a2cec3018a7ec61ddef"
checksum = "c127785850e8c20836d49732ae6abfa47616e60bf9d9f57c43c250361a9db96c"
dependencies = [
"indexmap 2.2.6",
"serde",
@ -4503,18 +4508,18 @@ checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
[[package]]
name = "unic-langid"
version = "0.9.4"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "238722e6d794ed130f91f4ea33e01fcff4f188d92337a21297892521c72df516"
checksum = "23dd9d1e72a73b25e07123a80776aae3e7b0ec461ef94f9151eed6ec88005a44"
dependencies = [
"unic-langid-impl",
]
[[package]]
name = "unic-langid-impl"
version = "0.9.4"
version = "0.9.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4bd55a2063fdea4ef1f8633243a7b0524cbeef1905ae04c31a1c9b9775c55bc6"
checksum = "0a5422c1f65949306c99240b81de9f3f15929f5a8bfe05bb44b034cc8bf593e5"
dependencies = [
"tinystr",
]
@ -4722,7 +4727,7 @@ dependencies = [
"once_cell",
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
"wasm-bindgen-shared",
]
@ -4756,7 +4761,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
"wasm-bindgen-backend",
"wasm-bindgen-shared",
]
@ -5104,7 +5109,7 @@ checksum = "15e934569e47891f7d9411f1a451d947a60e000ab3bd24fbb970f000387d1b3b"
dependencies = [
"proc-macro2",
"quote",
"syn 2.0.61",
"syn 2.0.65",
]
[[package]]

View file

@ -7,6 +7,23 @@ pub enum IndexFormat {
ElasticlunrJson,
#[default]
ElasticlunrJavascript,
FuseJson,
FuseJavascript,
}
impl IndexFormat {
/// file extension which ought to be used for this index format.
fn extension(&self) -> &'static str {
match *self {
IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js",
IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json",
}
}
/// the filename which ought to be used for this format and language `lang`
pub fn filename(&self, lang: &str) -> String {
format!("search_index.{}.{}", lang, self.extension())
}
}
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@ -17,7 +34,7 @@ pub struct Search {
/// Includes the whole content in the search index. Ok for small sites but becomes
/// too big on large sites. `true` by default.
pub include_content: bool,
/// Optionally truncate the content down to `n` chars. This might cut content in a word
/// Optionally truncate the content down to `n` code points. This might cut content in a word
pub truncate_content_length: Option<usize>,
/// Includes the description in the search index. When the site becomes too large, you can switch
/// to that instead. `false` by default
@ -26,7 +43,7 @@ pub struct Search {
pub include_date: bool,
/// Include the path of the page in the search index. `false` by default.
pub include_path: bool,
/// Foramt of the search index to be produced. Javascript by default
/// Foramt of the search index to be produced. 'elasticlunr_javascript' by default.
pub index_format: IndexFormat,
}

View file

@ -8,3 +8,4 @@ errors = { path = "../errors" }
content = { path = "../content" }
config = { path = "../config" }
libs = { path = "../libs" }
serde = { version = "1.0", features = ["derive"] }

View file

@ -0,0 +1,236 @@
use config::{Config, Search};
use content::{Library, Section};
use errors::{bail, Result};
use libs::elasticlunr::{lang, Index, IndexBuilder};
use libs::time::format_description::well_known::Rfc3339;
use libs::time::OffsetDateTime;
use crate::clean_and_truncate_body;
pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder {
if search_config.include_title {
index = index.add_field("title");
}
if search_config.include_description {
index = index.add_field("description");
}
if search_config.include_date {
index = index.add_field("date")
}
if search_config.include_path {
index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer));
}
if search_config.include_content {
index = index.add_field("body")
}
index
}
fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}
fn fill_index(
search_config: &Search,
title: &Option<String>,
description: &Option<String>,
datetime: &Option<OffsetDateTime>,
path: &str,
content: &str,
) -> Vec<String> {
let mut row = vec![];
if search_config.include_title {
row.push(title.clone().unwrap_or_default());
}
if search_config.include_description {
row.push(description.clone().unwrap_or_default());
}
if search_config.include_date {
if let Some(date) = datetime {
if let Ok(d) = date.format(&Rfc3339) {
row.push(d);
}
}
}
if search_config.include_path {
row.push(path.to_string());
}
if search_config.include_content {
row.push(clean_and_truncate_body(search_config.truncate_content_length, content));
}
row
}
/// Returns the generated JSON index with all the documents of the site added using
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
let language = match lang::from_code(lang) {
Some(l) => l,
None => {
bail!("Tried to build search index for language {} which is not supported", lang);
}
};
let language_options = &config.languages[lang];
let mut index = IndexBuilder::with_language(language);
index = build_fields(&language_options.search, index);
let mut index = index.build();
for (_, section) in &library.sections {
if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search);
}
}
Ok(index.to_json())
}
fn add_section_to_index(
index: &mut Index,
section: &Section,
library: &Library,
search_config: &Search,
) {
if !section.meta.in_search_index {
return;
}
// Don't index redirecting sections
if section.meta.redirect_to.is_none() {
index.add_doc(
&section.permalink,
&fill_index(
search_config,
&section.meta.title,
&section.meta.description,
&None,
&section.path,
&section.content,
),
);
}
for key in &section.pages {
let page = &library.pages[key];
if !page.meta.in_search_index {
continue;
}
index.add_doc(
&page.permalink,
&fill_index(
search_config,
&page.meta.title,
&page.meta.description,
&page.meta.datetime,
&page.path,
&page.content,
),
);
}
}
#[cfg(test)]
mod tests {
use super::*;
use config::Config;
use libs::elasticlunr::IndexBuilder;
#[test]
fn can_build_fields() {
let mut config = Config::default();
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "body"]);
config.search.include_content = false;
config.search.include_description = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description"]);
config.search.include_content = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description", "body"]);
config.search.include_title = false;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["description", "body"]);
}
#[test]
fn can_fill_index_default() {
let config = Config::default();
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content);
}
#[test]
fn can_fill_index_description() {
let mut config = Config::default();
config.search.include_description = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap());
assert_eq!(res[2], content);
}
#[test]
fn can_fill_index_truncated_content() {
let mut config = Config::default();
config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]);
}
#[test]
fn can_fill_index_date() {
let mut config = Config::default();
config.search.include_date = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap());
let res = fill_index(&config.search, &title, &description, &datetime, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], "2023-01-31T00:00:00Z");
assert_eq!(res[2], content);
}
}

View file

@ -0,0 +1,76 @@
use config::Search;
use content::Library;
use errors::Result;
use libs::serde_json;
use crate::clean_and_truncate_body;
/// build index in Fuse.js format.
pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result<String> {
#[derive(serde::Serialize)]
struct Item<'a> {
url: &'a str,
title: Option<&'a str>,
description: Option<&'a str>,
body: Option<String>, // AMMONIA.clean has to allocate anyway
path: Option<&'a str>,
}
let mut items: Vec<Item> = Vec::new();
for (_, section) in &library.sections {
if section.lang == lang
&& section.meta.redirect_to.is_none()
&& section.meta.in_search_index
{
items.push(Item {
url: &section.permalink,
title: match config.include_title {
true => Some(&section.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&section.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(clean_and_truncate_body(
config.truncate_content_length,
&section.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&section.path),
false => None,
},
});
for page in &section.pages {
let page = &library.pages[page];
if page.meta.in_search_index {
items.push(Item {
url: &page.permalink,
title: match config.include_title {
true => Some(&page.meta.title.as_deref().unwrap_or_default()),
false => None,
},
description: match config.include_description {
true => Some(&page.meta.description.as_deref().unwrap_or_default()),
false => None,
},
body: match config.include_content {
true => Some(super::clean_and_truncate_body(
config.truncate_content_length,
&page.content,
)),
false => None,
},
path: match config.include_path {
true => Some(&page.path),
false => None,
},
})
}
}
}
}
Ok(serde_json::to_string(&items)?)
}

View file

@ -1,16 +1,12 @@
use std::collections::{HashMap, HashSet};
mod elasticlunr;
mod fuse;
use libs::ammonia;
use libs::elasticlunr::{lang, Index, IndexBuilder};
use libs::once_cell::sync::Lazy;
use libs::time::format_description::well_known::Rfc3339;
use libs::time::OffsetDateTime;
use std::collections::{HashMap, HashSet};
use config::{Config, Search};
use content::{Library, Section};
use errors::{bail, Result};
pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
pub use elasticlunr::{build_index as build_elasticlunr, ELASTICLUNR_JS};
pub use fuse::build_index as build_fuse;
static AMMONIA: Lazy<ammonia::Builder<'static>> = Lazy::new(|| {
let mut clean_content = HashSet::new();
@ -28,238 +24,25 @@ static AMMONIA: Lazy<ammonia::Builder<'static>> = Lazy::new(|| {
builder
});
fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder {
if search_config.include_title {
index = index.add_field("title");
}
if search_config.include_description {
index = index.add_field("description");
}
if search_config.include_date {
index = index.add_field("date")
}
if search_config.include_path {
index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer));
}
if search_config.include_content {
index = index.add_field("body")
}
index
}
fn path_tokenizer(text: &str) -> Vec<String> {
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
.filter(|s| !s.is_empty())
.map(|s| s.trim().to_lowercase())
.collect()
}
fn fill_index(
search_config: &Search,
title: &Option<String>,
description: &Option<String>,
datetime: &Option<OffsetDateTime>,
path: &str,
content: &str,
) -> Vec<String> {
let mut row = vec![];
if search_config.include_title {
row.push(title.clone().unwrap_or_default());
}
if search_config.include_description {
row.push(description.clone().unwrap_or_default());
}
if search_config.include_date {
if let Some(date) = datetime {
if let Ok(d) = date.format(&Rfc3339) {
row.push(d);
}
}
}
if search_config.include_path {
row.push(path.to_string());
}
if search_config.include_content {
let body = AMMONIA.clean(content).to_string();
if let Some(truncate_len) = search_config.truncate_content_length {
// Not great for unicode
// TODO: fix it like the truncate in Tera
match body.char_indices().nth(truncate_len) {
None => row.push(body),
Some((idx, _)) => row.push((body[..idx]).to_string()),
};
} else {
row.push(body);
};
}
row
}
/// Returns the generated JSON index with all the documents of the site added using
/// the language given
/// Errors if the language given is not available in Elasticlunr
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
let language = match lang::from_code(lang) {
Some(l) => l,
None => {
bail!("Tried to build search index for language {} which is not supported", lang);
}
};
let language_options = &config.languages[lang];
let mut index = IndexBuilder::with_language(language);
index = build_fields(&language_options.search, index);
let mut index = index.build();
for (_, section) in &library.sections {
if section.lang == lang {
add_section_to_index(&mut index, section, library, &language_options.search);
}
}
Ok(index.to_json())
}
fn add_section_to_index(
index: &mut Index,
section: &Section,
library: &Library,
search_config: &Search,
) {
if !section.meta.in_search_index {
return;
}
// Don't index redirecting sections
if section.meta.redirect_to.is_none() {
index.add_doc(
&section.permalink,
&fill_index(
search_config,
&section.meta.title,
&section.meta.description,
&None,
&section.path,
&section.content,
),
);
}
for key in &section.pages {
let page = &library.pages[key];
if !page.meta.in_search_index {
continue;
}
index.add_doc(
&page.permalink,
&fill_index(
search_config,
&page.meta.title,
&page.meta.description,
&page.meta.datetime,
&page.path,
&page.content,
),
);
/// uses ammonia to clean the body, and truncates it to `truncate_content_length`
pub fn clean_and_truncate_body(truncate_content_length: Option<usize>, body: &str) -> String {
let mut clean = AMMONIA.clean(body).to_string();
if let Some(new_len) = truncate_content_length {
clean.truncate(clean.char_indices().nth(new_len).map(|(i, _)| i).unwrap_or(clean.len()))
}
clean
}
#[cfg(test)]
mod tests {
use super::*;
use config::Config;
#[test]
fn can_build_fields() {
let mut config = Config::default();
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "body"]);
config.search.include_content = false;
config.search.include_description = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description"]);
config.search.include_content = true;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["title", "description", "body"]);
config.search.include_title = false;
let index = build_fields(&config.search, IndexBuilder::new()).build();
assert_eq!(index.get_fields(), vec!["description", "body"]);
}
#[test]
fn can_fill_index_default() {
let config = Config::default();
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content);
}
#[test]
fn can_fill_index_description() {
let mut config = Config::default();
config.search.include_description = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], description.unwrap());
assert_eq!(res[2], content);
}
#[test]
fn can_fill_index_truncated_content() {
let mut config = Config::default();
config.search.truncate_content_length = Some(5);
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
assert_eq!(res.len(), 2);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], content[..5]);
}
#[test]
fn can_fill_index_date() {
let mut config = Config::default();
config.search.include_date = true;
let title = Some("A title".to_string());
let description = Some("A description".to_string());
let path = "/a/page/".to_string();
let content = "Some content".to_string();
let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap());
let res = fill_index(&config.search, &title, &description, &datetime, &path, &content);
assert_eq!(res.len(), 3);
assert_eq!(res[0], title.unwrap());
assert_eq!(res[1], "2023-01-31T00:00:00Z");
assert_eq!(res[2], content);
}
#[test]
fn clean_and_truncate_body_test() {
assert_eq!(clean_and_truncate_body(None, "hello world"), "hello world");
assert_eq!(
clean_and_truncate_body(None, "hello <script>alert('xss')</script> world"),
"hello world"
);
assert_eq!(clean_and_truncate_body(Some(100), "hello"), "hello");
assert_eq!(clean_and_truncate_body(Some(2), "hello"), "he");
assert_eq!(clean_and_truncate_body(Some(6), "hello \u{202E} world"), "hello ");
assert_eq!(clean_and_truncate_body(Some(7), "hello \u{202E} world"), "hello \u{202e}");
}

View file

@ -799,19 +799,26 @@ impl Site {
}
fn index_for_lang(&self, lang: &str) -> Result<()> {
let index_json = search::build_index(lang, &self.library.read().unwrap(), &self.config)?;
let (path, content) = match &self.config.search.index_format {
IndexFormat::ElasticlunrJson => {
let path = self.output_path.join(format!("search_index.{}.json", lang));
(path, index_json)
let path = &self.output_path.join(self.config.search.index_format.filename(lang));
let library = self.library.read().unwrap();
let content = match &self.config.search.index_format {
IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => {
search::build_elasticlunr(lang, &library, &self.config)?
}
IndexFormat::ElasticlunrJavascript => {
let path = self.output_path.join(format!("search_index.{}.js", lang));
let content = format!("window.searchIndex = {};", index_json);
(path, content)
IndexFormat::FuseJson | IndexFormat::FuseJavascript => {
search::build_fuse(lang, &library, &self.config.search)?
}
};
create_file(&path, &content)
drop(library); // no need to hold on to this guard while writing
create_file(
path,
match self.config.search.index_format {
IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => content,
IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => {
format!("window.searchIndex = {}", content)
}
},
)
}
pub fn build_search_index(&self) -> Result<()> {
@ -827,8 +834,13 @@ impl Site {
}
}
// then elasticlunr.min.js
create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?;
match self.config.search.index_format {
IndexFormat::ElasticlunrJavascript | IndexFormat::ElasticlunrJson => {
// then elasticlunr.min.js
create_file(&self.output_path.join("elasticlunr.min.js"), search::ELASTICLUNR_JS)?;
}
_ => {}
}
Ok(())
}

View file

@ -28,11 +28,12 @@ fn create_parent(path: &Path) -> Result<()> {
}
/// Create a file with the content given
pub fn create_file(path: &Path, content: &str) -> Result<()> {
/// `content`` can be `&str`, `String`, or `&String` (and probably others)
pub fn create_file(path: &Path, content: impl AsRef<str>) -> Result<()> {
create_parent(path)?;
let mut file =
File::create(path).with_context(|| format!("Failed to create file {}", path.display()))?;
file.write_all(content.as_bytes())?;
file.write_all(content.as_ref().as_bytes())?;
Ok(())
}

View file

@ -4,7 +4,7 @@ weight = 100
+++
Zola can build a search index from the sections and pages content to
be used by a JavaScript library such as [elasticlunr](http://elasticlunr.com/).
be used by a JavaScript library such as [elasticlunr](http://elasticlunr.com/) or [fuse](https://www.fusejs.io).
To enable it, you only need to set `build_search_index = true` in your `config.toml` and Zola will
generate an index for the `default_language` set for all pages not excluded from the search index.
@ -12,21 +12,36 @@ generate an index for the `default_language` set for all pages not excluded from
It is very important to set the `default_language` in your `config.toml` if you are writing a site not in
English; the index building pipelines are very different depending on the language.
After `zola build` or `zola serve`, you should see two files in your public directory:
- `search_index.${default_language}.js`: so `search_index.en.js` for a default setup
- `elasticlunr.min.js`
If you set `index_format = "elasticlunr_json"` in your `config.toml`, a `search_index.${default_language}.json` is generated
instead of the default `search_index.${default_language}.js`.
As each site will be different, Zola makes no assumptions about your search function and doesn't provide
the JavaScript/CSS code to do an actual search and display results. You can look at how this site
implements it to get an idea: [search.js](https://github.com/getzola/zola/tree/master/docs/static/search.js).
implements it (using elasticlunr) to get an idea: [search.js](https://github.com/getzola/zola/tree/master/docs/static/search.js).
If you are using a language other than English, you will also need to include the corresponding JavaScript stemmer file.
See <https://github.com/weixsong/lunr-languages#in-a-web-browser> for details.
## Configuring the search index
In some cases, the default indexing strategy is not suitable. You can customize which fields to include and whether
to truncate the content in the [search configuration](@/documentation/getting-started/configuration.md).
## Index Formats
### Elasticlunr
Compatible with [elasticlunr](http://elasticlunr.com/). Also produces `elasticlunr.min.js`.
```toml
# config.toml
[search]
index_format = "elasticlunr_javascript" # or "elasticlunr_json"
```
If you are using a language other than English, you will also need to include the corresponding JavaScript stemmer file.
See <https://github.com/weixsong/lunr-languages#in-a-web-browser> for details.
### Fuse
Compatible with [fuse.js](https://www.fusejs.io/) and [tinysearch](https://github.com/tinysearch/tinysearch).
```toml
# config.toml
[search]
index_format = "fuse_javascript" # or "fuse_json"
```

View file

@ -174,16 +174,18 @@ include_title = true
include_description = false
# Whether to include the RFC3339 datetime of the page in the search index
include_date = false
# Whether to include the path of the page/section in the index
# Whether to include the path of the page/section in the index (the permalink is always included)
include_path = false
# Whether to include the rendered content of the page/section in the index
include_content = true
# At which character to truncate the content to. Useful if you have a lot of pages and the index would
# At which code point to truncate the content to. Useful if you have a lot of pages and the index would
# become too big to load on the site. Defaults to not being set.
# truncate_content_length = 100
# Wether to produce the search index as a javascript file or as a JSON file
# Accepted value "elasticlunr_javascript" or "elasticlunr_json"
# Accepted values:
# - "elasticlunr_javascript", "elasticlunr_json"
# - "fuse_javascript", "fuse_json"
index_format = "elasticlunr_javascript"
# Optional translation object for the default language