Skip to content

Commit 26f6677

Browse files
SIGSTACKFAULTKeats
authored andcommitted
Add support for Fuse.js search format (#2507)
* inital "just barely works" Fuse.js support * implement FuseJavascript; refactor index_for_lang * support search config * move fuse index building to it's own file * update doc of Search.index_format * update config docs * update search documentation * use &str where possible * use libs::serde_json remmeber to commit Cargo.lock * move extension logic to IndexFormat * move the entire filename logic inside IndexFormat * move elasticlunr to it's own module * only create elasticlunr.min.js if we're actually using elasticlunr * move ELASTICLUNR_JS to elasticlunr.js * hide the details of search's submodules * optionally include path * explain include_path better * remove references to stork * replace if with match * support include_description * specify "permalink" * move body cleaning and truncation to a function * update truncate_content_length docs to specify *code points*
1 parent 0d0036e commit 26f6677

File tree

10 files changed

+496
-348
lines changed

10 files changed

+496
-348
lines changed

Cargo.lock

Lines changed: 83 additions & 78 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

components/config/src/config/search.rs

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,23 @@ pub enum IndexFormat {
77
ElasticlunrJson,
88
#[default]
99
ElasticlunrJavascript,
10+
FuseJson,
11+
FuseJavascript,
12+
}
13+
14+
impl IndexFormat {
15+
/// file extension which ought to be used for this index format.
16+
fn extension(&self) -> &'static str {
17+
match *self {
18+
IndexFormat::ElasticlunrJavascript | IndexFormat::FuseJavascript => "js",
19+
IndexFormat::ElasticlunrJson | IndexFormat::FuseJson => "json",
20+
}
21+
}
22+
23+
/// the filename which ought to be used for this format and language `lang`
24+
pub fn filename(&self, lang: &str) -> String {
25+
format!("search_index.{}.{}", lang, self.extension())
26+
}
1027
}
1128

1229
#[derive(Clone, Debug, PartialEq, Eq, Serialize, Deserialize)]
@@ -17,7 +34,7 @@ pub struct Search {
1734
/// Includes the whole content in the search index. Ok for small sites but becomes
1835
/// too big on large sites. `true` by default.
1936
pub include_content: bool,
20-
/// Optionally truncate the content down to `n` chars. This might cut content in a word
37+
/// Optionally truncate the content down to `n` code points. This might cut content in a word
2138
pub truncate_content_length: Option<usize>,
2239
/// Includes the description in the search index. When the site becomes too large, you can switch
2340
/// to that instead. `false` by default
@@ -26,7 +43,7 @@ pub struct Search {
2643
pub include_date: bool,
2744
/// Include the path of the page in the search index. `false` by default.
2845
pub include_path: bool,
29-
/// Foramt of the search index to be produced. Javascript by default
46+
/// Foramt of the search index to be produced. 'elasticlunr_javascript' by default.
3047
pub index_format: IndexFormat,
3148
}
3249

components/search/Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ errors = { path = "../errors" }
88
content = { path = "../content" }
99
config = { path = "../config" }
1010
libs = { path = "../libs" }
11+
serde = { version = "1.0", features = ["derive"] }

components/search/src/elasticlunr.rs

Lines changed: 236 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,236 @@
1+
use config::{Config, Search};
2+
use content::{Library, Section};
3+
use errors::{bail, Result};
4+
use libs::elasticlunr::{lang, Index, IndexBuilder};
5+
use libs::time::format_description::well_known::Rfc3339;
6+
use libs::time::OffsetDateTime;
7+
8+
use crate::clean_and_truncate_body;
9+
10+
pub const ELASTICLUNR_JS: &str = include_str!("elasticlunr.min.js");
11+
12+
fn build_fields(search_config: &Search, mut index: IndexBuilder) -> IndexBuilder {
13+
if search_config.include_title {
14+
index = index.add_field("title");
15+
}
16+
17+
if search_config.include_description {
18+
index = index.add_field("description");
19+
}
20+
21+
if search_config.include_date {
22+
index = index.add_field("date")
23+
}
24+
25+
if search_config.include_path {
26+
index = index.add_field_with_tokenizer("path", Box::new(path_tokenizer));
27+
}
28+
29+
if search_config.include_content {
30+
index = index.add_field("body")
31+
}
32+
33+
index
34+
}
35+
36+
fn path_tokenizer(text: &str) -> Vec<String> {
37+
text.split(|c: char| c.is_whitespace() || c == '-' || c == '/')
38+
.filter(|s| !s.is_empty())
39+
.map(|s| s.trim().to_lowercase())
40+
.collect()
41+
}
42+
43+
fn fill_index(
44+
search_config: &Search,
45+
title: &Option<String>,
46+
description: &Option<String>,
47+
datetime: &Option<OffsetDateTime>,
48+
path: &str,
49+
content: &str,
50+
) -> Vec<String> {
51+
let mut row = vec![];
52+
53+
if search_config.include_title {
54+
row.push(title.clone().unwrap_or_default());
55+
}
56+
57+
if search_config.include_description {
58+
row.push(description.clone().unwrap_or_default());
59+
}
60+
61+
if search_config.include_date {
62+
if let Some(date) = datetime {
63+
if let Ok(d) = date.format(&Rfc3339) {
64+
row.push(d);
65+
}
66+
}
67+
}
68+
69+
if search_config.include_path {
70+
row.push(path.to_string());
71+
}
72+
73+
if search_config.include_content {
74+
row.push(clean_and_truncate_body(search_config.truncate_content_length, content));
75+
}
76+
row
77+
}
78+
79+
/// Returns the generated JSON index with all the documents of the site added using
80+
/// the language given
81+
/// Errors if the language given is not available in Elasticlunr
82+
/// TODO: is making `in_search_index` apply to subsections of a `false` section useful?
83+
pub fn build_index(lang: &str, library: &Library, config: &Config) -> Result<String> {
84+
let language = match lang::from_code(lang) {
85+
Some(l) => l,
86+
None => {
87+
bail!("Tried to build search index for language {} which is not supported", lang);
88+
}
89+
};
90+
let language_options = &config.languages[lang];
91+
let mut index = IndexBuilder::with_language(language);
92+
index = build_fields(&language_options.search, index);
93+
let mut index = index.build();
94+
95+
for (_, section) in &library.sections {
96+
if section.lang == lang {
97+
add_section_to_index(&mut index, section, library, &language_options.search);
98+
}
99+
}
100+
101+
Ok(index.to_json())
102+
}
103+
104+
fn add_section_to_index(
105+
index: &mut Index,
106+
section: &Section,
107+
library: &Library,
108+
search_config: &Search,
109+
) {
110+
if !section.meta.in_search_index {
111+
return;
112+
}
113+
114+
// Don't index redirecting sections
115+
if section.meta.redirect_to.is_none() {
116+
index.add_doc(
117+
&section.permalink,
118+
&fill_index(
119+
search_config,
120+
&section.meta.title,
121+
&section.meta.description,
122+
&None,
123+
&section.path,
124+
&section.content,
125+
),
126+
);
127+
}
128+
129+
for key in &section.pages {
130+
let page = &library.pages[key];
131+
if !page.meta.in_search_index {
132+
continue;
133+
}
134+
135+
index.add_doc(
136+
&page.permalink,
137+
&fill_index(
138+
search_config,
139+
&page.meta.title,
140+
&page.meta.description,
141+
&page.meta.datetime,
142+
&page.path,
143+
&page.content,
144+
),
145+
);
146+
}
147+
}
148+
149+
#[cfg(test)]
150+
mod tests {
151+
use super::*;
152+
use config::Config;
153+
use libs::elasticlunr::IndexBuilder;
154+
155+
#[test]
156+
fn can_build_fields() {
157+
let mut config = Config::default();
158+
let index = build_fields(&config.search, IndexBuilder::new()).build();
159+
assert_eq!(index.get_fields(), vec!["title", "body"]);
160+
161+
config.search.include_content = false;
162+
config.search.include_description = true;
163+
let index = build_fields(&config.search, IndexBuilder::new()).build();
164+
assert_eq!(index.get_fields(), vec!["title", "description"]);
165+
166+
config.search.include_content = true;
167+
let index = build_fields(&config.search, IndexBuilder::new()).build();
168+
assert_eq!(index.get_fields(), vec!["title", "description", "body"]);
169+
170+
config.search.include_title = false;
171+
let index = build_fields(&config.search, IndexBuilder::new()).build();
172+
assert_eq!(index.get_fields(), vec!["description", "body"]);
173+
}
174+
175+
#[test]
176+
fn can_fill_index_default() {
177+
let config = Config::default();
178+
let title = Some("A title".to_string());
179+
let description = Some("A description".to_string());
180+
let path = "/a/page/".to_string();
181+
let content = "Some content".to_string();
182+
183+
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
184+
assert_eq!(res.len(), 2);
185+
assert_eq!(res[0], title.unwrap());
186+
assert_eq!(res[1], content);
187+
}
188+
189+
#[test]
190+
fn can_fill_index_description() {
191+
let mut config = Config::default();
192+
config.search.include_description = true;
193+
let title = Some("A title".to_string());
194+
let description = Some("A description".to_string());
195+
let path = "/a/page/".to_string();
196+
let content = "Some content".to_string();
197+
198+
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
199+
assert_eq!(res.len(), 3);
200+
assert_eq!(res[0], title.unwrap());
201+
assert_eq!(res[1], description.unwrap());
202+
assert_eq!(res[2], content);
203+
}
204+
205+
#[test]
206+
fn can_fill_index_truncated_content() {
207+
let mut config = Config::default();
208+
config.search.truncate_content_length = Some(5);
209+
let title = Some("A title".to_string());
210+
let description = Some("A description".to_string());
211+
let path = "/a/page/".to_string();
212+
let content = "Some content".to_string();
213+
214+
let res = fill_index(&config.search, &title, &description, &None, &path, &content);
215+
assert_eq!(res.len(), 2);
216+
assert_eq!(res[0], title.unwrap());
217+
assert_eq!(res[1], content[..5]);
218+
}
219+
220+
#[test]
221+
fn can_fill_index_date() {
222+
let mut config = Config::default();
223+
config.search.include_date = true;
224+
let title = Some("A title".to_string());
225+
let description = Some("A description".to_string());
226+
let path = "/a/page/".to_string();
227+
let content = "Some content".to_string();
228+
let datetime = Some(OffsetDateTime::parse("2023-01-31T00:00:00Z", &Rfc3339).unwrap());
229+
230+
let res = fill_index(&config.search, &title, &description, &datetime, &path, &content);
231+
assert_eq!(res.len(), 3);
232+
assert_eq!(res[0], title.unwrap());
233+
assert_eq!(res[1], "2023-01-31T00:00:00Z");
234+
assert_eq!(res[2], content);
235+
}
236+
}

components/search/src/fuse.rs

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
use config::Search;
2+
use content::Library;
3+
use errors::Result;
4+
use libs::serde_json;
5+
6+
use crate::clean_and_truncate_body;
7+
8+
/// build index in Fuse.js format.
9+
pub fn build_index(lang: &str, library: &Library, config: &Search) -> Result<String> {
10+
#[derive(serde::Serialize)]
11+
struct Item<'a> {
12+
url: &'a str,
13+
title: Option<&'a str>,
14+
description: Option<&'a str>,
15+
body: Option<String>, // AMMONIA.clean has to allocate anyway
16+
path: Option<&'a str>,
17+
}
18+
let mut items: Vec<Item> = Vec::new();
19+
for (_, section) in &library.sections {
20+
if section.lang == lang
21+
&& section.meta.redirect_to.is_none()
22+
&& section.meta.in_search_index
23+
{
24+
items.push(Item {
25+
url: &section.permalink,
26+
title: match config.include_title {
27+
true => Some(&section.meta.title.as_deref().unwrap_or_default()),
28+
false => None,
29+
},
30+
description: match config.include_description {
31+
true => Some(&section.meta.description.as_deref().unwrap_or_default()),
32+
false => None,
33+
},
34+
body: match config.include_content {
35+
true => Some(clean_and_truncate_body(
36+
config.truncate_content_length,
37+
&section.content,
38+
)),
39+
false => None,
40+
},
41+
path: match config.include_path {
42+
true => Some(&section.path),
43+
false => None,
44+
},
45+
});
46+
for page in &section.pages {
47+
let page = &library.pages[page];
48+
if page.meta.in_search_index {
49+
items.push(Item {
50+
url: &page.permalink,
51+
title: match config.include_title {
52+
true => Some(&page.meta.title.as_deref().unwrap_or_default()),
53+
false => None,
54+
},
55+
description: match config.include_description {
56+
true => Some(&page.meta.description.as_deref().unwrap_or_default()),
57+
false => None,
58+
},
59+
body: match config.include_content {
60+
true => Some(super::clean_and_truncate_body(
61+
config.truncate_content_length,
62+
&page.content,
63+
)),
64+
false => None,
65+
},
66+
path: match config.include_path {
67+
true => Some(&page.path),
68+
false => None,
69+
},
70+
})
71+
}
72+
}
73+
}
74+
}
75+
Ok(serde_json::to_string(&items)?)
76+
}

0 commit comments

Comments
 (0)