Skip to content

Commit c3a4015

Browse files
committed
Read list of JSON fields encoded in dictionary
add method to get list of fields on InvertedIndexReader
1 parent c520ac4 commit c3a4015

File tree

4 files changed

+59
-3
lines changed

4 files changed

+59
-3
lines changed

Cargo.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ common = { version= "0.6", path = "./common/", package = "tantivy-common" }
6363
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
6464
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
6565
futures-util = { version = "0.3.28", optional = true }
66+
fnv = "1.0.7"
6667

6768
[target.'cfg(windows)'.dependencies]
6869
winapi = "0.3.9"

src/core/inverted_index_reader.rs

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
use std::io;
22

33
use common::BinarySerializable;
4+
use fnv::FnvHashSet;
45

56
use crate::directory::FileSlice;
67
use crate::positions::PositionReader;
78
use crate::postings::{BlockSegmentPostings, SegmentPostings, TermInfo};
8-
use crate::schema::{IndexRecordOption, Term};
9+
use crate::schema::{IndexRecordOption, Term, JSON_END_OF_PATH};
910
use crate::termdict::TermDictionary;
1011

1112
/// The inverted index reader is in charge of accessing
@@ -69,6 +70,26 @@ impl InvertedIndexReader {
6970
&self.termdict
7071
}
7172

73+
/// Return the fields encoded in the dictionary in lexicographic oder.
74+
/// Only valid on JSON fields.
75+
///
76+
/// Notice: This requires a full scan and therefore **very expensive**.
77+
pub fn list_fields(&self) -> io::Result<Vec<String>> {
78+
let mut stream = self.termdict.stream()?;
79+
let mut fields = Vec::new();
80+
let mut fields_set = FnvHashSet::default();
81+
while let Some((term, _term_info)) = stream.next() {
82+
if let Some(index) = term.iter().position(|&byte| byte == JSON_END_OF_PATH) {
83+
if !fields_set.contains(&term[..index]) {
84+
fields_set.insert(term[..index].to_vec());
85+
fields.push(String::from_utf8_lossy(&term[..index]).to_string());
86+
}
87+
}
88+
}
89+
90+
Ok(fields)
91+
}
92+
7293
/// Resets the block segment to another position of the postings
7394
/// file.
7495
///

src/indexer/mod.rs

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -133,4 +133,37 @@ mod tests_mmap {
133133
assert_eq!(num_docs, 1);
134134
}
135135
}
136+
137+
#[test]
138+
fn test_json_field_list_fields() {
139+
let mut schema_builder = Schema::builder();
140+
let json_options: JsonObjectOptions =
141+
JsonObjectOptions::from(TEXT).set_expand_dots_enabled();
142+
let json_field = schema_builder.add_json_field("json", json_options);
143+
let index = Index::create_in_ram(schema_builder.build());
144+
let mut index_writer = index.writer_for_tests().unwrap();
145+
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "sub": {"a": 1, "b": 2}});
146+
index_writer.add_document(doc!(json_field=>json)).unwrap();
147+
let json = serde_json::json!({"k8s.container.name": "prometheus", "val": "hello", "suber": {"a": 1, "b": 2}});
148+
index_writer.add_document(doc!(json_field=>json)).unwrap();
149+
index_writer.commit().unwrap();
150+
let reader = index.reader().unwrap();
151+
152+
let searcher = reader.searcher();
153+
assert_eq!(searcher.num_docs(), 2);
154+
155+
let reader = &searcher.segment_readers()[0];
156+
let inverted_index = reader.inverted_index(json_field).unwrap();
157+
assert_eq!(
158+
inverted_index.list_fields().unwrap(),
159+
[
160+
"k8s\u{1}container\u{1}name",
161+
"sub\u{1}a",
162+
"sub\u{1}b",
163+
"suber\u{1}a",
164+
"suber\u{1}b",
165+
"val",
166+
]
167+
);
168+
}
136169
}

src/schema/field_type.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -313,9 +313,10 @@ impl FieldType {
313313

314314
/// Parses a field value from json, given the target FieldType.
315315
///
316-
/// Tantivy will not try to cast values.
316+
/// Tantivy will try to cast values only with the coerce option.
317317
/// For instance, If the json value is the integer `3` and the
318-
/// target field is a `Str`, this method will return an Error.
318+
/// target field is a `Str`, this method will return an Error if `coerce`
319+
/// is not enabled.
319320
pub fn value_from_json(&self, json: JsonValue) -> Result<Value, ValueParsingError> {
320321
match json {
321322
JsonValue::String(field_text) => {

0 commit comments

Comments
 (0)