Skip to content

Commit 3d1c4b3

Browse files
authored
support ff range queries on json fields (#2456)
* support ff range queries on json fields * fix term date truncation * use inverted index range query for phrase prefix queries * rename to InvertedIndexRangeQuery * fix column filter, add mixed column test
1 parent 0d4e319 commit 3d1c4b3

File tree

15 files changed

+1317
-284
lines changed

15 files changed

+1317
-284
lines changed

common/src/bounds.rs

Lines changed: 130 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,130 @@
1+
use std::io;
2+
use std::ops::Bound;
3+
4+
#[derive(Clone, Debug)]
5+
pub struct BoundsRange<T> {
6+
pub lower_bound: Bound<T>,
7+
pub upper_bound: Bound<T>,
8+
}
9+
impl<T> BoundsRange<T> {
10+
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
11+
BoundsRange {
12+
lower_bound,
13+
upper_bound,
14+
}
15+
}
16+
pub fn is_unbounded(&self) -> bool {
17+
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
18+
}
19+
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
20+
BoundsRange {
21+
lower_bound: map_bound(&self.lower_bound, &transform),
22+
upper_bound: map_bound(&self.upper_bound, &transform),
23+
}
24+
}
25+
26+
pub fn map_bound_res<TTo, Err>(
27+
&self,
28+
transform: impl Fn(&T) -> Result<TTo, Err>,
29+
) -> Result<BoundsRange<TTo>, Err> {
30+
Ok(BoundsRange {
31+
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
32+
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
33+
})
34+
}
35+
36+
pub fn transform_inner<TTo>(
37+
&self,
38+
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
39+
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
40+
) -> BoundsRange<TTo> {
41+
BoundsRange {
42+
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
43+
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
44+
}
45+
}
46+
47+
/// Returns the first set inner value
48+
pub fn get_inner(&self) -> Option<&T> {
49+
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
50+
}
51+
}
52+
53+
pub enum TransformBound<T> {
54+
/// Overwrite the bounds
55+
NewBound(Bound<T>),
56+
/// Use Existing bounds with new value
57+
Existing(T),
58+
}
59+
60+
/// Takes a bound and transforms the inner value into a new bound via a closure.
61+
/// The bound variant may change by the value returned value from the closure.
62+
pub fn transform_bound_inner_res<TFrom, TTo>(
63+
bound: &Bound<TFrom>,
64+
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
65+
) -> io::Result<Bound<TTo>> {
66+
use self::Bound::*;
67+
Ok(match bound {
68+
Excluded(ref from_val) => match transform(from_val)? {
69+
TransformBound::NewBound(new_val) => new_val,
70+
TransformBound::Existing(new_val) => Excluded(new_val),
71+
},
72+
Included(ref from_val) => match transform(from_val)? {
73+
TransformBound::NewBound(new_val) => new_val,
74+
TransformBound::Existing(new_val) => Included(new_val),
75+
},
76+
Unbounded => Unbounded,
77+
})
78+
}
79+
80+
/// Takes a bound and transforms the inner value into a new bound via a closure.
81+
/// The bound variant may change by the value returned value from the closure.
82+
pub fn transform_bound_inner<TFrom, TTo>(
83+
bound: &Bound<TFrom>,
84+
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
85+
) -> Bound<TTo> {
86+
use self::Bound::*;
87+
match bound {
88+
Excluded(ref from_val) => match transform(from_val) {
89+
TransformBound::NewBound(new_val) => new_val,
90+
TransformBound::Existing(new_val) => Excluded(new_val),
91+
},
92+
Included(ref from_val) => match transform(from_val) {
93+
TransformBound::NewBound(new_val) => new_val,
94+
TransformBound::Existing(new_val) => Included(new_val),
95+
},
96+
Unbounded => Unbounded,
97+
}
98+
}
99+
100+
/// Returns the inner value of a `Bound`
101+
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
102+
match val {
103+
Bound::Included(term) | Bound::Excluded(term) => Some(term),
104+
Bound::Unbounded => None,
105+
}
106+
}
107+
108+
pub fn map_bound<TFrom, TTo>(
109+
bound: &Bound<TFrom>,
110+
transform: impl Fn(&TFrom) -> TTo,
111+
) -> Bound<TTo> {
112+
use self::Bound::*;
113+
match bound {
114+
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
115+
Included(ref from_val) => Bound::Included(transform(from_val)),
116+
Unbounded => Unbounded,
117+
}
118+
}
119+
120+
pub fn map_bound_res<TFrom, TTo, Err>(
121+
bound: &Bound<TFrom>,
122+
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
123+
) -> Result<Bound<TTo>, Err> {
124+
use self::Bound::*;
125+
Ok(match bound {
126+
Excluded(ref from_val) => Excluded(transform(from_val)?),
127+
Included(ref from_val) => Included(transform(from_val)?),
128+
Unbounded => Unbounded,
129+
})
130+
}

common/src/lib.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ use std::ops::Deref;
55
pub use byteorder::LittleEndian as Endianness;
66

77
mod bitset;
8+
pub mod bounds;
89
mod byte_count;
910
mod datetime;
1011
pub mod file_slice;

src/core/json_utils.rs

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;
44

55
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
66
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
7-
use crate::schema::Type;
7+
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
88
use crate::time::format_description::well_known::Rfc3339;
99
use crate::time::{OffsetDateTime, UtcOffset};
1010
use crate::tokenizer::TextAnalyzer;
@@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
189189
ctx.path_to_unordered_id
190190
.get_or_allocate_unordered_id(json_path_writer.as_str()),
191191
);
192+
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
192193
term_buffer.append_type_and_fast_value(val);
193194
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
194195
}
@@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
239240
/// Tries to infer a JSON type from a string and append it to the term.
240241
///
241242
/// The term must be json + JSON path.
242-
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
243+
pub fn convert_to_fast_value_and_append_to_json_term(
244+
mut term: Term,
245+
phrase: &str,
246+
truncate_date_for_search: bool,
247+
) -> Option<Term> {
243248
assert_eq!(
244249
term.value()
245250
.as_json_value_bytes()
@@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
250255
"JSON value bytes should be empty"
251256
);
252257
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
253-
let dt_utc = dt.to_offset(UtcOffset::UTC);
254-
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
258+
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
259+
if truncate_date_for_search {
260+
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
261+
}
262+
term.append_type_and_fast_value(dt);
255263
return Some(term);
256264
}
257265
if let Ok(i64_val) = str::parse::<i64>(phrase) {

src/indexer/merger.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ mod tests {
673673
]
674674
);
675675
assert_eq!(
676-
get_doc_ids(vec![Term::from_field_date(
676+
get_doc_ids(vec![Term::from_field_date_for_search(
677677
date_field,
678678
DateTime::from_utc(curr_time)
679679
)])?,

src/indexer/segment_writer.rs

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -64,9 +64,9 @@ impl SegmentWriter {
6464
///
6565
/// The arguments are defined as follows
6666
///
67-
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
68-
/// is stored in a memory arena. This makes it possible for the user to define
69-
/// the flushing behavior as a memory limit.
67+
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
68+
/// stored in a memory arena. This makes it possible for the user to define the flushing
69+
/// behavior as a memory limit.
7070
/// - segment: The segment being written
7171
/// - schema
7272
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
@@ -431,7 +431,7 @@ mod tests {
431431
use crate::query::{PhraseQuery, QueryParser};
432432
use crate::schema::{
433433
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
434-
STORED, STRING, TEXT,
434+
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
435435
};
436436
use crate::store::{Compressor, StoreReader, StoreWriter};
437437
use crate::time::format_description::well_known::Rfc3339;
@@ -651,7 +651,8 @@ mod tests {
651651
set_fast_val(
652652
DateTime::from_utc(
653653
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
654-
),
654+
)
655+
.truncate(DATE_TIME_PRECISION_INDEXED),
655656
term
656657
)
657658
.serialized_value_bytes()

src/query/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
5454
pub use self::phrase_query::PhraseQuery;
5555
pub use self::query::{EnableScoring, Query, QueryClone};
5656
pub use self::query_parser::{QueryParser, QueryParserError};
57-
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
57+
pub use self::range_query::*;
5858
pub use self::regex_query::RegexQuery;
5959
pub use self::reqopt_scorer::RequiredOptionalScorer;
6060
pub use self::score_combiner::{

src/query/more_like_this/more_like_this.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ impl MoreLikeThis {
241241
let timestamp = value.as_datetime().ok_or_else(|| {
242242
TantivyError::InvalidArgument("invalid value".to_string())
243243
})?;
244-
let term = Term::from_field_date(field, timestamp);
244+
let term = Term::from_field_date_for_search(field, timestamp);
245245
*term_frequencies.entry(term).or_insert(0) += 1;
246246
}
247247
}

src/query/phrase_prefix_query/phrase_prefix_query.rs

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ use std::ops::Bound;
22

33
use super::{prefix_end, PhrasePrefixWeight};
44
use crate::query::bm25::Bm25Weight;
5-
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
5+
use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight};
66
use crate::schema::{Field, IndexRecordOption, Term};
77

88
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
@@ -145,9 +145,15 @@ impl Query for PhrasePrefixQuery {
145145
Bound::Unbounded
146146
};
147147

148-
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
149-
range_query.limit(self.max_expansions as u64);
150-
range_query.weight(enable_scoring)
148+
let lower_bound = Bound::Included(self.prefix.1.clone());
149+
let upper_bound = end_term;
150+
151+
Ok(Box::new(InvertedIndexRangeWeight::new(
152+
self.field,
153+
&lower_bound,
154+
&upper_bound,
155+
Some(self.max_expansions as u64),
156+
)))
151157
}
152158
}
153159

src/query/query_parser/query_parser.rs

Lines changed: 33 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
137137
/// so-called default fields (as set up in the constructor).
138138
///
139139
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
140-
/// conjunction as a default, our query will be interpreted as.
140+
/// conjunction as a default, our query will be interpreted as.
141141
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
142142
/// By default, all tokenized and indexed fields are default fields.
143143
///
@@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
148148
/// `body:Barack OR (body:Barack OR text:Obama)` .
149149
///
150150
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
151-
/// interpreted
152-
/// as `(a AND b) OR c`.
151+
/// interpreted as `(a AND b) OR c`.
153152
///
154153
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
155154
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
@@ -272,8 +271,7 @@ impl QueryParser {
272271

273272
/// Creates a `QueryParser`, given
274273
/// * an index
275-
/// * a set of default fields used to search if no field is specifically defined
276-
/// in the query.
274+
/// * a set of default fields used to search if no field is specifically defined in the query.
277275
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
278276
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
279277
}
@@ -482,16 +480,33 @@ impl QueryParser {
482480
});
483481
if terms.len() != 1 {
484482
return Err(QueryParserError::UnsupportedQuery(format!(
485-
"Range query boundary cannot have multiple tokens: {phrase:?}."
483+
"Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]."
486484
)));
487485
}
488486
Ok(terms.into_iter().next().unwrap())
489487
}
490-
FieldType::JsonObject(_) => {
491-
// Json range are not supported.
492-
Err(QueryParserError::UnsupportedQuery(
493-
"Range query are not supported on json field.".to_string(),
494-
))
488+
FieldType::JsonObject(ref json_options) => {
489+
let get_term_with_path = || {
490+
Term::from_field_json_path(
491+
field,
492+
json_path,
493+
json_options.is_expand_dots_enabled(),
494+
)
495+
};
496+
if let Some(term) =
497+
// Try to convert the phrase to a fast value
498+
convert_to_fast_value_and_append_to_json_term(
499+
get_term_with_path(),
500+
phrase,
501+
false,
502+
)
503+
{
504+
Ok(term)
505+
} else {
506+
let mut term = get_term_with_path();
507+
term.append_type_and_str(phrase);
508+
Ok(term)
509+
}
495510
}
496511
FieldType::Facet(_) => match Facet::from_text(phrase) {
497512
Ok(facet) => Ok(Term::from_facet(field, &facet)),
@@ -553,7 +568,7 @@ impl QueryParser {
553568
}
554569
FieldType::Date(_) => {
555570
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
556-
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
571+
let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
557572
Ok(vec![LogicalLiteral::Term(dt_term)])
558573
}
559574
FieldType::Str(ref str_options) => {
@@ -685,8 +700,8 @@ impl QueryParser {
685700
///
686701
/// The terms are identified by a triplet:
687702
/// - tantivy field
688-
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
689-
/// object by naturally extending the json field name with a "." separated field_path
703+
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
704+
/// naturally extending the json field name with a "." separated field_path
690705
/// - field_phrase: the phrase that is being searched.
691706
///
692707
/// The literal identifies the targeted field by a so-called *full field path*,
@@ -949,7 +964,8 @@ fn generate_literals_for_json_object(
949964
|| Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());
950965

951966
// Try to convert the phrase to a fast value
952-
if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase)
967+
if let Some(term) =
968+
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
953969
{
954970
logical_literals.push(LogicalLiteral::Term(term));
955971
}
@@ -1123,8 +1139,8 @@ mod test {
11231139
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
11241140
assert_eq!(
11251141
format!("{query:?}"),
1126-
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
1127-
Included(Term(field=0, type=Str, \"b\")), limit: None }"
1142+
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \
1143+
\"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }"
11281144
);
11291145
}
11301146

0 commit comments

Comments
 (0)