Skip to content

Commit 6a0e9a1

Browse files
committed
fix term date truncation
1 parent ba33a2a commit 6a0e9a1

File tree

9 files changed

+180
-54
lines changed

9 files changed

+180
-54
lines changed

src/core/json_utils.rs

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;
44

55
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
66
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
7-
use crate::schema::Type;
7+
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
88
use crate::time::format_description::well_known::Rfc3339;
99
use crate::time::{OffsetDateTime, UtcOffset};
1010
use crate::tokenizer::TextAnalyzer;
@@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
189189
ctx.path_to_unordered_id
190190
.get_or_allocate_unordered_id(json_path_writer.as_str()),
191191
);
192+
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
192193
term_buffer.append_type_and_fast_value(val);
193194
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
194195
}

src/indexer/merger.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -673,7 +673,7 @@ mod tests {
673673
]
674674
);
675675
assert_eq!(
676-
get_doc_ids(vec![Term::from_field_date(
676+
get_doc_ids(vec![Term::from_field_date_for_search(
677677
date_field,
678678
DateTime::from_utc(curr_time)
679679
)])?,

src/indexer/segment_writer.rs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ mod tests {
431431
use crate::query::{PhraseQuery, QueryParser};
432432
use crate::schema::{
433433
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
434-
STORED, STRING, TEXT,
434+
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
435435
};
436436
use crate::store::{Compressor, StoreReader, StoreWriter};
437437
use crate::time::format_description::well_known::Rfc3339;
@@ -651,7 +651,8 @@ mod tests {
651651
set_fast_val(
652652
DateTime::from_utc(
653653
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
654-
),
654+
)
655+
.truncate(DATE_TIME_PRECISION_INDEXED),
655656
term
656657
)
657658
.serialized_value_bytes()

src/query/more_like_this/more_like_this.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ impl MoreLikeThis {
241241
let timestamp = value.as_datetime().ok_or_else(|| {
242242
TantivyError::InvalidArgument("invalid value".to_string())
243243
})?;
244-
let term = Term::from_field_date(field, timestamp);
244+
let term = Term::from_field_date_for_search(field, timestamp);
245245
*term_frequencies.entry(term).or_insert(0) += 1;
246246
}
247247
}

src/query/query_parser/query_parser.rs

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
137137
/// so-called default fields (as set up in the constructor).
138138
///
139139
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
140-
/// conjunction as a default, our query will be interpreted as.
140+
/// conjunction as a default, our query will be interpreted as.
141141
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
142142
/// By default, all tokenized and indexed fields are default fields.
143143
///
@@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
148148
/// `body:Barack OR (body:Barack OR text:Obama)` .
149149
///
150150
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
151-
/// interpreted
152-
/// as `(a AND b) OR c`.
151+
/// interpreted as `(a AND b) OR c`.
153152
///
154153
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
155154
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
@@ -273,7 +272,7 @@ impl QueryParser {
273272
/// Creates a `QueryParser`, given
274273
/// * an index
275274
/// * a set of default fields used to search if no field is specifically defined
276-
/// in the query.
275+
/// in the query.
277276
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
278277
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
279278
}
@@ -569,7 +568,7 @@ impl QueryParser {
569568
}
570569
FieldType::Date(_) => {
571570
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
572-
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
571+
let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
573572
Ok(vec![LogicalLiteral::Term(dt_term)])
574573
}
575574
FieldType::Str(ref str_options) => {
@@ -701,8 +700,8 @@ impl QueryParser {
701700
///
702701
/// The terms are identified by a triplet:
703702
/// - tantivy field
704-
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
705-
/// object by naturally extending the json field name with a "." separated field_path
703+
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
704+
/// naturally extending the json field name with a "." separated field_path
706705
/// - field_phrase: the phrase that is being searched.
707706
///
708707
/// The literal identifies the targeted field by a so-called *full field path*,

src/query/range_query/range_query.rs

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -116,10 +116,7 @@ impl Query for RangeQuery {
116116
let field_type = schema.get_field_entry(self.field()).field_type();
117117

118118
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
119-
Ok(Box::new(FastFieldRangeWeight::new(
120-
self.field(),
121-
self.bounds.clone(),
122-
)))
119+
Ok(Box::new(FastFieldRangeWeight::new(self.bounds.clone())))
123120
} else {
124121
if field_type.is_json() {
125122
return Err(crate::TantivyError::InvalidArgument(

src/query/range_query/range_query_u64_fastfield.rs

Lines changed: 148 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -10,24 +10,22 @@ use columnar::{
1010
StrColumn,
1111
};
1212
use common::bounds::{BoundsRange, TransformBound};
13-
use common::BinarySerializable;
1413

1514
use super::fast_field_range_doc_set::RangeDocSet;
1615
use crate::query::{AllScorer, ConstScorer, EmptyScorer, Explanation, Query, Scorer, Weight};
17-
use crate::schema::{Field, Type, ValueBytes};
16+
use crate::schema::{Type, ValueBytes};
1817
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError, Term};
1918

2019
/// `FastFieldRangeWeight` uses the fast field to execute range queries.
2120
#[derive(Clone, Debug)]
2221
pub struct FastFieldRangeWeight {
2322
bounds: BoundsRange<Term>,
24-
field: Field,
2523
}
2624

2725
impl FastFieldRangeWeight {
2826
/// Create a new FastFieldRangeWeight
29-
pub(crate) fn new(field: Field, bounds: BoundsRange<Term>) -> Self {
30-
Self { bounds, field }
27+
pub(crate) fn new(bounds: BoundsRange<Term>) -> Self {
28+
Self { bounds }
3129
}
3230
}
3331

@@ -46,12 +44,12 @@ impl Weight for FastFieldRangeWeight {
4644
if self.bounds.is_unbounded() {
4745
return Ok(Box::new(AllScorer::new(reader.max_doc())));
4846
}
49-
let field_type = reader.schema().get_field_entry(self.field).field_type();
5047

5148
let term = self
5249
.bounds
5350
.get_inner()
5451
.expect("At least one bound must be set");
52+
let field_type = reader.schema().get_field_entry(term.field()).field_type();
5553
assert_eq!(
5654
term.typ(),
5755
field_type.value_type(),
@@ -62,10 +60,6 @@ impl Weight for FastFieldRangeWeight {
6260
let field_name = term.get_full_path(reader.schema());
6361

6462
let get_value_bytes = |term: &Term| term.value().value_bytes_payload();
65-
let get_term_u64_internal_representation = |term: &Term| {
66-
let bytes = term.value().value_bytes_payload();
67-
u64::from_be(BinarySerializable::deserialize(&mut &bytes[..]).unwrap())
68-
};
6963

7064
let term_value = term.value();
7165
if field_type.is_json() {
@@ -175,11 +169,35 @@ impl Weight for FastFieldRangeWeight {
175169
field_type
176170
);
177171

178-
let bounds = self.bounds.map_bound(get_term_u64_internal_representation);
172+
let bounds = self.bounds.map_bound_res(|term| {
173+
let value = term.value();
174+
let val = if let Some(val) = value.as_u64() {
175+
val
176+
} else if let Some(val) = value.as_i64() {
177+
val.to_u64()
178+
} else if let Some(val) = value.as_f64() {
179+
val.to_u64()
180+
} else if let Some(val) = value.as_date() {
181+
val.to_u64()
182+
} else {
183+
return Err(TantivyError::InvalidArgument(format!(
184+
"Expected term with u64, i64, f64 or date, but got {:?}",
185+
term
186+
)));
187+
};
188+
Ok(val)
189+
})?;
179190

180191
let fast_field_reader = reader.fast_fields();
181-
let Some((column, _col_type)) =
182-
fast_field_reader.u64_lenient_for_type(None, &field_name)?
192+
let Some((column, _col_type)) = fast_field_reader.u64_lenient_for_type(
193+
Some(&[
194+
ColumnType::U64,
195+
ColumnType::I64,
196+
ColumnType::F64,
197+
ColumnType::DateTime,
198+
]),
199+
&field_name,
200+
)?
183201
else {
184202
return Ok(Box::new(EmptyScorer));
185203
};
@@ -212,7 +230,7 @@ fn search_on_json_numerical_field(
212230
boost: Score,
213231
) -> crate::Result<Box<dyn Scorer>> {
214232
// Since we don't know which type was interpolated for the internal column whe
215-
// have to check for all types (only one exists)
233+
// have to check for all numeric types (only one exists)
216234
let allowed_column_types: Option<&[ColumnType]> =
217235
Some(&[ColumnType::F64, ColumnType::I64, ColumnType::U64]);
218236
let fast_field_reader = reader.fast_fields();
@@ -455,7 +473,8 @@ pub mod tests {
455473
use crate::query::range_query::range_query_u64_fastfield::FastFieldRangeWeight;
456474
use crate::query::{QueryParser, RangeQuery, Weight};
457475
use crate::schema::{
458-
Field, NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING, TEXT,
476+
DateOptions, Field, NumericOptions, Schema, SchemaBuilder, FAST, INDEXED, STORED, STRING,
477+
TEXT,
459478
};
460479
use crate::{Index, IndexWriter, Term, TERMINATED};
461480

@@ -518,6 +537,89 @@ pub mod tests {
518537
Ok(())
519538
}
520539

540+
#[test]
541+
fn test_date_range_query() {
542+
let mut schema_builder = Schema::builder();
543+
let options = DateOptions::default()
544+
.set_precision(common::DateTimePrecision::Microseconds)
545+
.set_fast();
546+
let date_field = schema_builder.add_date_field("date", options);
547+
let schema = schema_builder.build();
548+
549+
let index = Index::create_in_ram(schema.clone());
550+
{
551+
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
552+
// This is added a string and creates a string column!
553+
index_writer
554+
.add_document(doc!(date_field => DateTime::from_utc(
555+
OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap(),
556+
)))
557+
.unwrap();
558+
index_writer
559+
.add_document(doc!(date_field => DateTime::from_utc(
560+
OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap(),
561+
)))
562+
.unwrap();
563+
index_writer
564+
.add_document(doc!(date_field => DateTime::from_utc(
565+
OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(),
566+
)))
567+
.unwrap();
568+
index_writer.commit().unwrap();
569+
}
570+
571+
// Date field
572+
let dt1 =
573+
DateTime::from_utc(OffsetDateTime::parse("2022-12-01T00:00:01Z", &Rfc3339).unwrap());
574+
let dt2 =
575+
DateTime::from_utc(OffsetDateTime::parse("2023-12-01T00:00:01Z", &Rfc3339).unwrap());
576+
let dt3 = DateTime::from_utc(
577+
OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(),
578+
);
579+
let dt4 = DateTime::from_utc(
580+
OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(),
581+
);
582+
583+
let reader = index.reader().unwrap();
584+
let searcher = reader.searcher();
585+
let count = |range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
586+
assert_eq!(
587+
count(RangeQuery::new(
588+
Bound::Included(Term::from_field_date(date_field, dt3)),
589+
Bound::Excluded(Term::from_field_date(date_field, dt4)),
590+
)),
591+
1
592+
);
593+
assert_eq!(
594+
count(RangeQuery::new(
595+
Bound::Included(Term::from_field_date(date_field, dt3)),
596+
Bound::Included(Term::from_field_date(date_field, dt4)),
597+
)),
598+
1
599+
);
600+
assert_eq!(
601+
count(RangeQuery::new(
602+
Bound::Included(Term::from_field_date(date_field, dt1)),
603+
Bound::Included(Term::from_field_date(date_field, dt2)),
604+
)),
605+
2
606+
);
607+
assert_eq!(
608+
count(RangeQuery::new(
609+
Bound::Included(Term::from_field_date(date_field, dt1)),
610+
Bound::Excluded(Term::from_field_date(date_field, dt2)),
611+
)),
612+
1
613+
);
614+
assert_eq!(
615+
count(RangeQuery::new(
616+
Bound::Excluded(Term::from_field_date(date_field, dt1)),
617+
Bound::Excluded(Term::from_field_date(date_field, dt2)),
618+
)),
619+
0
620+
);
621+
}
622+
521623
fn get_json_term<T: FastValue>(field: Field, path: &str, value: T) -> Term {
522624
let mut term = Term::from_field_json_path(field, path, true);
523625
term.append_type_and_fast_value(value);
@@ -548,6 +650,10 @@ pub mod tests {
548650
"date": "2023-12-01T00:00:01Z"
549651
});
550652
index_writer.add_document(doc!(json_field => doc)).unwrap();
653+
let doc = json!({
654+
"date": "2015-02-01T00:00:00.001Z"
655+
});
656+
index_writer.add_document(doc!(json_field => doc)).unwrap();
551657

552658
index_writer.commit().unwrap();
553659
}
@@ -631,6 +737,13 @@ pub mod tests {
631737
)),
632738
2
633739
);
740+
assert_eq!(
741+
count(RangeQuery::new(
742+
Bound::Included(get_json_term(json_field, "id_i64", 1000i64)),
743+
Bound::Excluded(get_json_term(json_field, "id_i64", 1001i64)),
744+
)),
745+
1
746+
);
634747

635748
// u64 on i64
636749
assert_eq!(
@@ -718,6 +831,18 @@ pub mod tests {
718831
)),
719832
0
720833
);
834+
// Date precision test. We don't want to truncate the precision
835+
let dt3 = DateTime::from_utc(
836+
OffsetDateTime::parse("2015-02-01T00:00:00.001Z", &Rfc3339).unwrap(),
837+
);
838+
let dt4 = DateTime::from_utc(
839+
OffsetDateTime::parse("2015-02-01T00:00:00.002Z", &Rfc3339).unwrap(),
840+
);
841+
let query = RangeQuery::new(
842+
Bound::Included(get_json_term(json_field, "date", dt3)),
843+
Bound::Excluded(get_json_term(json_field, "date", dt4)),
844+
);
845+
assert_eq!(count(query), 1);
721846
}
722847

723848
#[derive(Clone, Debug)]
@@ -796,13 +921,10 @@ pub mod tests {
796921
writer.add_document(doc!(field=>52_000u64)).unwrap();
797922
writer.commit().unwrap();
798923
let searcher = index.reader().unwrap().searcher();
799-
let range_query = FastFieldRangeWeight::new(
800-
field,
801-
BoundsRange::new(
802-
Bound::Included(Term::from_field_u64(field, 50_000)),
803-
Bound::Included(Term::from_field_u64(field, 50_002)),
804-
),
805-
);
924+
let range_query = FastFieldRangeWeight::new(BoundsRange::new(
925+
Bound::Included(Term::from_field_u64(field, 50_000)),
926+
Bound::Included(Term::from_field_u64(field, 50_002)),
927+
));
806928
let scorer = range_query
807929
.scorer(searcher.segment_reader(0), 1.0f32)
808930
.unwrap();
@@ -1158,13 +1280,10 @@ pub mod ip_range_tests {
11581280
}
11591281
writer.commit().unwrap();
11601282
let searcher = index.reader().unwrap().searcher();
1161-
let range_weight = FastFieldRangeWeight::new(
1162-
ips_field,
1163-
BoundsRange::new(
1164-
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])),
1165-
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])),
1166-
),
1167-
);
1283+
let range_weight = FastFieldRangeWeight::new(BoundsRange::new(
1284+
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[1])),
1285+
Bound::Included(Term::from_field_ip_addr(ips_field, ip_addrs[2])),
1286+
));
11681287

11691288
let count =
11701289
crate::query::weight::Weight::count(&range_weight, searcher.segment_reader(0)).unwrap();

0 commit comments

Comments
 (0)