Skip to content

Commit e9f969b

Browse files
eeeebbbbrrrrphilippemnoel
authored andcommitted
adjust Dictionary::sorted_ords_to_term_cb() to allow duplicates (#8)
Use Levenshtein distance to score documents in fuzzy term queries Fix managed paths (#5) add RegexPhraseQuery (quickwit-oss#2516) * add RegexPhraseQuery RegexPhraseQuery supports phrase queries with regex. It supports regex and wildcards. E.g. a query with wildcards: "b* b* wolf" matches "big bad wolf" Slop is supported as well: "b* wolf"~2 matches "big bad wolf" Regex queries may match a lot of terms where we still need to keep track which term hit to load the positions. The phrase query algorithm groups terms by their frequency together in the union to prefilter groups early. This PR comes with some new datastructures: SimpleUnion - A union docset for a list of docsets. It doesn't do any caching and is therefore well suited for datasets with lots of skipping. (phrase search, but intersections in general) LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in memory. SegmentPostings uses 1840 bytes per instance with its caches, which is equivalent to 460 docids. LoadedPostings is used for terms which have less than 100 docs. LoadedPostings is only used to reduce memory consumption. BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid hits and the docsets for positions. The BitSet is the precalculated union of the docsets In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion, before creating a new one. Renamed Union to BufferedUnionScorer Added proptests to test different union types. * cleanup * use Box instead of Vec * use RefCell instead of term_freq(&mut) * remove wildcard mode * move RefCell to outer * clippy clippy (quickwit-oss#2527) * clippy * clippy * clippy * clippy * convert allow to expect and remove unused * cargo fmt * cleanup * export sample * clippy chore: Fix merge conflict (#11)
1 parent 0cc66a2 commit e9f969b

File tree

4 files changed

+35
-27
lines changed

4 files changed

+35
-27
lines changed

src/query/automaton_weight.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -101,7 +101,7 @@ where
101101
scorers.push(scorer);
102102
}
103103

104-
let scorer = Union::build(scorers, SumCombiner::default);
104+
let scorer = BufferedUnionScorer::build(scorers, SumCombiner::default);
105105
Ok(Box::new(scorer))
106106
}
107107

src/termdict/fst_termdict/streamer.rs

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,16 @@ use crate::termdict::TermOrdinal;
1111
/// `TermStreamerBuilder` is a helper object used to define
1212
/// a range of terms that should be streamed.
1313
pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
14-
where A: Automaton
14+
where
15+
A: Automaton,
1516
{
1617
fst_map: &'a TermDictionary,
1718
stream_builder: StreamBuilder<'a, A>,
1819
}
1920

2021
impl<'a, A> TermStreamerBuilder<'a, A>
21-
where A: Automaton
22+
where
23+
A: Automaton,
2224
{
2325
pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
2426
TermStreamerBuilder {
@@ -73,7 +75,8 @@ where A: Automaton
7375
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
7476
/// Terms are guaranteed to be sorted.
7577
pub struct TermStreamer<'a, A = AlwaysMatch>
76-
where A: Automaton
78+
where
79+
A: Automaton,
7780
{
7881
pub(crate) fst_map: &'a TermDictionary,
7982
pub(crate) stream: Stream<'a, A>,
@@ -82,8 +85,9 @@ where A: Automaton
8285
current_value: TermInfo,
8386
}
8487

85-
impl<'a, A> TermStreamer<'a, A>
86-
where A: Automaton
88+
impl<A> TermStreamer<'_, A>
89+
where
90+
A: Automaton,
8791
{
8892
/// Advance position the stream on the next item.
8993
/// Before the first call to `.advance()`, the stream

sstable/src/dictionary.rs

Lines changed: 19 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -443,16 +443,26 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
443443
let mut current_sstable_delta_reader =
444444
self.sstable_delta_reader_block(current_block_addr.clone())?;
445445
let mut current_ordinal = 0;
446+
let mut prev_ord = None;
446447
for ord in ord {
447-
assert!(ord >= current_ordinal);
448-
// check if block changed for new term_ord
449-
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
450-
if new_block_addr != current_block_addr {
451-
current_block_addr = new_block_addr;
452-
current_ordinal = current_block_addr.first_ordinal;
453-
current_sstable_delta_reader =
454-
self.sstable_delta_reader_block(current_block_addr.clone())?;
455-
bytes.clear();
448+
449+
// only advance forward if the new ord is different than the one we just processed
450+
//
451+
// this allows the input TermOrdinal iterator to contain duplicates, so long as it's
452+
// still sorted
453+
if Some(ord) != prev_ord {
454+
assert!(ord >= current_ordinal);
455+
// check if block changed for new term_ord
456+
let new_block_addr = self.sstable_index.get_block_with_ord(ord);
457+
if new_block_addr != current_block_addr {
458+
current_block_addr = new_block_addr;
459+
current_ordinal = current_block_addr.first_ordinal;
460+
current_sstable_delta_reader =
461+
self.sstable_delta_reader_block(current_block_addr.clone())?;
462+
bytes.clear();
463+
}
464+
465+
prev_ord = Some(ord);
456466
}
457467

458468
// move to ord inside that block

tests/fuzzy_scoring.rs

Lines changed: 6 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ mod test {
33
use maplit::hashmap;
44
use tantivy::collector::TopDocs;
55
use tantivy::query::FuzzyTermQuery;
6-
use tantivy::schema::{Schema, STORED, TEXT};
7-
use tantivy::{doc, Index, Term};
6+
use tantivy::schema::{Schema, Value, STORED, TEXT};
7+
use tantivy::{doc, Index, TantivyDocument, Term};
88

99
#[test]
1010
pub fn test_fuzzy_term() {
@@ -100,8 +100,8 @@ mod test {
100100

101101
// Print out the scores and documents retrieved by the search.
102102
for (score, adr) in &top_docs {
103-
let doc = searcher.doc(*adr).expect("document");
104-
println!("{score}, {:?}", doc.field_values().first().unwrap().value);
103+
let doc: TantivyDocument = searcher.doc(*adr).expect("document");
104+
println!("{score}, {:?}", doc.field_values().next().unwrap().1);
105105
}
106106

107107
// Assert that 17 documents match the fuzzy query criteria.
@@ -111,14 +111,8 @@ mod test {
111111

112112
// Check the scores of the returned documents against the expected scores.
113113
for (score, adr) in &top_docs {
114-
let doc = searcher.doc(*adr).expect("document");
115-
let doc_text = doc
116-
.field_values()
117-
.first()
118-
.unwrap()
119-
.value()
120-
.as_text()
121-
.unwrap();
114+
let doc: TantivyDocument = searcher.doc(*adr).expect("document");
115+
let doc_text = doc.field_values().next().unwrap().1.as_str().unwrap();
122116

123117
// Ensure the retrieved score for each document is close to the expected score.
124118
assert!(

0 commit comments

Comments
 (0)