adjust Dictionary::sorted_ords_to_term_cb() to allow duplicates (#8)

eeeebbbbrrrr · philippemnoel · commit e9f969b1cdc2 · 2024-11-20T10:28:57.000-05:00
Use Levenshtein distance to score documents in fuzzy term queries Fix managed paths (#5) add RegexPhraseQuery (quickwit-oss#2516) * add RegexPhraseQuery RegexPhraseQuery supports phrase queries with regex. It supports regex and wildcards. E.g. a query with wildcards: "b* b* wolf" matches "big bad wolf" Slop is supported as well: "b* wolf"~2 matches "big bad wolf" Regex queries may match a lot of terms where we still need to keep track which term hit to load the positions. The phrase query algorithm groups terms by their frequency together in the union to prefilter groups early. This PR comes with some new datastructures: SimpleUnion - A union docset for a list of docsets. It doesn't do any caching and is therefore well suited for datasets with lots of skipping. (phrase search, but intersections in general) LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in memory. SegmentPostings uses 1840 bytes per instance with its caches, which is equivalent to 460 docids. LoadedPostings is used for terms which have less than 100 docs. LoadedPostings is only used to reduce memory consumption. BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid hits and the docsets for positions. The BitSet is the precalculated union of the docsets In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion, before creating a new one. Renamed Union to BufferedUnionScorer Added proptests to test different union types. * cleanup * use Box instead of Vec * use RefCell instead of term_freq(&mut) * remove wildcard mode * move RefCell to outer * clippy clippy (quickwit-oss#2527) * clippy * clippy * clippy * clippy * convert allow to expect and remove unused * cargo fmt * cleanup * export sample * clippy chore: Fix merge conflict (#11)
diff --git a/src/query/automaton_weight.rs b/src/query/automaton_weight.rs
@@ -101,7 +101,7 @@ where
             scorers.push(scorer);
         }
 
-        let scorer = Union::build(scorers, SumCombiner::default);
+        let scorer = BufferedUnionScorer::build(scorers, SumCombiner::default);
         Ok(Box::new(scorer))
     }
 
diff --git a/src/termdict/fst_termdict/streamer.rs b/src/termdict/fst_termdict/streamer.rs
@@ -11,14 +11,16 @@ use crate::termdict::TermOrdinal;
 /// `TermStreamerBuilder` is a helper object used to define
 /// a range of terms that should be streamed.
 pub struct TermStreamerBuilder<'a, A = AlwaysMatch>
-where A: Automaton
+where
+    A: Automaton,
 {
     fst_map: &'a TermDictionary,
     stream_builder: StreamBuilder<'a, A>,
 }
 
 impl<'a, A> TermStreamerBuilder<'a, A>
-where A: Automaton
+where
+    A: Automaton,
 {
     pub(crate) fn new(fst_map: &'a TermDictionary, stream_builder: StreamBuilder<'a, A>) -> Self {
         TermStreamerBuilder {
@@ -73,7 +75,8 @@ where A: Automaton
 /// `TermStreamer` acts as a cursor over a range of terms of a segment.
 /// Terms are guaranteed to be sorted.
 pub struct TermStreamer<'a, A = AlwaysMatch>
-where A: Automaton
+where
+    A: Automaton,
 {
     pub(crate) fst_map: &'a TermDictionary,
     pub(crate) stream: Stream<'a, A>,
@@ -82,8 +85,9 @@ where A: Automaton
     current_value: TermInfo,
 }
 
-impl<'a, A> TermStreamer<'a, A>
-where A: Automaton
+impl<A> TermStreamer<'_, A>
+where
+    A: Automaton,
 {
     /// Advance position the stream on the next item.
     /// Before the first call to `.advance()`, the stream
diff --git a/sstable/src/dictionary.rs b/sstable/src/dictionary.rs
@@ -443,16 +443,26 @@ impl<TSSTable: SSTable> Dictionary<TSSTable> {
         let mut current_sstable_delta_reader =
             self.sstable_delta_reader_block(current_block_addr.clone())?;
         let mut current_ordinal = 0;
+        let mut prev_ord = None;
         for ord in ord {
-            assert!(ord >= current_ordinal);
-            // check if block changed for new term_ord
-            let new_block_addr = self.sstable_index.get_block_with_ord(ord);
-            if new_block_addr != current_block_addr {
-                current_block_addr = new_block_addr;
-                current_ordinal = current_block_addr.first_ordinal;
-                current_sstable_delta_reader =
-                    self.sstable_delta_reader_block(current_block_addr.clone())?;
-                bytes.clear();
+
+            // only advance forward if the new ord is different than the one we just processed
+            //
+            // this allows the input TermOrdinal iterator to contain duplicates, so long as it's
+            // still sorted
+            if Some(ord) != prev_ord  {
+                assert!(ord >= current_ordinal);
+                // check if block changed for new term_ord
+                let new_block_addr = self.sstable_index.get_block_with_ord(ord);
+                if new_block_addr != current_block_addr {
+                    current_block_addr = new_block_addr;
+                    current_ordinal = current_block_addr.first_ordinal;
+                    current_sstable_delta_reader =
+                        self.sstable_delta_reader_block(current_block_addr.clone())?;
+                    bytes.clear();
+                }
+
+                prev_ord = Some(ord);
             }
 
             // move to ord inside that block
diff --git a/tests/fuzzy_scoring.rs b/tests/fuzzy_scoring.rs
@@ -3,8 +3,8 @@ mod test {
     use maplit::hashmap;
     use tantivy::collector::TopDocs;
     use tantivy::query::FuzzyTermQuery;
-    use tantivy::schema::{Schema, STORED, TEXT};
-    use tantivy::{doc, Index, Term};
+    use tantivy::schema::{Schema, Value, STORED, TEXT};
+    use tantivy::{doc, Index, TantivyDocument, Term};
 
     #[test]
     pub fn test_fuzzy_term() {
@@ -100,8 +100,8 @@ mod test {
 
             // Print out the scores and documents retrieved by the search.
             for (score, adr) in &top_docs {
-                let doc = searcher.doc(*adr).expect("document");
-                println!("{score}, {:?}", doc.field_values().first().unwrap().value);
+                let doc: TantivyDocument = searcher.doc(*adr).expect("document");
+                println!("{score}, {:?}", doc.field_values().next().unwrap().1);
             }
 
             // Assert that 17 documents match the fuzzy query criteria.
@@ -111,14 +111,8 @@ mod test {
 
             // Check the scores of the returned documents against the expected scores.
             for (score, adr) in &top_docs {
-                let doc = searcher.doc(*adr).expect("document");
-                let doc_text = doc
-                    .field_values()
-                    .first()
-                    .unwrap()
-                    .value()
-                    .as_text()
-                    .unwrap();
+                let doc: TantivyDocument = searcher.doc(*adr).expect("document");
+                let doc_text = doc.field_values().next().unwrap().1.as_str().unwrap();
 
                 // Ensure the retrieved score for each document is close to the expected score.
                 assert!(

Original file line number	Diff line number	Diff line change
`@@ -101,7 +101,7 @@ where`
`101`	`101`	`scorers.push(scorer);`
`102`	`102`	`}`
`103`	`103`
`104`		`- let scorer = Union::build(scorers, SumCombiner::default);`
	`104`	`+ let scorer = BufferedUnionScorer::build(scorers, SumCombiner::default);`
`105`	`105`	`Ok(Box::new(scorer))`
`106`	`106`	`}`
`107`	`107`