Fixing PhraseQuery, broken due to the reordering of the intersection clauses.

Closes #234
2026-06-05 01:50:42 +00:00 · 2018-01-12 21:01:28 +09:00
parent 143a143cde
commit 137906ff29
3 changed files with 117 additions and 19 deletions
--- a/src/query/phrase_query/mod.rs
+++ b/src/query/phrase_query/mod.rs
@@ -6,6 +6,7 @@ pub use self::phrase_query::PhraseQuery;
 pub use self::phrase_weight::PhraseWeight;
 pub use self::phrase_scorer::PhraseScorer;

+
 #[cfg(test)]
 mod tests {

@@ -74,4 +75,47 @@ mod tests {
        assert_eq!(test_query(vec!["g", "a"]), empty_vec);
    }

+
+
+    #[test] // motivated by #234
+    pub fn test_phrase_query_docfreq_order() {
+        let mut schema_builder = SchemaBuilder::default();
+        let text_field = schema_builder.add_text_field("text", TEXT);
+        let schema = schema_builder.build();
+        let index = Index::create_in_ram(schema);
+        {
+            let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
+            {
+                // 0
+                let doc = doc!(text_field=>"b");
+                index_writer.add_document(doc);
+            }
+            {   // 1
+                let doc = doc!(text_field=>"a b");
+                index_writer.add_document(doc);
+            }
+            {   // 2
+                let doc = doc!(text_field=>"b a");
+                index_writer.add_document(doc);
+            }
+            assert!(index_writer.commit().is_ok());
+        }
+
+        index.load_searchers().unwrap();
+        let searcher = index.searcher();
+        let test_query = |texts: Vec<&str>| {
+            let mut test_collector = TestCollector::default();
+            let terms: Vec<Term> = texts
+                .iter()
+                .map(|text| Term::from_field_text(text_field, text))
+                .collect();
+            let phrase_query = PhraseQuery::from(terms);
+            searcher
+                .search(&phrase_query, &mut test_collector)
+                .expect("search should succeed");
+            test_collector.docs()
+        };
+        assert_eq!(test_query(vec!["a", "b"]), vec![1]);
+        assert_eq!(test_query(vec!["b", "a"]), vec![2]);
+    }
 }
--- a/src/query/phrase_query/phrase_scorer.rs
+++ b/src/query/phrase_query/phrase_scorer.rs
@@ -1,21 +1,79 @@
 use query::Scorer;
-use DocSet;
-use postings::SegmentPostings;
-use postings::Postings;
-use postings::IntersectionDocSet;
 use DocId;
+use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings};
+
+struct PostingsWithOffset {
+    offset: u32,
+    segment_postings: SegmentPostings,
+}
+
+impl PostingsWithOffset {
+    pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
+        PostingsWithOffset {
+            offset,
+            segment_postings
+        }
+    }
+}
+
+impl Postings for PostingsWithOffset {
+    fn term_freq(&self) -> u32 {
+        self.segment_postings.term_freq()
+    }
+
+    fn positions(&self) -> &[u32] {
+        self.segment_postings.positions()
+    }
+}
+
+impl DocSet for PostingsWithOffset {
+    fn advance(&mut self) -> bool {
+        self.segment_postings.advance()
+    }
+
+    fn doc(&self) -> DocId {
+        self.segment_postings.doc()
+    }
+
+    fn size_hint(&self) -> usize {
+        self.segment_postings.size_hint()
+    }
+
+    fn skip_next(&mut self, target: DocId) -> SkipResult {
+        self.segment_postings.skip_next(target)
+    }
+}

 pub struct PhraseScorer {
-    pub intersection_docset: IntersectionDocSet<SegmentPostings>,
+    intersection_docset: IntersectionDocSet<PostingsWithOffset>,
 }

 impl PhraseScorer {
-    fn phrase_match(&self) -> bool {
-        let mut positions_arr: Vec<&[u32]> = self.intersection_docset
-            .docsets()
-            .iter()
-            .map(|posting| posting.positions())
+
+    pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
+        let postings_with_offsets: Vec<_> = term_postings
+            .into_iter()
+            .enumerate()
+            .map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32))
            .collect();
+        PhraseScorer {
+            intersection_docset: IntersectionDocSet::from(postings_with_offsets)
+        }
+    }
+
+    fn phrase_match(&self) -> bool {
+
+        // TODO maybe we could avoid decoding positions lazily for all terms
+        // when there is > 2 terms.
+        //
+        // For instance for the query "A B C", the position of "C" do not need
+        // to be decoded if "A B" had no match.
+        let docsets = self.intersection_docset.docsets();
+        let mut positions_arr: Vec<&[u32]> = vec![&[]; docsets.len()];
+        for docset in docsets {
+            positions_arr[docset.offset as usize] = docset.positions();
+        }
+

        let num_postings = positions_arr.len() as u32;

--- a/src/query/phrase_query/phrase_weight.rs
+++ b/src/query/phrase_query/phrase_weight.rs
@@ -4,7 +4,6 @@ use schema::Term;
 use schema::IndexRecordOption;
 use core::SegmentReader;
 use super::PhraseScorer;
-use postings::IntersectionDocSet;
 use query::EmptyScorer;
 use Result;

@@ -22,17 +21,14 @@ impl Weight for PhraseWeight {
    fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
        let mut term_postings_list = Vec::new();
        for term in &self.phrase_terms {
-            let inverted_index = reader.inverted_index(term.field());
-            let term_postings_option =
-                inverted_index.read_postings(term, IndexRecordOption::WithFreqsAndPositions);
-            if let Some(term_postings) = term_postings_option {
-                term_postings_list.push(term_postings);
+            if let Some(postings) = reader
+                .inverted_index(term.field())
+                .read_postings(term, IndexRecordOption::WithFreqsAndPositions) {
+                term_postings_list.push(postings);
            } else {
                return Ok(box EmptyScorer);
            }
        }
-        Ok(box PhraseScorer {
-            intersection_docset: IntersectionDocSet::from(term_postings_list),
-        })
+        Ok(box PhraseScorer::new(term_postings_list))
    }
 }