From 137906ff29f4373280282a41c37de0be59fd464a Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 12 Jan 2018 21:01:28 +0900 Subject: [PATCH] Fixing PhraseQuery, broken due to the reordering of the intersection clauses. Closes #234 --- src/query/phrase_query/mod.rs | 44 ++++++++++++++ src/query/phrase_query/phrase_scorer.rs | 78 +++++++++++++++++++++---- src/query/phrase_query/phrase_weight.rs | 14 ++--- 3 files changed, 117 insertions(+), 19 deletions(-) diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index b4ef40380..8e78394cb 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -6,6 +6,7 @@ pub use self::phrase_query::PhraseQuery; pub use self::phrase_weight::PhraseWeight; pub use self::phrase_scorer::PhraseScorer; + #[cfg(test)] mod tests { @@ -74,4 +75,47 @@ mod tests { assert_eq!(test_query(vec!["g", "a"]), empty_vec); } + + + #[test] // motivated by #234 + pub fn test_phrase_query_docfreq_order() { + let mut schema_builder = SchemaBuilder::default(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap(); + { + // 0 + let doc = doc!(text_field=>"b"); + index_writer.add_document(doc); + } + { // 1 + let doc = doc!(text_field=>"a b"); + index_writer.add_document(doc); + } + { // 2 + let doc = doc!(text_field=>"b a"); + index_writer.add_document(doc); + } + assert!(index_writer.commit().is_ok()); + } + + index.load_searchers().unwrap(); + let searcher = index.searcher(); + let test_query = |texts: Vec<&str>| { + let mut test_collector = TestCollector::default(); + let terms: Vec = texts + .iter() + .map(|text| Term::from_field_text(text_field, text)) + .collect(); + let phrase_query = PhraseQuery::from(terms); + searcher + .search(&phrase_query, &mut test_collector) + .expect("search should succeed"); + test_collector.docs() + }; + assert_eq!(test_query(vec!["a", "b"]), vec![1]); + assert_eq!(test_query(vec!["b", "a"]), vec![2]); + } } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 97ef11372..4b7a26095 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -1,21 +1,79 @@ use query::Scorer; -use DocSet; -use postings::SegmentPostings; -use postings::Postings; -use postings::IntersectionDocSet; use DocId; +use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings}; + +struct PostingsWithOffset { + offset: u32, + segment_postings: SegmentPostings, +} + +impl PostingsWithOffset { + pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset { + PostingsWithOffset { + offset, + segment_postings + } + } +} + +impl Postings for PostingsWithOffset { + fn term_freq(&self) -> u32 { + self.segment_postings.term_freq() + } + + fn positions(&self) -> &[u32] { + self.segment_postings.positions() + } +} + +impl DocSet for PostingsWithOffset { + fn advance(&mut self) -> bool { + self.segment_postings.advance() + } + + fn doc(&self) -> DocId { + self.segment_postings.doc() + } + + fn size_hint(&self) -> usize { + self.segment_postings.size_hint() + } + + fn skip_next(&mut self, target: DocId) -> SkipResult { + self.segment_postings.skip_next(target) + } +} pub struct PhraseScorer { - pub intersection_docset: IntersectionDocSet, + intersection_docset: IntersectionDocSet, } impl PhraseScorer { - fn phrase_match(&self) -> bool { - let mut positions_arr: Vec<&[u32]> = self.intersection_docset - .docsets() - .iter() - .map(|posting| posting.positions()) + + pub fn new(term_postings: Vec) -> PhraseScorer { + let postings_with_offsets: Vec<_> = term_postings + .into_iter() + .enumerate() + .map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32)) .collect(); + PhraseScorer { + intersection_docset: IntersectionDocSet::from(postings_with_offsets) + } + } + + fn phrase_match(&self) -> bool { + + // TODO maybe we could avoid decoding positions lazily for all terms + // when there is > 2 terms. + // + // For instance for the query "A B C", the position of "C" do not need + // to be decoded if "A B" had no match. + let docsets = self.intersection_docset.docsets(); + let mut positions_arr: Vec<&[u32]> = vec![&[]; docsets.len()]; + for docset in docsets { + positions_arr[docset.offset as usize] = docset.positions(); + } + let num_postings = positions_arr.len() as u32; diff --git a/src/query/phrase_query/phrase_weight.rs b/src/query/phrase_query/phrase_weight.rs index 3135526e1..5c73bd2a0 100644 --- a/src/query/phrase_query/phrase_weight.rs +++ b/src/query/phrase_query/phrase_weight.rs @@ -4,7 +4,6 @@ use schema::Term; use schema::IndexRecordOption; use core::SegmentReader; use super::PhraseScorer; -use postings::IntersectionDocSet; use query::EmptyScorer; use Result; @@ -22,17 +21,14 @@ impl Weight for PhraseWeight { fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result> { let mut term_postings_list = Vec::new(); for term in &self.phrase_terms { - let inverted_index = reader.inverted_index(term.field()); - let term_postings_option = - inverted_index.read_postings(term, IndexRecordOption::WithFreqsAndPositions); - if let Some(term_postings) = term_postings_option { - term_postings_list.push(term_postings); + if let Some(postings) = reader + .inverted_index(term.field()) + .read_postings(term, IndexRecordOption::WithFreqsAndPositions) { + term_postings_list.push(postings); } else { return Ok(box EmptyScorer); } } - Ok(box PhraseScorer { - intersection_docset: IntersectionDocSet::from(term_postings_list), - }) + Ok(box PhraseScorer::new(term_postings_list)) } }