Fixing PhraseQuery, broken due to the reordering of the intersection clauses.

Closes #234
This commit is contained in:
Paul Masurel
2018-01-12 21:01:28 +09:00
parent 143a143cde
commit 137906ff29
3 changed files with 117 additions and 19 deletions

View File

@@ -6,6 +6,7 @@ pub use self::phrase_query::PhraseQuery;
pub use self::phrase_weight::PhraseWeight;
pub use self::phrase_scorer::PhraseScorer;
#[cfg(test)]
mod tests {
@@ -74,4 +75,47 @@ mod tests {
assert_eq!(test_query(vec!["g", "a"]), empty_vec);
}
#[test] // motivated by #234
pub fn test_phrase_query_docfreq_order() {
let mut schema_builder = SchemaBuilder::default();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
{
// 0
let doc = doc!(text_field=>"b");
index_writer.add_document(doc);
}
{ // 1
let doc = doc!(text_field=>"a b");
index_writer.add_document(doc);
}
{ // 2
let doc = doc!(text_field=>"b a");
index_writer.add_document(doc);
}
assert!(index_writer.commit().is_ok());
}
index.load_searchers().unwrap();
let searcher = index.searcher();
let test_query = |texts: Vec<&str>| {
let mut test_collector = TestCollector::default();
let terms: Vec<Term> = texts
.iter()
.map(|text| Term::from_field_text(text_field, text))
.collect();
let phrase_query = PhraseQuery::from(terms);
searcher
.search(&phrase_query, &mut test_collector)
.expect("search should succeed");
test_collector.docs()
};
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
}
}

View File

@@ -1,21 +1,79 @@
use query::Scorer;
use DocSet;
use postings::SegmentPostings;
use postings::Postings;
use postings::IntersectionDocSet;
use DocId;
use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings};
struct PostingsWithOffset {
offset: u32,
segment_postings: SegmentPostings,
}
impl PostingsWithOffset {
pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
PostingsWithOffset {
offset,
segment_postings
}
}
}
impl Postings for PostingsWithOffset {
fn term_freq(&self) -> u32 {
self.segment_postings.term_freq()
}
fn positions(&self) -> &[u32] {
self.segment_postings.positions()
}
}
impl DocSet for PostingsWithOffset {
fn advance(&mut self) -> bool {
self.segment_postings.advance()
}
fn doc(&self) -> DocId {
self.segment_postings.doc()
}
fn size_hint(&self) -> usize {
self.segment_postings.size_hint()
}
fn skip_next(&mut self, target: DocId) -> SkipResult {
self.segment_postings.skip_next(target)
}
}
pub struct PhraseScorer {
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
intersection_docset: IntersectionDocSet<PostingsWithOffset>,
}
impl PhraseScorer {
fn phrase_match(&self) -> bool {
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
.docsets()
.iter()
.map(|posting| posting.positions())
pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
let postings_with_offsets: Vec<_> = term_postings
.into_iter()
.enumerate()
.map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32))
.collect();
PhraseScorer {
intersection_docset: IntersectionDocSet::from(postings_with_offsets)
}
}
fn phrase_match(&self) -> bool {
// TODO maybe we could avoid decoding positions lazily for all terms
// when there is > 2 terms.
//
// For instance for the query "A B C", the position of "C" do not need
// to be decoded if "A B" had no match.
let docsets = self.intersection_docset.docsets();
let mut positions_arr: Vec<&[u32]> = vec![&[]; docsets.len()];
for docset in docsets {
positions_arr[docset.offset as usize] = docset.positions();
}
let num_postings = positions_arr.len() as u32;

View File

@@ -4,7 +4,6 @@ use schema::Term;
use schema::IndexRecordOption;
use core::SegmentReader;
use super::PhraseScorer;
use postings::IntersectionDocSet;
use query::EmptyScorer;
use Result;
@@ -22,17 +21,14 @@ impl Weight for PhraseWeight {
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
let mut term_postings_list = Vec::new();
for term in &self.phrase_terms {
let inverted_index = reader.inverted_index(term.field());
let term_postings_option =
inverted_index.read_postings(term, IndexRecordOption::WithFreqsAndPositions);
if let Some(term_postings) = term_postings_option {
term_postings_list.push(term_postings);
if let Some(postings) = reader
.inverted_index(term.field())
.read_postings(term, IndexRecordOption::WithFreqsAndPositions) {
term_postings_list.push(postings);
} else {
return Ok(box EmptyScorer);
}
}
Ok(box PhraseScorer {
intersection_docset: IntersectionDocSet::from(term_postings_list),
})
Ok(box PhraseScorer::new(term_postings_list))
}
}