mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Fixing PhraseQuery, broken due to the reordering of the intersection clauses.
Closes #234
This commit is contained in:
@@ -6,6 +6,7 @@ pub use self::phrase_query::PhraseQuery;
|
||||
pub use self::phrase_weight::PhraseWeight;
|
||||
pub use self::phrase_scorer::PhraseScorer;
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -74,4 +75,47 @@ mod tests {
|
||||
assert_eq!(test_query(vec!["g", "a"]), empty_vec);
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[test] // motivated by #234
|
||||
pub fn test_phrase_query_docfreq_order() {
|
||||
let mut schema_builder = SchemaBuilder::default();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 40_000_000).unwrap();
|
||||
{
|
||||
// 0
|
||||
let doc = doc!(text_field=>"b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 1
|
||||
let doc = doc!(text_field=>"a b");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
{ // 2
|
||||
let doc = doc!(text_field=>"b a");
|
||||
index_writer.add_document(doc);
|
||||
}
|
||||
assert!(index_writer.commit().is_ok());
|
||||
}
|
||||
|
||||
index.load_searchers().unwrap();
|
||||
let searcher = index.searcher();
|
||||
let test_query = |texts: Vec<&str>| {
|
||||
let mut test_collector = TestCollector::default();
|
||||
let terms: Vec<Term> = texts
|
||||
.iter()
|
||||
.map(|text| Term::from_field_text(text_field, text))
|
||||
.collect();
|
||||
let phrase_query = PhraseQuery::from(terms);
|
||||
searcher
|
||||
.search(&phrase_query, &mut test_collector)
|
||||
.expect("search should succeed");
|
||||
test_collector.docs()
|
||||
};
|
||||
assert_eq!(test_query(vec!["a", "b"]), vec![1]);
|
||||
assert_eq!(test_query(vec!["b", "a"]), vec![2]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,21 +1,79 @@
|
||||
use query::Scorer;
|
||||
use DocSet;
|
||||
use postings::SegmentPostings;
|
||||
use postings::Postings;
|
||||
use postings::IntersectionDocSet;
|
||||
use DocId;
|
||||
use postings::{SkipResult, IntersectionDocSet, DocSet, Postings, SegmentPostings};
|
||||
|
||||
struct PostingsWithOffset {
|
||||
offset: u32,
|
||||
segment_postings: SegmentPostings,
|
||||
}
|
||||
|
||||
impl PostingsWithOffset {
|
||||
pub fn new(segment_postings: SegmentPostings, offset: u32) -> PostingsWithOffset {
|
||||
PostingsWithOffset {
|
||||
offset,
|
||||
segment_postings
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Postings for PostingsWithOffset {
|
||||
fn term_freq(&self) -> u32 {
|
||||
self.segment_postings.term_freq()
|
||||
}
|
||||
|
||||
fn positions(&self) -> &[u32] {
|
||||
self.segment_postings.positions()
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for PostingsWithOffset {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.segment_postings.advance()
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.segment_postings.doc()
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> usize {
|
||||
self.segment_postings.size_hint()
|
||||
}
|
||||
|
||||
fn skip_next(&mut self, target: DocId) -> SkipResult {
|
||||
self.segment_postings.skip_next(target)
|
||||
}
|
||||
}
|
||||
|
||||
pub struct PhraseScorer {
|
||||
pub intersection_docset: IntersectionDocSet<SegmentPostings>,
|
||||
intersection_docset: IntersectionDocSet<PostingsWithOffset>,
|
||||
}
|
||||
|
||||
impl PhraseScorer {
|
||||
fn phrase_match(&self) -> bool {
|
||||
let mut positions_arr: Vec<&[u32]> = self.intersection_docset
|
||||
.docsets()
|
||||
.iter()
|
||||
.map(|posting| posting.positions())
|
||||
|
||||
pub fn new(term_postings: Vec<SegmentPostings>) -> PhraseScorer {
|
||||
let postings_with_offsets: Vec<_> = term_postings
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(offset, postings)| PostingsWithOffset::new(postings, offset as u32))
|
||||
.collect();
|
||||
PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(postings_with_offsets)
|
||||
}
|
||||
}
|
||||
|
||||
fn phrase_match(&self) -> bool {
|
||||
|
||||
// TODO maybe we could avoid decoding positions lazily for all terms
|
||||
// when there is > 2 terms.
|
||||
//
|
||||
// For instance for the query "A B C", the position of "C" do not need
|
||||
// to be decoded if "A B" had no match.
|
||||
let docsets = self.intersection_docset.docsets();
|
||||
let mut positions_arr: Vec<&[u32]> = vec![&[]; docsets.len()];
|
||||
for docset in docsets {
|
||||
positions_arr[docset.offset as usize] = docset.positions();
|
||||
}
|
||||
|
||||
|
||||
let num_postings = positions_arr.len() as u32;
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@ use schema::Term;
|
||||
use schema::IndexRecordOption;
|
||||
use core::SegmentReader;
|
||||
use super::PhraseScorer;
|
||||
use postings::IntersectionDocSet;
|
||||
use query::EmptyScorer;
|
||||
use Result;
|
||||
|
||||
@@ -22,17 +21,14 @@ impl Weight for PhraseWeight {
|
||||
fn scorer<'a>(&'a self, reader: &'a SegmentReader) -> Result<Box<Scorer + 'a>> {
|
||||
let mut term_postings_list = Vec::new();
|
||||
for term in &self.phrase_terms {
|
||||
let inverted_index = reader.inverted_index(term.field());
|
||||
let term_postings_option =
|
||||
inverted_index.read_postings(term, IndexRecordOption::WithFreqsAndPositions);
|
||||
if let Some(term_postings) = term_postings_option {
|
||||
term_postings_list.push(term_postings);
|
||||
if let Some(postings) = reader
|
||||
.inverted_index(term.field())
|
||||
.read_postings(term, IndexRecordOption::WithFreqsAndPositions) {
|
||||
term_postings_list.push(postings);
|
||||
} else {
|
||||
return Ok(box EmptyScorer);
|
||||
}
|
||||
}
|
||||
Ok(box PhraseScorer {
|
||||
intersection_docset: IntersectionDocSet::from(term_postings_list),
|
||||
})
|
||||
Ok(box PhraseScorer::new(term_postings_list))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user