From 533ad99cd5b077d804fe2f9df10aabb7bc5e6b99 Mon Sep 17 00:00:00 2001 From: trinity-1686a Date: Wed, 22 Feb 2023 11:18:33 +0100 Subject: [PATCH] add PhrasePrefixQuery (#1842) * add PhrasePrefixQuery --- src/query/mod.rs | 2 + src/query/phrase_prefix_query/mod.rs | 34 +++ .../phrase_prefix_query.rs | 168 +++++++++++ .../phrase_prefix_scorer.rs | 207 ++++++++++++++ .../phrase_prefix_weight.rs | 260 ++++++++++++++++++ src/query/phrase_query/mod.rs | 1 + src/query/phrase_query/phrase_scorer.rs | 26 +- 7 files changed, 696 insertions(+), 2 deletions(-) create mode 100644 src/query/phrase_prefix_query/mod.rs create mode 100644 src/query/phrase_prefix_query/phrase_prefix_query.rs create mode 100644 src/query/phrase_prefix_query/phrase_prefix_scorer.rs create mode 100644 src/query/phrase_prefix_query/phrase_prefix_weight.rs diff --git a/src/query/mod.rs b/src/query/mod.rs index dd606693f..aea485456 100644 --- a/src/query/mod.rs +++ b/src/query/mod.rs @@ -12,6 +12,7 @@ mod explanation; mod fuzzy_query; mod intersection; mod more_like_this; +mod phrase_prefix_query; mod phrase_query; mod query; mod query_parser; @@ -47,6 +48,7 @@ pub(crate) use self::fuzzy_query::DfaWrapper; pub use self::fuzzy_query::FuzzyTermQuery; pub use self::intersection::{intersect_scorers, Intersection}; pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder}; +pub use self::phrase_prefix_query::PhrasePrefixQuery; pub use self::phrase_query::PhraseQuery; pub use self::query::{EnableScoring, Query, QueryClone}; pub use self::query_parser::{QueryParser, QueryParserError}; diff --git a/src/query/phrase_prefix_query/mod.rs b/src/query/phrase_prefix_query/mod.rs new file mode 100644 index 000000000..260e43311 --- /dev/null +++ b/src/query/phrase_prefix_query/mod.rs @@ -0,0 +1,34 @@ +mod phrase_prefix_query; +mod phrase_prefix_scorer; +mod phrase_prefix_weight; + +pub use phrase_prefix_query::PhrasePrefixQuery; +pub use phrase_prefix_scorer::PhrasePrefixScorer; +pub use phrase_prefix_weight::PhrasePrefixWeight; + +fn prefix_end(prefix_start: &[u8]) -> Option> { + let mut res = prefix_start.to_owned(); + while !res.is_empty() { + let end = res.len() - 1; + if res[end] == u8::MAX { + res.pop(); + } else { + res[end] += 1; + return Some(res); + } + } + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_prefix_end() { + assert_eq!(prefix_end(b"aaa"), Some(b"aab".to_vec())); + assert_eq!(prefix_end(b"aa\xff"), Some(b"ab".to_vec())); + assert_eq!(prefix_end(b"a\xff\xff"), Some(b"b".to_vec())); + assert_eq!(prefix_end(b"\xff\xff\xff"), None); + } +} diff --git a/src/query/phrase_prefix_query/phrase_prefix_query.rs b/src/query/phrase_prefix_query/phrase_prefix_query.rs new file mode 100644 index 000000000..43cc33739 --- /dev/null +++ b/src/query/phrase_prefix_query/phrase_prefix_query.rs @@ -0,0 +1,168 @@ +use std::ops::Bound; + +use super::{prefix_end, PhrasePrefixWeight}; +use crate::query::bm25::Bm25Weight; +use crate::query::{EnableScoring, Query, RangeQuery, Weight}; +use crate::schema::{Field, IndexRecordOption, Term}; + +const DEFAULT_MAX_EXPANSIONS: u32 = 50; + +/// `PhrasePrefixQuery` matches a specific sequence of words followed by term of which only a +/// prefix is known. +/// +/// For instance the phrase prefix query for `"part t"` will match +/// the sentence +/// +/// **Alan just got a part time job.** +/// +/// On the other hand it will not match the sentence. +/// +/// **This is my favorite part of the job.** +/// +/// Using a `PhrasePrefixQuery` on a field requires positions +/// to be indexed for this field. +#[derive(Clone, Debug)] +pub struct PhrasePrefixQuery { + field: Field, + phrase_terms: Vec<(usize, Term)>, + prefix: (usize, Term), + max_expansions: u32, +} + +impl PhrasePrefixQuery { + /// Creates a new `PhrasePrefixQuery` given a list of terms. + /// + /// There must be at least two terms, and all terms + /// must belong to the same field. + /// Offset for each term will be same as index in the Vector + /// The last Term is a prefix and not a full value + pub fn new(terms: Vec) -> PhrasePrefixQuery { + let terms_with_offset = terms.into_iter().enumerate().collect(); + PhrasePrefixQuery::new_with_offset(terms_with_offset) + } + + /// Creates a new `PhrasePrefixQuery` given a list of terms and their offsets. + /// + /// Can be used to provide custom offset for each term. + pub fn new_with_offset(mut terms: Vec<(usize, Term)>) -> PhrasePrefixQuery { + assert!( + !terms.is_empty(), + "A phrase prefix query is required to have at least one term." + ); + terms.sort_by_key(|&(offset, _)| offset); + let field = terms[0].1.field(); + assert!( + terms[1..].iter().all(|term| term.1.field() == field), + "All terms from a phrase query must belong to the same field" + ); + PhrasePrefixQuery { + field, + prefix: terms.pop().unwrap(), + phrase_terms: terms, + max_expansions: DEFAULT_MAX_EXPANSIONS, + } + } + + /// Maximum number of terms to which the last provided term will expand. + pub fn set_max_expansions(&mut self, value: u32) { + self.max_expansions = value; + } + + /// The [`Field`] this `PhrasePrefixQuery` is targeting. + pub fn field(&self) -> Field { + self.field + } + + /// `Term`s in the phrase without the associated offsets. + pub fn phrase_terms(&self) -> Vec { + // TODO should we include the last term too? + self.phrase_terms + .iter() + .map(|(_, term)| term.clone()) + .collect::>() + } + + /// Returns the [`PhrasePrefixWeight`] for the given phrase query given a specific `searcher`. + /// + /// This function is the same as [`Query::weight()`] except it returns + /// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait. + /// If the query was only one term long, this returns `None` wherease [`Query::weight`] + /// returns a boxed [`RangeWeight`] + /// + /// Returns `None`, if phrase_terms is empty, which happens if the phrase prefix query was + /// built with a single term. + pub(crate) fn phrase_prefix_query_weight( + &self, + enable_scoring: EnableScoring<'_>, + ) -> crate::Result> { + if self.phrase_terms.is_empty() { + return Ok(None); + } + let schema = enable_scoring.schema(); + let field_entry = schema.get_field_entry(self.field); + let has_positions = field_entry + .field_type() + .get_index_record_option() + .map(IndexRecordOption::has_positions) + .unwrap_or(false); + if !has_positions { + let field_name = field_entry.name(); + return Err(crate::TantivyError::SchemaError(format!( + "Applied phrase query on field {:?}, which does not have positions indexed", + field_name + ))); + } + let terms = self.phrase_terms(); + let bm25_weight_opt = match enable_scoring { + EnableScoring::Enabled { searcher, .. } => { + Some(Bm25Weight::for_terms(searcher, &terms)?) + } + EnableScoring::Disabled { .. } => None, + }; + let weight = PhrasePrefixWeight::new( + self.phrase_terms.clone(), + self.prefix.clone(), + bm25_weight_opt, + self.max_expansions, + ); + Ok(Some(weight)) + } +} + +impl Query for PhrasePrefixQuery { + /// Create the weight associated with a query. + /// + /// See [`Weight`]. + fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result> { + if let Some(phrase_weight) = self.phrase_prefix_query_weight(enable_scoring)? { + Ok(Box::new(phrase_weight)) + } else { + // There are no prefix. Let's just match the suffix. + let end_term = if let Some(end_value) = prefix_end(&self.prefix.1.value_bytes()) { + let mut end_term = Term::with_capacity(end_value.len()); + end_term.set_field_and_type(self.field, self.prefix.1.typ()); + end_term.append_bytes(&end_value); + Bound::Excluded(end_term) + } else { + Bound::Unbounded + }; + + RangeQuery::new_term_bounds( + enable_scoring + .schema() + .get_field_name(self.field) + .to_owned(), + self.prefix.1.typ(), + &Bound::Included(self.prefix.1.clone()), + &end_term, + ) + .weight(enable_scoring) + } + } + + fn query_terms<'a>(&'a self, visitor: &mut dyn FnMut(&'a Term, bool)) { + for (_, term) in &self.phrase_terms { + visitor(term, true); + } + } +} diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs new file mode 100644 index 000000000..7f561e39e --- /dev/null +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -0,0 +1,207 @@ +use crate::docset::{DocSet, TERMINATED}; +use crate::fieldnorm::FieldNormReader; +use crate::postings::Postings; +use crate::query::bm25::Bm25Weight; +use crate::query::phrase_query::{intersection_count, PhraseScorer}; +use crate::query::Scorer; +use crate::{DocId, Score}; + +enum PhraseKind { + SinglePrefix { + position_offset: u32, + postings: TPostings, + positions: Vec, + }, + MultiPrefix(PhraseScorer), +} + +impl PhraseKind { + fn get_intersection(&mut self) -> &[u32] { + match self { + PhraseKind::SinglePrefix { + position_offset, + postings, + positions, + } => { + if positions.is_empty() { + postings.positions_with_offset(*position_offset, positions); + } + positions + } + PhraseKind::MultiPrefix(postings) => postings.get_intersection(), + } + } +} + +impl DocSet for PhraseKind { + fn advance(&mut self) -> DocId { + match self { + PhraseKind::SinglePrefix { + postings, + positions, + .. + } => { + positions.clear(); + postings.advance() + } + PhraseKind::MultiPrefix(postings) => postings.advance(), + } + } + + fn doc(&self) -> DocId { + match self { + PhraseKind::SinglePrefix { postings, .. } => postings.doc(), + PhraseKind::MultiPrefix(postings) => postings.doc(), + } + } + + fn size_hint(&self) -> u32 { + match self { + PhraseKind::SinglePrefix { postings, .. } => postings.size_hint(), + PhraseKind::MultiPrefix(postings) => postings.size_hint(), + } + } + + fn seek(&mut self, target: DocId) -> DocId { + match self { + PhraseKind::SinglePrefix { + postings, + positions, + .. + } => { + positions.clear(); + postings.seek(target) + } + PhraseKind::MultiPrefix(postings) => postings.seek(target), + } + } +} + +impl Scorer for PhraseKind { + fn score(&mut self) -> Score { + match self { + PhraseKind::SinglePrefix { positions, .. } => { + if positions.is_empty() { + 0.0 + } else { + 1.0 + } + } + PhraseKind::MultiPrefix(postings) => postings.score(), + } + } +} + +pub struct PhrasePrefixScorer { + phrase_scorer: PhraseKind, + suffixes: Vec, + suffix_offset: u32, + phrase_count: u32, +} + +impl PhrasePrefixScorer { + // If similarity_weight is None, then scoring is disabled. + pub fn new( + mut term_postings: Vec<(usize, TPostings)>, + similarity_weight_opt: Option, + fieldnorm_reader: FieldNormReader, + suffixes: Vec, + suffix_pos: usize, + ) -> PhrasePrefixScorer { + // correct indices so we can merge with our suffix term the PhraseScorer doesn't know about + let max_offset = term_postings + .iter() + .map(|(pos, _)| *pos) + .chain(std::iter::once(suffix_pos)) + .max() + .unwrap(); + + let phrase_scorer = if term_postings.len() > 1 { + PhraseKind::MultiPrefix(PhraseScorer::new_with_offset( + term_postings, + similarity_weight_opt, + fieldnorm_reader, + 0, + 1, + )) + } else { + let (pos, postings) = term_postings + .pop() + .expect("PhrasePrefixScorer must have at least two terms"); + let offset = suffix_pos - pos; + PhraseKind::SinglePrefix { + position_offset: offset as u32, + postings, + positions: Vec::with_capacity(100), + } + }; + let mut phrase_prefix_scorer = PhrasePrefixScorer { + phrase_scorer, + suffixes, + suffix_offset: (max_offset - suffix_pos) as u32, + phrase_count: 0, + }; + if !phrase_prefix_scorer.matches_prefix() { + phrase_prefix_scorer.advance(); + } + phrase_prefix_scorer + } + + pub fn phrase_count(&self) -> u32 { + self.phrase_count + } + + fn matches_prefix(&mut self) -> bool { + let mut count = 0; + let mut positions = Vec::new(); + let current_doc = self.doc(); + let pos_matching = self.phrase_scorer.get_intersection(); + for suffix in &mut self.suffixes { + if suffix.doc() > current_doc { + continue; + } + let doc = suffix.seek(current_doc); + if doc == current_doc { + suffix.positions_with_offset(self.suffix_offset, &mut positions); + count += intersection_count(pos_matching, &positions); + } + } + self.phrase_count = count as u32; + count != 0 + } +} + +impl DocSet for PhrasePrefixScorer { + fn advance(&mut self) -> DocId { + loop { + let doc = self.phrase_scorer.advance(); + if doc == TERMINATED || self.matches_prefix() { + return doc; + } + } + } + + fn seek(&mut self, target: DocId) -> DocId { + self.phrase_scorer.seek(target); + let doc = self.phrase_scorer.seek(target); + if doc == TERMINATED || self.matches_prefix() { + return doc; + } + self.advance() + } + + fn doc(&self) -> DocId { + self.phrase_scorer.doc() + } + + fn size_hint(&self) -> u32 { + self.phrase_scorer.size_hint() + } +} + +impl Scorer for PhrasePrefixScorer { + fn score(&mut self) -> Score { + // TODO modify score?? + self.phrase_scorer.score() + } +} diff --git a/src/query/phrase_prefix_query/phrase_prefix_weight.rs b/src/query/phrase_prefix_query/phrase_prefix_weight.rs new file mode 100644 index 000000000..f9292a327 --- /dev/null +++ b/src/query/phrase_prefix_query/phrase_prefix_weight.rs @@ -0,0 +1,260 @@ +use super::{prefix_end, PhrasePrefixScorer}; +use crate::core::SegmentReader; +use crate::fieldnorm::FieldNormReader; +use crate::postings::SegmentPostings; +use crate::query::bm25::Bm25Weight; +use crate::query::explanation::does_not_match; +use crate::query::{EmptyScorer, Explanation, Scorer, Weight}; +use crate::schema::{IndexRecordOption, Term}; +use crate::{DocId, DocSet, Score}; + +pub struct PhrasePrefixWeight { + phrase_terms: Vec<(usize, Term)>, + prefix: (usize, Term), + similarity_weight_opt: Option, + max_expansions: u32, +} + +impl PhrasePrefixWeight { + /// Creates a new phrase weight. + /// If `similarity_weight_opt` is None, then scoring is disabled + pub fn new( + phrase_terms: Vec<(usize, Term)>, + prefix: (usize, Term), + similarity_weight_opt: Option, + max_expansions: u32, + ) -> PhrasePrefixWeight { + PhrasePrefixWeight { + phrase_terms, + prefix, + similarity_weight_opt, + max_expansions, + } + } + + fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result { + let field = self.phrase_terms[0].1.field(); + if self.similarity_weight_opt.is_some() { + if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(field)? { + return Ok(fieldnorm_reader); + } + } + Ok(FieldNormReader::constant(reader.max_doc(), 1)) + } + + pub(crate) fn phrase_scorer( + &self, + reader: &SegmentReader, + boost: Score, + ) -> crate::Result>> { + let similarity_weight_opt = self + .similarity_weight_opt + .as_ref() + .map(|similarity_weight| similarity_weight.boost_by(boost)); + let fieldnorm_reader = self.fieldnorm_reader(reader)?; + let mut term_postings_list = Vec::new(); + if reader.has_deletes() { + for &(offset, ref term) in &self.phrase_terms { + if let Some(postings) = reader + .inverted_index(term.field())? + .read_postings(term, IndexRecordOption::WithFreqsAndPositions)? + { + term_postings_list.push((offset, postings)); + } else { + return Ok(None); + } + } + } else { + for &(offset, ref term) in &self.phrase_terms { + if let Some(postings) = reader + .inverted_index(term.field())? + .read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)? + { + term_postings_list.push((offset, postings)); + } else { + return Ok(None); + } + } + } + + let inv_index = reader.inverted_index(self.prefix.1.field())?; + let mut stream = inv_index.terms().range().ge(self.prefix.1.value_bytes()); + if let Some(end) = prefix_end(self.prefix.1.value_bytes()) { + stream = stream.lt(&end); + } + + #[cfg(feature = "quickwit")] + { + // We don't have this on the fst, hence we end up needing a feature flag. + // + // This is not a problem however as we enforce the limit below too. + // The point of `stream.limit` is to limit the number of term dictionary + // blocks being downloaded. + stream = stream.limit(self.max_expansions as u64); + } + + let mut stream = stream.into_stream()?; + + let mut suffixes = Vec::with_capacity(self.max_expansions as usize); + let mut new_term = self.prefix.1.clone(); + while stream.advance() && (suffixes.len() as u32) < self.max_expansions { + new_term.clear_with_type(new_term.typ()); + new_term.append_bytes(stream.key()); + if reader.has_deletes() { + if let Some(postings) = + inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)? + { + suffixes.push(postings); + } + } else { + if let Some(postings) = inv_index + .read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)? + { + suffixes.push(postings); + } + } + } + + Ok(Some(PhrasePrefixScorer::new( + term_postings_list, + similarity_weight_opt, + fieldnorm_reader, + suffixes, + self.prefix.0, + ))) + } +} + +impl Weight for PhrasePrefixWeight { + fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result> { + if let Some(scorer) = self.phrase_scorer(reader, boost)? { + Ok(Box::new(scorer)) + } else { + Ok(Box::new(EmptyScorer)) + } + } + + fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result { + let scorer_opt = self.phrase_scorer(reader, 1.0)?; + if scorer_opt.is_none() { + return Err(does_not_match(doc)); + } + let mut scorer = scorer_opt.unwrap(); + if scorer.seek(doc) != doc { + return Err(does_not_match(doc)); + } + let fieldnorm_reader = self.fieldnorm_reader(reader)?; + let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); + let phrase_count = scorer.phrase_count(); + let mut explanation = Explanation::new("Phrase Prefix Scorer", scorer.score()); + if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() { + explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count)); + } + Ok(explanation) + } +} + +#[cfg(test)] +mod tests { + use crate::core::Index; + use crate::docset::TERMINATED; + use crate::query::{EnableScoring, PhrasePrefixQuery, Query}; + use crate::schema::{Schema, TEXT}; + use crate::{DocSet, Term}; + + pub fn create_index(texts: &[&'static str]) -> crate::Result { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", TEXT); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema); + { + let mut index_writer = index.writer_for_tests()?; + for &text in texts { + let doc = doc!(text_field=>text); + index_writer.add_document(doc)?; + } + index_writer.commit()?; + } + Ok(index) + } + + #[test] + pub fn test_phrase_count_long() -> crate::Result<()> { + let index = create_index(&[ + "aa bb dd cc", + "aa aa bb c dd aa bb cc aa bb dc", + " aa bb cd", + ])?; + let schema = index.schema(); + let text_field = schema.get_field("text").unwrap(); + let searcher = index.reader()?.searcher(); + let phrase_query = PhrasePrefixQuery::new(vec![ + Term::from_field_text(text_field, "aa"), + Term::from_field_text(text_field, "bb"), + Term::from_field_text(text_field, "c"), + ]); + let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); + let phrase_weight = phrase_query + .phrase_prefix_query_weight(enable_scoring) + .unwrap() + .unwrap(); + let mut phrase_scorer = phrase_weight + .phrase_scorer(searcher.segment_reader(0u32), 1.0)? + .unwrap(); + assert_eq!(phrase_scorer.doc(), 1); + assert_eq!(phrase_scorer.phrase_count(), 2); + assert_eq!(phrase_scorer.advance(), 2); + assert_eq!(phrase_scorer.doc(), 2); + assert_eq!(phrase_scorer.phrase_count(), 1); + assert_eq!(phrase_scorer.advance(), TERMINATED); + Ok(()) + } + + #[test] + pub fn test_phrase_count_mid() -> crate::Result<()> { + let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?; + let schema = index.schema(); + let text_field = schema.get_field("text").unwrap(); + let searcher = index.reader()?.searcher(); + let phrase_query = PhrasePrefixQuery::new(vec![ + Term::from_field_text(text_field, "aa"), + Term::from_field_text(text_field, "b"), + ]); + let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); + let phrase_weight = phrase_query + .phrase_prefix_query_weight(enable_scoring) + .unwrap() + .unwrap(); + let mut phrase_scorer = phrase_weight + .phrase_scorer(searcher.segment_reader(0u32), 1.0)? + .unwrap(); + assert_eq!(phrase_scorer.doc(), 1); + assert_eq!(phrase_scorer.phrase_count(), 2); + assert_eq!(phrase_scorer.advance(), 2); + assert_eq!(phrase_scorer.doc(), 2); + assert_eq!(phrase_scorer.phrase_count(), 1); + assert_eq!(phrase_scorer.advance(), TERMINATED); + Ok(()) + } + + #[test] + pub fn test_phrase_count_short() -> crate::Result<()> { + let index = create_index(&["aa dd", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?; + let schema = index.schema(); + let text_field = schema.get_field("text").unwrap(); + let searcher = index.reader()?.searcher(); + let phrase_query = PhrasePrefixQuery::new(vec![Term::from_field_text(text_field, "c")]); + let enable_scoring = EnableScoring::enabled_from_searcher(&searcher); + assert!(phrase_query + .phrase_prefix_query_weight(enable_scoring) + .unwrap() + .is_none()); + let weight = phrase_query.weight(enable_scoring).unwrap(); + let mut phrase_scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?; + assert_eq!(phrase_scorer.doc(), 1); + assert_eq!(phrase_scorer.advance(), 2); + assert_eq!(phrase_scorer.doc(), 2); + assert_eq!(phrase_scorer.advance(), TERMINATED); + Ok(()) + } +} diff --git a/src/query/phrase_query/mod.rs b/src/query/phrase_query/mod.rs index 984659fc9..4a0bd8298 100644 --- a/src/query/phrase_query/mod.rs +++ b/src/query/phrase_query/mod.rs @@ -3,6 +3,7 @@ mod phrase_scorer; mod phrase_weight; pub use self::phrase_query::PhraseQuery; +pub(crate) use self::phrase_scorer::intersection_count; pub use self::phrase_scorer::PhraseScorer; pub use self::phrase_weight::PhraseWeight; diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index fde78d15e..6a5c641ec 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -76,7 +76,7 @@ fn intersection_exists(left: &[u32], right: &[u32]) -> bool { false } -fn intersection_count(left: &[u32], right: &[u32]) -> usize { +pub(crate) fn intersection_count(left: &[u32], right: &[u32]) -> usize { let mut left_index = 0; let mut right_index = 0; let mut count = 0; @@ -250,12 +250,29 @@ impl PhraseScorer { similarity_weight_opt: Option, fieldnorm_reader: FieldNormReader, slop: u32, + ) -> PhraseScorer { + Self::new_with_offset( + term_postings, + similarity_weight_opt, + fieldnorm_reader, + slop, + 0, + ) + } + + pub(crate) fn new_with_offset( + term_postings: Vec<(usize, TPostings)>, + similarity_weight_opt: Option, + fieldnorm_reader: FieldNormReader, + slop: u32, + offset: usize, ) -> PhraseScorer { let max_offset = term_postings .iter() .map(|&(offset, _)| offset) .max() - .unwrap_or(0); + .unwrap_or(0) + + offset; let num_docsets = term_postings.len(); let postings_with_offsets = term_postings .into_iter() @@ -283,6 +300,11 @@ impl PhraseScorer { self.phrase_count } + pub(crate) fn get_intersection(&mut self) -> &[u32] { + let len = intersection(&mut self.left, &self.right); + &self.left[..len] + } + fn phrase_match(&mut self) -> bool { if self.similarity_weight_opt.is_some() { let count = self.compute_phrase_count();