mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
1 Commits
columnar_o
...
prefix-phr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2a45af77e0 |
@@ -2,7 +2,7 @@ use crate::docset::{DocSet, TERMINATED};
|
|||||||
use crate::fieldnorm::FieldNormReader;
|
use crate::fieldnorm::FieldNormReader;
|
||||||
use crate::postings::Postings;
|
use crate::postings::Postings;
|
||||||
use crate::query::bm25::Bm25Weight;
|
use crate::query::bm25::Bm25Weight;
|
||||||
use crate::query::phrase_query::{intersection_count, PhraseScorer};
|
use crate::query::phrase_query::{intersection_count, intersection_exists, PhraseScorer};
|
||||||
use crate::query::Scorer;
|
use crate::query::Scorer;
|
||||||
use crate::{DocId, Score};
|
use crate::{DocId, Score};
|
||||||
|
|
||||||
@@ -92,14 +92,17 @@ impl<TPostings: Postings> Scorer for PhraseKind<TPostings> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct PhrasePrefixScorer<TPostings: Postings> {
|
pub struct PhrasePrefixScorer<TPostings: Postings, const SCORING_ENABLED: bool> {
|
||||||
phrase_scorer: PhraseKind<TPostings>,
|
phrase_scorer: PhraseKind<TPostings>,
|
||||||
suffixes: Vec<TPostings>,
|
suffixes: Vec<TPostings>,
|
||||||
suffix_offset: u32,
|
suffix_offset: u32,
|
||||||
phrase_count: u32,
|
phrase_count: u32,
|
||||||
|
suffix_position_buffer: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
impl<TPostings: Postings, const SCORING_ENABLED: bool>
|
||||||
|
PhrasePrefixScorer<TPostings, SCORING_ENABLED>
|
||||||
|
{
|
||||||
// If similarity_weight is None, then scoring is disabled.
|
// If similarity_weight is None, then scoring is disabled.
|
||||||
pub fn new(
|
pub fn new(
|
||||||
mut term_postings: Vec<(usize, TPostings)>,
|
mut term_postings: Vec<(usize, TPostings)>,
|
||||||
@@ -107,7 +110,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
fieldnorm_reader: FieldNormReader,
|
fieldnorm_reader: FieldNormReader,
|
||||||
suffixes: Vec<TPostings>,
|
suffixes: Vec<TPostings>,
|
||||||
suffix_pos: usize,
|
suffix_pos: usize,
|
||||||
) -> PhrasePrefixScorer<TPostings> {
|
) -> PhrasePrefixScorer<TPostings, SCORING_ENABLED> {
|
||||||
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
|
// correct indices so we can merge with our suffix term the PhraseScorer doesn't know about
|
||||||
let max_offset = term_postings
|
let max_offset = term_postings
|
||||||
.iter()
|
.iter()
|
||||||
@@ -140,6 +143,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
suffixes,
|
suffixes,
|
||||||
suffix_offset: (max_offset - suffix_pos) as u32,
|
suffix_offset: (max_offset - suffix_pos) as u32,
|
||||||
phrase_count: 0,
|
phrase_count: 0,
|
||||||
|
suffix_position_buffer: Vec::with_capacity(100),
|
||||||
};
|
};
|
||||||
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
||||||
phrase_prefix_scorer.advance();
|
phrase_prefix_scorer.advance();
|
||||||
@@ -153,7 +157,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
|
|
||||||
fn matches_prefix(&mut self) -> bool {
|
fn matches_prefix(&mut self) -> bool {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut positions = Vec::new();
|
|
||||||
let current_doc = self.doc();
|
let current_doc = self.doc();
|
||||||
let pos_matching = self.phrase_scorer.get_intersection();
|
let pos_matching = self.phrase_scorer.get_intersection();
|
||||||
for suffix in &mut self.suffixes {
|
for suffix in &mut self.suffixes {
|
||||||
@@ -162,16 +165,27 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
let doc = suffix.seek(current_doc);
|
let doc = suffix.seek(current_doc);
|
||||||
if doc == current_doc {
|
if doc == current_doc {
|
||||||
suffix.positions_with_offset(self.suffix_offset, &mut positions);
|
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
|
||||||
count += intersection_count(pos_matching, &positions);
|
if SCORING_ENABLED {
|
||||||
|
count += intersection_count(pos_matching, &self.suffix_position_buffer);
|
||||||
|
} else {
|
||||||
|
if intersection_exists(pos_matching, &self.suffix_position_buffer) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if !SCORING_ENABLED {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
self.phrase_count = count as u32;
|
self.phrase_count = count as u32;
|
||||||
count != 0
|
count != 0
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
impl<TPostings: Postings, const SCORING_ENABLED: bool> DocSet
|
||||||
|
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
|
||||||
|
{
|
||||||
fn advance(&mut self) -> DocId {
|
fn advance(&mut self) -> DocId {
|
||||||
loop {
|
loop {
|
||||||
let doc = self.phrase_scorer.advance();
|
let doc = self.phrase_scorer.advance();
|
||||||
@@ -198,9 +212,15 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TPostings: Postings> Scorer for PhrasePrefixScorer<TPostings> {
|
impl<TPostings: Postings, const SCORING_ENABLED: bool> Scorer
|
||||||
|
for PhrasePrefixScorer<TPostings, SCORING_ENABLED>
|
||||||
|
{
|
||||||
fn score(&mut self) -> Score {
|
fn score(&mut self) -> Score {
|
||||||
|
if SCORING_ENABLED {
|
||||||
|
self.phrase_scorer.score()
|
||||||
|
} else {
|
||||||
|
1.0f32
|
||||||
|
}
|
||||||
// TODO modify score??
|
// TODO modify score??
|
||||||
self.phrase_scorer.score()
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,11 +42,11 @@ impl PhrasePrefixWeight {
|
|||||||
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
Ok(FieldNormReader::constant(reader.max_doc(), 1))
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn phrase_scorer(
|
pub(crate) fn phrase_prefix_scorer<const SCORING_ENABLED: bool>(
|
||||||
&self,
|
&self,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
boost: Score,
|
boost: Score,
|
||||||
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings>>> {
|
) -> crate::Result<Option<PhrasePrefixScorer<SegmentPostings, SCORING_ENABLED>>> {
|
||||||
let similarity_weight_opt = self
|
let similarity_weight_opt = self
|
||||||
.similarity_weight_opt
|
.similarity_weight_opt
|
||||||
.as_ref()
|
.as_ref()
|
||||||
@@ -128,15 +128,20 @@ impl PhrasePrefixWeight {
|
|||||||
|
|
||||||
impl Weight for PhrasePrefixWeight {
|
impl Weight for PhrasePrefixWeight {
|
||||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||||
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
|
if self.similarity_weight_opt.is_some() {
|
||||||
Ok(Box::new(scorer))
|
if let Some(scorer) = self.phrase_prefix_scorer::<true>(reader, boost)? {
|
||||||
|
return Ok(Box::new(scorer));
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
Ok(Box::new(EmptyScorer))
|
if let Some(scorer) = self.phrase_prefix_scorer::<false>(reader, boost)? {
|
||||||
|
return Ok(Box::new(scorer));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
Ok(Box::new(EmptyScorer))
|
||||||
}
|
}
|
||||||
|
|
||||||
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
|
||||||
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
|
let scorer_opt = self.phrase_prefix_scorer::<true>(reader, 1.0)?;
|
||||||
if scorer_opt.is_none() {
|
if scorer_opt.is_none() {
|
||||||
return Err(does_not_match(doc));
|
return Err(does_not_match(doc));
|
||||||
}
|
}
|
||||||
@@ -200,7 +205,7 @@ mod tests {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut phrase_scorer = phrase_weight
|
let mut phrase_scorer = phrase_weight
|
||||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
.phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(phrase_scorer.doc(), 1);
|
assert_eq!(phrase_scorer.doc(), 1);
|
||||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||||
@@ -211,6 +216,38 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn test_phrase_no_count() -> crate::Result<()> {
|
||||||
|
let index = create_index(&[
|
||||||
|
"aa bb dd cc",
|
||||||
|
"aa aa bb c dd aa bb cc aa bb dc",
|
||||||
|
" aa bb cd",
|
||||||
|
])?;
|
||||||
|
let schema = index.schema();
|
||||||
|
let text_field = schema.get_field("text").unwrap();
|
||||||
|
let searcher = index.reader()?.searcher();
|
||||||
|
let phrase_query = PhrasePrefixQuery::new(vec![
|
||||||
|
Term::from_field_text(text_field, "aa"),
|
||||||
|
Term::from_field_text(text_field, "bb"),
|
||||||
|
Term::from_field_text(text_field, "c"),
|
||||||
|
]);
|
||||||
|
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
|
||||||
|
let phrase_weight = phrase_query
|
||||||
|
.phrase_prefix_query_weight(enable_scoring)
|
||||||
|
.unwrap()
|
||||||
|
.unwrap();
|
||||||
|
let mut phrase_scorer = phrase_weight
|
||||||
|
.phrase_prefix_scorer::<false>(searcher.segment_reader(0u32), 1.0)?
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(phrase_scorer.doc(), 1);
|
||||||
|
assert_eq!(phrase_scorer.phrase_count(), 0);
|
||||||
|
assert_eq!(phrase_scorer.advance(), 2);
|
||||||
|
assert_eq!(phrase_scorer.doc(), 2);
|
||||||
|
assert_eq!(phrase_scorer.phrase_count(), 0);
|
||||||
|
assert_eq!(phrase_scorer.advance(), TERMINATED);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_phrase_count_mid() -> crate::Result<()> {
|
pub fn test_phrase_count_mid() -> crate::Result<()> {
|
||||||
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
|
let index = create_index(&["aa dd cc", "aa aa bb c dd aa bb cc aa dc", " aa bb cd"])?;
|
||||||
@@ -227,7 +264,7 @@ mod tests {
|
|||||||
.unwrap()
|
.unwrap()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
let mut phrase_scorer = phrase_weight
|
let mut phrase_scorer = phrase_weight
|
||||||
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
|
.phrase_prefix_scorer::<true>(searcher.segment_reader(0u32), 1.0)?
|
||||||
.unwrap();
|
.unwrap();
|
||||||
assert_eq!(phrase_scorer.doc(), 1);
|
assert_eq!(phrase_scorer.doc(), 1);
|
||||||
assert_eq!(phrase_scorer.phrase_count(), 2);
|
assert_eq!(phrase_scorer.phrase_count(), 2);
|
||||||
|
|||||||
@@ -3,8 +3,8 @@ mod phrase_scorer;
|
|||||||
mod phrase_weight;
|
mod phrase_weight;
|
||||||
|
|
||||||
pub use self::phrase_query::PhraseQuery;
|
pub use self::phrase_query::PhraseQuery;
|
||||||
pub(crate) use self::phrase_scorer::intersection_count;
|
|
||||||
pub use self::phrase_scorer::PhraseScorer;
|
pub use self::phrase_scorer::PhraseScorer;
|
||||||
|
pub(crate) use self::phrase_scorer::{intersection_count, intersection_exists};
|
||||||
pub use self::phrase_weight::PhraseWeight;
|
pub use self::phrase_weight::PhraseWeight;
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ pub struct PhraseScorer<TPostings: Postings> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns true if and only if the two sorted arrays contain a common element
|
/// Returns true if and only if the two sorted arrays contain a common element
|
||||||
fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
|
pub(crate) fn intersection_exists(left: &[u32], right: &[u32]) -> bool {
|
||||||
let mut left_index = 0;
|
let mut left_index = 0;
|
||||||
let mut right_index = 0;
|
let mut right_index = 0;
|
||||||
while left_index < left.len() && right_index < right.len() {
|
while left_index < left.len() && right_index < right.len() {
|
||||||
|
|||||||
Reference in New Issue
Block a user