Generic TermScorer

This commit is contained in:
Paul Masurel
2026-01-15 18:06:26 +01:00
parent 0955b44ce1
commit 3e57eb9add
6 changed files with 116 additions and 39 deletions

View File

@@ -101,6 +101,19 @@ impl Postings for LoadedPostings {
output.push(*pos + offset);
}
}
fn seek_block(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &crate::fieldnorm::FieldNormReader,
similarity_weight: &crate::query::Bm25Weight,
) -> crate::Score {
unimplemented!()
}
fn freq_reading_option(&self) -> super::FreqReadingOption {
super::FreqReadingOption::ReadFreq
}
}
#[cfg(test)]

View File

@@ -1,4 +1,8 @@
use crate::docset::DocSet;
use crate::fieldnorm::FieldNormReader;
use crate::postings::FreqReadingOption;
use crate::query::{Bm25Weight, Scorer};
use crate::{DocId, Score};
/// Postings (also called inverted list)
///
@@ -11,6 +15,8 @@ use crate::docset::DocSet;
/// but other implementations mocking `SegmentPostings` exist,
/// for merging segments or for testing.
pub trait Postings: DocSet + 'static {
fn new_term_scorer(self: Box<Self>, fieldnorm_reader: &FieldNormReader, similarity_weight: &Bm25Weight) -> Box<dyn Scorer>;
/// The number of times the term appears in the document.
fn term_freq(&self) -> u32;
@@ -31,6 +37,30 @@ pub trait Postings: DocSet + 'static {
fn positions(&mut self, output: &mut Vec<u32>) {
self.positions_with_offset(0u32, output);
}
// supports Block-Wand
fn supports_block_max(&self) -> bool {
false
}
// TODO document
// Only allowed for block max.
fn seek_block(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &FieldNormReader,
similarity_weight: &Bm25Weight,
) -> Score {
unimplemented!()
}
// TODO
// Only allowed for block max.
fn last_doc_in_block(&self) -> crate::DocId {
unimplemented!()
}
fn freq_reading_option(&self) -> FreqReadingOption;
}
impl Postings for Box<dyn Postings> {
@@ -41,4 +71,25 @@ impl Postings for Box<dyn Postings> {
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
(**self).append_positions_with_offset(offset, output);
}
fn supports_block_max(&self) -> bool {
(**self).supports_block_max()
}
fn seek_block(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &FieldNormReader,
similarity_weight: &Bm25Weight,
) -> Score {
(**self).seek_block(target_doc, fieldnorm_reader, similarity_weight)
}
fn last_doc_in_block(&self) -> crate::DocId {
(**self).last_doc_in_block()
}
fn freq_reading_option(&self) -> FreqReadingOption {
(**self).freq_reading_option()
}
}

View File

@@ -3,10 +3,12 @@ use common::HasLen;
use crate::codec::postings::PostingsReader;
use crate::docset::DocSet;
use crate::fastfield::AliveBitSet;
use crate::fieldnorm::FieldNormReader;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED};
use crate::postings::{BlockSegmentPostings, FreqReadingOption, Postings};
use crate::query::Bm25Weight;
use crate::{DocId, Score, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated with
/// a term in a `Segment`.
@@ -252,6 +254,29 @@ impl Postings for SegmentPostings {
}
}
}
fn supports_block_max(&self) -> bool {
true
}
fn seek_block(
&mut self,
target_doc: crate::DocId,
fieldnorm_reader: &FieldNormReader,
similarity_weight: &Bm25Weight,
) -> Score {
self.block_cursor.seek_block(target_doc);
self.block_cursor
.block_max_score(&fieldnorm_reader, &similarity_weight)
}
fn last_doc_in_block(&self) -> crate::DocId {
self.block_cursor.skip_reader().last_doc_in_block()
}
fn freq_reading_option(&self) -> FreqReadingOption {
self.block_cursor.freq_reading_option()
}
}
#[cfg(test)]

View File

@@ -7,18 +7,18 @@ use crate::query::{Explanation, Scorer};
use crate::{DocId, Score};
#[derive(Clone)]
pub struct TermScorer {
postings: SegmentPostings,
pub struct TermScorer<TPostings: Postings = SegmentPostings> {
postings: TPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
}
impl TermScorer {
impl<TPostings: Postings> TermScorer<TPostings> {
pub fn new(
postings: SegmentPostings,
postings: TPostings,
fieldnorm_reader: FieldNormReader,
similarity_weight: Bm25Weight,
) -> TermScorer {
) -> TermScorer<TPostings> {
TermScorer {
postings,
fieldnorm_reader,
@@ -26,11 +26,6 @@ impl TermScorer {
}
}
pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
self.postings.block_cursor.seek_block(target_doc);
self.block_max_score()
}
#[cfg(test)]
pub fn create_for_test(
doc_and_tfs: &[(DocId, u32)],
@@ -54,27 +49,7 @@ impl TermScorer {
/// See `FreqReadingOption`.
pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
self.postings.block_cursor.freq_reading_option()
}
/// Returns the maximum score for the current block.
///
/// In some rare case, the result may not be exact. In this case a lower value is returned,
/// (and may lead us to return a lesser document).
///
/// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the
/// score assuming the average fieldnorm computed on this segment.
///
/// Though extremely rare, it is theoretically possible that the actual average fieldnorm
/// is different enough from the current segment average fieldnorm that the maximum over a
/// specific is achieved on a different document.
///
/// (The result is on the other hand guaranteed to be correct if there is only one segment).
#[inline(always)]
fn block_max_score(&mut self) -> Score {
self.postings
.block_cursor
.block_max_score(&self.fieldnorm_reader, &self.similarity_weight)
self.postings.freq_reading_option()
}
pub fn term_freq(&self) -> u32 {
@@ -96,11 +71,16 @@ impl TermScorer {
}
pub fn last_doc_in_block(&self) -> DocId {
self.postings.block_cursor.skip_reader().last_doc_in_block()
self.postings.last_doc_in_block()
}
pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
self.postings
.seek_block(target_doc, &self.fieldnorm_reader, &self.similarity_weight)
}
}
impl DocSet for TermScorer {
impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
#[inline]
fn advance(&mut self) -> DocId {
self.postings.advance()
@@ -282,8 +262,8 @@ mod tests {
{
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
for d in docs {
term_scorer.seek_block(d);
block_max_scores_b.push(term_scorer.block_max_score());
let block_max_score = term_scorer.seek_block(d);
block_max_scores_b.push(block_max_score);
}
}
for (l, r) in block_max_scores

View File

@@ -1,7 +1,7 @@
use std::cell::RefCell;
use crate::docset::DocSet;
use crate::postings::Postings;
use crate::postings::{FreqReadingOption, Postings};
use crate::query::BitSetDocSet;
use crate::DocId;
@@ -46,6 +46,10 @@ impl<TDocSet: Postings> Postings for BitSetPostingUnion<TDocSet> {
term_freq
}
fn freq_reading_option(&self) -> FreqReadingOption {
FreqReadingOption::ReadFreq
}
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let curr_doc = self.bitset.doc();
let mut docsets = self.docsets.borrow_mut();

View File

@@ -1,5 +1,5 @@
use crate::docset::{DocSet, TERMINATED};
use crate::postings::Postings;
use crate::postings::{FreqReadingOption, Postings};
use crate::DocId;
/// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`.
@@ -56,6 +56,10 @@ impl<TDocSet: Postings> Postings for SimpleUnion<TDocSet> {
term_freq
}
fn freq_reading_option(&self) -> FreqReadingOption {
FreqReadingOption::ReadFreq
}
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
for docset in &mut self.docsets {
let doc = docset.doc();