Generic TermScorer

2026-06-01 08:00:41 +00:00 · 2026-01-15 18:06:26 +01:00
parent 0955b44ce1
commit 3e57eb9add
6 changed files with 116 additions and 39 deletions
--- a/src/postings/loaded_postings.rs
+++ b/src/postings/loaded_postings.rs
@@ -101,6 +101,19 @@ impl Postings for LoadedPostings {
            output.push(*pos + offset);
        }
    }
+
+    fn seek_block(
+        &mut self,
+        target_doc: crate::DocId,
+        fieldnorm_reader: &crate::fieldnorm::FieldNormReader,
+        similarity_weight: &crate::query::Bm25Weight,
+    ) -> crate::Score {
+        unimplemented!()
+    }
+
+    fn freq_reading_option(&self) -> super::FreqReadingOption {
+        super::FreqReadingOption::ReadFreq
+    }
 }

 #[cfg(test)]
--- a/src/postings/postings.rs
+++ b/src/postings/postings.rs
@@ -1,4 +1,8 @@
 use crate::docset::DocSet;
+use crate::fieldnorm::FieldNormReader;
+use crate::postings::FreqReadingOption;
+use crate::query::{Bm25Weight, Scorer};
+use crate::{DocId, Score};

 /// Postings (also called inverted list)
 ///
@@ -11,6 +15,8 @@ use crate::docset::DocSet;
 /// but other implementations mocking `SegmentPostings` exist,
 /// for merging segments or for testing.
 pub trait Postings: DocSet + 'static {
+    fn new_term_scorer(self: Box<Self>, fieldnorm_reader: &FieldNormReader, similarity_weight: &Bm25Weight) -> Box<dyn Scorer>;
+
    /// The number of times the term appears in the document.
    fn term_freq(&self) -> u32;

@@ -31,6 +37,30 @@ pub trait Postings: DocSet + 'static {
    fn positions(&mut self, output: &mut Vec<u32>) {
        self.positions_with_offset(0u32, output);
    }
+
+    // supports Block-Wand
+    fn supports_block_max(&self) -> bool {
+        false
+    }
+
+    // TODO document
+    // Only allowed for block max.
+    fn seek_block(
+        &mut self,
+        target_doc: crate::DocId,
+        fieldnorm_reader: &FieldNormReader,
+        similarity_weight: &Bm25Weight,
+    ) -> Score {
+        unimplemented!()
+    }
+
+    // TODO
+    // Only allowed for block max.
+    fn last_doc_in_block(&self) -> crate::DocId {
+        unimplemented!()
+    }
+
+    fn freq_reading_option(&self) -> FreqReadingOption;
 }

 impl Postings for Box<dyn Postings> {
@@ -41,4 +71,25 @@ impl Postings for Box<dyn Postings> {
    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
        (**self).append_positions_with_offset(offset, output);
    }
+
+    fn supports_block_max(&self) -> bool {
+        (**self).supports_block_max()
+    }
+
+    fn seek_block(
+        &mut self,
+        target_doc: crate::DocId,
+        fieldnorm_reader: &FieldNormReader,
+        similarity_weight: &Bm25Weight,
+    ) -> Score {
+        (**self).seek_block(target_doc, fieldnorm_reader, similarity_weight)
+    }
+
+    fn last_doc_in_block(&self) -> crate::DocId {
+        (**self).last_doc_in_block()
+    }
+
+    fn freq_reading_option(&self) -> FreqReadingOption {
+        (**self).freq_reading_option()
+    }
 }
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -3,10 +3,12 @@ use common::HasLen;
 use crate::codec::postings::PostingsReader;
 use crate::docset::DocSet;
 use crate::fastfield::AliveBitSet;
+use crate::fieldnorm::FieldNormReader;
 use crate::positions::PositionReader;
 use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
-use crate::postings::{BlockSegmentPostings, Postings};
-use crate::{DocId, TERMINATED};
+use crate::postings::{BlockSegmentPostings, FreqReadingOption, Postings};
+use crate::query::Bm25Weight;
+use crate::{DocId, Score, TERMINATED};

 /// `SegmentPostings` represents the inverted list or postings associated with
 /// a term in a `Segment`.
@@ -252,6 +254,29 @@ impl Postings for SegmentPostings {
            }
        }
    }
+
+    fn supports_block_max(&self) -> bool {
+        true
+    }
+
+    fn seek_block(
+        &mut self,
+        target_doc: crate::DocId,
+        fieldnorm_reader: &FieldNormReader,
+        similarity_weight: &Bm25Weight,
+    ) -> Score {
+        self.block_cursor.seek_block(target_doc);
+        self.block_cursor
+            .block_max_score(&fieldnorm_reader, &similarity_weight)
+    }
+
+    fn last_doc_in_block(&self) -> crate::DocId {
+        self.block_cursor.skip_reader().last_doc_in_block()
+    }
+
+    fn freq_reading_option(&self) -> FreqReadingOption {
+        self.block_cursor.freq_reading_option()
+    }
 }

 #[cfg(test)]
--- a/src/query/term_query/term_scorer.rs
+++ b/src/query/term_query/term_scorer.rs
@@ -7,18 +7,18 @@ use crate::query::{Explanation, Scorer};
 use crate::{DocId, Score};

 #[derive(Clone)]
-pub struct TermScorer {
-    postings: SegmentPostings,
+pub struct TermScorer<TPostings: Postings = SegmentPostings> {
+    postings: TPostings,
    fieldnorm_reader: FieldNormReader,
    similarity_weight: Bm25Weight,
 }

-impl TermScorer {
+impl<TPostings: Postings> TermScorer<TPostings> {
    pub fn new(
-        postings: SegmentPostings,
+        postings: TPostings,
        fieldnorm_reader: FieldNormReader,
        similarity_weight: Bm25Weight,
-    ) -> TermScorer {
+    ) -> TermScorer<TPostings> {
        TermScorer {
            postings,
            fieldnorm_reader,
@@ -26,11 +26,6 @@ impl TermScorer {
        }
    }

-    pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
-        self.postings.block_cursor.seek_block(target_doc);
-        self.block_max_score()
-    }
-
    #[cfg(test)]
    pub fn create_for_test(
        doc_and_tfs: &[(DocId, u32)],
@@ -54,27 +49,7 @@ impl TermScorer {

    /// See `FreqReadingOption`.
    pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
-        self.postings.block_cursor.freq_reading_option()
-    }
-
-    /// Returns the maximum score for the current block.
-    ///
-    /// In some rare case, the result may not be exact. In this case a lower value is returned,
-    /// (and may lead us to return a lesser document).
-    ///
-    /// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the
-    /// score assuming the average fieldnorm computed on this segment.
-    ///
-    /// Though extremely rare, it is theoretically possible that the actual average fieldnorm
-    /// is different enough from the current segment average fieldnorm that the maximum over a
-    /// specific is achieved on a different document.
-    ///
-    /// (The result is on the other hand guaranteed to be correct if there is only one segment).
-    #[inline(always)]
-    fn block_max_score(&mut self) -> Score {
-        self.postings
-            .block_cursor
-            .block_max_score(&self.fieldnorm_reader, &self.similarity_weight)
+        self.postings.freq_reading_option()
    }

    pub fn term_freq(&self) -> u32 {
@@ -96,11 +71,16 @@ impl TermScorer {
    }

    pub fn last_doc_in_block(&self) -> DocId {
-        self.postings.block_cursor.skip_reader().last_doc_in_block()
+        self.postings.last_doc_in_block()
+    }
+
+    pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
+        self.postings
+            .seek_block(target_doc, &self.fieldnorm_reader, &self.similarity_weight)
    }
 }

-impl DocSet for TermScorer {
+impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
    #[inline]
    fn advance(&mut self) -> DocId {
        self.postings.advance()
@@ -282,8 +262,8 @@ mod tests {
            {
                let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
                for d in docs {
-                    term_scorer.seek_block(d);
-                    block_max_scores_b.push(term_scorer.block_max_score());
+                    let block_max_score = term_scorer.seek_block(d);
+                    block_max_scores_b.push(block_max_score);
                }
            }
            for (l, r) in block_max_scores
--- a/src/query/union/bitset_union.rs
+++ b/src/query/union/bitset_union.rs
@@ -1,7 +1,7 @@
 use std::cell::RefCell;

 use crate::docset::DocSet;
-use crate::postings::Postings;
+use crate::postings::{FreqReadingOption, Postings};
 use crate::query::BitSetDocSet;
 use crate::DocId;

@@ -46,6 +46,10 @@ impl<TDocSet: Postings> Postings for BitSetPostingUnion<TDocSet> {
        term_freq
    }

+    fn freq_reading_option(&self) -> FreqReadingOption {
+        FreqReadingOption::ReadFreq
+    }
+
    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
        let curr_doc = self.bitset.doc();
        let mut docsets = self.docsets.borrow_mut();
--- a/src/query/union/simple_union.rs
+++ b/src/query/union/simple_union.rs
@@ -1,5 +1,5 @@
 use crate::docset::{DocSet, TERMINATED};
-use crate::postings::Postings;
+use crate::postings::{FreqReadingOption, Postings};
 use crate::DocId;

 /// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`.
@@ -56,6 +56,10 @@ impl<TDocSet: Postings> Postings for SimpleUnion<TDocSet> {
        term_freq
    }

+    fn freq_reading_option(&self) -> FreqReadingOption {
+        FreqReadingOption::ReadFreq
+    }
+
    fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
        for docset in &mut self.docsets {
            let doc = docset.doc();