mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 08:00:41 +00:00
Generic TermScorer
This commit is contained in:
@@ -101,6 +101,19 @@ impl Postings for LoadedPostings {
|
||||
output.push(*pos + offset);
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_block(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &crate::fieldnorm::FieldNormReader,
|
||||
similarity_weight: &crate::query::Bm25Weight,
|
||||
) -> crate::Score {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> super::FreqReadingOption {
|
||||
super::FreqReadingOption::ReadFreq
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -1,4 +1,8 @@
|
||||
use crate::docset::DocSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::FreqReadingOption;
|
||||
use crate::query::{Bm25Weight, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
/// Postings (also called inverted list)
|
||||
///
|
||||
@@ -11,6 +15,8 @@ use crate::docset::DocSet;
|
||||
/// but other implementations mocking `SegmentPostings` exist,
|
||||
/// for merging segments or for testing.
|
||||
pub trait Postings: DocSet + 'static {
|
||||
fn new_term_scorer(self: Box<Self>, fieldnorm_reader: &FieldNormReader, similarity_weight: &Bm25Weight) -> Box<dyn Scorer>;
|
||||
|
||||
/// The number of times the term appears in the document.
|
||||
fn term_freq(&self) -> u32;
|
||||
|
||||
@@ -31,6 +37,30 @@ pub trait Postings: DocSet + 'static {
|
||||
fn positions(&mut self, output: &mut Vec<u32>) {
|
||||
self.positions_with_offset(0u32, output);
|
||||
}
|
||||
|
||||
// supports Block-Wand
|
||||
fn supports_block_max(&self) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
// TODO document
|
||||
// Only allowed for block max.
|
||||
fn seek_block(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
similarity_weight: &Bm25Weight,
|
||||
) -> Score {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
// TODO
|
||||
// Only allowed for block max.
|
||||
fn last_doc_in_block(&self) -> crate::DocId {
|
||||
unimplemented!()
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> FreqReadingOption;
|
||||
}
|
||||
|
||||
impl Postings for Box<dyn Postings> {
|
||||
@@ -41,4 +71,25 @@ impl Postings for Box<dyn Postings> {
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
(**self).append_positions_with_offset(offset, output);
|
||||
}
|
||||
|
||||
fn supports_block_max(&self) -> bool {
|
||||
(**self).supports_block_max()
|
||||
}
|
||||
|
||||
fn seek_block(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
similarity_weight: &Bm25Weight,
|
||||
) -> Score {
|
||||
(**self).seek_block(target_doc, fieldnorm_reader, similarity_weight)
|
||||
}
|
||||
|
||||
fn last_doc_in_block(&self) -> crate::DocId {
|
||||
(**self).last_doc_in_block()
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
(**self).freq_reading_option()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,10 +3,12 @@ use common::HasLen;
|
||||
use crate::codec::postings::PostingsReader;
|
||||
use crate::docset::DocSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::{BlockSegmentPostings, Postings};
|
||||
use crate::{DocId, TERMINATED};
|
||||
use crate::postings::{BlockSegmentPostings, FreqReadingOption, Postings};
|
||||
use crate::query::Bm25Weight;
|
||||
use crate::{DocId, Score, TERMINATED};
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated with
|
||||
/// a term in a `Segment`.
|
||||
@@ -252,6 +254,29 @@ impl Postings for SegmentPostings {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn supports_block_max(&self) -> bool {
|
||||
true
|
||||
}
|
||||
|
||||
fn seek_block(
|
||||
&mut self,
|
||||
target_doc: crate::DocId,
|
||||
fieldnorm_reader: &FieldNormReader,
|
||||
similarity_weight: &Bm25Weight,
|
||||
) -> Score {
|
||||
self.block_cursor.seek_block(target_doc);
|
||||
self.block_cursor
|
||||
.block_max_score(&fieldnorm_reader, &similarity_weight)
|
||||
}
|
||||
|
||||
fn last_doc_in_block(&self) -> crate::DocId {
|
||||
self.block_cursor.skip_reader().last_doc_in_block()
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
self.block_cursor.freq_reading_option()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -7,18 +7,18 @@ use crate::query::{Explanation, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct TermScorer {
|
||||
postings: SegmentPostings,
|
||||
pub struct TermScorer<TPostings: Postings = SegmentPostings> {
|
||||
postings: TPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
}
|
||||
|
||||
impl TermScorer {
|
||||
impl<TPostings: Postings> TermScorer<TPostings> {
|
||||
pub fn new(
|
||||
postings: SegmentPostings,
|
||||
postings: TPostings,
|
||||
fieldnorm_reader: FieldNormReader,
|
||||
similarity_weight: Bm25Weight,
|
||||
) -> TermScorer {
|
||||
) -> TermScorer<TPostings> {
|
||||
TermScorer {
|
||||
postings,
|
||||
fieldnorm_reader,
|
||||
@@ -26,11 +26,6 @@ impl TermScorer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
|
||||
self.postings.block_cursor.seek_block(target_doc);
|
||||
self.block_max_score()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn create_for_test(
|
||||
doc_and_tfs: &[(DocId, u32)],
|
||||
@@ -54,27 +49,7 @@ impl TermScorer {
|
||||
|
||||
/// See `FreqReadingOption`.
|
||||
pub(crate) fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
self.postings.block_cursor.freq_reading_option()
|
||||
}
|
||||
|
||||
/// Returns the maximum score for the current block.
|
||||
///
|
||||
/// In some rare case, the result may not be exact. In this case a lower value is returned,
|
||||
/// (and may lead us to return a lesser document).
|
||||
///
|
||||
/// At index time, we store the (fieldnorm_id, term frequency) pair that maximizes the
|
||||
/// score assuming the average fieldnorm computed on this segment.
|
||||
///
|
||||
/// Though extremely rare, it is theoretically possible that the actual average fieldnorm
|
||||
/// is different enough from the current segment average fieldnorm that the maximum over a
|
||||
/// specific is achieved on a different document.
|
||||
///
|
||||
/// (The result is on the other hand guaranteed to be correct if there is only one segment).
|
||||
#[inline(always)]
|
||||
fn block_max_score(&mut self) -> Score {
|
||||
self.postings
|
||||
.block_cursor
|
||||
.block_max_score(&self.fieldnorm_reader, &self.similarity_weight)
|
||||
self.postings.freq_reading_option()
|
||||
}
|
||||
|
||||
pub fn term_freq(&self) -> u32 {
|
||||
@@ -96,11 +71,16 @@ impl TermScorer {
|
||||
}
|
||||
|
||||
pub fn last_doc_in_block(&self) -> DocId {
|
||||
self.postings.block_cursor.skip_reader().last_doc_in_block()
|
||||
self.postings.last_doc_in_block()
|
||||
}
|
||||
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) -> Score {
|
||||
self.postings
|
||||
.seek_block(target_doc, &self.fieldnorm_reader, &self.similarity_weight)
|
||||
}
|
||||
}
|
||||
|
||||
impl DocSet for TermScorer {
|
||||
impl<TPostings: Postings> DocSet for TermScorer<TPostings> {
|
||||
#[inline]
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.postings.advance()
|
||||
@@ -282,8 +262,8 @@ mod tests {
|
||||
{
|
||||
let mut term_scorer = term_weight.term_scorer_for_test(reader, 1.0)?.unwrap();
|
||||
for d in docs {
|
||||
term_scorer.seek_block(d);
|
||||
block_max_scores_b.push(term_scorer.block_max_score());
|
||||
let block_max_score = term_scorer.seek_block(d);
|
||||
block_max_scores_b.push(block_max_score);
|
||||
}
|
||||
}
|
||||
for (l, r) in block_max_scores
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use std::cell::RefCell;
|
||||
|
||||
use crate::docset::DocSet;
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::{FreqReadingOption, Postings};
|
||||
use crate::query::BitSetDocSet;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -46,6 +46,10 @@ impl<TDocSet: Postings> Postings for BitSetPostingUnion<TDocSet> {
|
||||
term_freq
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
FreqReadingOption::ReadFreq
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
let curr_doc = self.bitset.doc();
|
||||
let mut docsets = self.docsets.borrow_mut();
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::Postings;
|
||||
use crate::postings::{FreqReadingOption, Postings};
|
||||
use crate::DocId;
|
||||
|
||||
/// A `SimpleUnion` is a `DocSet` that is the union of multiple `DocSet`.
|
||||
@@ -56,6 +56,10 @@ impl<TDocSet: Postings> Postings for SimpleUnion<TDocSet> {
|
||||
term_freq
|
||||
}
|
||||
|
||||
fn freq_reading_option(&self) -> FreqReadingOption {
|
||||
FreqReadingOption::ReadFreq
|
||||
}
|
||||
|
||||
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
|
||||
for docset in &mut self.docsets {
|
||||
let doc = docset.doc();
|
||||
|
||||
Reference in New Issue
Block a user