From 799e88adbd0de888929b387ed8ca675cb174b4bc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 13 Jan 2026 20:21:22 +0100 Subject: [PATCH] blop --- src/codec/postings/mod.rs | 7 ++++ src/codec/standard/postings/mod.rs | 8 ++-- ...ostings.rs => standard_postings_reader.rs} | 39 +++++++++++++++++++ ...zer.rs => standard_postings_serializer.rs} | 0 src/postings/block_segment_postings.rs | 18 ++++----- src/postings/mod.rs | 2 +- src/postings/segment_postings.rs | 1 - 7 files changed, 58 insertions(+), 17 deletions(-) rename src/codec/standard/postings/{block_segment_postings.rs => standard_postings_reader.rs} (91%) rename src/codec/standard/postings/{postings_serializer.rs => standard_postings_serializer.rs} (100%) diff --git a/src/codec/postings/mod.rs b/src/codec/postings/mod.rs index ca09c9886..1a547eea1 100644 --- a/src/codec/postings/mod.rs +++ b/src/codec/postings/mod.rs @@ -4,6 +4,7 @@ use common::OwnedBytes; use crate::fieldnorm::FieldNormReader; use crate::postings::FreqReadingOption; +use crate::query::Bm25Weight; use crate::schema::IndexRecordOption; use crate::{DocId, Score}; @@ -70,4 +71,10 @@ pub trait PostingsReader: Sized { // TODO Move to the codec and use the serializer. fn empty() -> Self; + + fn block_max_score( + &mut self, + fieldnorm_reader: &FieldNormReader, + bm25_weight: &Bm25Weight, + ) -> Score; } diff --git a/src/codec/standard/postings/mod.rs b/src/codec/standard/postings/mod.rs index 745e6b507..74baa746c 100644 --- a/src/codec/standard/postings/mod.rs +++ b/src/codec/standard/postings/mod.rs @@ -4,12 +4,12 @@ use crate::schema::IndexRecordOption; use crate::Score; mod block; -mod block_segment_postings; -mod postings_serializer; +mod standard_postings_reader; +mod standard_postings_serializer; mod skip; -pub use block_segment_postings::StandardPostingsReader; -pub use postings_serializer::StandardPostingsSerializer; +pub use standard_postings_reader::StandardPostingsReader; +pub use standard_postings_serializer::StandardPostingsSerializer; pub struct StandardPostingsCodec; diff --git a/src/codec/standard/postings/block_segment_postings.rs b/src/codec/standard/postings/standard_postings_reader.rs similarity index 91% rename from src/codec/standard/postings/block_segment_postings.rs rename to src/codec/standard/postings/standard_postings_reader.rs index d6a7b4633..d9f2b9ca4 100644 --- a/src/codec/standard/postings/block_segment_postings.rs +++ b/src/codec/standard/postings/standard_postings_reader.rs @@ -269,6 +269,45 @@ impl PostingsReader for StandardPostingsReader { skip_reader: SkipReader::new(OwnedBytes::empty(), 0, IndexRecordOption::Basic), } } + + /// Returns the block_max_score for the current block. + /// It does not require the block to be loaded. For instance, it is ok to call this method + /// after having called `.shallow_advance(..)`. + /// + /// See `TermScorer::block_max_score(..)` for more information. + fn block_max_score( + &mut self, + fieldnorm_reader: &FieldNormReader, + bm25_weight: &Bm25Weight, + ) -> Score { + if let Some(score) = self.block_max_score_cache { + return score; + } + if let Some(skip_reader_max_score) = self.skip_reader.block_max_score(bm25_weight) { + // if we are on a full block, the skip reader should have the block max information + // for us + self.block_max_score_cache = Some(skip_reader_max_score); + return skip_reader_max_score; + } + // this is the last block of the segment posting list. + // If it is actually loaded, we can compute block max manually. + if self.block_loaded { + let docs = self.doc_decoder.output_array().iter().cloned(); + let freqs = self.freq_decoder.output_array().iter().cloned(); + let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| { + let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc); + bm25_weight.score(fieldnorm_id, term_freq) + }); + let block_max_score = max_score(bm25_scores).unwrap_or(0.0); + self.block_max_score_cache = Some(block_max_score); + return block_max_score; + } + // We do not have access to any good block max value. We return bm25_weight.max_score() + // as it is a valid upperbound. + // + // We do not cache it however, so that it gets computed when once block is loaded. + bm25_weight.max_score() + } } impl StandardPostingsReader { diff --git a/src/codec/standard/postings/postings_serializer.rs b/src/codec/standard/postings/standard_postings_serializer.rs similarity index 100% rename from src/codec/standard/postings/postings_serializer.rs rename to src/codec/standard/postings/standard_postings_serializer.rs diff --git a/src/postings/block_segment_postings.rs b/src/postings/block_segment_postings.rs index 26646cdbd..bdbf9418d 100644 --- a/src/postings/block_segment_postings.rs +++ b/src/postings/block_segment_postings.rs @@ -159,7 +159,7 @@ impl BlockSegmentPostings { } // this is the last block of the segment posting list. // If it is actually loaded, we can compute block max manually. - if self.block_is_loaded() { + if self.block_loaded { let docs = self.doc_decoder.output_array().iter().cloned(); let freqs = self.freq_decoder.output_array().iter().cloned(); let bm25_scores = docs.zip(freqs).map(|(doc, term_freq)| { @@ -222,7 +222,7 @@ impl BlockSegmentPostings { /// returned by `.docs()` is empty. #[inline] pub fn docs(&self) -> &[DocId] { - debug_assert!(self.block_is_loaded()); + debug_assert!(self.block_loaded); self.doc_decoder.output_array() } @@ -235,14 +235,14 @@ impl BlockSegmentPostings { /// Return the array of `term freq` in the block. #[inline] pub fn freqs(&self) -> &[u32] { - debug_assert!(self.block_is_loaded()); + debug_assert!(self.block_loaded); self.freq_decoder.output_array() } /// Return the frequency at index `idx` of the block. #[inline] pub fn freq(&self, idx: usize) -> u32 { - debug_assert!(self.block_is_loaded()); + debug_assert!(self.block_loaded); self.freq_decoder.output(idx) } @@ -253,7 +253,7 @@ impl BlockSegmentPostings { /// of any number between 1 and `NUM_DOCS_PER_BLOCK - 1` #[inline] pub fn block_len(&self) -> usize { - debug_assert!(self.block_is_loaded()); + debug_assert!(self.block_loaded); self.doc_decoder.output_len } @@ -297,13 +297,9 @@ impl BlockSegmentPostings { } } - pub(crate) fn block_is_loaded(&self) -> bool { - self.block_loaded - } - - pub(crate) fn load_block(&mut self) { + fn load_block(&mut self) { let offset = self.skip_reader.byte_offset(); - if self.block_is_loaded() { + if self.block_loaded { return; } match self.skip_reader.block_info() { diff --git a/src/postings/mod.rs b/src/postings/mod.rs index 57ad2e0de..74723f7e0 100644 --- a/src/postings/mod.rs +++ b/src/postings/mod.rs @@ -35,7 +35,7 @@ pub use self::term_info::TermInfo; #[expect(clippy::enum_variant_names)] #[derive(Debug, PartialEq, Clone, Copy, Eq)] -pub(crate) enum FreqReadingOption { +pub enum FreqReadingOption { NoFreq, SkipFreq, ReadFreq, diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 7638561ff..f2ebf5587 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -166,7 +166,6 @@ impl DocSet for SegmentPostings { // next needs to be called a first time to point to the correct element. #[inline] fn advance(&mut self) -> DocId { - debug_assert!(self.block_cursor.block_is_loaded()); if self.cur == COMPRESSION_BLOCK_SIZE - 1 { self.cur = 0; self.block_cursor.advance();