refactor postings (#2709)

rename shallow_seek to seek_block
remove full_block from public postings API

This is as preparation to optionally handle Bitsets in the postings
This commit is contained in:
PSeitz
2025-10-08 16:55:25 +02:00
committed by GitHub
parent 714366d3b9
commit 270ca5123c
5 changed files with 36 additions and 48 deletions

View File

@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array()
}
/// Returns a full block, regardless of whether the block is complete or incomplete (
/// as it happens for the last block of the posting list).
///
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block.
#[inline]
pub fn doc(&self, idx: usize) -> u32 {
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
///
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) {
self.shallow_seek(target_doc);
pub fn seek(&mut self, target_doc: DocId) -> usize {
// Move to the block that might contain our document.
self.seek_block(target_doc);
self.load_block();
// At this point we are on the block that might contain our document.
let doc = self.doc_decoder.seek_within_block(target_doc);
// The last block is not full and padded with TERMINATED,
// so we are guaranteed to have at least one value (real or padding)
// that is >= target_doc.
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target_doc`.
// If all docs are smaller than target, the current block is incomplete and padded
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
doc
}
pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset()
}
/// Dangerous API! This calls seek on the skip list,
/// Dangerous API! This calls seeks the next block on the skip list,
/// but does not `.load_block()` afterwards.
///
/// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None;
self.block_loaded = false;

View File

@@ -151,9 +151,11 @@ impl BlockDecoder {
&self.output[..self.output_len]
}
/// Return in-block index of first value >= `target`.
/// Uses the padded buffer to enable branchless search.
#[inline]
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
&self.output
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
crate::postings::branchless_binary_search(&self.output, target)
}
#[inline]

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
use crate::postings::{BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated with
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
return self.doc();
}
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let output = self.block_cursor.full_block();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target.
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target`
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output[self.cur];
// Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
self.cur = self.block_cursor.seek(target);
let doc = self.doc();
debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc
}

View File

@@ -167,7 +167,7 @@ pub fn block_wand(
let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut()
.map(|scorer| {
scorer.shallow_seek(pivot_doc);
scorer.seek_block(pivot_doc);
scorer.block_max_score()
})
.sum();
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
return;
}
doc = last_doc_in_block + 1;
scorer.shallow_seek(doc);
scorer.seek_block(doc);
}
// Seek will effectively load that block.
doc = scorer.seek(doc);
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
}
}
doc += 1;
scorer.shallow_seek(doc);
scorer.seek_block(doc);
}
}

View File

@@ -25,8 +25,8 @@ impl TermScorer {
}
}
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
self.postings.block_cursor.shallow_seek(target_doc);
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
self.postings.block_cursor.seek_block(target_doc);
}
#[cfg(test)]
@@ -175,7 +175,7 @@ mod tests {
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.shallow_seek(1289);
term_scorer.seek_block(1289);
assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1289);
assert_eq!(term_scorer.doc(), 1290);
@@ -242,9 +242,9 @@ mod tests {
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
docs.shallow_seek(135);
docs.seek_block(135);
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
docs.shallow_seek(256);
docs.seek_block(256);
// the block is not loaded yet.
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
assert_eq!(256, docs.seek(256));
@@ -275,7 +275,7 @@ mod tests {
{
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
for d in docs {
term_scorer.shallow_seek(d);
term_scorer.seek_block(d);
block_max_scores_b.push(term_scorer.block_max_score());
}
}