refactor postings (#2709)

rename shallow_seek to seek_block
remove full_block from public postings API

This is as preparation to optionally handle Bitsets in the postings
This commit is contained in:
PSeitz
2025-10-08 16:55:25 +02:00
committed by GitHub
parent 714366d3b9
commit 270ca5123c
5 changed files with 36 additions and 48 deletions

View File

@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array() self.doc_decoder.output_array()
} }
/// Returns a full block, regardless of whether the block is complete or incomplete (
/// as it happens for the last block of the posting list).
///
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
///
/// This method is useful to run SSE2 linear search.
#[inline]
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
debug_assert!(self.block_is_loaded());
self.doc_decoder.full_output()
}
/// Return the document at index `idx` of the block. /// Return the document at index `idx` of the block.
#[inline] #[inline]
pub fn doc(&self, idx: usize) -> u32 { pub fn doc(&self, idx: usize) -> u32 {
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
/// ///
/// If all docs are smaller than target, the block loaded may be empty, /// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block. /// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) { pub fn seek(&mut self, target_doc: DocId) -> usize {
self.shallow_seek(target_doc); // Move to the block that might contain our document.
self.seek_block(target_doc);
self.load_block(); self.load_block();
// At this point we are on the block that might contain our document.
let doc = self.doc_decoder.seek_within_block(target_doc);
// The last block is not full and padded with TERMINATED,
// so we are guaranteed to have at least one value (real or padding)
// that is >= target_doc.
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target_doc`.
// If all docs are smaller than target, the current block is incomplete and padded
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
doc
} }
pub(crate) fn position_offset(&self) -> u64 { pub(crate) fn position_offset(&self) -> u64 {
self.skip_reader.position_offset() self.skip_reader.position_offset()
} }
/// Dangerous API! This calls seek on the skip list, /// Dangerous API! This calls seeks the next block on the skip list,
/// but does not `.load_block()` afterwards. /// but does not `.load_block()` afterwards.
/// ///
/// `.load_block()` needs to be called manually afterwards. /// `.load_block()` needs to be called manually afterwards.
/// If all docs are smaller than target, the block loaded may be empty, /// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block. /// or be the last an incomplete VInt block.
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) { pub(crate) fn seek_block(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) { if self.skip_reader.seek(target_doc) {
self.block_max_score_cache = None; self.block_max_score_cache = None;
self.block_loaded = false; self.block_loaded = false;

View File

@@ -151,9 +151,11 @@ impl BlockDecoder {
&self.output[..self.output_len] &self.output[..self.output_len]
} }
/// Return in-block index of first value >= `target`.
/// Uses the padded buffer to enable branchless search.
#[inline] #[inline]
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] { pub(crate) fn seek_within_block(&self, target: u32) -> usize {
&self.output crate::postings::branchless_binary_search(&self.output, target)
} }
#[inline] #[inline]

View File

@@ -4,7 +4,7 @@ use crate::docset::DocSet;
use crate::fastfield::AliveBitSet; use crate::fastfield::AliveBitSet;
use crate::positions::PositionReader; use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE; use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings}; use crate::postings::{BlockSegmentPostings, Postings};
use crate::{DocId, TERMINATED}; use crate::{DocId, TERMINATED};
/// `SegmentPostings` represents the inverted list or postings associated with /// `SegmentPostings` represents the inverted list or postings associated with
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
return self.doc(); return self.doc();
} }
self.block_cursor.seek(target); // Delegate block-local search to BlockSegmentPostings::seek, which returns
// the in-block index of the first doc >= target.
// At this point we are on the block, that might contain our document. self.cur = self.block_cursor.seek(target);
let output = self.block_cursor.full_block(); let doc = self.doc();
self.cur = branchless_binary_search(output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target.
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target`
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output[self.cur];
debug_assert!(doc >= target); debug_assert!(doc >= target);
debug_assert_eq!(doc, self.doc());
doc doc
} }

View File

@@ -167,7 +167,7 @@ pub fn block_wand(
let block_max_score_upperbound: Score = scorers[..pivot_len] let block_max_score_upperbound: Score = scorers[..pivot_len]
.iter_mut() .iter_mut()
.map(|scorer| { .map(|scorer| {
scorer.shallow_seek(pivot_doc); scorer.seek_block(pivot_doc);
scorer.block_max_score() scorer.block_max_score()
}) })
.sum(); .sum();
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
return; return;
} }
doc = last_doc_in_block + 1; doc = last_doc_in_block + 1;
scorer.shallow_seek(doc); scorer.seek_block(doc);
} }
// Seek will effectively load that block. // Seek will effectively load that block.
doc = scorer.seek(doc); doc = scorer.seek(doc);
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
} }
} }
doc += 1; doc += 1;
scorer.shallow_seek(doc); scorer.seek_block(doc);
} }
} }

View File

@@ -25,8 +25,8 @@ impl TermScorer {
} }
} }
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) { pub(crate) fn seek_block(&mut self, target_doc: DocId) {
self.postings.block_cursor.shallow_seek(target_doc); self.postings.block_cursor.seek_block(target_doc);
} }
#[cfg(test)] #[cfg(test)]
@@ -175,7 +175,7 @@ mod tests {
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect(); let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight); let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
assert_eq!(term_scorer.doc(), 0u32); assert_eq!(term_scorer.doc(), 0u32);
term_scorer.shallow_seek(1289); term_scorer.seek_block(1289);
assert_eq!(term_scorer.doc(), 0u32); assert_eq!(term_scorer.doc(), 0u32);
term_scorer.seek(1289); term_scorer.seek(1289);
assert_eq!(term_scorer.doc(), 1290); assert_eq!(term_scorer.doc(), 1290);
@@ -242,9 +242,9 @@ mod tests {
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0); let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight); let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
assert_nearly_equals!(docs.block_max_score(), 2.5161593); assert_nearly_equals!(docs.block_max_score(), 2.5161593);
docs.shallow_seek(135); docs.seek_block(135);
assert_nearly_equals!(docs.block_max_score(), 3.4597192); assert_nearly_equals!(docs.block_max_score(), 3.4597192);
docs.shallow_seek(256); docs.seek_block(256);
// the block is not loaded yet. // the block is not loaded yet.
assert_nearly_equals!(docs.block_max_score(), 5.2971773); assert_nearly_equals!(docs.block_max_score(), 5.2971773);
assert_eq!(256, docs.seek(256)); assert_eq!(256, docs.seek(256));
@@ -275,7 +275,7 @@ mod tests {
{ {
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?; let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
for d in docs { for d in docs {
term_scorer.shallow_seek(d); term_scorer.seek_block(d);
block_max_scores_b.push(term_scorer.block_max_score()); block_max_scores_b.push(term_scorer.block_max_score());
} }
} }