mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-23 02:29:57 +00:00
refactor postings (#2709)
rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings
This commit is contained in:
@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
/// Returns a full block, regardless of whether the block is complete or incomplete (
|
||||
/// as it happens for the last block of the posting list).
|
||||
///
|
||||
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
|
||||
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
|
||||
///
|
||||
/// This method is useful to run SSE2 linear search.
|
||||
#[inline]
|
||||
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
|
||||
debug_assert!(self.block_is_loaded());
|
||||
self.doc_decoder.full_output()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
|
||||
///
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) {
|
||||
self.shallow_seek(target_doc);
|
||||
pub fn seek(&mut self, target_doc: DocId) -> usize {
|
||||
// Move to the block that might contain our document.
|
||||
self.seek_block(target_doc);
|
||||
self.load_block();
|
||||
|
||||
// At this point we are on the block that might contain our document.
|
||||
let doc = self.doc_decoder.seek_within_block(target_doc);
|
||||
|
||||
// The last block is not full and padded with TERMINATED,
|
||||
// so we are guaranteed to have at least one value (real or padding)
|
||||
// that is >= target_doc.
|
||||
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target_doc`.
|
||||
// If all docs are smaller than target, the current block is incomplete and padded
|
||||
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
|
||||
doc
|
||||
}
|
||||
|
||||
pub(crate) fn position_offset(&self) -> u64 {
|
||||
self.skip_reader.position_offset()
|
||||
}
|
||||
|
||||
/// Dangerous API! This calls seek on the skip list,
|
||||
/// Dangerous API! This calls seeks the next block on the skip list,
|
||||
/// but does not `.load_block()` afterwards.
|
||||
///
|
||||
/// `.load_block()` needs to be called manually afterwards.
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
|
||||
@@ -151,9 +151,11 @@ impl BlockDecoder {
|
||||
&self.output[..self.output_len]
|
||||
}
|
||||
|
||||
/// Return in-block index of first value >= `target`.
|
||||
/// Uses the padded buffer to enable branchless search.
|
||||
#[inline]
|
||||
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
|
||||
&self.output
|
||||
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
|
||||
crate::postings::branchless_binary_search(&self.output, target)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -4,7 +4,7 @@ use crate::docset::DocSet;
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::positions::PositionReader;
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
|
||||
use crate::postings::{BlockSegmentPostings, Postings};
|
||||
use crate::{DocId, TERMINATED};
|
||||
|
||||
/// `SegmentPostings` represents the inverted list or postings associated with
|
||||
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
|
||||
return self.doc();
|
||||
}
|
||||
|
||||
self.block_cursor.seek(target);
|
||||
|
||||
// At this point we are on the block, that might contain our document.
|
||||
let output = self.block_cursor.full_block();
|
||||
self.cur = branchless_binary_search(output, target);
|
||||
|
||||
// The last block is not full and padded with the value TERMINATED,
|
||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
||||
// that is greater or equal to the target.
|
||||
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
|
||||
// If all docs are smaller than target the current block should be incomplemented and padded
|
||||
// with the value `TERMINATED`.
|
||||
//
|
||||
// After the search, the cursor should point to the first value of TERMINATED.
|
||||
let doc = output[self.cur];
|
||||
// Delegate block-local search to BlockSegmentPostings::seek, which returns
|
||||
// the in-block index of the first doc >= target.
|
||||
self.cur = self.block_cursor.seek(target);
|
||||
let doc = self.doc();
|
||||
debug_assert!(doc >= target);
|
||||
debug_assert_eq!(doc, self.doc());
|
||||
doc
|
||||
}
|
||||
|
||||
|
||||
@@ -167,7 +167,7 @@ pub fn block_wand(
|
||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||
.iter_mut()
|
||||
.map(|scorer| {
|
||||
scorer.shallow_seek(pivot_doc);
|
||||
scorer.seek_block(pivot_doc);
|
||||
scorer.block_max_score()
|
||||
})
|
||||
.sum();
|
||||
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
|
||||
return;
|
||||
}
|
||||
doc = last_doc_in_block + 1;
|
||||
scorer.shallow_seek(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
// Seek will effectively load that block.
|
||||
doc = scorer.seek(doc);
|
||||
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
|
||||
}
|
||||
}
|
||||
doc += 1;
|
||||
scorer.shallow_seek(doc);
|
||||
scorer.seek_block(doc);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -25,8 +25,8 @@ impl TermScorer {
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
||||
self.postings.block_cursor.shallow_seek(target_doc);
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||
self.postings.block_cursor.seek_block(target_doc);
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -175,7 +175,7 @@ mod tests {
|
||||
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
|
||||
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
term_scorer.shallow_seek(1289);
|
||||
term_scorer.seek_block(1289);
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
term_scorer.seek(1289);
|
||||
assert_eq!(term_scorer.doc(), 1290);
|
||||
@@ -242,9 +242,9 @@ mod tests {
|
||||
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
|
||||
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
|
||||
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
|
||||
docs.shallow_seek(135);
|
||||
docs.seek_block(135);
|
||||
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
|
||||
docs.shallow_seek(256);
|
||||
docs.seek_block(256);
|
||||
// the block is not loaded yet.
|
||||
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
|
||||
assert_eq!(256, docs.seek(256));
|
||||
@@ -275,7 +275,7 @@ mod tests {
|
||||
{
|
||||
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
|
||||
for d in docs {
|
||||
term_scorer.shallow_seek(d);
|
||||
term_scorer.seek_block(d);
|
||||
block_max_scores_b.push(term_scorer.block_max_score());
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user