mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 20:19:57 +00:00
refactor postings (#2709)
rename shallow_seek to seek_block remove full_block from public postings API This is as preparation to optionally handle Bitsets in the postings
This commit is contained in:
@@ -227,19 +227,6 @@ impl BlockSegmentPostings {
|
|||||||
self.doc_decoder.output_array()
|
self.doc_decoder.output_array()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a full block, regardless of whether the block is complete or incomplete (
|
|
||||||
/// as it happens for the last block of the posting list).
|
|
||||||
///
|
|
||||||
/// In the latter case, the block is guaranteed to be padded with the sentinel value:
|
|
||||||
/// `TERMINATED`. The array is also guaranteed to be aligned on 16 bytes = 128 bits.
|
|
||||||
///
|
|
||||||
/// This method is useful to run SSE2 linear search.
|
|
||||||
#[inline]
|
|
||||||
pub(crate) fn full_block(&self) -> &[DocId; COMPRESSION_BLOCK_SIZE] {
|
|
||||||
debug_assert!(self.block_is_loaded());
|
|
||||||
self.doc_decoder.full_output()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Return the document at index `idx` of the block.
|
/// Return the document at index `idx` of the block.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub fn doc(&self, idx: usize) -> u32 {
|
pub fn doc(&self, idx: usize) -> u32 {
|
||||||
@@ -275,22 +262,36 @@ impl BlockSegmentPostings {
|
|||||||
///
|
///
|
||||||
/// If all docs are smaller than target, the block loaded may be empty,
|
/// If all docs are smaller than target, the block loaded may be empty,
|
||||||
/// or be the last an incomplete VInt block.
|
/// or be the last an incomplete VInt block.
|
||||||
pub fn seek(&mut self, target_doc: DocId) {
|
pub fn seek(&mut self, target_doc: DocId) -> usize {
|
||||||
self.shallow_seek(target_doc);
|
// Move to the block that might contain our document.
|
||||||
|
self.seek_block(target_doc);
|
||||||
self.load_block();
|
self.load_block();
|
||||||
|
|
||||||
|
// At this point we are on the block that might contain our document.
|
||||||
|
let doc = self.doc_decoder.seek_within_block(target_doc);
|
||||||
|
|
||||||
|
// The last block is not full and padded with TERMINATED,
|
||||||
|
// so we are guaranteed to have at least one value (real or padding)
|
||||||
|
// that is >= target_doc.
|
||||||
|
debug_assert!(doc < COMPRESSION_BLOCK_SIZE);
|
||||||
|
|
||||||
|
// `doc` is now the first element >= `target_doc`.
|
||||||
|
// If all docs are smaller than target, the current block is incomplete and padded
|
||||||
|
// with TERMINATED. After the search, the cursor points to the first TERMINATED.
|
||||||
|
doc
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn position_offset(&self) -> u64 {
|
pub(crate) fn position_offset(&self) -> u64 {
|
||||||
self.skip_reader.position_offset()
|
self.skip_reader.position_offset()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Dangerous API! This calls seek on the skip list,
|
/// Dangerous API! This calls seeks the next block on the skip list,
|
||||||
/// but does not `.load_block()` afterwards.
|
/// but does not `.load_block()` afterwards.
|
||||||
///
|
///
|
||||||
/// `.load_block()` needs to be called manually afterwards.
|
/// `.load_block()` needs to be called manually afterwards.
|
||||||
/// If all docs are smaller than target, the block loaded may be empty,
|
/// If all docs are smaller than target, the block loaded may be empty,
|
||||||
/// or be the last an incomplete VInt block.
|
/// or be the last an incomplete VInt block.
|
||||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||||
if self.skip_reader.seek(target_doc) {
|
if self.skip_reader.seek(target_doc) {
|
||||||
self.block_max_score_cache = None;
|
self.block_max_score_cache = None;
|
||||||
self.block_loaded = false;
|
self.block_loaded = false;
|
||||||
|
|||||||
@@ -151,9 +151,11 @@ impl BlockDecoder {
|
|||||||
&self.output[..self.output_len]
|
&self.output[..self.output_len]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Return in-block index of first value >= `target`.
|
||||||
|
/// Uses the padded buffer to enable branchless search.
|
||||||
#[inline]
|
#[inline]
|
||||||
pub(crate) fn full_output(&self) -> &[u32; COMPRESSION_BLOCK_SIZE] {
|
pub(crate) fn seek_within_block(&self, target: u32) -> usize {
|
||||||
&self.output
|
crate::postings::branchless_binary_search(&self.output, target)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use crate::docset::DocSet;
|
|||||||
use crate::fastfield::AliveBitSet;
|
use crate::fastfield::AliveBitSet;
|
||||||
use crate::positions::PositionReader;
|
use crate::positions::PositionReader;
|
||||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||||
use crate::postings::{branchless_binary_search, BlockSegmentPostings, Postings};
|
use crate::postings::{BlockSegmentPostings, Postings};
|
||||||
use crate::{DocId, TERMINATED};
|
use crate::{DocId, TERMINATED};
|
||||||
|
|
||||||
/// `SegmentPostings` represents the inverted list or postings associated with
|
/// `SegmentPostings` represents the inverted list or postings associated with
|
||||||
@@ -175,26 +175,11 @@ impl DocSet for SegmentPostings {
|
|||||||
return self.doc();
|
return self.doc();
|
||||||
}
|
}
|
||||||
|
|
||||||
self.block_cursor.seek(target);
|
// Delegate block-local search to BlockSegmentPostings::seek, which returns
|
||||||
|
// the in-block index of the first doc >= target.
|
||||||
// At this point we are on the block, that might contain our document.
|
self.cur = self.block_cursor.seek(target);
|
||||||
let output = self.block_cursor.full_block();
|
let doc = self.doc();
|
||||||
self.cur = branchless_binary_search(output, target);
|
|
||||||
|
|
||||||
// The last block is not full and padded with the value TERMINATED,
|
|
||||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
|
||||||
// that is greater or equal to the target.
|
|
||||||
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
|
|
||||||
|
|
||||||
// `doc` is now the first element >= `target`
|
|
||||||
|
|
||||||
// If all docs are smaller than target the current block should be incomplemented and padded
|
|
||||||
// with the value `TERMINATED`.
|
|
||||||
//
|
|
||||||
// After the search, the cursor should point to the first value of TERMINATED.
|
|
||||||
let doc = output[self.cur];
|
|
||||||
debug_assert!(doc >= target);
|
debug_assert!(doc >= target);
|
||||||
debug_assert_eq!(doc, self.doc());
|
|
||||||
doc
|
doc
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -167,7 +167,7 @@ pub fn block_wand(
|
|||||||
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
let block_max_score_upperbound: Score = scorers[..pivot_len]
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
.map(|scorer| {
|
.map(|scorer| {
|
||||||
scorer.shallow_seek(pivot_doc);
|
scorer.seek_block(pivot_doc);
|
||||||
scorer.block_max_score()
|
scorer.block_max_score()
|
||||||
})
|
})
|
||||||
.sum();
|
.sum();
|
||||||
@@ -234,7 +234,7 @@ pub fn block_wand_single_scorer(
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
doc = last_doc_in_block + 1;
|
doc = last_doc_in_block + 1;
|
||||||
scorer.shallow_seek(doc);
|
scorer.seek_block(doc);
|
||||||
}
|
}
|
||||||
// Seek will effectively load that block.
|
// Seek will effectively load that block.
|
||||||
doc = scorer.seek(doc);
|
doc = scorer.seek(doc);
|
||||||
@@ -256,7 +256,7 @@ pub fn block_wand_single_scorer(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
doc += 1;
|
doc += 1;
|
||||||
scorer.shallow_seek(doc);
|
scorer.seek_block(doc);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -25,8 +25,8 @@ impl TermScorer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn shallow_seek(&mut self, target_doc: DocId) {
|
pub(crate) fn seek_block(&mut self, target_doc: DocId) {
|
||||||
self.postings.block_cursor.shallow_seek(target_doc);
|
self.postings.block_cursor.seek_block(target_doc);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -175,7 +175,7 @@ mod tests {
|
|||||||
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
|
let fieldnorms: Vec<u32> = std::iter::repeat_n(10u32, 3_000).collect();
|
||||||
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
|
let mut term_scorer = TermScorer::create_for_test(&doc_and_tfs, &fieldnorms, bm25_weight);
|
||||||
assert_eq!(term_scorer.doc(), 0u32);
|
assert_eq!(term_scorer.doc(), 0u32);
|
||||||
term_scorer.shallow_seek(1289);
|
term_scorer.seek_block(1289);
|
||||||
assert_eq!(term_scorer.doc(), 0u32);
|
assert_eq!(term_scorer.doc(), 0u32);
|
||||||
term_scorer.seek(1289);
|
term_scorer.seek(1289);
|
||||||
assert_eq!(term_scorer.doc(), 1290);
|
assert_eq!(term_scorer.doc(), 1290);
|
||||||
@@ -242,9 +242,9 @@ mod tests {
|
|||||||
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
|
let bm25_weight = Bm25Weight::for_one_term(10, 129, 20.0);
|
||||||
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
|
let mut docs = TermScorer::create_for_test(&doc_tfs[..], &fieldnorms[..], bm25_weight);
|
||||||
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
|
assert_nearly_equals!(docs.block_max_score(), 2.5161593);
|
||||||
docs.shallow_seek(135);
|
docs.seek_block(135);
|
||||||
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
|
assert_nearly_equals!(docs.block_max_score(), 3.4597192);
|
||||||
docs.shallow_seek(256);
|
docs.seek_block(256);
|
||||||
// the block is not loaded yet.
|
// the block is not loaded yet.
|
||||||
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
|
assert_nearly_equals!(docs.block_max_score(), 5.2971773);
|
||||||
assert_eq!(256, docs.seek(256));
|
assert_eq!(256, docs.seek(256));
|
||||||
@@ -275,7 +275,7 @@ mod tests {
|
|||||||
{
|
{
|
||||||
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
|
let mut term_scorer = term_weight.specialized_scorer(reader, 1.0)?;
|
||||||
for d in docs {
|
for d in docs {
|
||||||
term_scorer.shallow_seek(d);
|
term_scorer.seek_block(d);
|
||||||
block_max_scores_b.push(term_scorer.block_max_score());
|
block_max_scores_b.push(term_scorer.block_max_score());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user