mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-26 05:00:41 +00:00
Simplification of the segment postings seek implementation. (#834)
This commit is contained in:
@@ -346,7 +346,7 @@ impl IndexWriter {
|
||||
|
||||
fn drop_sender(&mut self) {
|
||||
let (sender, _receiver) = channel::bounded(1);
|
||||
mem::replace(&mut self.operation_sender, sender);
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
/// If there are some merging threads, blocks until they all finish their work and
|
||||
|
||||
@@ -87,6 +87,7 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
|
||||
(begin, end)
|
||||
}
|
||||
|
||||
#[inline(never)]
|
||||
fn galloping(block_docs: &[u32], target: u32) -> usize {
|
||||
let (start, end) = exponential_search(&block_docs, target);
|
||||
start + linear_search(&block_docs[start..end], target)
|
||||
@@ -133,19 +134,14 @@ impl BlockSearcher {
|
||||
///
|
||||
/// Indeed, if the block is not full, the remaining items are TERMINATED.
|
||||
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
|
||||
pub(crate) fn search_in_block(
|
||||
self,
|
||||
block_docs: &AlignedBuffer,
|
||||
start: usize,
|
||||
target: u32,
|
||||
) -> usize {
|
||||
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
{
|
||||
if self == BlockSearcher::SSE2 {
|
||||
return sse2::linear_search_sse2_128(block_docs, target);
|
||||
}
|
||||
}
|
||||
start + galloping(&block_docs.0[start..], target)
|
||||
galloping(&block_docs.0[..], target)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -199,12 +195,10 @@ mod tests {
|
||||
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
|
||||
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
|
||||
output_buffer[..block.len()].copy_from_slice(block);
|
||||
for i in 0..cursor {
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(&AlignedBuffer(output_buffer), i, target),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
assert_eq!(
|
||||
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
|
||||
cursor
|
||||
);
|
||||
}
|
||||
|
||||
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {
|
||||
|
||||
@@ -139,12 +139,13 @@ impl BlockSegmentPostings {
|
||||
self.doc_decoder.output_array()
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
|
||||
self.doc_decoder.output_aligned()
|
||||
}
|
||||
|
||||
/// Return the document at index `idx` of the block.
|
||||
#[inline]
|
||||
#[inline(always)]
|
||||
pub fn doc(&self, idx: usize) -> u32 {
|
||||
self.doc_decoder.output(idx)
|
||||
}
|
||||
@@ -177,23 +178,12 @@ impl BlockSegmentPostings {
|
||||
|
||||
/// Position on a block that may contains `target_doc`.
|
||||
///
|
||||
/// If the current block last element is greater or equal to `target_doc`, return true.
|
||||
///
|
||||
/// Returns true if a block that has an element greater or equal to the target is found.
|
||||
/// Returning true does not guarantee that the smallest element of the block is smaller
|
||||
/// than the target. It only guarantees that the last element is greater or equal.
|
||||
///
|
||||
/// Returns false iff all of the document remaining are smaller than
|
||||
/// `doc_id`. In that case, all of these document are consumed.
|
||||
pub fn seek(&mut self, target_doc: DocId) -> bool {
|
||||
self.skip_reader.seek(target_doc);
|
||||
self.read_block();
|
||||
|
||||
// The last block last doc may actually stop before the target.
|
||||
self.docs()
|
||||
.last()
|
||||
.map(|last_doc| *last_doc >= target_doc)
|
||||
.unwrap_or(false)
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
pub fn seek(&mut self, target_doc: DocId) {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.read_block();
|
||||
}
|
||||
}
|
||||
|
||||
fn read_block(&mut self) {
|
||||
@@ -263,6 +253,7 @@ mod tests {
|
||||
use crate::common::HasLen;
|
||||
use crate::core::Index;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::postings::postings::Postings;
|
||||
use crate::postings::SegmentPostings;
|
||||
use crate::schema::IndexRecordOption;
|
||||
@@ -373,17 +364,6 @@ mod tests {
|
||||
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip() {
|
||||
for i in 0..4 {
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(block_postings.seek(i), true);
|
||||
assert_eq!(block_postings.seek(i), true);
|
||||
}
|
||||
let mut block_postings = build_block_postings(&[3]);
|
||||
assert_eq!(block_postings.seek(4u32), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_block_segment_postings_skip2() {
|
||||
let mut docs = vec![0];
|
||||
@@ -392,13 +372,13 @@ mod tests {
|
||||
}
|
||||
let mut block_postings = build_block_postings(&docs[..]);
|
||||
for i in vec![0, 424, 10000] {
|
||||
assert!(block_postings.seek(i));
|
||||
block_postings.seek(i);
|
||||
let docs = block_postings.docs();
|
||||
assert!(docs[0] <= i);
|
||||
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
|
||||
}
|
||||
assert!(!block_postings.seek(100_000));
|
||||
assert!(!block_postings.seek(101_000));
|
||||
block_postings.seek(100_000);
|
||||
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use crate::common::HasLen;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::docset::DocSet;
|
||||
use crate::positions::PositionReader;
|
||||
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
@@ -90,61 +90,43 @@ impl DocSet for SegmentPostings {
|
||||
// next needs to be called a first time to point to the correct element.
|
||||
#[inline]
|
||||
fn advance(&mut self) -> DocId {
|
||||
self.cur += 1;
|
||||
if self.cur >= self.block_cursor.block_len() {
|
||||
if self.block_cursor.advance() {
|
||||
self.cur = 0;
|
||||
} else {
|
||||
self.cur = COMPRESSION_BLOCK_SIZE - 1;
|
||||
return TERMINATED;
|
||||
}
|
||||
if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
|
||||
self.cur = 0;
|
||||
self.block_cursor.advance();
|
||||
} else {
|
||||
self.cur += 1;
|
||||
}
|
||||
self.doc()
|
||||
}
|
||||
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
let doc = self.doc();
|
||||
if doc >= target {
|
||||
return doc;
|
||||
}
|
||||
|
||||
// skip blocks until one that might contain the target
|
||||
// check if we need to go to the next block
|
||||
if self
|
||||
.block_cursor
|
||||
.docs()
|
||||
.last()
|
||||
.map(|&doc| doc < target)
|
||||
.unwrap_or(true)
|
||||
{
|
||||
// We are not in the right block.
|
||||
if !self.block_cursor.seek(target) {
|
||||
self.block_cursor.doc_decoder.clear();
|
||||
self.cur = 0;
|
||||
return TERMINATED;
|
||||
}
|
||||
self.cur = 0;
|
||||
if self.doc() == target {
|
||||
return target;
|
||||
}
|
||||
self.block_cursor.seek(target);
|
||||
|
||||
// At this point we are on the block, that might contain our document.
|
||||
|
||||
let cur = self.cur;
|
||||
|
||||
let output = self.block_cursor.docs_aligned();
|
||||
let new_cur = self.block_searcher.search_in_block(&output, cur, target);
|
||||
self.cur = new_cur;
|
||||
|
||||
self.cur = self.block_searcher.search_in_block(&output, target);
|
||||
|
||||
// The last block is not full and padded with the value TERMINATED,
|
||||
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
|
||||
// that is greater or equal to the target.
|
||||
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
|
||||
|
||||
// `doc` is now the first element >= `target`
|
||||
let doc = output.0[new_cur];
|
||||
|
||||
// If all docs are smaller than target the current block should be incomplemented and padded
|
||||
// with the value `TERMINATED`.
|
||||
//
|
||||
// After the search, the cursor should point to the first value of TERMINATED.
|
||||
let doc = output.0[self.cur];
|
||||
debug_assert!(doc >= target);
|
||||
doc
|
||||
}
|
||||
|
||||
/// Return the current document's `DocId`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// Will panics if called without having called advance before.
|
||||
#[inline]
|
||||
fn doc(&self) -> DocId {
|
||||
self.block_cursor.doc(self.cur)
|
||||
|
||||
@@ -102,7 +102,8 @@ impl SkipReader {
|
||||
self.remaining_docs = doc_freq;
|
||||
}
|
||||
|
||||
pub fn doc(&self) -> DocId {
|
||||
#[inline(always)]
|
||||
pub(crate) fn last_doc_in_block(&self) -> DocId {
|
||||
self.last_doc_in_block
|
||||
}
|
||||
|
||||
@@ -158,10 +159,17 @@ impl SkipReader {
|
||||
/// If the target is larger than all documents, the skip_reader
|
||||
/// then advance to the last Variable In block.
|
||||
///
|
||||
/// Returns true if the last block is reached.
|
||||
pub fn seek(&mut self, target: DocId) {
|
||||
while self.doc() < target {
|
||||
/// Returns true if the skip reader had to advance,
|
||||
/// false if it was already positionned on the right block.
|
||||
pub fn seek(&mut self, target: DocId) -> bool {
|
||||
if self.last_doc_in_block() >= target {
|
||||
return false;
|
||||
}
|
||||
loop {
|
||||
self.advance();
|
||||
if self.last_doc_in_block() >= target {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -218,7 +226,7 @@ mod tests {
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 1u32);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
@@ -228,7 +236,7 @@ mod tests {
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 5u32);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
@@ -257,7 +265,7 @@ mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 1u32);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
@@ -267,7 +275,7 @@ mod tests {
|
||||
}
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 5u32);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
@@ -295,7 +303,7 @@ mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
assert!(skip_reader.advance());
|
||||
assert_eq!(skip_reader.doc(), 1u32);
|
||||
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
|
||||
assert_eq!(
|
||||
skip_reader.block_info(),
|
||||
BlockInfo::BitPacked {
|
||||
|
||||
@@ -112,11 +112,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
debug_assert_eq!(left.doc(), right.doc());
|
||||
// test the remaining scorers;
|
||||
for docset in self.others.iter_mut() {
|
||||
// `candidate_ord` is already at the
|
||||
// right position.
|
||||
//
|
||||
// Calling `skip_next` would advance this docset
|
||||
// and miss it.
|
||||
let seek_doc = docset.seek(candidate);
|
||||
if seek_doc > candidate {
|
||||
candidate = left.seek(seek_doc);
|
||||
|
||||
@@ -198,6 +198,18 @@ where
|
||||
|
||||
// TODO Also implement `count` with deletes efficiently.
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
|
||||
fn count_including_deleted(&mut self) -> u32 {
|
||||
if self.doc == TERMINATED {
|
||||
return 0;
|
||||
@@ -219,18 +231,6 @@ where
|
||||
self.cursor = HORIZON_NUM_TINYBITSETS;
|
||||
count
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
self.doc
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
self.docsets
|
||||
.iter()
|
||||
.map(|docset| docset.size_hint())
|
||||
.max()
|
||||
.unwrap_or(0u32)
|
||||
}
|
||||
}
|
||||
|
||||
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>
|
||||
|
||||
Reference in New Issue
Block a user