Simplification of the segment postings seek implementation. (#834)

This commit is contained in:
Paul Masurel
2020-05-27 08:49:47 +09:00
committed by GitHub
parent 7275ebdf3c
commit baf015fc57
7 changed files with 71 additions and 112 deletions

View File

@@ -346,7 +346,7 @@ impl IndexWriter {
fn drop_sender(&mut self) {
let (sender, _receiver) = channel::bounded(1);
mem::replace(&mut self.operation_sender, sender);
self.operation_sender = sender;
}
/// If there are some merging threads, blocks until they all finish their work and

View File

@@ -87,6 +87,7 @@ fn exponential_search(arr: &[u32], target: u32) -> (usize, usize) {
(begin, end)
}
#[inline(never)]
fn galloping(block_docs: &[u32], target: u32) -> usize {
let (start, end) = exponential_search(&block_docs, target);
start + linear_search(&block_docs[start..end], target)
@@ -133,19 +134,14 @@ impl BlockSearcher {
///
/// Indeed, if the block is not full, the remaining items are TERMINATED.
/// It is surprisingly faster, most likely because of the lack of branch misprediction.
pub(crate) fn search_in_block(
self,
block_docs: &AlignedBuffer,
start: usize,
target: u32,
) -> usize {
pub(crate) fn search_in_block(self, block_docs: &AlignedBuffer, target: u32) -> usize {
#[cfg(target_arch = "x86_64")]
{
if self == BlockSearcher::SSE2 {
return sse2::linear_search_sse2_128(block_docs, target);
}
}
start + galloping(&block_docs.0[start..], target)
galloping(&block_docs.0[..], target)
}
}
@@ -199,12 +195,10 @@ mod tests {
assert!(block.len() < COMPRESSION_BLOCK_SIZE);
let mut output_buffer = [TERMINATED; COMPRESSION_BLOCK_SIZE];
output_buffer[..block.len()].copy_from_slice(block);
for i in 0..cursor {
assert_eq!(
block_searcher.search_in_block(&AlignedBuffer(output_buffer), i, target),
cursor
);
}
assert_eq!(
block_searcher.search_in_block(&AlignedBuffer(output_buffer), target),
cursor
);
}
fn util_test_search_in_block_all(block_searcher: BlockSearcher, block: &[u32]) {

View File

@@ -139,12 +139,13 @@ impl BlockSegmentPostings {
self.doc_decoder.output_array()
}
#[inline(always)]
pub(crate) fn docs_aligned(&self) -> &AlignedBuffer {
self.doc_decoder.output_aligned()
}
/// Return the document at index `idx` of the block.
#[inline]
#[inline(always)]
pub fn doc(&self, idx: usize) -> u32 {
self.doc_decoder.output(idx)
}
@@ -177,23 +178,12 @@ impl BlockSegmentPostings {
/// Position on a block that may contains `target_doc`.
///
/// If the current block last element is greater or equal to `target_doc`, return true.
///
/// Returns true if a block that has an element greater or equal to the target is found.
/// Returning true does not guarantee that the smallest element of the block is smaller
/// than the target. It only guarantees that the last element is greater or equal.
///
/// Returns false iff all of the document remaining are smaller than
/// `doc_id`. In that case, all of these document are consumed.
pub fn seek(&mut self, target_doc: DocId) -> bool {
self.skip_reader.seek(target_doc);
self.read_block();
// The last block last doc may actually stop before the target.
self.docs()
.last()
.map(|last_doc| *last_doc >= target_doc)
.unwrap_or(false)
/// If all docs are smaller than target, the block loaded may be empty,
/// or be the last an incomplete VInt block.
pub fn seek(&mut self, target_doc: DocId) {
if self.skip_reader.seek(target_doc) {
self.read_block();
}
}
fn read_block(&mut self) {
@@ -263,6 +253,7 @@ mod tests {
use crate::common::HasLen;
use crate::core::Index;
use crate::docset::{DocSet, TERMINATED};
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
use crate::postings::postings::Postings;
use crate::postings::SegmentPostings;
use crate::schema::IndexRecordOption;
@@ -373,17 +364,6 @@ mod tests {
inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic)
}
#[test]
fn test_block_segment_postings_skip() {
for i in 0..4 {
let mut block_postings = build_block_postings(&[3]);
assert_eq!(block_postings.seek(i), true);
assert_eq!(block_postings.seek(i), true);
}
let mut block_postings = build_block_postings(&[3]);
assert_eq!(block_postings.seek(4u32), false);
}
#[test]
fn test_block_segment_postings_skip2() {
let mut docs = vec![0];
@@ -392,13 +372,13 @@ mod tests {
}
let mut block_postings = build_block_postings(&docs[..]);
for i in vec![0, 424, 10000] {
assert!(block_postings.seek(i));
block_postings.seek(i);
let docs = block_postings.docs();
assert!(docs[0] <= i);
assert!(docs.last().cloned().unwrap_or(0u32) >= i);
}
assert!(!block_postings.seek(100_000));
assert!(!block_postings.seek(101_000));
block_postings.seek(100_000);
assert_eq!(block_postings.doc(COMPRESSION_BLOCK_SIZE - 1), TERMINATED);
}
#[test]

View File

@@ -1,6 +1,6 @@
use crate::common::HasLen;
use crate::docset::{DocSet, TERMINATED};
use crate::docset::DocSet;
use crate::positions::PositionReader;
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
@@ -90,61 +90,43 @@ impl DocSet for SegmentPostings {
// next needs to be called a first time to point to the correct element.
#[inline]
fn advance(&mut self) -> DocId {
self.cur += 1;
if self.cur >= self.block_cursor.block_len() {
if self.block_cursor.advance() {
self.cur = 0;
} else {
self.cur = COMPRESSION_BLOCK_SIZE - 1;
return TERMINATED;
}
if self.cur == COMPRESSION_BLOCK_SIZE - 1 {
self.cur = 0;
self.block_cursor.advance();
} else {
self.cur += 1;
}
self.doc()
}
fn seek(&mut self, target: DocId) -> DocId {
let doc = self.doc();
if doc >= target {
return doc;
}
// skip blocks until one that might contain the target
// check if we need to go to the next block
if self
.block_cursor
.docs()
.last()
.map(|&doc| doc < target)
.unwrap_or(true)
{
// We are not in the right block.
if !self.block_cursor.seek(target) {
self.block_cursor.doc_decoder.clear();
self.cur = 0;
return TERMINATED;
}
self.cur = 0;
if self.doc() == target {
return target;
}
self.block_cursor.seek(target);
// At this point we are on the block, that might contain our document.
let cur = self.cur;
let output = self.block_cursor.docs_aligned();
let new_cur = self.block_searcher.search_in_block(&output, cur, target);
self.cur = new_cur;
self.cur = self.block_searcher.search_in_block(&output, target);
// The last block is not full and padded with the value TERMINATED,
// so that we are guaranteed to have at least doc in the block (a real one or the padding)
// that is greater or equal to the target.
debug_assert!(self.cur < COMPRESSION_BLOCK_SIZE);
// `doc` is now the first element >= `target`
let doc = output.0[new_cur];
// If all docs are smaller than target the current block should be incomplemented and padded
// with the value `TERMINATED`.
//
// After the search, the cursor should point to the first value of TERMINATED.
let doc = output.0[self.cur];
debug_assert!(doc >= target);
doc
}
/// Return the current document's `DocId`.
///
/// # Panics
///
/// Will panics if called without having called advance before.
#[inline]
fn doc(&self) -> DocId {
self.block_cursor.doc(self.cur)

View File

@@ -102,7 +102,8 @@ impl SkipReader {
self.remaining_docs = doc_freq;
}
pub fn doc(&self) -> DocId {
#[inline(always)]
pub(crate) fn last_doc_in_block(&self) -> DocId {
self.last_doc_in_block
}
@@ -158,10 +159,17 @@ impl SkipReader {
/// If the target is larger than all documents, the skip_reader
/// then advance to the last Variable In block.
///
/// Returns true if the last block is reached.
pub fn seek(&mut self, target: DocId) {
while self.doc() < target {
/// Returns true if the skip reader had to advance,
/// false if it was already positionned on the right block.
pub fn seek(&mut self, target: DocId) -> bool {
if self.last_doc_in_block() >= target {
return false;
}
loop {
self.advance();
if self.last_doc_in_block() >= target {
return true;
}
}
}
@@ -218,7 +226,7 @@ mod tests {
IndexRecordOption::WithFreqs,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.doc(), 1u32);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
@@ -228,7 +236,7 @@ mod tests {
}
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.doc(), 5u32);
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
@@ -257,7 +265,7 @@ mod tests {
IndexRecordOption::Basic,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.doc(), 1u32);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
@@ -267,7 +275,7 @@ mod tests {
}
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.doc(), 5u32);
assert_eq!(skip_reader.last_doc_in_block(), 5u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {
@@ -295,7 +303,7 @@ mod tests {
IndexRecordOption::Basic,
);
assert!(skip_reader.advance());
assert_eq!(skip_reader.doc(), 1u32);
assert_eq!(skip_reader.last_doc_in_block(), 1u32);
assert_eq!(
skip_reader.block_info(),
BlockInfo::BitPacked {

View File

@@ -112,11 +112,6 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers;
for docset in self.others.iter_mut() {
// `candidate_ord` is already at the
// right position.
//
// Calling `skip_next` would advance this docset
// and miss it.
let seek_doc = docset.seek(candidate);
if seek_doc > candidate {
candidate = left.seek(seek_doc);

View File

@@ -198,6 +198,18 @@ where
// TODO Also implement `count` with deletes efficiently.
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
fn count_including_deleted(&mut self) -> u32 {
if self.doc == TERMINATED {
return 0;
@@ -219,18 +231,6 @@ where
self.cursor = HORIZON_NUM_TINYBITSETS;
count
}
fn doc(&self) -> DocId {
self.doc
}
fn size_hint(&self) -> u32 {
self.docsets
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
}
impl<TScorer, TScoreCombiner> Scorer for Union<TScorer, TScoreCombiner>