add API contract verfication

This commit is contained in:
Pascal Seitz
2024-11-13 11:49:54 +09:00
committed by Pascal Seitz
parent 6c756117f8
commit e5bf84c7d0
3 changed files with 28 additions and 5 deletions

View File

@@ -40,6 +40,8 @@ pub trait DocSet: Send {
/// of `DocSet` should support it.
///
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
///
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
fn seek(&mut self, target: DocId) -> DocId {
let mut doc = self.doc();
debug_assert!(doc <= target);
@@ -58,11 +60,22 @@ pub trait DocSet: Send {
///
/// ## API Behaviour
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
/// which may be lower than target.
/// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be
/// used until `seek_exact` returns true again. The DocSet is considered to be in an invalid
/// state until `seek_exact` returns true again.
///
/// target needs to be equal or larger than `doc` when in a valid state.
///
/// Consecutive calls are not allowed to have decreasing `target` values.
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_exact(&mut self, target: DocId) -> bool {
let doc = self.seek(target);
doc == target
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
}
/// Fills a given mutable buffer with the next doc ids from the

View File

@@ -13,6 +13,9 @@ use crate::{DocId, Score};
/// For better performance, the function uses a
/// specialized implementation if the two
/// shortest scorers are `TermScorer`s.
///
/// num_docs_segment is the number of documents in the segment. It is used for estimating the
/// `size_hint` of the intersection.
pub fn intersect_scorers(
mut scorers: Vec<Box<dyn Scorer>>,
num_docs_segment: u32,

View File

@@ -194,7 +194,14 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
fn cost(&self) -> u64 {
// Advancing the docset is pretty expensive since it scans the whole column, there is no
// index currently (will change with an kd-tree)
// Since we use SIMD to scan the fast field range query we lower the cost a little bit.
// Since we use SIMD to scan the fast field range query we lower the cost a little bit,
// assuming that we hit 10% of the docs like in size_hint.
//
// If we would return a cost higher than num_docs, we would never choose ff range query as
// the driver in a DocSet, when intersecting a term query with a fast field. But
// it's the faster choice when the term query has a lot of docids and the range
// query has not.
//
// Ideally this would take the fast field codec into account
(self.column.num_docs() as f64 * 0.8) as u64
}