diff --git a/src/docset.rs b/src/docset.rs index 44396e437..e8799b2e9 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -40,6 +40,8 @@ pub trait DocSet: Send { /// of `DocSet` should support it. /// /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`. + /// + /// `target` has to be larger or equal to `.doc()` when calling `seek`. fn seek(&mut self, target: DocId) -> DocId { let mut doc = self.doc(); debug_assert!(doc <= target); @@ -58,11 +60,22 @@ pub trait DocSet: Send { /// /// ## API Behaviour /// If `seek_exact` is returning true, a call to `doc()` has to return target. - /// If `seek_exact` is returning false, a call to `doc()` may return the previous doc, - /// which may be lower than target. + /// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be + /// used until `seek_exact` returns true again. The DocSet is considered to be in an invalid + /// state until `seek_exact` returns true again. + /// + /// target needs to be equal or larger than `doc` when in a valid state. + /// + /// Consecutive calls are not allowed to have decreasing `target` values. + /// + /// # Warning + /// This is an advanced API used by intersection. The API contract is tricky, avoid using it. fn seek_exact(&mut self, target: DocId) -> bool { - let doc = self.seek(target); - doc == target + let current_doc = self.doc(); + if current_doc < target { + self.seek(target); + } + self.doc() == target } /// Fills a given mutable buffer with the next doc ids from the diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 419f2c5ff..148c80f05 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -13,6 +13,9 @@ use crate::{DocId, Score}; /// For better performance, the function uses a /// specialized implementation if the two /// shortest scorers are `TermScorer`s. +/// +/// num_docs_segment is the number of documents in the segment. It is used for estimating the +/// `size_hint` of the intersection. pub fn intersect_scorers( mut scorers: Vec>, num_docs_segment: u32, diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index ec8e97ef3..c736f7b08 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -194,7 +194,14 @@ impl DocSet for RangeDocSe fn cost(&self) -> u64 { // Advancing the docset is pretty expensive since it scans the whole column, there is no // index currently (will change with an kd-tree) - // Since we use SIMD to scan the fast field range query we lower the cost a little bit. + // Since we use SIMD to scan the fast field range query we lower the cost a little bit, + // assuming that we hit 10% of the docs like in size_hint. + // + // If we would return a cost higher than num_docs, we would never choose ff range query as + // the driver in a DocSet, when intersecting a term query with a fast field. But + // it's the faster choice when the term query has a lot of docids and the range + // query has not. + // // Ideally this would take the fast field codec into account (self.column.num_docs() as f64 * 0.8) as u64 }