use std::borrow::{Borrow, BorrowMut}; use crate::fastfield::AliveBitSet; use crate::DocId; /// Sentinel value returned when a [`DocSet`] has been entirely consumed. /// /// This is not `u32::MAX` as one would have expected, due to the lack of SSE2 instructions /// to compare `[u32; 4]`. pub const TERMINATED: DocId = i32::MAX as u32; /// The collect_block method on `SegmentCollector` uses a buffer of this size. /// Passed results to `collect_block` will not exceed this size and will be /// exactly this size as long as we can fill the buffer. pub const COLLECT_BLOCK_BUFFER_LEN: usize = 64; /// Represents an iterable set of sorted doc ids. pub trait DocSet: Send { /// Goes to the next element. /// /// The DocId of the next element is returned. /// In other words we should always have : /// ```compile_fail /// let doc = docset.advance(); /// assert_eq!(doc, docset.doc()); /// ``` /// /// If we reached the end of the `DocSet`, [`TERMINATED`] should be returned. /// /// Calling `.advance()` on a terminated `DocSet` should be supported, and [`TERMINATED`] should /// be returned. fn advance(&mut self) -> DocId; /// Advances the `DocSet` forward until reaching the target, or going to the /// lowest [`DocId`] greater than the target. /// /// If the end of the `DocSet` is reached, [`TERMINATED`] is returned. /// /// Calling `.seek(target)` on a terminated `DocSet` is legal. Implementation /// of `DocSet` should support it. /// /// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`. /// /// `target` has to be larger or equal to `.doc()` when calling `seek`. fn seek(&mut self, target: DocId) -> DocId { let mut doc = self.doc(); debug_assert!(doc <= target); while doc < target { doc = self.advance(); } doc } /// !!!Dragons ahead!!! /// In spirit, this is an approximate and dangerous version of `seek`. /// /// It can leave the DocSet in an `invalid` state and might return a /// lower bound of what the result of Seek would have been. /// /// /// More accurately it returns either: /// - Found if the target is in the docset. In that case, the DocSet is left in a valid state. /// - SeekLowerBound(seek_lower_bound) if the target is not in the docset. In that case, The /// DocSet can be the left in a invalid state. The DocSet should then only receives call to /// `seek_danger(..)` until it returns `Found`, and get back to a valid state. /// /// `seek_lower_bound` can be any `DocId` (in the docset or not) as long as it is in /// `(target .. seek_result] U {TERMINATED}` where `seek_result` is the first document in the /// docset greater than to `target`. /// /// `seek_danger` may return `SeekLowerBound(TERMINATED)`. /// /// Calling `seek_danger` with TERMINATED as a target is allowed, /// and should always return NewTarget(TERMINATED) or anything larger as TERMINATED is NOT in /// the DocSet. /// /// DocSets that already have an efficient `seek` method don't need to implement /// `seek_danger`. /// /// Consecutive calls to seek_danger are guaranteed to have strictly increasing `target` /// values. fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { if target >= TERMINATED { debug_assert!(target == TERMINATED); // No need to advance. return SeekDangerResult::SeekLowerBound(target); } // The default implementation does not include any // `danger zone` behavior. // // It does not leave the scorer in an invalid state. // For this reason, we can safely call `self.doc()`. let mut doc = self.doc(); if doc < target { doc = self.seek(target); } if doc == target { SeekDangerResult::Found } else { SeekDangerResult::SeekLowerBound(doc) } } /// Fills a given mutable buffer with the next doc ids from the /// `DocSet` /// /// If that many `DocId`s are available, the method should /// fill the entire buffer and return the length of the buffer. /// /// If we reach the end of the `DocSet` before filling /// it entirely, then the buffer is filled up to this point, and /// return value is the number of elements that were filled. /// /// # Warning /// /// This method is only here for specific high-performance /// use case where batching. The normal way to /// go through the `DocId`'s is to call `.advance()`. fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { if self.doc() == TERMINATED { return 0; } for (i, buffer_val) in buffer.iter_mut().enumerate() { *buffer_val = self.doc(); if self.advance() == TERMINATED { return i + 1; } } buffer.len() } /// Returns the current document /// Right after creating a new `DocSet`, the docset points to the first document. /// /// If the `DocSet` is empty, `.doc()` should return [`TERMINATED`]. fn doc(&self) -> DocId; /// Returns a best-effort hint of the /// length of the docset. fn size_hint(&self) -> u32; /// Returns a best-effort hint of the cost to consume the entire docset. /// /// Consuming means calling advance until [`TERMINATED`] is returned. /// The cost should be relative to the cost of driving a Term query, /// which would be the number of documents in the DocSet. /// /// By default this returns `size_hint()`. /// /// DocSets may have vastly different cost depending on their type, /// e.g. an intersection with 10 hits is much cheaper than /// a phrase search with 10 hits, since it needs to load positions. /// /// ### Future Work /// We may want to differentiate `DocSet` costs more more granular, e.g. /// creation_cost, advance_cost, seek_cost on to get a good estimation /// what query types to choose. fn cost(&self) -> u64 { self.size_hint() as u64 } /// Returns the number documents matching. /// Calling this method consumes the `DocSet`. fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { if alive_bitset.is_alive(doc) { count += 1u32; } doc = self.advance(); } count } /// Returns the count of documents, deleted or not. /// Calling this method consumes the `DocSet`. /// /// Of course, the result is an upper bound of the result /// given by `count()`. fn count_including_deleted(&mut self) -> u32 { let mut count = 0u32; let mut doc = self.doc(); while doc != TERMINATED { count += 1u32; doc = self.advance(); } count } } #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub enum SeekDangerResult { /// The target was found in the DocSet. Found, /// The target was not found in the DocSet. /// We return a range in which the value could be. /// The given target can be any DocId, that is <= than the first document /// in the docset after the target. SeekLowerBound(DocId), } impl DocSet for &mut dyn DocSet { fn advance(&mut self) -> u32 { (**self).advance() } fn seek(&mut self, target: DocId) -> DocId { (**self).seek(target) } fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { (**self).seek_danger(target) } fn doc(&self) -> u32 { (**self).doc() } fn size_hint(&self) -> u32 { (**self).size_hint() } fn cost(&self) -> u64 { (**self).cost() } fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { (**self).count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { (**self).count_including_deleted() } } impl DocSet for Box { fn advance(&mut self) -> DocId { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.advance() } fn seek(&mut self, target: DocId) -> DocId { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.seek(target) } fn seek_danger(&mut self, target: DocId) -> SeekDangerResult { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.seek_danger(target) } fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.fill_buffer(buffer) } fn doc(&self) -> DocId { let unboxed: &TDocSet = self.borrow(); unboxed.doc() } fn size_hint(&self) -> u32 { let unboxed: &TDocSet = self.borrow(); unboxed.size_hint() } fn cost(&self) -> u64 { let unboxed: &TDocSet = self.borrow(); unboxed.cost() } fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.count(alive_bitset) } fn count_including_deleted(&mut self) -> u32 { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.count_including_deleted() } }