mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-03 09:00:42 +00:00
Adds seek into the danger zone for fastfield range docsets.
This commit is contained in:
@@ -60,7 +60,7 @@ pub trait DocSet: Send {
|
||||
/// ## API Behaviour
|
||||
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
|
||||
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
|
||||
/// between the last doc that matched and target or a doc that is a valid next hit after
|
||||
/// greater than the last doc that matched and target or a doc that is a valid next hit after
|
||||
/// target. The DocSet is considered to be in an invalid state until
|
||||
/// `seek_into_the_danger_zone` returns true again.
|
||||
///
|
||||
@@ -70,12 +70,16 @@ pub trait DocSet: Send {
|
||||
///
|
||||
/// # Warning
|
||||
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
let current_doc = self.doc();
|
||||
if current_doc < target {
|
||||
self.seek(target);
|
||||
}
|
||||
self.doc() == target
|
||||
if self.doc() == target {
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
} else {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(self.doc())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fills a given mutable buffer with the next doc ids from the
|
||||
@@ -166,6 +170,12 @@ pub trait DocSet: Send {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum SeekIntoTheDangerZoneResult {
|
||||
Found,
|
||||
NewTarget(DocId),
|
||||
}
|
||||
|
||||
impl DocSet for &mut dyn DocSet {
|
||||
fn advance(&mut self) -> u32 {
|
||||
(**self).advance()
|
||||
@@ -175,7 +185,7 @@ impl DocSet for &mut dyn DocSet {
|
||||
(**self).seek(target)
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
(**self).seek_into_the_danger_zone(target)
|
||||
}
|
||||
|
||||
@@ -211,7 +221,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
|
||||
unboxed.seek(target)
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
let unboxed: &mut TDocSet = self.borrow_mut();
|
||||
unboxed.seek_into_the_danger_zone(target)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::fmt;
|
||||
|
||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||
use crate::docset::{SeekIntoTheDangerZoneResult, COLLECT_BLOCK_BUFFER_LEN};
|
||||
use crate::fastfield::AliveBitSet;
|
||||
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
|
||||
use crate::{DocId, DocSet, Score, SegmentReader, Term};
|
||||
@@ -104,7 +104,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
|
||||
fn seek(&mut self, target: DocId) -> DocId {
|
||||
self.underlying.seek(target)
|
||||
}
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
self.underlying.seek_into_the_danger_zone(target)
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::BinaryHeap;
|
||||
|
||||
use crate::docset::SeekIntoTheDangerZoneResult;
|
||||
use crate::query::score_combiner::DoNothingCombiner;
|
||||
use crate::query::{ScoreCombiner, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
@@ -67,9 +68,16 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||
self.current_doc = doc_id;
|
||||
doc_id
|
||||
}
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
let found = self.scorer.seek_into_the_danger_zone(target);
|
||||
self.current_doc = self.scorer.doc();
|
||||
match found {
|
||||
crate::docset::SeekIntoTheDangerZoneResult::Found => {
|
||||
self.current_doc = self.scorer.doc();
|
||||
}
|
||||
crate::docset::SeekIntoTheDangerZoneResult::NewTarget(current_doc) => {
|
||||
self.current_doc = current_doc;
|
||||
}
|
||||
}
|
||||
found
|
||||
}
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use super::size_hint::estimate_intersection;
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::{EmptyScorer, Scorer};
|
||||
use crate::{DocId, Score};
|
||||
@@ -117,14 +117,15 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
// of the two rarest `DocSet` in the intersection.
|
||||
|
||||
loop {
|
||||
if right.seek_into_the_danger_zone(candidate) {
|
||||
let SeekIntoTheDangerZoneResult::NewTarget(right_new_target) =
|
||||
right.seek_into_the_danger_zone(candidate)
|
||||
else {
|
||||
break;
|
||||
}
|
||||
let right_doc = right.doc();
|
||||
};
|
||||
// TODO: Think about which value would make sense here
|
||||
// It depends on the DocSet implementation, when a seek would outweigh an advance.
|
||||
if right_doc > candidate.wrapping_add(100) {
|
||||
candidate = left.seek(right_doc);
|
||||
if right_new_target > candidate.wrapping_add(100) {
|
||||
candidate = left.seek(right_new_target);
|
||||
} else {
|
||||
candidate = left.advance();
|
||||
}
|
||||
@@ -135,17 +136,20 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
|
||||
debug_assert_eq!(left.doc(), right.doc());
|
||||
// test the remaining scorers
|
||||
if self
|
||||
.others
|
||||
.iter_mut()
|
||||
.all(|docset| docset.seek_into_the_danger_zone(candidate))
|
||||
{
|
||||
debug_assert_eq!(candidate, self.left.doc());
|
||||
debug_assert_eq!(candidate, self.right.doc());
|
||||
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
|
||||
return candidate;
|
||||
for other in &mut self.others {
|
||||
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = other.seek_into_the_danger_zone(candidate) {
|
||||
if new_target > candidate.wrapping_add(100) {
|
||||
candidate = left.seek(new_target);
|
||||
} else {
|
||||
candidate = left.advance();
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
candidate = left.advance();
|
||||
debug_assert_eq!(candidate, self.left.doc());
|
||||
debug_assert_eq!(candidate, self.right.doc());
|
||||
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
|
||||
return candidate;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -165,13 +169,25 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
|
||||
///
|
||||
/// Some implementations may choose to advance past the target if beneficial for performance.
|
||||
/// The return value is `true` if the target is in the docset, and `false` otherwise.
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
self.left.seek_into_the_danger_zone(target)
|
||||
&& self.right.seek_into_the_danger_zone(target)
|
||||
&& self
|
||||
.others
|
||||
.iter_mut()
|
||||
.all(|docset| docset.seek_into_the_danger_zone(target))
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
|
||||
self.left.seek_into_the_danger_zone(target)
|
||||
{
|
||||
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
|
||||
}
|
||||
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
|
||||
self.right.seek_into_the_danger_zone(target)
|
||||
{
|
||||
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
|
||||
}
|
||||
for docset in &mut self.others {
|
||||
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
|
||||
docset.seek_into_the_danger_zone(target)
|
||||
{
|
||||
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
|
||||
}
|
||||
}
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
@@ -193,11 +193,16 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
if self.phrase_scorer.seek_into_the_danger_zone(target) {
|
||||
self.matches_prefix()
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
|
||||
self.phrase_scorer.seek_into_the_danger_zone(target)
|
||||
{
|
||||
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
|
||||
}
|
||||
if self.matches_prefix() {
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
} else {
|
||||
false
|
||||
SeekIntoTheDangerZoneResult::NewTarget(target)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::cmp::Ordering;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
|
||||
use crate::fieldnorm::FieldNormReader;
|
||||
use crate::postings::Postings;
|
||||
use crate::query::bm25::Bm25Weight;
|
||||
@@ -530,12 +530,18 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
|
||||
self.advance()
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
debug_assert!(target >= self.doc());
|
||||
if self.intersection_docset.seek_into_the_danger_zone(target) && self.phrase_match() {
|
||||
return true;
|
||||
match self.intersection_docset.seek_into_the_danger_zone(target) {
|
||||
SeekIntoTheDangerZoneResult::Found => {
|
||||
if self.phrase_match() {
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
} else {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(target)
|
||||
}
|
||||
}
|
||||
new_target => new_target,
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn doc(&self) -> DocId {
|
||||
|
||||
@@ -3,6 +3,7 @@ use std::ops::RangeInclusive;
|
||||
|
||||
use columnar::Column;
|
||||
|
||||
use crate::docset::SeekIntoTheDangerZoneResult;
|
||||
use crate::{DocId, DocSet, TERMINATED};
|
||||
|
||||
/// Helper to have a cursor over a vec of docids
|
||||
@@ -184,6 +185,34 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
||||
doc
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
if self.is_last_seek_distance_large(target) {
|
||||
self.reset_fetch_range();
|
||||
}
|
||||
let last_block: bool;
|
||||
if target > self.next_fetch_start {
|
||||
self.next_fetch_start = target;
|
||||
// Contrary to seek, we fetch at most a single block.
|
||||
last_block = self.fetch_horizon(DEFAULT_FETCH_HORIZON);
|
||||
} else {
|
||||
last_block = false;
|
||||
}
|
||||
while let Some(loaded_doc) = self.loaded_docs.next() {
|
||||
if loaded_doc < target {
|
||||
continue;
|
||||
} else if loaded_doc == target {
|
||||
return SeekIntoTheDangerZoneResult::Found;
|
||||
} else {
|
||||
return SeekIntoTheDangerZoneResult::NewTarget(loaded_doc);
|
||||
}
|
||||
}
|
||||
if last_block {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(TERMINATED)
|
||||
} else {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(target)
|
||||
}
|
||||
}
|
||||
|
||||
fn size_hint(&self) -> u32 {
|
||||
// TODO: Implement a better size hint
|
||||
self.column.num_docs() / 10
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use crate::docset::DocSet;
|
||||
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult};
|
||||
use crate::query::score_combiner::ScoreCombiner;
|
||||
use crate::query::Scorer;
|
||||
use crate::{DocId, Score};
|
||||
@@ -56,7 +56,7 @@ where
|
||||
self.req_scorer.seek(target)
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
self.score_cache = None;
|
||||
self.req_scorer.seek_into_the_danger_zone(target)
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use common::TinySet;
|
||||
|
||||
use crate::docset::{DocSet, TERMINATED};
|
||||
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
|
||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::size_hint::estimate_union;
|
||||
use crate::query::Scorer;
|
||||
@@ -223,25 +223,30 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
|
||||
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
|
||||
if self.is_in_horizon(target) {
|
||||
// Our value is within the buffered horizon and the docset may already have been
|
||||
// processed and removed, so we need to use seek, which uses the regular advance.
|
||||
self.seek(target) == target
|
||||
if self.seek(target) == target {
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
} else {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(self.doc())
|
||||
}
|
||||
} else {
|
||||
// The docsets are not in the buffered range, so we can use seek_into_the_danger_zone
|
||||
// of the underlying docsets
|
||||
let is_hit = self
|
||||
.docsets
|
||||
.iter_mut()
|
||||
.any(|docset| docset.seek_into_the_danger_zone(target));
|
||||
let is_hit = self.docsets.iter_mut().any(|docset| {
|
||||
docset.seek_into_the_danger_zone(target) == SeekIntoTheDangerZoneResult::Found
|
||||
});
|
||||
|
||||
// The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone`
|
||||
// returns true.
|
||||
if is_hit {
|
||||
self.seek(target);
|
||||
SeekIntoTheDangerZoneResult::Found
|
||||
} else {
|
||||
SeekIntoTheDangerZoneResult::NewTarget(target)
|
||||
}
|
||||
is_hit
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user