Adds seek into the danger zone for fastfield range docsets.

This commit is contained in:
Paul Masurel
2025-12-30 19:00:46 +01:00
parent 923f0508f2
commit f5939b2e4c
9 changed files with 132 additions and 52 deletions

View File

@@ -60,7 +60,7 @@ pub trait DocSet: Send {
/// ## API Behaviour
/// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target.
/// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc
/// between the last doc that matched and target or a doc that is a valid next hit after
/// greater than the last doc that matched and target or a doc that is a valid next hit after
/// target. The DocSet is considered to be in an invalid state until
/// `seek_into_the_danger_zone` returns true again.
///
@@ -70,12 +70,16 @@ pub trait DocSet: Send {
///
/// # Warning
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
let current_doc = self.doc();
if current_doc < target {
self.seek(target);
}
self.doc() == target
if self.doc() == target {
SeekIntoTheDangerZoneResult::Found
} else {
SeekIntoTheDangerZoneResult::NewTarget(self.doc())
}
}
/// Fills a given mutable buffer with the next doc ids from the
@@ -166,6 +170,12 @@ pub trait DocSet: Send {
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum SeekIntoTheDangerZoneResult {
Found,
NewTarget(DocId),
}
impl DocSet for &mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()
@@ -175,7 +185,7 @@ impl DocSet for &mut dyn DocSet {
(**self).seek(target)
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
(**self).seek_into_the_danger_zone(target)
}
@@ -211,7 +221,7 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
unboxed.seek(target)
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
let unboxed: &mut TDocSet = self.borrow_mut();
unboxed.seek_into_the_danger_zone(target)
}

View File

@@ -1,6 +1,6 @@
use std::fmt;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::docset::{SeekIntoTheDangerZoneResult, COLLECT_BLOCK_BUFFER_LEN};
use crate::fastfield::AliveBitSet;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, Term};
@@ -104,7 +104,8 @@ impl<S: Scorer> DocSet for BoostScorer<S> {
fn seek(&mut self, target: DocId) -> DocId {
self.underlying.seek(target)
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
self.underlying.seek_into_the_danger_zone(target)
}

View File

@@ -1,6 +1,7 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use crate::docset::SeekIntoTheDangerZoneResult;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ScoreCombiner, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
@@ -67,9 +68,16 @@ impl<T: Scorer> DocSet for ScorerWrapper<T> {
self.current_doc = doc_id;
doc_id
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
let found = self.scorer.seek_into_the_danger_zone(target);
self.current_doc = self.scorer.doc();
match found {
crate::docset::SeekIntoTheDangerZoneResult::Found => {
self.current_doc = self.scorer.doc();
}
crate::docset::SeekIntoTheDangerZoneResult::NewTarget(current_doc) => {
self.current_doc = current_doc;
}
}
found
}

View File

@@ -1,5 +1,5 @@
use super::size_hint::estimate_intersection;
use crate::docset::{DocSet, TERMINATED};
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
use crate::query::term_query::TermScorer;
use crate::query::{EmptyScorer, Scorer};
use crate::{DocId, Score};
@@ -117,14 +117,15 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
// of the two rarest `DocSet` in the intersection.
loop {
if right.seek_into_the_danger_zone(candidate) {
let SeekIntoTheDangerZoneResult::NewTarget(right_new_target) =
right.seek_into_the_danger_zone(candidate)
else {
break;
}
let right_doc = right.doc();
};
// TODO: Think about which value would make sense here
// It depends on the DocSet implementation, when a seek would outweigh an advance.
if right_doc > candidate.wrapping_add(100) {
candidate = left.seek(right_doc);
if right_new_target > candidate.wrapping_add(100) {
candidate = left.seek(right_new_target);
} else {
candidate = left.advance();
}
@@ -135,17 +136,20 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
debug_assert_eq!(left.doc(), right.doc());
// test the remaining scorers
if self
.others
.iter_mut()
.all(|docset| docset.seek_into_the_danger_zone(candidate))
{
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
for other in &mut self.others {
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = other.seek_into_the_danger_zone(candidate) {
if new_target > candidate.wrapping_add(100) {
candidate = left.seek(new_target);
} else {
candidate = left.advance();
}
continue;
}
}
candidate = left.advance();
debug_assert_eq!(candidate, self.left.doc());
debug_assert_eq!(candidate, self.right.doc());
debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate));
return candidate;
}
}
@@ -165,13 +169,25 @@ impl<TDocSet: DocSet, TOtherDocSet: DocSet> DocSet for Intersection<TDocSet, TOt
///
/// Some implementations may choose to advance past the target if beneficial for performance.
/// The return value is `true` if the target is in the docset, and `false` otherwise.
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
self.left.seek_into_the_danger_zone(target)
&& self.right.seek_into_the_danger_zone(target)
&& self
.others
.iter_mut()
.all(|docset| docset.seek_into_the_danger_zone(target))
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
self.left.seek_into_the_danger_zone(target)
{
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
}
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
self.right.seek_into_the_danger_zone(target)
{
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
}
for docset in &mut self.others {
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
docset.seek_into_the_danger_zone(target)
{
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
}
}
SeekIntoTheDangerZoneResult::Found
}
fn doc(&self) -> DocId {

View File

@@ -1,4 +1,4 @@
use crate::docset::{DocSet, TERMINATED};
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
@@ -193,11 +193,16 @@ impl<TPostings: Postings> DocSet for PhrasePrefixScorer<TPostings> {
self.advance()
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
if self.phrase_scorer.seek_into_the_danger_zone(target) {
self.matches_prefix()
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
if let SeekIntoTheDangerZoneResult::NewTarget(new_target) =
self.phrase_scorer.seek_into_the_danger_zone(target)
{
return SeekIntoTheDangerZoneResult::NewTarget(new_target);
}
if self.matches_prefix() {
SeekIntoTheDangerZoneResult::Found
} else {
false
SeekIntoTheDangerZoneResult::NewTarget(target)
}
}

View File

@@ -1,6 +1,6 @@
use std::cmp::Ordering;
use crate::docset::{DocSet, TERMINATED};
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
use crate::fieldnorm::FieldNormReader;
use crate::postings::Postings;
use crate::query::bm25::Bm25Weight;
@@ -530,12 +530,18 @@ impl<TPostings: Postings> DocSet for PhraseScorer<TPostings> {
self.advance()
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
debug_assert!(target >= self.doc());
if self.intersection_docset.seek_into_the_danger_zone(target) && self.phrase_match() {
return true;
match self.intersection_docset.seek_into_the_danger_zone(target) {
SeekIntoTheDangerZoneResult::Found => {
if self.phrase_match() {
SeekIntoTheDangerZoneResult::Found
} else {
SeekIntoTheDangerZoneResult::NewTarget(target)
}
}
new_target => new_target,
}
false
}
fn doc(&self) -> DocId {

View File

@@ -3,6 +3,7 @@ use std::ops::RangeInclusive;
use columnar::Column;
use crate::docset::SeekIntoTheDangerZoneResult;
use crate::{DocId, DocSet, TERMINATED};
/// Helper to have a cursor over a vec of docids
@@ -184,6 +185,34 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
doc
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
if self.is_last_seek_distance_large(target) {
self.reset_fetch_range();
}
let last_block: bool;
if target > self.next_fetch_start {
self.next_fetch_start = target;
// Contrary to seek, we fetch at most a single block.
last_block = self.fetch_horizon(DEFAULT_FETCH_HORIZON);
} else {
last_block = false;
}
while let Some(loaded_doc) = self.loaded_docs.next() {
if loaded_doc < target {
continue;
} else if loaded_doc == target {
return SeekIntoTheDangerZoneResult::Found;
} else {
return SeekIntoTheDangerZoneResult::NewTarget(loaded_doc);
}
}
if last_block {
SeekIntoTheDangerZoneResult::NewTarget(TERMINATED)
} else {
SeekIntoTheDangerZoneResult::NewTarget(target)
}
}
fn size_hint(&self) -> u32 {
// TODO: Implement a better size hint
self.column.num_docs() / 10

View File

@@ -1,6 +1,6 @@
use std::marker::PhantomData;
use crate::docset::DocSet;
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult};
use crate::query::score_combiner::ScoreCombiner;
use crate::query::Scorer;
use crate::{DocId, Score};
@@ -56,7 +56,7 @@ where
self.req_scorer.seek(target)
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
self.score_cache = None;
self.req_scorer.seek_into_the_danger_zone(target)
}

View File

@@ -1,6 +1,6 @@
use common::TinySet;
use crate::docset::{DocSet, TERMINATED};
use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED};
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::size_hint::estimate_union;
use crate::query::Scorer;
@@ -223,25 +223,30 @@ where
}
}
fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool {
fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult {
if self.is_in_horizon(target) {
// Our value is within the buffered horizon and the docset may already have been
// processed and removed, so we need to use seek, which uses the regular advance.
self.seek(target) == target
if self.seek(target) == target {
SeekIntoTheDangerZoneResult::Found
} else {
SeekIntoTheDangerZoneResult::NewTarget(self.doc())
}
} else {
// The docsets are not in the buffered range, so we can use seek_into_the_danger_zone
// of the underlying docsets
let is_hit = self
.docsets
.iter_mut()
.any(|docset| docset.seek_into_the_danger_zone(target));
let is_hit = self.docsets.iter_mut().any(|docset| {
docset.seek_into_the_danger_zone(target) == SeekIntoTheDangerZoneResult::Found
});
// The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone`
// returns true.
if is_hit {
self.seek(target);
SeekIntoTheDangerZoneResult::Found
} else {
SeekIntoTheDangerZoneResult::NewTarget(target)
}
is_hit
}
}