From f5939b2e4c599909688369c58aa3ba7ea42dcc08 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 30 Dec 2025 19:00:46 +0100 Subject: [PATCH] Adds seek into the danger zone for fastfield range docsets. --- src/docset.rs | 20 ++++-- src/query/boost_query.rs | 5 +- src/query/disjunction.rs | 12 +++- src/query/intersection.rs | 62 ++++++++++++------- .../phrase_prefix_scorer.rs | 15 +++-- src/query/phrase_query/phrase_scorer.rs | 16 +++-- .../range_query/fast_field_range_doc_set.rs | 29 +++++++++ src/query/reqopt_scorer.rs | 4 +- src/query/union/buffered_union.rs | 21 ++++--- 9 files changed, 132 insertions(+), 52 deletions(-) diff --git a/src/docset.rs b/src/docset.rs index 01ea1125a..337cb40d2 100644 --- a/src/docset.rs +++ b/src/docset.rs @@ -60,7 +60,7 @@ pub trait DocSet: Send { /// ## API Behaviour /// If `seek_into_the_danger_zone` is returning true, a call to `doc()` has to return target. /// If `seek_into_the_danger_zone` is returning false, a call to `doc()` may return any doc - /// between the last doc that matched and target or a doc that is a valid next hit after + /// greater than the last doc that matched and target or a doc that is a valid next hit after /// target. The DocSet is considered to be in an invalid state until /// `seek_into_the_danger_zone` returns true again. /// @@ -70,12 +70,16 @@ pub trait DocSet: Send { /// /// # Warning /// This is an advanced API used by intersection. The API contract is tricky, avoid using it. - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { let current_doc = self.doc(); if current_doc < target { self.seek(target); } - self.doc() == target + if self.doc() == target { + SeekIntoTheDangerZoneResult::Found + } else { + SeekIntoTheDangerZoneResult::NewTarget(self.doc()) + } } /// Fills a given mutable buffer with the next doc ids from the @@ -166,6 +170,12 @@ pub trait DocSet: Send { } } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SeekIntoTheDangerZoneResult { + Found, + NewTarget(DocId), +} + impl DocSet for &mut dyn DocSet { fn advance(&mut self) -> u32 { (**self).advance() @@ -175,7 +185,7 @@ impl DocSet for &mut dyn DocSet { (**self).seek(target) } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { (**self).seek_into_the_danger_zone(target) } @@ -211,7 +221,7 @@ impl DocSet for Box { unboxed.seek(target) } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { let unboxed: &mut TDocSet = self.borrow_mut(); unboxed.seek_into_the_danger_zone(target) } diff --git a/src/query/boost_query.rs b/src/query/boost_query.rs index ecbf3d8d6..95224ab2f 100644 --- a/src/query/boost_query.rs +++ b/src/query/boost_query.rs @@ -1,6 +1,6 @@ use std::fmt; -use crate::docset::COLLECT_BLOCK_BUFFER_LEN; +use crate::docset::{SeekIntoTheDangerZoneResult, COLLECT_BLOCK_BUFFER_LEN}; use crate::fastfield::AliveBitSet; use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight}; use crate::{DocId, DocSet, Score, SegmentReader, Term}; @@ -104,7 +104,8 @@ impl DocSet for BoostScorer { fn seek(&mut self, target: DocId) -> DocId { self.underlying.seek(target) } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { self.underlying.seek_into_the_danger_zone(target) } diff --git a/src/query/disjunction.rs b/src/query/disjunction.rs index b2f1080fc..a959f80c0 100644 --- a/src/query/disjunction.rs +++ b/src/query/disjunction.rs @@ -1,6 +1,7 @@ use std::cmp::Ordering; use std::collections::BinaryHeap; +use crate::docset::SeekIntoTheDangerZoneResult; use crate::query::score_combiner::DoNothingCombiner; use crate::query::{ScoreCombiner, Scorer}; use crate::{DocId, DocSet, Score, TERMINATED}; @@ -67,9 +68,16 @@ impl DocSet for ScorerWrapper { self.current_doc = doc_id; doc_id } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { let found = self.scorer.seek_into_the_danger_zone(target); - self.current_doc = self.scorer.doc(); + match found { + crate::docset::SeekIntoTheDangerZoneResult::Found => { + self.current_doc = self.scorer.doc(); + } + crate::docset::SeekIntoTheDangerZoneResult::NewTarget(current_doc) => { + self.current_doc = current_doc; + } + } found } diff --git a/src/query/intersection.rs b/src/query/intersection.rs index 3e8677d98..6a6cd402e 100644 --- a/src/query/intersection.rs +++ b/src/query/intersection.rs @@ -1,5 +1,5 @@ use super::size_hint::estimate_intersection; -use crate::docset::{DocSet, TERMINATED}; +use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED}; use crate::query::term_query::TermScorer; use crate::query::{EmptyScorer, Scorer}; use crate::{DocId, Score}; @@ -117,14 +117,15 @@ impl DocSet for Intersection candidate.wrapping_add(100) { - candidate = left.seek(right_doc); + if right_new_target > candidate.wrapping_add(100) { + candidate = left.seek(right_new_target); } else { candidate = left.advance(); } @@ -135,17 +136,20 @@ impl DocSet for Intersection candidate.wrapping_add(100) { + candidate = left.seek(new_target); + } else { + candidate = left.advance(); + } + continue; + } } - candidate = left.advance(); + debug_assert_eq!(candidate, self.left.doc()); + debug_assert_eq!(candidate, self.right.doc()); + debug_assert!(self.others.iter().all(|docset| docset.doc() == candidate)); + return candidate; } } @@ -165,13 +169,25 @@ impl DocSet for Intersection bool { - self.left.seek_into_the_danger_zone(target) - && self.right.seek_into_the_danger_zone(target) - && self - .others - .iter_mut() - .all(|docset| docset.seek_into_the_danger_zone(target)) + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { + if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = + self.left.seek_into_the_danger_zone(target) + { + return SeekIntoTheDangerZoneResult::NewTarget(new_target); + } + if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = + self.right.seek_into_the_danger_zone(target) + { + return SeekIntoTheDangerZoneResult::NewTarget(new_target); + } + for docset in &mut self.others { + if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = + docset.seek_into_the_danger_zone(target) + { + return SeekIntoTheDangerZoneResult::NewTarget(new_target); + } + } + SeekIntoTheDangerZoneResult::Found } fn doc(&self) -> DocId { diff --git a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs index cc7bb7886..75e1025bd 100644 --- a/src/query/phrase_prefix_query/phrase_prefix_scorer.rs +++ b/src/query/phrase_prefix_query/phrase_prefix_scorer.rs @@ -1,4 +1,4 @@ -use crate::docset::{DocSet, TERMINATED}; +use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; @@ -193,11 +193,16 @@ impl DocSet for PhrasePrefixScorer { self.advance() } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { - if self.phrase_scorer.seek_into_the_danger_zone(target) { - self.matches_prefix() + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { + if let SeekIntoTheDangerZoneResult::NewTarget(new_target) = + self.phrase_scorer.seek_into_the_danger_zone(target) + { + return SeekIntoTheDangerZoneResult::NewTarget(new_target); + } + if self.matches_prefix() { + SeekIntoTheDangerZoneResult::Found } else { - false + SeekIntoTheDangerZoneResult::NewTarget(target) } } diff --git a/src/query/phrase_query/phrase_scorer.rs b/src/query/phrase_query/phrase_scorer.rs index 4f8541cd2..7322bb979 100644 --- a/src/query/phrase_query/phrase_scorer.rs +++ b/src/query/phrase_query/phrase_scorer.rs @@ -1,6 +1,6 @@ use std::cmp::Ordering; -use crate::docset::{DocSet, TERMINATED}; +use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED}; use crate::fieldnorm::FieldNormReader; use crate::postings::Postings; use crate::query::bm25::Bm25Weight; @@ -530,12 +530,18 @@ impl DocSet for PhraseScorer { self.advance() } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { debug_assert!(target >= self.doc()); - if self.intersection_docset.seek_into_the_danger_zone(target) && self.phrase_match() { - return true; + match self.intersection_docset.seek_into_the_danger_zone(target) { + SeekIntoTheDangerZoneResult::Found => { + if self.phrase_match() { + SeekIntoTheDangerZoneResult::Found + } else { + SeekIntoTheDangerZoneResult::NewTarget(target) + } + } + new_target => new_target, } - false } fn doc(&self) -> DocId { diff --git a/src/query/range_query/fast_field_range_doc_set.rs b/src/query/range_query/fast_field_range_doc_set.rs index 24d2b1fe3..0b742eee9 100644 --- a/src/query/range_query/fast_field_range_doc_set.rs +++ b/src/query/range_query/fast_field_range_doc_set.rs @@ -3,6 +3,7 @@ use std::ops::RangeInclusive; use columnar::Column; +use crate::docset::SeekIntoTheDangerZoneResult; use crate::{DocId, DocSet, TERMINATED}; /// Helper to have a cursor over a vec of docids @@ -184,6 +185,34 @@ impl DocSet for RangeDocSe doc } + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { + if self.is_last_seek_distance_large(target) { + self.reset_fetch_range(); + } + let last_block: bool; + if target > self.next_fetch_start { + self.next_fetch_start = target; + // Contrary to seek, we fetch at most a single block. + last_block = self.fetch_horizon(DEFAULT_FETCH_HORIZON); + } else { + last_block = false; + } + while let Some(loaded_doc) = self.loaded_docs.next() { + if loaded_doc < target { + continue; + } else if loaded_doc == target { + return SeekIntoTheDangerZoneResult::Found; + } else { + return SeekIntoTheDangerZoneResult::NewTarget(loaded_doc); + } + } + if last_block { + SeekIntoTheDangerZoneResult::NewTarget(TERMINATED) + } else { + SeekIntoTheDangerZoneResult::NewTarget(target) + } + } + fn size_hint(&self) -> u32 { // TODO: Implement a better size hint self.column.num_docs() / 10 diff --git a/src/query/reqopt_scorer.rs b/src/query/reqopt_scorer.rs index 45857567c..ce8992e8b 100644 --- a/src/query/reqopt_scorer.rs +++ b/src/query/reqopt_scorer.rs @@ -1,6 +1,6 @@ use std::marker::PhantomData; -use crate::docset::DocSet; +use crate::docset::{DocSet, SeekIntoTheDangerZoneResult}; use crate::query::score_combiner::ScoreCombiner; use crate::query::Scorer; use crate::{DocId, Score}; @@ -56,7 +56,7 @@ where self.req_scorer.seek(target) } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { self.score_cache = None; self.req_scorer.seek_into_the_danger_zone(target) } diff --git a/src/query/union/buffered_union.rs b/src/query/union/buffered_union.rs index 70299ad6f..128c0c670 100644 --- a/src/query/union/buffered_union.rs +++ b/src/query/union/buffered_union.rs @@ -1,6 +1,6 @@ use common::TinySet; -use crate::docset::{DocSet, TERMINATED}; +use crate::docset::{DocSet, SeekIntoTheDangerZoneResult, TERMINATED}; use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner}; use crate::query::size_hint::estimate_union; use crate::query::Scorer; @@ -223,25 +223,30 @@ where } } - fn seek_into_the_danger_zone(&mut self, target: DocId) -> bool { + fn seek_into_the_danger_zone(&mut self, target: DocId) -> SeekIntoTheDangerZoneResult { if self.is_in_horizon(target) { // Our value is within the buffered horizon and the docset may already have been // processed and removed, so we need to use seek, which uses the regular advance. - self.seek(target) == target + if self.seek(target) == target { + SeekIntoTheDangerZoneResult::Found + } else { + SeekIntoTheDangerZoneResult::NewTarget(self.doc()) + } } else { // The docsets are not in the buffered range, so we can use seek_into_the_danger_zone // of the underlying docsets - let is_hit = self - .docsets - .iter_mut() - .any(|docset| docset.seek_into_the_danger_zone(target)); + let is_hit = self.docsets.iter_mut().any(|docset| { + docset.seek_into_the_danger_zone(target) == SeekIntoTheDangerZoneResult::Found + }); // The API requires the DocSet to be in a valid state when `seek_into_the_danger_zone` // returns true. if is_hit { self.seek(target); + SeekIntoTheDangerZoneResult::Found + } else { + SeekIntoTheDangerZoneResult::NewTarget(target) } - is_hit } }