mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-21 18:50:42 +00:00
Suffix-sum pruning for multi-term intersection candidates
After scoring each secondary in Phase 2, check whether remaining secondaries' block_max scores can still beat the threshold. Skip to the next candidate early if impossible, avoiding expensive seeks into later secondaries. Improves three-term intersection by ~8% on the balanced benchmark while keeping two-term performance neutral. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -291,13 +291,12 @@ impl BlockSegmentPostings {
|
||||
/// `.load_block()` needs to be called manually afterwards.
|
||||
/// If all docs are smaller than target, the block loaded may be empty,
|
||||
/// or be the last an incomplete VInt block.
|
||||
#[inline]
|
||||
pub(crate) fn seek_block(&mut self, target_doc: DocId) -> bool {
|
||||
if self.skip_reader.seek(target_doc) {
|
||||
self.block_max_score_cache = None;
|
||||
self.block_loaded = false;
|
||||
}
|
||||
self.skip_reader.remaining_docs != 0
|
||||
self.skip_reader.has_remaining_docs()
|
||||
}
|
||||
|
||||
pub(crate) fn block_is_loaded(&self) -> bool {
|
||||
|
||||
@@ -96,7 +96,7 @@ pub(crate) struct SkipReader {
|
||||
owned_read: OwnedBytes,
|
||||
skip_info: IndexRecordOption,
|
||||
byte_offset: usize,
|
||||
pub remaining_docs: u32, // number of docs remaining, including the
|
||||
remaining_docs: u32, // number of docs remaining, including the
|
||||
// documents in the current block.
|
||||
block_info: BlockInfo,
|
||||
|
||||
@@ -146,6 +146,11 @@ impl SkipReader {
|
||||
skip_reader
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn has_remaining_docs(&self) -> bool {
|
||||
self.remaining_docs != 0
|
||||
}
|
||||
|
||||
pub fn reset(&mut self, data: OwnedBytes, doc_freq: u32) {
|
||||
self.last_doc_in_block = if doc_freq >= COMPRESSION_BLOCK_SIZE as u32 {
|
||||
0
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::Scorer;
|
||||
use crate::query::weight::for_each_pruning_scorer;
|
||||
use crate::query::{Intersection, Scorer};
|
||||
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
|
||||
/// Block-max pruning for top-K over intersection of term scorers.
|
||||
@@ -15,13 +16,15 @@ use crate::{DocId, DocSet, Score, TERMINATED};
|
||||
///
|
||||
/// # Preconditions
|
||||
/// - `scorers` has at least 2 elements
|
||||
/// - `scorers` has less than 16 elements
|
||||
/// - All scorers read frequencies (`FreqReadingOption::ReadFreq`)
|
||||
pub fn block_wand_intersection(
|
||||
pub(crate) fn block_wand_intersection(
|
||||
mut scorers: Vec<TermScorer>,
|
||||
mut threshold: Score,
|
||||
callback: &mut dyn FnMut(DocId, Score) -> Score,
|
||||
) {
|
||||
assert!(scorers.len() >= 2);
|
||||
assert!(scorers.len() <= 16);
|
||||
|
||||
// Sort by cost (ascending). scorers[0] becomes the "leader" (rarest term).
|
||||
scorers.sort_by_key(TermScorer::size_hint);
|
||||
@@ -61,12 +64,28 @@ pub fn block_wand_intersection(
|
||||
let mut window_end: DocId = leader.last_doc_in_block();
|
||||
|
||||
let mut secondary_block_max_sum: Score = 0.0;
|
||||
for secondary in secondaries.iter_mut() {
|
||||
let mut secondary_block_max_scores = [0.0f32; 16];
|
||||
let num_secondaries = secondaries.len();
|
||||
for (idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
if !secondary.block_cursor().seek_block(doc) {
|
||||
return;
|
||||
}
|
||||
window_end = window_end.min(secondary.last_doc_in_block());
|
||||
secondary_block_max_sum += secondary.block_max_score();
|
||||
let bms = secondary.block_max_score();
|
||||
secondary_block_max_scores[idx] = bms;
|
||||
secondary_block_max_sum += bms;
|
||||
}
|
||||
|
||||
// Precompute suffix sums: suffix[i] = sum of block_max for secondaries[i+1..].
|
||||
// Used in Phase 2 to prune candidates that can't beat threshold even with
|
||||
// remaining secondaries contributing their block_max.
|
||||
let mut secondary_suffix_block_max = [0.0f32; 16];
|
||||
{
|
||||
let mut running = 0.0f32;
|
||||
for idx in (0..num_secondaries).rev() {
|
||||
secondary_suffix_block_max[idx] = running;
|
||||
running += secondary_block_max_scores[idx];
|
||||
}
|
||||
}
|
||||
|
||||
if leader_block_max + secondary_block_max_sum <= threshold {
|
||||
@@ -122,7 +141,7 @@ pub fn block_wand_intersection(
|
||||
let candidate_doc = candidate_doc_ids[candidate_idx];
|
||||
let mut total_score: Score = candidate_scores[candidate_idx];
|
||||
|
||||
for secondary in secondaries.iter_mut() {
|
||||
for (secondary_idx, secondary) in secondaries.iter_mut().enumerate() {
|
||||
// If a previous candidate already advanced this secondary past
|
||||
// candidate_doc, the candidate can't be in the intersection.
|
||||
if secondary.doc() > candidate_doc {
|
||||
@@ -133,6 +152,12 @@ pub fn block_wand_intersection(
|
||||
continue 'next_candidate;
|
||||
}
|
||||
total_score += secondary.score();
|
||||
|
||||
// Prune: even if all remaining secondaries score at their block max,
|
||||
// can we still beat the threshold?
|
||||
if total_score + secondary_suffix_block_max[secondary_idx] <= threshold {
|
||||
continue 'next_candidate;
|
||||
}
|
||||
}
|
||||
|
||||
// All secondaries matched.
|
||||
|
||||
@@ -9,8 +9,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||
use crate::query::term_query::TermScorer;
|
||||
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
|
||||
use crate::query::{
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
|
||||
RequiredOptionalScorer, Scorer, Weight,
|
||||
intersect_scorers, AllScorer, BufferedUnionScorer, EmptyScorer, Exclude, Explanation,
|
||||
Intersection, Occur, RequiredOptionalScorer, Scorer, Weight,
|
||||
};
|
||||
use crate::{DocId, Score};
|
||||
|
||||
@@ -574,7 +574,12 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
|
||||
super::block_wand(term_scorers, threshold, callback);
|
||||
}
|
||||
SpecializedScorer::TermIntersection(term_scorers) => {
|
||||
super::block_wand_intersection(term_scorers, threshold, callback);
|
||||
if term_scorers.len() >= 16 {
|
||||
let mut intersection = Intersection::new(term_scorers, reader.max_doc());
|
||||
for_each_pruning_scorer(&mut intersection, threshold, callback);
|
||||
} else {
|
||||
super::block_wand_intersection(term_scorers, threshold, callback);
|
||||
}
|
||||
}
|
||||
SpecializedScorer::Other(mut scorer) => {
|
||||
for_each_pruning_scorer(scorer.as_mut(), threshold, callback);
|
||||
|
||||
Reference in New Issue
Block a user