From 04627546731dbc5c057b769cb06ce044bf33399d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Massot?= Date: Mon, 1 Nov 2021 01:18:05 +0100 Subject: [PATCH] Optimize block wand for one and several TermScorer. (#1190) * Added optimisation using block wand for single TermScorer. A proptest was also added. * Fix block wand algorithm by taking the last doc id of scores until the pivot scorer (included). * In block wand, when block max score is lower than the threshold, advance the scorer with best score. * Fix wrong condition in block_wand_single_scorer and add debug_assert to have an equality check on doc to break the loop. --- src/query/boolean_query/block_wand.rs | 94 +++++++++++++++++++++++++-- src/query/boolean_query/mod.rs | 1 + src/query/term_query/term_weight.rs | 2 +- 3 files changed, 90 insertions(+), 7 deletions(-) diff --git a/src/query/boolean_query/block_wand.rs b/src/query/boolean_query/block_wand.rs index 9ba658fbf..020b3dc87 100644 --- a/src/query/boolean_query/block_wand.rs +++ b/src/query/boolean_query/block_wand.rs @@ -42,27 +42,39 @@ fn find_pivot_doc( Some((before_pivot_len, pivot_len, pivot_doc)) } -// Before and after calling this method, scorers need to be sorted by their `.doc()`. +/// Advance the scorer with best score among the scorers[..pivot_len] to +/// the next doc candidate defined by the min of `last_doc_in_block + 1` for +/// scorer in scorers[..pivot_len] and `scorer.doc()` for scorer in scorers[pivot_len..]. +/// Note: before and after calling this method, scorers need to be sorted by their `.doc()`. fn block_max_was_too_low_advance_one_scorer( scorers: &mut Vec, pivot_len: usize, ) { debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); let mut scorer_to_seek = pivot_len - 1; - let mut doc_to_seek_after = scorers[scorer_to_seek].doc(); + let mut global_max_score = scorers[scorer_to_seek].max_score; + let mut doc_to_seek_after = scorers[scorer_to_seek].last_doc_in_block(); for scorer_ord in (0..pivot_len - 1).rev() { let scorer = &scorers[scorer_ord]; if scorer.last_doc_in_block() <= doc_to_seek_after { doc_to_seek_after = scorer.last_doc_in_block(); + } + if scorers[scorer_ord].max_score > global_max_score { + global_max_score = scorers[scorer_ord].max_score; scorer_to_seek = scorer_ord; } } + // Add +1 to go to the next block unless we are already at the end. + if doc_to_seek_after != TERMINATED { + doc_to_seek_after += 1; + } for scorer in &scorers[pivot_len..] { if scorer.doc() <= doc_to_seek_after { doc_to_seek_after = scorer.doc(); } } - scorers[scorer_to_seek].seek(doc_to_seek_after + 1); + scorers[scorer_to_seek].seek(doc_to_seek_after); + restore_ordering(scorers, scorer_to_seek); debug_assert!(is_sorted(scorers.iter().map(|scorer| scorer.doc()))); } @@ -130,6 +142,9 @@ fn advance_all_scorers_on_pivot(term_scorers: &mut Vec, term_scorers.sort_by_key(|scorer| scorer.doc()); } +/// Implements the WAND (Weak AND) algorithm for dynamic pruning +/// described in the paper "Faster Top-k Document Retrieval Using Block-Max Indexes". +/// Link: http://engineering.nyu.edu/~suel/papers/bmw.pdf pub fn block_wand( mut scorers: Vec, mut threshold: Score, @@ -187,6 +202,7 @@ pub fn block_wand( .iter_mut() .map(|scorer| scorer.score()) .sum(); + if score > threshold { threshold = callback(pivot_doc, score); } @@ -195,6 +211,56 @@ pub fn block_wand( } } +/// Specialized version of [`block_wand`] for a single scorer. +/// In this case, the algorithm is simple and readable and faster (~ x3) +/// than the generic algorithm. +/// The algorithm behaves as follows: +/// - While we don't hit the end of the docset: +/// - While the block max score is under the `threshold`, go to the +/// next block. +/// - On a block, advance until the end and execute `callback`` +/// when the doc score is greater or equal to the `threshold`. +pub fn block_wand_single_scorer( + mut scorer: TermScorer, + mut threshold: Score, + callback: &mut dyn FnMut(u32, Score) -> Score, +) { + let mut doc = scorer.doc(); + loop { + // We position the scorer on a block that can reach + // the threshold. + while scorer.block_max_score() < threshold { + let last_doc_in_block = scorer.last_doc_in_block(); + if last_doc_in_block == TERMINATED { + return; + } + doc = last_doc_in_block + 1; + scorer.shallow_seek(doc); + } + // Seek will effectively load that block. + doc = scorer.seek(doc); + if doc == TERMINATED { + break; + } + loop { + let score = scorer.score(); + if score > threshold { + threshold = callback(doc, score); + } + debug_assert!(doc <= scorer.last_doc_in_block()); + if doc == scorer.last_doc_in_block() { + break; + } + doc = scorer.advance(); + if doc == TERMINATED { + return; + } + } + doc += 1; + scorer.shallow_seek(doc); + } +} + struct TermScorerWithMaxScore<'a> { scorer: &'a mut TermScorer, max_score: Score, @@ -272,13 +338,14 @@ mod tests { } fn compute_checkpoints_for_each_pruning( - term_scorers: Vec, + mut term_scorers: Vec, n: usize, ) -> Vec<(DocId, Score)> { let mut heap: BinaryHeap = BinaryHeap::with_capacity(n); let mut checkpoints: Vec<(DocId, Score)> = Vec::new(); let mut limit: Score = 0.0; - super::block_wand(term_scorers, Score::MIN, &mut |doc, score| { + + let callback = &mut |doc, score| { heap.push(Float(score)); if heap.len() > n { heap.pop().unwrap(); @@ -290,7 +357,14 @@ mod tests { checkpoints.push((doc, score)); } limit - }); + }; + + if term_scorers.len() == 1 { + let scorer = term_scorers.pop().unwrap(); + super::block_wand_single_scorer(scorer, Score::MIN, callback); + } else { + super::block_wand(term_scorers, Score::MIN, callback); + } checkpoints } @@ -424,6 +498,14 @@ mod tests { } } + proptest! { + #![proptest_config(ProptestConfig::with_cases(500))] + #[test] + fn test_block_wand_single_term_scorer((posting_lists, fieldnorms) in gen_term_scorers(1)) { + test_block_wand_aux(&posting_lists[..], &fieldnorms[..]); + } + } + #[test] fn test_fn_reproduce_proptest() { let postings_lists = &[ diff --git a/src/query/boolean_query/mod.rs b/src/query/boolean_query/mod.rs index c2d80ae42..605a62e9f 100644 --- a/src/query/boolean_query/mod.rs +++ b/src/query/boolean_query/mod.rs @@ -3,6 +3,7 @@ mod boolean_query; mod boolean_weight; pub(crate) use self::block_wand::block_wand; +pub(crate) use self::block_wand::block_wand_single_scorer; pub use self::boolean_query::BooleanQuery; #[cfg(test)] diff --git a/src/query/term_query/term_weight.rs b/src/query/term_query/term_weight.rs index 51779124b..23222424c 100644 --- a/src/query/term_query/term_weight.rs +++ b/src/query/term_query/term_weight.rs @@ -79,7 +79,7 @@ impl Weight for TermWeight { callback: &mut dyn FnMut(DocId, Score) -> Score, ) -> crate::Result<()> { let scorer = self.specialized_scorer(reader, 1.0)?; - crate::query::boolean_query::block_wand(vec![scorer], threshold, callback); + crate::query::boolean_query::block_wand_single_scorer(scorer, threshold, callback); Ok(()) } }