From c0f21a45ae99a37996d1edaceb434d24ab3f057c Mon Sep 17 00:00:00 2001 From: Stu Hood Date: Thu, 18 Dec 2025 03:13:23 -0800 Subject: [PATCH] Use a strict comparison in TopNComputer (#2777) * Remove `(Partial)Ord` from `ComparableDoc`, and unify comparison between `TopNComputer` and `Comparator`. * Doc cleanups. * Require Ord for `ComparableDoc`. * Semantics are actually _ascending_ DocId order. * Adjust docs again for ascending DocId order. * minor change --------- Co-authored-by: Paul Masurel --- src/collector/sort_key/mod.rs | 32 +++++++--- src/collector/sort_key/order.rs | 3 +- src/collector/top_collector.rs | 58 +++--------------- src/collector/top_score_collector.rs | 91 ++++++++++++++++++++-------- src/indexer/delete_queue.rs | 18 ++++-- src/indexer/operation.rs | 6 ++ 6 files changed, 116 insertions(+), 92 deletions(-) diff --git a/src/collector/sort_key/mod.rs b/src/collector/sort_key/mod.rs index a66115633..3bfb3b1c8 100644 --- a/src/collector/sort_key/mod.rs +++ b/src/collector/sort_key/mod.rs @@ -11,7 +11,26 @@ pub use sort_by_string::SortByString; pub use sort_key_computer::{SegmentSortKeyComputer, SortKeyComputer}; #[cfg(test)] -mod tests { +pub(crate) mod tests { + + // By spec, regardless of whether ascending or descending order was requested, in presence of a + // tie, we sort by ascending doc id/doc address. + pub(crate) fn sort_hits( + hits: &mut [ComparableDoc], + order: Order, + ) { + if order.is_asc() { + hits.sort_by(|l, r| l.sort_key.cmp(&r.sort_key).then(l.doc.cmp(&r.doc))); + } else { + hits.sort_by(|l, r| { + l.sort_key + .cmp(&r.sort_key) + .reverse() // This is descending + .then(l.doc.cmp(&r.doc)) + }); + } + } + use std::collections::HashMap; use std::ops::Range; @@ -372,15 +391,10 @@ mod tests { // Using the TopDocs collector should always be equivalent to sorting, skipping the // offset, and then taking the limit. - let sorted_docs: Vec<_> = if order.is_desc() { - let mut comparable_docs: Vec> = + let sorted_docs: Vec<_> = { + let mut comparable_docs: Vec> = all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect(); - comparable_docs.sort(); - comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect() - } else { - let mut comparable_docs: Vec> = - all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect(); - comparable_docs.sort(); + sort_hits(&mut comparable_docs, order); comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect() }; let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::>(); diff --git a/src/collector/sort_key/order.rs b/src/collector/sort_key/order.rs index 923d5cb8e..40a718b90 100644 --- a/src/collector/sort_key/order.rs +++ b/src/collector/sort_key/order.rs @@ -30,7 +30,8 @@ impl Comparator for NaturalComparator { /// first. /// /// The ReverseComparator does not necessarily imply that the sort order is reversed compared -/// to the NaturalComparator. In presence of a tie, both version will retain the higher doc ids. +/// to the NaturalComparator. In presence of a tie on the sort key, documents will always be +/// sorted by ascending `DocId`/`DocAddress` in TopN results, regardless of the comparator. #[derive(Debug, Copy, Clone, Default, Serialize, Deserialize)] pub struct ReverseComparator; diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 6981c86c9..1990b3837 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -1,64 +1,22 @@ -use std::cmp::Ordering; - use serde::{Deserialize, Serialize}; /// Contains a feature (field, score, etc.) of a document along with the document address. /// -/// It guarantees stable sorting: in case of a tie on the feature, the document -/// address is used. -/// -/// The REVERSE_ORDER generic parameter controls whether the by-feature order -/// should be reversed, which is useful for achieving for example largest-first -/// semantics without having to wrap the feature in a `Reverse`. -#[derive(Clone, Default, Serialize, Deserialize)] -pub struct ComparableDoc { +/// Used only by TopNComputer, which implements the actual comparison via a `Comparator`. +#[derive(Clone, Default, Eq, PartialEq, Serialize, Deserialize)] +pub struct ComparableDoc { /// The feature of the document. In practice, this is - /// is any type that implements `PartialOrd`. + /// is a type which can be compared with a `Comparator`. pub sort_key: T, - /// The document address. In practice, this is any - /// type that implements `PartialOrd`, and is guaranteed - /// to be unique for each document. + /// The document address. In practice, this is either a `DocId` or `DocAddress`. pub doc: D, } -impl std::fmt::Debug - for ComparableDoc -{ + +impl std::fmt::Debug for ComparableDoc { fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { - f.debug_struct(format!("ComparableDoc<_, _ {R}").as_str()) + f.debug_struct("ComparableDoc") .field("feature", &self.sort_key) .field("doc", &self.doc) .finish() } } - -impl PartialOrd for ComparableDoc { - fn partial_cmp(&self, other: &Self) -> Option { - Some(self.cmp(other)) - } -} - -impl Ord for ComparableDoc { - #[inline] - fn cmp(&self, other: &Self) -> Ordering { - let by_feature = self - .sort_key - .partial_cmp(&other.sort_key) - .map(|ord| if R { ord.reverse() } else { ord }) - .unwrap_or(Ordering::Equal); - - let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal); - - // In case of a tie on the feature, we sort by ascending - // `DocAddress` in order to ensure a stable sorting of the - // documents. - by_feature.then_with(lazy_by_doc_address) - } -} - -impl PartialEq for ComparableDoc { - fn eq(&self, other: &Self) -> bool { - self.cmp(other) == Ordering::Equal - } -} - -impl Eq for ComparableDoc {} diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 78c344dbe..3c3f1beb9 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -23,10 +23,9 @@ use crate::{DocAddress, DocId, Order, Score, SegmentReader}; /// The theoretical complexity for collecting the top `K` out of `N` documents /// is `O(N + K)`. /// -/// This collector does not guarantee a stable sorting in case of a tie on the -/// document score, for stable sorting `PartialOrd` needs to resolve on other fields -/// like docid in case of score equality. -/// Only then, it is suitable for pagination. +/// This collector guarantees a stable sorting in case of a tie on the +/// document score/sort key: The document address (`DocAddress`) is used as a tie breaker. +/// In case of a tie on the sort key, documents are always sorted by ascending `DocAddress`. /// /// ```rust /// use tantivy::collector::TopDocs; @@ -500,8 +499,13 @@ where /// /// For TopN == 0, it will be relative expensive. /// -/// When using the natural comparator, the top N computer returns the top N elements in -/// descending order, as expected for a top N. +/// The TopNComputer will tiebreak by using ascending `D` (DocId or DocAddress): +/// i.e., in case of a tie on the sort key, the `DocId|DocAddress` are always sorted in +/// ascending order, regardless of the `Comparator` used for the `Score` type. +/// +/// NOTE: Items must be `push`ed to the TopNComputer in ascending `DocId|DocAddress` order, as the +/// threshold used to eliminate docs does not include the `DocId` or `DocAddress`: this provides +/// the ascending `DocId|DocAddress` tie-breaking behavior without additional comparisons. #[derive(Serialize, Deserialize)] #[serde(from = "TopNComputerDeser")] pub struct TopNComputer { @@ -580,6 +584,18 @@ where } } +#[inline(always)] +fn compare_for_top_k>( + c: &C, + lhs: &ComparableDoc, + rhs: &ComparableDoc, +) -> std::cmp::Ordering { + c.compare(&lhs.sort_key, &rhs.sort_key) + .reverse() // Reverse here because we want top K. + .then_with(|| lhs.doc.cmp(&rhs.doc)) // Regardless of asc/desc, in presence of a tie, we + // sort by doc id +} + impl TopNComputer where D: Ord, @@ -600,10 +616,13 @@ where /// Push a new document to the top n. /// If the document is below the current threshold, it will be ignored. + /// + /// NOTE: `push` must be called in ascending `DocId`/`DocAddress` order. #[inline] pub fn push(&mut self, sort_key: TSortKey, doc: D) { if let Some(last_median) = &self.threshold { - if self.comparator.compare(&sort_key, last_median) == Ordering::Less { + // See the struct docs for an explanation of why this comparison is strict. + if self.comparator.compare(&sort_key, last_median) != Ordering::Greater { return; } } @@ -629,9 +648,7 @@ where fn truncate_top_n(&mut self) -> TSortKey { // Use select_nth_unstable to find the top nth score let (_, median_el, _) = self.buffer.select_nth_unstable_by(self.top_n, |lhs, rhs| { - self.comparator - .compare(&rhs.sort_key, &lhs.sort_key) - .then_with(|| lhs.doc.cmp(&rhs.doc)) + compare_for_top_k(&self.comparator, lhs, rhs) }); let median_score = median_el.sort_key.clone(); @@ -646,11 +663,8 @@ where if self.buffer.len() > self.top_n { self.truncate_top_n(); } - self.buffer.sort_unstable_by(|left, right| { - self.comparator - .compare(&right.sort_key, &left.sort_key) - .then_with(|| left.doc.cmp(&right.doc)) - }); + self.buffer + .sort_unstable_by(|lhs, rhs| compare_for_top_k(&self.comparator, lhs, rhs)); self.buffer } @@ -755,6 +769,33 @@ mod tests { ); } + #[test] + fn test_topn_computer_duplicates() { + let mut computer: TopNComputer = + TopNComputer::new_with_comparator(2, NaturalComparator); + + computer.push(1u32, 1u32); + computer.push(1u32, 2u32); + computer.push(1u32, 3u32); + computer.push(1u32, 4u32); + computer.push(1u32, 5u32); + + // In the presence of duplicates, DocIds are always ascending order. + assert_eq!( + computer.into_sorted_vec(), + &[ + ComparableDoc { + sort_key: 1u32, + doc: 1u32, + }, + ComparableDoc { + sort_key: 1u32, + doc: 2u32, + } + ] + ); + } + #[test] fn test_topn_computer_no_panic() { for top_n in 0..10 { @@ -772,14 +813,17 @@ mod tests { #[test] fn test_topn_computer_asc_prop( limit in 0..10_usize, - docs in proptest::collection::vec((0..100_u64, 0..100_u64), 0..100_usize), + mut docs in proptest::collection::vec((0..100_u64, 0..100_u64), 0..100_usize), ) { + // NB: TopNComputer must receive inputs in ascending DocId order. + docs.sort_by_key(|(_, doc_id)| *doc_id); let mut computer: TopNComputer<_, _, ReverseComparator> = TopNComputer::new_with_comparator(limit, ReverseComparator); for (feature, doc) in &docs { computer.push(*feature, *doc); } - let mut comparable_docs: Vec> = docs.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc }).collect::>(); - comparable_docs.sort(); + let mut comparable_docs: Vec> = + docs.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc }).collect(); + crate::collector::sort_key::tests::sort_hits(&mut comparable_docs, Order::Asc); comparable_docs.truncate(limit); prop_assert_eq!( computer.into_sorted_vec(), @@ -1406,15 +1450,10 @@ mod tests { // Using the TopDocs collector should always be equivalent to sorting, skipping the // offset, and then taking the limit. - let sorted_docs: Vec<_> = if order.is_desc() { - let mut comparable_docs: Vec> = + let sorted_docs: Vec<_> = { + let mut comparable_docs: Vec> = all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect(); - comparable_docs.sort(); - comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect() - } else { - let mut comparable_docs: Vec> = - all_results.into_iter().map(|(sort_key, doc)| ComparableDoc { sort_key, doc}).collect(); - comparable_docs.sort(); + crate::collector::sort_key::tests::sort_hits(&mut comparable_docs, order); comparable_docs.into_iter().map(|cd| (cd.sort_key, cd.doc)).collect() }; let expected_docs = sorted_docs.into_iter().skip(offset).take(limit).collect::>(); diff --git a/src/indexer/delete_queue.rs b/src/indexer/delete_queue.rs index 3aa9f0d85..25c84fb36 100644 --- a/src/indexer/delete_queue.rs +++ b/src/indexer/delete_queue.rs @@ -23,13 +23,18 @@ struct InnerDeleteQueue { last_block: Weak, } +/// The delete queue is a linked list storing delete operations. +/// +/// Several consumers can hold a reference to it. Delete operations +/// get dropped/gc'ed when no more consumers are holding a reference +/// to them. #[derive(Clone)] pub struct DeleteQueue { inner: Arc>, } impl DeleteQueue { - // Creates a new delete queue. + /// Creates a new empty delete queue. pub fn new() -> DeleteQueue { DeleteQueue { inner: Arc::default(), @@ -58,10 +63,10 @@ impl DeleteQueue { block } - // Creates a new cursor that makes it possible to - // consume future delete operations. - // - // Past delete operations are not accessible. + /// Creates a new cursor that makes it possible to + /// consume future delete operations. + /// + /// Past delete operations are not accessible. pub fn cursor(&self) -> DeleteCursor { let last_block = self.get_last_block(); let operations_len = last_block.operations.len(); @@ -71,7 +76,7 @@ impl DeleteQueue { } } - // Appends a new delete operations. + /// Appends a new delete operations. pub fn push(&self, delete_operation: DeleteOperation) { self.inner .write() @@ -169,6 +174,7 @@ struct Block { next: NextBlock, } +/// As we process delete operations, keeps track of our position. #[derive(Clone)] pub struct DeleteCursor { block: Arc, diff --git a/src/indexer/operation.rs b/src/indexer/operation.rs index 69bffec17..9316f6fa7 100644 --- a/src/indexer/operation.rs +++ b/src/indexer/operation.rs @@ -5,14 +5,20 @@ use crate::Opstamp; /// Timestamped Delete operation. pub struct DeleteOperation { + /// Operation stamp. + /// It is used to check whether the delete operation + /// applies to an added document operation. pub opstamp: Opstamp, + /// Weight is used to define the set of documents to be deleted. pub target: Box, } /// Timestamped Add operation. #[derive(Eq, PartialEq, Debug)] pub struct AddOperation { + /// Operation stamp. pub opstamp: Opstamp, + /// Document to be added. pub document: D, }