diff --git a/CHANGELOG.md b/CHANGELOG.md index a0df3a1c6..6a637d32f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Tantivy 0.11.0 - API change around `Box`. See detail in #629 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock) - Add footer with some metadata to index files. #605 (@fdb-hiroshima) +- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) ## How to update? diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 9a9d2dca0..7290d032b 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -12,6 +12,9 @@ use std::collections::BinaryHeap; /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the /// default Rust heap is a max heap, whereas a min heap is needed. /// +/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document +/// address is used. +/// /// WARNING: equality is not what you would expect here. /// Two elements are equal if their feature is equal, and regardless of whether `doc` /// is equal. This should be perfectly fine for this usage, but let's make sure this @@ -21,29 +24,37 @@ struct ComparableDoc { doc: D, } -impl PartialOrd for ComparableDoc { +impl PartialOrd for ComparableDoc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ComparableDoc { +impl Ord for ComparableDoc { #[inline] fn cmp(&self, other: &Self) -> Ordering { - other + // Reversed to make BinaryHeap work as a min-heap + let by_feature = other .feature .partial_cmp(&self.feature) - .unwrap_or_else(|| Ordering::Equal) + .unwrap_or(Ordering::Equal); + + let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal); + + // In case of a tie on the feature, we sort by ascending + // `DocAddress` in order to ensure a stable sorting of the + // documents. + by_feature.then_with(lazy_by_doc_address) } } -impl PartialEq for ComparableDoc { +impl PartialEq for ComparableDoc { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl Eq for ComparableDoc {} +impl Eq for ComparableDoc {} pub(crate) struct TopCollector { limit: usize, @@ -214,4 +225,94 @@ mod tests { ] ); } + + #[test] + fn test_top_segment_collector_stable_ordering_for_equal_feature() { + // given that the documents are collected in ascending doc id order, + // when harvesting we have to guarantee stable sorting in case of a tie + // on the score + let doc_ids_collection = [4, 5, 6]; + let score = 3.14; + + let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2); + for id in &doc_ids_collection { + top_collector_limit_2.collect(*id, score); + } + + let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3); + for id in &doc_ids_collection { + top_collector_limit_3.collect(*id, score); + } + + assert_eq!( + top_collector_limit_2.harvest(), + top_collector_limit_3.harvest()[..2].to_vec(), + ); + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + use super::TopSegmentCollector; + use test::Bencher; + + #[bench] + fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) { + let mut top_collector = TopSegmentCollector::new(0, 400); + + b.iter(|| { + for i in 0..100 { + top_collector.collect(i, 0.8); + } + }); + } + + #[bench] + fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) { + let mut top_collector = TopSegmentCollector::new(0, 100); + + for i in 0..100 { + top_collector.collect(i, 0.8); + } + + b.iter(|| { + for i in 0..100 { + top_collector.collect(i, 0.8); + } + }); + } + + #[bench] + fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) { + b.iter(|| { + let mut top_collector = TopSegmentCollector::new(0, 100); + + for i in 0..100 { + top_collector.collect(i, 0.8); + } + + // it would be nice to be able to do the setup N times but still + // measure only harvest(). We can't since harvest() consumes + // the top_collector. + top_collector.harvest() + }); + } + + #[bench] + fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) { + b.iter(|| { + let mut top_collector = TopSegmentCollector::new(0, 100); + let mut score = 1.0; + + for i in 0..100 { + score += 1.0; + top_collector.collect(i, score); + } + + // it would be nice to be able to do the setup N times but still + // measure only harvest(). We can't since harvest() consumes + // the top_collector. + top_collector.harvest() + }); + } } diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 19cc48813..70b4a98aa 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -15,13 +15,16 @@ use crate::SegmentLocalId; use crate::SegmentReader; use std::fmt; -/// The Top Score Collector keeps track of the K documents +/// The `TopDocs` collector keeps track of the top `K` documents /// sorted by their score. /// /// The implementation is based on a `BinaryHeap`. /// The theorical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. /// +/// This collector guarantees a stable sorting in case of a tie on the +/// document score. As such, it is suitable to implement pagination. +/// /// ```rust /// use tantivy::collector::TopDocs; /// use tantivy::query::QueryParser; @@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector { mod tests { use super::TopDocs; use crate::collector::Collector; - use crate::query::{Query, QueryParser}; + use crate::query::{AllQuery, Query, QueryParser}; use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::DocAddress; use crate::Index; use crate::IndexWriter; use crate::Score; + use itertools::Itertools; fn make_index() -> Index { let mut schema_builder = Schema::builder(); @@ -494,6 +498,29 @@ mod tests { ); } + #[test] + fn test_top_collector_stable_sorting() { + let index = make_index(); + + // using AllQuery to get a constant score + let searcher = index.reader().unwrap().searcher(); + + let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap(); + + let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap(); + + // precondition for the test to be meaningful: we did get documents + // with the same score + assert!(page_1.iter().map(|result| result.0).all_equal()); + assert!(page_2.iter().map(|result| result.0).all_equal()); + + // sanity check since we're relying on make_index() + assert_eq!(page_1.len(), 2); + assert_eq!(page_2.len(), 3); + + assert_eq!(page_1, &page_2[..page_1.len()]); + } + #[test] #[should_panic] fn test_top_0() {