From 3a65dc84c88da7013ecfc4efa997f77d81d1dd4d Mon Sep 17 00:00:00 2001 From: Alberto Piai Date: Sat, 26 Oct 2019 08:27:25 +0200 Subject: [PATCH] TopDocs: ensure stable sorting on equal score (#675) * TopDocs: ensure stable sorting on equal score When selecting the top K documents by score, we need to ensure stable sorting. Until now, for documents with the same score, we were relying on the (arbitrary) order returned by the BinaryHeap used to implement the collectors. This patch fixes the problem by explicitly using the doc address when harvesting the `TopSegmentCollector` and when merging the results in `TopCollector::merge_fruits()`. This is important (for example) to implement pagination correctly using the TopDocs collector. If sorting isn't stable, documents that have the same score might be ranked in different positions depending on the specific K that was used, thus appearing in two different pages, or in none at all. Fixes gh-671 * TMP: alternative solution (see previous commit) If we add the constrait that D is also PartialOrd in ComparableDoc, then we can move the comparison by doc address directly in the cmp implementation of ComparableDoc. * TMP rebase as first commit: add benchmarks for TopSegmentCollector * fixup! TMP: alternative solution (see previous commit) * TMP add changelog entry * TMP run cargo fmt --- CHANGELOG.md | 1 + src/collector/top_collector.rs | 113 +++++++++++++++++++++++++-- src/collector/top_score_collector.rs | 31 +++++++- 3 files changed, 137 insertions(+), 8 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a0df3a1c6..6a637d32f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ Tantivy 0.11.0 - API change around `Box`. See detail in #629 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock) - Add footer with some metadata to index files. #605 (@fdb-hiroshima) +- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) ## How to update? diff --git a/src/collector/top_collector.rs b/src/collector/top_collector.rs index 9a9d2dca0..7290d032b 100644 --- a/src/collector/top_collector.rs +++ b/src/collector/top_collector.rs @@ -12,6 +12,9 @@ use std::collections::BinaryHeap; /// It has a custom implementation of `PartialOrd` that reverses the order. This is because the /// default Rust heap is a max heap, whereas a min heap is needed. /// +/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document +/// address is used. +/// /// WARNING: equality is not what you would expect here. /// Two elements are equal if their feature is equal, and regardless of whether `doc` /// is equal. This should be perfectly fine for this usage, but let's make sure this @@ -21,29 +24,37 @@ struct ComparableDoc { doc: D, } -impl PartialOrd for ComparableDoc { +impl PartialOrd for ComparableDoc { fn partial_cmp(&self, other: &Self) -> Option { Some(self.cmp(other)) } } -impl Ord for ComparableDoc { +impl Ord for ComparableDoc { #[inline] fn cmp(&self, other: &Self) -> Ordering { - other + // Reversed to make BinaryHeap work as a min-heap + let by_feature = other .feature .partial_cmp(&self.feature) - .unwrap_or_else(|| Ordering::Equal) + .unwrap_or(Ordering::Equal); + + let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal); + + // In case of a tie on the feature, we sort by ascending + // `DocAddress` in order to ensure a stable sorting of the + // documents. + by_feature.then_with(lazy_by_doc_address) } } -impl PartialEq for ComparableDoc { +impl PartialEq for ComparableDoc { fn eq(&self, other: &Self) -> bool { self.cmp(other) == Ordering::Equal } } -impl Eq for ComparableDoc {} +impl Eq for ComparableDoc {} pub(crate) struct TopCollector { limit: usize, @@ -214,4 +225,94 @@ mod tests { ] ); } + + #[test] + fn test_top_segment_collector_stable_ordering_for_equal_feature() { + // given that the documents are collected in ascending doc id order, + // when harvesting we have to guarantee stable sorting in case of a tie + // on the score + let doc_ids_collection = [4, 5, 6]; + let score = 3.14; + + let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2); + for id in &doc_ids_collection { + top_collector_limit_2.collect(*id, score); + } + + let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3); + for id in &doc_ids_collection { + top_collector_limit_3.collect(*id, score); + } + + assert_eq!( + top_collector_limit_2.harvest(), + top_collector_limit_3.harvest()[..2].to_vec(), + ); + } +} + +#[cfg(all(test, feature = "unstable"))] +mod bench { + use super::TopSegmentCollector; + use test::Bencher; + + #[bench] + fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) { + let mut top_collector = TopSegmentCollector::new(0, 400); + + b.iter(|| { + for i in 0..100 { + top_collector.collect(i, 0.8); + } + }); + } + + #[bench] + fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) { + let mut top_collector = TopSegmentCollector::new(0, 100); + + for i in 0..100 { + top_collector.collect(i, 0.8); + } + + b.iter(|| { + for i in 0..100 { + top_collector.collect(i, 0.8); + } + }); + } + + #[bench] + fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) { + b.iter(|| { + let mut top_collector = TopSegmentCollector::new(0, 100); + + for i in 0..100 { + top_collector.collect(i, 0.8); + } + + // it would be nice to be able to do the setup N times but still + // measure only harvest(). We can't since harvest() consumes + // the top_collector. + top_collector.harvest() + }); + } + + #[bench] + fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) { + b.iter(|| { + let mut top_collector = TopSegmentCollector::new(0, 100); + let mut score = 1.0; + + for i in 0..100 { + score += 1.0; + top_collector.collect(i, score); + } + + // it would be nice to be able to do the setup N times but still + // measure only harvest(). We can't since harvest() consumes + // the top_collector. + top_collector.harvest() + }); + } } diff --git a/src/collector/top_score_collector.rs b/src/collector/top_score_collector.rs index 19cc48813..70b4a98aa 100644 --- a/src/collector/top_score_collector.rs +++ b/src/collector/top_score_collector.rs @@ -15,13 +15,16 @@ use crate::SegmentLocalId; use crate::SegmentReader; use std::fmt; -/// The Top Score Collector keeps track of the K documents +/// The `TopDocs` collector keeps track of the top `K` documents /// sorted by their score. /// /// The implementation is based on a `BinaryHeap`. /// The theorical complexity for collecting the top `K` out of `n` documents /// is `O(n log K)`. /// +/// This collector guarantees a stable sorting in case of a tie on the +/// document score. As such, it is suitable to implement pagination. +/// /// ```rust /// use tantivy::collector::TopDocs; /// use tantivy::query::QueryParser; @@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector { mod tests { use super::TopDocs; use crate::collector::Collector; - use crate::query::{Query, QueryParser}; + use crate::query::{AllQuery, Query, QueryParser}; use crate::schema::{Field, Schema, FAST, STORED, TEXT}; use crate::DocAddress; use crate::Index; use crate::IndexWriter; use crate::Score; + use itertools::Itertools; fn make_index() -> Index { let mut schema_builder = Schema::builder(); @@ -494,6 +498,29 @@ mod tests { ); } + #[test] + fn test_top_collector_stable_sorting() { + let index = make_index(); + + // using AllQuery to get a constant score + let searcher = index.reader().unwrap().searcher(); + + let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap(); + + let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap(); + + // precondition for the test to be meaningful: we did get documents + // with the same score + assert!(page_1.iter().map(|result| result.0).all_equal()); + assert!(page_2.iter().map(|result| result.0).all_equal()); + + // sanity check since we're relying on make_index() + assert_eq!(page_1.len(), 2); + assert_eq!(page_2.len(), 3); + + assert_eq!(page_1, &page_2[..page_1.len()]); + } + #[test] #[should_panic] fn test_top_0() {