TopDocs: ensure stable sorting on equal score (#675)

* TopDocs: ensure stable sorting on equal score

When selecting the top K documents by score, we need to ensure stable
sorting. Until now, for documents with the same score, we were relying
on the (arbitrary) order returned by the BinaryHeap used to implement
the collectors.

This patch fixes the problem by explicitly using the doc address when
harvesting the `TopSegmentCollector` and when merging the results in
`TopCollector::merge_fruits()`.

This is important (for example) to implement pagination correctly using
the TopDocs collector. If sorting isn't stable, documents that have the
same score might be ranked in different positions depending on the
specific K that was used, thus appearing in two different pages, or in
none at all.

Fixes gh-671

* TMP: alternative solution (see previous commit)

If we add the constrait that D is also PartialOrd in ComparableDoc<T,
D>, then we can move the comparison by doc address directly in the cmp
implementation of ComparableDoc.

* TMP rebase as first commit: add benchmarks for TopSegmentCollector

* fixup! TMP: alternative solution (see previous commit)

* TMP add changelog entry

* TMP run cargo fmt
This commit is contained in:
Alberto Piai
2019-10-26 08:27:25 +02:00
committed by Paul Masurel
parent ce42bbf5c9
commit 3a65dc84c8
3 changed files with 137 additions and 8 deletions

View File

@@ -9,6 +9,7 @@ Tantivy 0.11.0
- API change around `Box<BoxableTokenizer>`. See detail in #629
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
## How to update?

View File

@@ -12,6 +12,9 @@ use std::collections::BinaryHeap;
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
/// default Rust heap is a max heap, whereas a min heap is needed.
///
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
/// address is used.
///
/// WARNING: equality is not what you would expect here.
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this
@@ -21,29 +24,37 @@ struct ComparableDoc<T, D> {
doc: D,
}
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
other
// Reversed to make BinaryHeap work as a min-heap
let by_feature = other
.feature
.partial_cmp(&self.feature)
.unwrap_or_else(|| Ordering::Equal)
.unwrap_or(Ordering::Equal);
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
// In case of a tie on the feature, we sort by ascending
// `DocAddress` in order to ensure a stable sorting of the
// documents.
by_feature.then_with(lazy_by_doc_address)
}
}
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
pub(crate) struct TopCollector<T> {
limit: usize,
@@ -214,4 +225,94 @@ mod tests {
]
);
}
#[test]
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
// given that the documents are collected in ascending doc id order,
// when harvesting we have to guarantee stable sorting in case of a tie
// on the score
let doc_ids_collection = [4, 5, 6];
let score = 3.14;
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
for id in &doc_ids_collection {
top_collector_limit_2.collect(*id, score);
}
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
for id in &doc_ids_collection {
top_collector_limit_3.collect(*id, score);
}
assert_eq!(
top_collector_limit_2.harvest(),
top_collector_limit_3.harvest()[..2].to_vec(),
);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::TopSegmentCollector;
use test::Bencher;
#[bench]
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 400);
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
let mut score = 1.0;
for i in 0..100 {
score += 1.0;
top_collector.collect(i, score);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
}

View File

@@ -15,13 +15,16 @@ use crate::SegmentLocalId;
use crate::SegmentReader;
use std::fmt;
/// The Top Score Collector keeps track of the K documents
/// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// This collector guarantees a stable sorting in case of a tie on the
/// document score. As such, it is suitable to implement pagination.
///
/// ```rust
/// use tantivy::collector::TopDocs;
/// use tantivy::query::QueryParser;
@@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
mod tests {
use super::TopDocs;
use crate::collector::Collector;
use crate::query::{Query, QueryParser};
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::DocAddress;
use crate::Index;
use crate::IndexWriter;
use crate::Score;
use itertools::Itertools;
fn make_index() -> Index {
let mut schema_builder = Schema::builder();
@@ -494,6 +498,29 @@ mod tests {
);
}
#[test]
fn test_top_collector_stable_sorting() {
let index = make_index();
// using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher();
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
// precondition for the test to be meaningful: we did get documents
// with the same score
assert!(page_1.iter().map(|result| result.0).all_equal());
assert!(page_2.iter().map(|result| result.0).all_equal());
// sanity check since we're relying on make_index()
assert_eq!(page_1.len(), 2);
assert_eq!(page_2.len(), 3);
assert_eq!(page_1, &page_2[..page_1.len()]);
}
#[test]
#[should_panic]
fn test_top_0() {