Compare commits

..

5 Commits

Author SHA1 Message Date
Paul Masurel
52c24e1c06 first stab 2019-11-01 09:05:35 +09:00
Alberto Piai
3a65dc84c8 TopDocs: ensure stable sorting on equal score (#675)
* TopDocs: ensure stable sorting on equal score

When selecting the top K documents by score, we need to ensure stable
sorting. Until now, for documents with the same score, we were relying
on the (arbitrary) order returned by the BinaryHeap used to implement
the collectors.

This patch fixes the problem by explicitly using the doc address when
harvesting the `TopSegmentCollector` and when merging the results in
`TopCollector::merge_fruits()`.

This is important (for example) to implement pagination correctly using
the TopDocs collector. If sorting isn't stable, documents that have the
same score might be ranked in different positions depending on the
specific K that was used, thus appearing in two different pages, or in
none at all.

Fixes gh-671

* TMP: alternative solution (see previous commit)

If we add the constrait that D is also PartialOrd in ComparableDoc<T,
D>, then we can move the comparison by doc address directly in the cmp
implementation of ComparableDoc.

* TMP rebase as first commit: add benchmarks for TopSegmentCollector

* fixup! TMP: alternative solution (see previous commit)

* TMP add changelog entry

* TMP run cargo fmt
2019-10-26 15:27:25 +09:00
dependabot-preview[bot]
ce42bbf5c9 Update base64 requirement from 0.10.0 to 0.11.0 (#676)
Updates the requirements on [base64](https://github.com/marshallpierce/rust-base64) to permit the latest version.
- [Release notes](https://github.com/marshallpierce/rust-base64/releases)
- [Changelog](https://github.com/marshallpierce/rust-base64/blob/master/RELEASE-NOTES.md)
- [Commits](https://github.com/marshallpierce/rust-base64/compare/v0.10.0...v0.11.0)

Signed-off-by: dependabot-preview[bot] <support@dependabot.com>
2019-10-26 15:24:47 +09:00
Paul Masurel
7b21b3f25a Refactoring around Field (#673)
* Refactoring around Field

Removing the contract about the order of the field, and the
field id allocation.

* Update delete_queue.rs

* Update field.rs
2019-10-25 09:06:44 +09:00
Paul Masurel
46caec1040 Updating uuid to 0.8 (#674) 2019-10-25 09:02:00 +09:00
9 changed files with 156 additions and 12 deletions

View File

@@ -9,6 +9,7 @@ Tantivy 0.11.0
- API change around `Box<BoxableTokenizer>`. See detail in #629
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
## How to update?

View File

@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
edition = "2018"
[dependencies]
base64 = "0.10.0"
base64 = "0.11.0"
byteorder = "1.0"
crc32fast = "1.2.0"
once_cell = "1.0"
@@ -34,7 +34,7 @@ itertools = "0.8"
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
notify = {version="4", optional=true}
bit-set = "0.5"
uuid = { version = "0.7.2", features = ["v4", "serde"] }
uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.7"
futures = "0.1"
futures-cpupool = "0.1"

View File

@@ -12,6 +12,9 @@ use std::collections::BinaryHeap;
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
/// default Rust heap is a max heap, whereas a min heap is needed.
///
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
/// address is used.
///
/// WARNING: equality is not what you would expect here.
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this
@@ -21,29 +24,37 @@ struct ComparableDoc<T, D> {
doc: D,
}
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
#[inline]
fn cmp(&self, other: &Self) -> Ordering {
other
// Reversed to make BinaryHeap work as a min-heap
let by_feature = other
.feature
.partial_cmp(&self.feature)
.unwrap_or_else(|| Ordering::Equal)
.unwrap_or(Ordering::Equal);
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
// In case of a tie on the feature, we sort by ascending
// `DocAddress` in order to ensure a stable sorting of the
// documents.
by_feature.then_with(lazy_by_doc_address)
}
}
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
fn eq(&self, other: &Self) -> bool {
self.cmp(other) == Ordering::Equal
}
}
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
pub(crate) struct TopCollector<T> {
limit: usize,
@@ -214,4 +225,94 @@ mod tests {
]
);
}
#[test]
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
// given that the documents are collected in ascending doc id order,
// when harvesting we have to guarantee stable sorting in case of a tie
// on the score
let doc_ids_collection = [4, 5, 6];
let score = 3.14;
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
for id in &doc_ids_collection {
top_collector_limit_2.collect(*id, score);
}
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
for id in &doc_ids_collection {
top_collector_limit_3.collect(*id, score);
}
assert_eq!(
top_collector_limit_2.harvest(),
top_collector_limit_3.harvest()[..2].to_vec(),
);
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use super::TopSegmentCollector;
use test::Bencher;
#[bench]
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 400);
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
b.iter(|| {
for i in 0..100 {
top_collector.collect(i, 0.8);
}
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
for i in 0..100 {
top_collector.collect(i, 0.8);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
#[bench]
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
b.iter(|| {
let mut top_collector = TopSegmentCollector::new(0, 100);
let mut score = 1.0;
for i in 0..100 {
score += 1.0;
top_collector.collect(i, score);
}
// it would be nice to be able to do the setup N times but still
// measure only harvest(). We can't since harvest() consumes
// the top_collector.
top_collector.harvest()
});
}
}

View File

@@ -15,13 +15,16 @@ use crate::SegmentLocalId;
use crate::SegmentReader;
use std::fmt;
/// The Top Score Collector keeps track of the K documents
/// The `TopDocs` collector keeps track of the top `K` documents
/// sorted by their score.
///
/// The implementation is based on a `BinaryHeap`.
/// The theorical complexity for collecting the top `K` out of `n` documents
/// is `O(n log K)`.
///
/// This collector guarantees a stable sorting in case of a tie on the
/// document score. As such, it is suitable to implement pagination.
///
/// ```rust
/// use tantivy::collector::TopDocs;
/// use tantivy::query::QueryParser;
@@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
mod tests {
use super::TopDocs;
use crate::collector::Collector;
use crate::query::{Query, QueryParser};
use crate::query::{AllQuery, Query, QueryParser};
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::DocAddress;
use crate::Index;
use crate::IndexWriter;
use crate::Score;
use itertools::Itertools;
fn make_index() -> Index {
let mut schema_builder = Schema::builder();
@@ -494,6 +498,29 @@ mod tests {
);
}
#[test]
fn test_top_collector_stable_sorting() {
let index = make_index();
// using AllQuery to get a constant score
let searcher = index.reader().unwrap().searcher();
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
// precondition for the test to be meaningful: we did get documents
// with the same score
assert!(page_1.iter().map(|result| result.0).all_equal());
assert!(page_2.iter().map(|result| result.0).all_equal());
// sanity check since we're relying on make_index()
assert_eq!(page_1.len(), 2);
assert_eq!(page_2.len(), 3);
assert_eq!(page_1, &page_2[..page_1.len()]);
}
#[test]
#[should_panic]
fn test_top_0() {

View File

@@ -76,7 +76,7 @@ impl SegmentId {
}
/// Error type used when parsing a `SegmentId` from a string fails.
pub struct SegmentIdParseError(uuid::parser::ParseError);
pub struct SegmentIdParseError(uuid::Error);
impl Error for SegmentIdParseError {}

View File

@@ -732,7 +732,7 @@ impl IndexWriter {
}
UserOperation::Add(document) => {
let add_operation = AddOperation { opstamp, document };
adds.push(add_operation);
adds.push(add_operpation);
}
}
}

View File

@@ -7,6 +7,10 @@ use std::collections::HashSet;
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
impl MergeOperationInventory {
pub fn num_merge_operations(&self) -> usize {
self.0.list().len()
}
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
let mut segment_in_merge = HashSet::default();
for merge_op in self.0.list() {

View File

@@ -12,6 +12,10 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
/// Every time a the list of segments changes, the segment updater
/// asks the merge policy if some segments should be merged.
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
fn maximum_num_threads(&self) -> Option<usize> {
None
}
/// Given the list of segment metas, returns the list of merge candidates.
///
/// This call happens on the segment updater thread, and will block

View File

@@ -39,6 +39,7 @@ use std::sync::Arc;
use std::sync::RwLock;
use std::thread;
use std::thread::JoinHandle;
use std::time::Duration;
/// Save the index meta file.
/// This operation is atomic :
@@ -200,6 +201,12 @@ impl SegmentUpdater {
}
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
let max_num_threads_opt = self.0.merge_policy.read().unwrap().maximum_num_threads();
if let Some(max_num_threads) = max_num_threads_opt {
while self.0.merge_operations.num_merge_operations() >= max_num_threads_opt {
std::thread::sleep(Duration::from_secs(1u64));
}
}
self.run_async(|segment_updater| {
segment_updater.0.segment_manager.add_segment(segment_entry);
segment_updater.consider_merge_options();