mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Compare commits
3 Commits
issue/680
...
refactorin
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
45d219b6e5 | ||
|
|
20aaaad04f | ||
|
|
ba0b89da36 |
@@ -9,7 +9,6 @@ Tantivy 0.11.0
|
|||||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
|
||||||
|
|
||||||
## How to update?
|
## How to update?
|
||||||
|
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
|
|||||||
edition = "2018"
|
edition = "2018"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
base64 = "0.11.0"
|
base64 = "0.10.0"
|
||||||
byteorder = "1.0"
|
byteorder = "1.0"
|
||||||
crc32fast = "1.2.0"
|
crc32fast = "1.2.0"
|
||||||
once_cell = "1.0"
|
once_cell = "1.0"
|
||||||
@@ -34,7 +34,7 @@ itertools = "0.8"
|
|||||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||||
notify = {version="4", optional=true}
|
notify = {version="4", optional=true}
|
||||||
bit-set = "0.5"
|
bit-set = "0.5"
|
||||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||||
crossbeam = "0.7"
|
crossbeam = "0.7"
|
||||||
futures = "0.1"
|
futures = "0.1"
|
||||||
futures-cpupool = "0.1"
|
futures-cpupool = "0.1"
|
||||||
|
|||||||
@@ -12,9 +12,6 @@ use std::collections::BinaryHeap;
|
|||||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||||
///
|
///
|
||||||
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
|
|
||||||
/// address is used.
|
|
||||||
///
|
|
||||||
/// WARNING: equality is not what you would expect here.
|
/// WARNING: equality is not what you would expect here.
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||||
@@ -24,37 +21,29 @@ struct ComparableDoc<T, D> {
|
|||||||
doc: D,
|
doc: D,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
||||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
Some(self.cmp(other))
|
Some(self.cmp(other))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
// Reversed to make BinaryHeap work as a min-heap
|
other
|
||||||
let by_feature = other
|
|
||||||
.feature
|
.feature
|
||||||
.partial_cmp(&self.feature)
|
.partial_cmp(&self.feature)
|
||||||
.unwrap_or(Ordering::Equal);
|
.unwrap_or_else(|| Ordering::Equal)
|
||||||
|
|
||||||
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
|
||||||
|
|
||||||
// In case of a tie on the feature, we sort by ascending
|
|
||||||
// `DocAddress` in order to ensure a stable sorting of the
|
|
||||||
// documents.
|
|
||||||
by_feature.then_with(lazy_by_doc_address)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.cmp(other) == Ordering::Equal
|
self.cmp(other) == Ordering::Equal
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
||||||
|
|
||||||
pub(crate) struct TopCollector<T> {
|
pub(crate) struct TopCollector<T> {
|
||||||
limit: usize,
|
limit: usize,
|
||||||
@@ -225,94 +214,4 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
|
|
||||||
// given that the documents are collected in ascending doc id order,
|
|
||||||
// when harvesting we have to guarantee stable sorting in case of a tie
|
|
||||||
// on the score
|
|
||||||
let doc_ids_collection = [4, 5, 6];
|
|
||||||
let score = 3.14;
|
|
||||||
|
|
||||||
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
|
||||||
for id in &doc_ids_collection {
|
|
||||||
top_collector_limit_2.collect(*id, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
|
|
||||||
for id in &doc_ids_collection {
|
|
||||||
top_collector_limit_3.collect(*id, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
assert_eq!(
|
|
||||||
top_collector_limit_2.harvest(),
|
|
||||||
top_collector_limit_3.harvest()[..2].to_vec(),
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
use super::TopSegmentCollector;
|
|
||||||
use test::Bencher;
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 400);
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
for i in 0..100 {
|
|
||||||
top_collector.collect(i, 0.8);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
|
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
|
||||||
|
|
||||||
for i in 0..100 {
|
|
||||||
top_collector.collect(i, 0.8);
|
|
||||||
}
|
|
||||||
|
|
||||||
b.iter(|| {
|
|
||||||
for i in 0..100 {
|
|
||||||
top_collector.collect(i, 0.8);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
|
||||||
|
|
||||||
for i in 0..100 {
|
|
||||||
top_collector.collect(i, 0.8);
|
|
||||||
}
|
|
||||||
|
|
||||||
// it would be nice to be able to do the setup N times but still
|
|
||||||
// measure only harvest(). We can't since harvest() consumes
|
|
||||||
// the top_collector.
|
|
||||||
top_collector.harvest()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
|
||||||
let mut score = 1.0;
|
|
||||||
|
|
||||||
for i in 0..100 {
|
|
||||||
score += 1.0;
|
|
||||||
top_collector.collect(i, score);
|
|
||||||
}
|
|
||||||
|
|
||||||
// it would be nice to be able to do the setup N times but still
|
|
||||||
// measure only harvest(). We can't since harvest() consumes
|
|
||||||
// the top_collector.
|
|
||||||
top_collector.harvest()
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,16 +15,13 @@ use crate::SegmentLocalId;
|
|||||||
use crate::SegmentReader;
|
use crate::SegmentReader;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
/// The Top Score Collector keeps track of the K documents
|
||||||
/// sorted by their score.
|
/// sorted by their score.
|
||||||
///
|
///
|
||||||
/// The implementation is based on a `BinaryHeap`.
|
/// The implementation is based on a `BinaryHeap`.
|
||||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||||
/// is `O(n log K)`.
|
/// is `O(n log K)`.
|
||||||
///
|
///
|
||||||
/// This collector guarantees a stable sorting in case of a tie on the
|
|
||||||
/// document score. As such, it is suitable to implement pagination.
|
|
||||||
///
|
|
||||||
/// ```rust
|
/// ```rust
|
||||||
/// use tantivy::collector::TopDocs;
|
/// use tantivy::collector::TopDocs;
|
||||||
/// use tantivy::query::QueryParser;
|
/// use tantivy::query::QueryParser;
|
||||||
@@ -431,13 +428,12 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
|||||||
mod tests {
|
mod tests {
|
||||||
use super::TopDocs;
|
use super::TopDocs;
|
||||||
use crate::collector::Collector;
|
use crate::collector::Collector;
|
||||||
use crate::query::{AllQuery, Query, QueryParser};
|
use crate::query::{Query, QueryParser};
|
||||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||||
use crate::DocAddress;
|
use crate::DocAddress;
|
||||||
use crate::Index;
|
use crate::Index;
|
||||||
use crate::IndexWriter;
|
use crate::IndexWriter;
|
||||||
use crate::Score;
|
use crate::Score;
|
||||||
use itertools::Itertools;
|
|
||||||
|
|
||||||
fn make_index() -> Index {
|
fn make_index() -> Index {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -498,29 +494,6 @@ mod tests {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_top_collector_stable_sorting() {
|
|
||||||
let index = make_index();
|
|
||||||
|
|
||||||
// using AllQuery to get a constant score
|
|
||||||
let searcher = index.reader().unwrap().searcher();
|
|
||||||
|
|
||||||
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
|
|
||||||
|
|
||||||
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
|
|
||||||
|
|
||||||
// precondition for the test to be meaningful: we did get documents
|
|
||||||
// with the same score
|
|
||||||
assert!(page_1.iter().map(|result| result.0).all_equal());
|
|
||||||
assert!(page_2.iter().map(|result| result.0).all_equal());
|
|
||||||
|
|
||||||
// sanity check since we're relying on make_index()
|
|
||||||
assert_eq!(page_1.len(), 2);
|
|
||||||
assert_eq!(page_2.len(), 3);
|
|
||||||
|
|
||||||
assert_eq!(page_1, &page_2[..page_1.len()]);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
#[should_panic]
|
#[should_panic]
|
||||||
fn test_top_0() {
|
fn test_top_0() {
|
||||||
|
|||||||
@@ -76,7 +76,7 @@ impl SegmentId {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Error type used when parsing a `SegmentId` from a string fails.
|
/// Error type used when parsing a `SegmentId` from a string fails.
|
||||||
pub struct SegmentIdParseError(uuid::Error);
|
pub struct SegmentIdParseError(uuid::parser::ParseError);
|
||||||
|
|
||||||
impl Error for SegmentIdParseError {}
|
impl Error for SegmentIdParseError {}
|
||||||
|
|
||||||
|
|||||||
@@ -732,7 +732,7 @@ impl IndexWriter {
|
|||||||
}
|
}
|
||||||
UserOperation::Add(document) => {
|
UserOperation::Add(document) => {
|
||||||
let add_operation = AddOperation { opstamp, document };
|
let add_operation = AddOperation { opstamp, document };
|
||||||
adds.push(add_operpation);
|
adds.push(add_operation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,10 +7,6 @@ use std::collections::HashSet;
|
|||||||
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
pub struct MergeOperationInventory(Inventory<InnerMergeOperation>);
|
||||||
|
|
||||||
impl MergeOperationInventory {
|
impl MergeOperationInventory {
|
||||||
pub fn num_merge_operations(&self) -> usize {
|
|
||||||
self.0.list().len()
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
|
pub fn segment_in_merge(&self) -> HashSet<SegmentId> {
|
||||||
let mut segment_in_merge = HashSet::default();
|
let mut segment_in_merge = HashSet::default();
|
||||||
for merge_op in self.0.list() {
|
for merge_op in self.0.list() {
|
||||||
|
|||||||
@@ -12,10 +12,6 @@ pub struct MergeCandidate(pub Vec<SegmentId>);
|
|||||||
/// Every time a the list of segments changes, the segment updater
|
/// Every time a the list of segments changes, the segment updater
|
||||||
/// asks the merge policy if some segments should be merged.
|
/// asks the merge policy if some segments should be merged.
|
||||||
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
pub trait MergePolicy: marker::Send + marker::Sync + Debug {
|
||||||
fn maximum_num_threads(&self) -> Option<usize> {
|
|
||||||
None
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Given the list of segment metas, returns the list of merge candidates.
|
/// Given the list of segment metas, returns the list of merge candidates.
|
||||||
///
|
///
|
||||||
/// This call happens on the segment updater thread, and will block
|
/// This call happens on the segment updater thread, and will block
|
||||||
|
|||||||
@@ -39,7 +39,6 @@ use std::sync::Arc;
|
|||||||
use std::sync::RwLock;
|
use std::sync::RwLock;
|
||||||
use std::thread;
|
use std::thread;
|
||||||
use std::thread::JoinHandle;
|
use std::thread::JoinHandle;
|
||||||
use std::time::Duration;
|
|
||||||
|
|
||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
/// This operation is atomic :
|
/// This operation is atomic :
|
||||||
@@ -201,12 +200,6 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
|
pub fn add_segment(&self, segment_entry: SegmentEntry) -> bool {
|
||||||
let max_num_threads_opt = self.0.merge_policy.read().unwrap().maximum_num_threads();
|
|
||||||
if let Some(max_num_threads) = max_num_threads_opt {
|
|
||||||
while self.0.merge_operations.num_merge_operations() >= max_num_threads_opt {
|
|
||||||
std::thread::sleep(Duration::from_secs(1u64));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
self.run_async(|segment_updater| {
|
self.run_async(|segment_updater| {
|
||||||
segment_updater.0.segment_manager.add_segment(segment_entry);
|
segment_updater.0.segment_manager.add_segment(segment_entry);
|
||||||
segment_updater.consider_merge_options();
|
segment_updater.consider_merge_options();
|
||||||
|
|||||||
Reference in New Issue
Block a user