mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-31 06:22:54 +00:00
Compare commits
6 Commits
refactorin
...
issue/681
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97227a2938 | ||
|
|
6a8a8557d2 | ||
|
|
3a65dc84c8 | ||
|
|
ce42bbf5c9 | ||
|
|
7b21b3f25a | ||
|
|
46caec1040 |
@@ -9,7 +9,9 @@ Tantivy 0.11.0
|
||||
- API change around `Box<BoxableTokenizer>`. See detail in #629
|
||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||
|
||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||
- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)
|
||||
|
||||
## How to update?
|
||||
|
||||
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
||||
|
||||
@@ -13,7 +13,7 @@ keywords = ["search", "information", "retrieval"]
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
base64 = "0.10.0"
|
||||
base64 = "0.11.0"
|
||||
byteorder = "1.0"
|
||||
crc32fast = "1.2.0"
|
||||
once_cell = "1.0"
|
||||
@@ -34,7 +34,7 @@ itertools = "0.8"
|
||||
levenshtein_automata = {version="0.1", features=["fst_automaton"]}
|
||||
notify = {version="4", optional=true}
|
||||
bit-set = "0.5"
|
||||
uuid = { version = "0.7.2", features = ["v4", "serde"] }
|
||||
uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.7"
|
||||
futures = "0.1"
|
||||
futures-cpupool = "0.1"
|
||||
|
||||
@@ -12,6 +12,9 @@ use std::collections::BinaryHeap;
|
||||
/// It has a custom implementation of `PartialOrd` that reverses the order. This is because the
|
||||
/// default Rust heap is a max heap, whereas a min heap is needed.
|
||||
///
|
||||
/// Additionally, it guarantees stable sorting: in case of a tie on the feature, the document
|
||||
/// address is used.
|
||||
///
|
||||
/// WARNING: equality is not what you would expect here.
|
||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||
@@ -21,29 +24,37 @@ struct ComparableDoc<T, D> {
|
||||
doc: D,
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> PartialOrd for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialOrd for ComparableDoc<T, D> {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> Ord for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> Ord for ComparableDoc<T, D> {
|
||||
#[inline]
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other
|
||||
// Reversed to make BinaryHeap work as a min-heap
|
||||
let by_feature = other
|
||||
.feature
|
||||
.partial_cmp(&self.feature)
|
||||
.unwrap_or_else(|| Ordering::Equal)
|
||||
.unwrap_or(Ordering::Equal);
|
||||
|
||||
let lazy_by_doc_address = || self.doc.partial_cmp(&other.doc).unwrap_or(Ordering::Equal);
|
||||
|
||||
// In case of a tie on the feature, we sort by ascending
|
||||
// `DocAddress` in order to ensure a stable sorting of the
|
||||
// documents.
|
||||
by_feature.then_with(lazy_by_doc_address)
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> PartialEq for ComparableDoc<T, D> {
|
||||
impl<T: PartialOrd, D: PartialOrd> PartialEq for ComparableDoc<T, D> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.cmp(other) == Ordering::Equal
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: PartialOrd, D> Eq for ComparableDoc<T, D> {}
|
||||
impl<T: PartialOrd, D: PartialOrd> Eq for ComparableDoc<T, D> {}
|
||||
|
||||
pub(crate) struct TopCollector<T> {
|
||||
limit: usize,
|
||||
@@ -214,4 +225,94 @@ mod tests {
|
||||
]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_segment_collector_stable_ordering_for_equal_feature() {
|
||||
// given that the documents are collected in ascending doc id order,
|
||||
// when harvesting we have to guarantee stable sorting in case of a tie
|
||||
// on the score
|
||||
let doc_ids_collection = [4, 5, 6];
|
||||
let score = 3.14;
|
||||
|
||||
let mut top_collector_limit_2 = TopSegmentCollector::new(0, 2);
|
||||
for id in &doc_ids_collection {
|
||||
top_collector_limit_2.collect(*id, score);
|
||||
}
|
||||
|
||||
let mut top_collector_limit_3 = TopSegmentCollector::new(0, 3);
|
||||
for id in &doc_ids_collection {
|
||||
top_collector_limit_3.collect(*id, score);
|
||||
}
|
||||
|
||||
assert_eq!(
|
||||
top_collector_limit_2.harvest(),
|
||||
top_collector_limit_3.harvest()[..2].to_vec(),
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
use super::TopSegmentCollector;
|
||||
use test::Bencher;
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_not_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 400);
|
||||
|
||||
b.iter(|| {
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_at_capacity(b: &mut Bencher) {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
|
||||
b.iter(|| {
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_and_harvest_many_ties(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
|
||||
for i in 0..100 {
|
||||
top_collector.collect(i, 0.8);
|
||||
}
|
||||
|
||||
// it would be nice to be able to do the setup N times but still
|
||||
// measure only harvest(). We can't since harvest() consumes
|
||||
// the top_collector.
|
||||
top_collector.harvest()
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_top_segment_collector_collect_and_harvest_no_tie(b: &mut Bencher) {
|
||||
b.iter(|| {
|
||||
let mut top_collector = TopSegmentCollector::new(0, 100);
|
||||
let mut score = 1.0;
|
||||
|
||||
for i in 0..100 {
|
||||
score += 1.0;
|
||||
top_collector.collect(i, score);
|
||||
}
|
||||
|
||||
// it would be nice to be able to do the setup N times but still
|
||||
// measure only harvest(). We can't since harvest() consumes
|
||||
// the top_collector.
|
||||
top_collector.harvest()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,13 +15,16 @@ use crate::SegmentLocalId;
|
||||
use crate::SegmentReader;
|
||||
use std::fmt;
|
||||
|
||||
/// The Top Score Collector keeps track of the K documents
|
||||
/// The `TopDocs` collector keeps track of the top `K` documents
|
||||
/// sorted by their score.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// This collector guarantees a stable sorting in case of a tie on the
|
||||
/// document score. As such, it is suitable to implement pagination.
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::collector::TopDocs;
|
||||
/// use tantivy::query::QueryParser;
|
||||
@@ -428,12 +431,13 @@ impl SegmentCollector for TopScoreSegmentCollector {
|
||||
mod tests {
|
||||
use super::TopDocs;
|
||||
use crate::collector::Collector;
|
||||
use crate::query::{Query, QueryParser};
|
||||
use crate::query::{AllQuery, Query, QueryParser};
|
||||
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
|
||||
use crate::DocAddress;
|
||||
use crate::Index;
|
||||
use crate::IndexWriter;
|
||||
use crate::Score;
|
||||
use itertools::Itertools;
|
||||
|
||||
fn make_index() -> Index {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -494,6 +498,29 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_top_collector_stable_sorting() {
|
||||
let index = make_index();
|
||||
|
||||
// using AllQuery to get a constant score
|
||||
let searcher = index.reader().unwrap().searcher();
|
||||
|
||||
let page_1 = searcher.search(&AllQuery, &TopDocs::with_limit(2)).unwrap();
|
||||
|
||||
let page_2 = searcher.search(&AllQuery, &TopDocs::with_limit(3)).unwrap();
|
||||
|
||||
// precondition for the test to be meaningful: we did get documents
|
||||
// with the same score
|
||||
assert!(page_1.iter().map(|result| result.0).all_equal());
|
||||
assert!(page_2.iter().map(|result| result.0).all_equal());
|
||||
|
||||
// sanity check since we're relying on make_index()
|
||||
assert_eq!(page_1.len(), 2);
|
||||
assert_eq!(page_2.len(), 3);
|
||||
|
||||
assert_eq!(page_1, &page_2[..page_1.len()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic]
|
||||
fn test_top_0() {
|
||||
|
||||
@@ -150,6 +150,21 @@ impl SegmentMeta {
|
||||
self.num_deleted_docs() > 0
|
||||
}
|
||||
|
||||
/// Updates the max_doc value from the `SegmentMeta`.
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||
assert_eq!(self.tracked.max_doc, 0);
|
||||
assert!(self.tracked.deletes.is_none());
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
segment_id: inner_meta.segment_id,
|
||||
max_doc,
|
||||
deletes: None,
|
||||
});
|
||||
SegmentMeta { tracked }
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||
let delete_meta = DeleteMeta {
|
||||
|
||||
@@ -50,6 +50,17 @@ impl Segment {
|
||||
&self.meta
|
||||
}
|
||||
|
||||
/// Updates the max_doc value from the `SegmentMeta`.
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||
Segment {
|
||||
index: self.index,
|
||||
meta: self.meta.with_max_doc(max_doc),
|
||||
}
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||
Segment {
|
||||
|
||||
@@ -76,7 +76,7 @@ impl SegmentId {
|
||||
}
|
||||
|
||||
/// Error type used when parsing a `SegmentId` from a string fails.
|
||||
pub struct SegmentIdParseError(uuid::parser::ParseError);
|
||||
pub struct SegmentIdParseError(uuid::Error);
|
||||
|
||||
impl Error for SegmentIdParseError {}
|
||||
|
||||
|
||||
@@ -327,8 +327,7 @@ mod tests_mmap_specific {
|
||||
.unwrap();
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(managed_directory.exists(test_path2));
|
||||
let living_files: HashSet<PathBuf> =
|
||||
[test_path1.to_owned()].into_iter().cloned().collect();
|
||||
let living_files: HashSet<PathBuf> = [test_path1.to_owned()].iter().cloned().collect();
|
||||
managed_directory.garbage_collect(|| living_files);
|
||||
assert!(managed_directory.exists(test_path1));
|
||||
assert!(!managed_directory.exists(test_path2));
|
||||
|
||||
@@ -10,11 +10,14 @@ use std::io::Write;
|
||||
/// Write a delete `BitSet`
|
||||
///
|
||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
||||
let max_doc = delete_bitset.capacity();
|
||||
pub fn write_delete_bitset(
|
||||
delete_bitset: &BitSet,
|
||||
max_doc: u32,
|
||||
writer: &mut WritePtr,
|
||||
) -> io::Result<()> {
|
||||
let mut byte = 0u8;
|
||||
let mut shift = 0u8;
|
||||
for doc in 0..max_doc {
|
||||
for doc in 0..(max_doc as usize) {
|
||||
if delete_bitset.contains(doc) {
|
||||
byte |= 1 << shift;
|
||||
}
|
||||
@@ -86,18 +89,17 @@ mod tests {
|
||||
use bit_set::BitSet;
|
||||
use std::path::PathBuf;
|
||||
|
||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
||||
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
|
||||
let test_path = PathBuf::from("test");
|
||||
let mut directory = RAMDirectory::create();
|
||||
{
|
||||
let mut writer = directory.open_write(&*test_path).unwrap();
|
||||
write_delete_bitset(bitset, &mut writer).unwrap();
|
||||
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
|
||||
}
|
||||
{
|
||||
let source = directory.open_read(&test_path).unwrap();
|
||||
let delete_bitset = DeleteBitSet::open(source);
|
||||
let n = bitset.capacity();
|
||||
for doc in 0..n {
|
||||
for doc in 0..max_doc as usize {
|
||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||
}
|
||||
assert_eq!(delete_bitset.len(), bitset.len());
|
||||
@@ -110,7 +112,7 @@ mod tests {
|
||||
let mut bitset = BitSet::with_capacity(10);
|
||||
bitset.insert(1);
|
||||
bitset.insert(9);
|
||||
test_delete_bitset_helper(&bitset);
|
||||
test_delete_bitset_helper(&bitset, 10);
|
||||
}
|
||||
{
|
||||
let mut bitset = BitSet::with_capacity(8);
|
||||
@@ -119,7 +121,7 @@ mod tests {
|
||||
bitset.insert(3);
|
||||
bitset.insert(5);
|
||||
bitset.insert(7);
|
||||
test_delete_bitset_helper(&bitset);
|
||||
test_delete_bitset_helper(&bitset, 8);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -148,7 +148,6 @@ pub(crate) fn advance_deletes(
|
||||
};
|
||||
|
||||
let delete_cursor = segment_entry.delete_cursor();
|
||||
|
||||
compute_deleted_bitset(
|
||||
&mut delete_bitset,
|
||||
&segment_reader,
|
||||
@@ -168,7 +167,7 @@ pub(crate) fn advance_deletes(
|
||||
if num_deleted_docs > 0 {
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
||||
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
}
|
||||
@@ -178,13 +177,13 @@ pub(crate) fn advance_deletes(
|
||||
|
||||
fn index_documents(
|
||||
memory_budget: usize,
|
||||
segment: &Segment,
|
||||
segment: Segment,
|
||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||
segment_updater: &mut SegmentUpdater,
|
||||
mut delete_cursor: DeleteCursor,
|
||||
) -> Result<bool> {
|
||||
let schema = segment.schema();
|
||||
let segment_id = segment.id();
|
||||
|
||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||
for document_group in grouped_document_iterator {
|
||||
for doc in document_group {
|
||||
@@ -204,21 +203,30 @@ fn index_documents(
|
||||
return Ok(false);
|
||||
}
|
||||
|
||||
let num_docs = segment_writer.max_doc();
|
||||
let max_doc = segment_writer.max_doc();
|
||||
|
||||
// this is ensured by the call to peek before starting
|
||||
// the worker thread.
|
||||
assert!(num_docs > 0);
|
||||
assert!(max_doc > 0);
|
||||
|
||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||
let segment_meta = segment.index().new_segment_meta(segment_id, num_docs);
|
||||
|
||||
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||
|
||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||
|
||||
let delete_bitset_opt =
|
||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
||||
let delete_bitset_opt = apply_deletes(
|
||||
&segment_with_max_doc,
|
||||
&mut delete_cursor,
|
||||
&doc_opstamps,
|
||||
last_docstamp,
|
||||
)?;
|
||||
|
||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
||||
let segment_entry = SegmentEntry::new(
|
||||
segment_with_max_doc.meta().clone(),
|
||||
delete_cursor,
|
||||
delete_bitset_opt,
|
||||
);
|
||||
Ok(segment_updater.add_segment(segment_entry))
|
||||
}
|
||||
|
||||
@@ -235,7 +243,9 @@ fn apply_deletes(
|
||||
}
|
||||
let segment_reader = SegmentReader::open(segment)?;
|
||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
||||
|
||||
let max_doc = segment.meta().max_doc();
|
||||
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
|
||||
let may_have_deletes = compute_deleted_bitset(
|
||||
&mut deleted_bitset,
|
||||
&segment_reader,
|
||||
@@ -407,7 +417,7 @@ impl IndexWriter {
|
||||
let segment = index.new_segment();
|
||||
index_documents(
|
||||
mem_budget,
|
||||
&segment,
|
||||
segment,
|
||||
&mut document_iterator,
|
||||
&mut segment_updater,
|
||||
delete_cursor.clone(),
|
||||
|
||||
@@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter;
|
||||
|
||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||
pub type DefaultMergePolicy = LogMergePolicy;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::schema::{self, Schema};
|
||||
use crate::{Index, Term};
|
||||
#[test]
|
||||
fn test_advance_delete_bug() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||
// there must be one deleted document in the segment
|
||||
index_writer.add_document(doc!(text_field=>"b"));
|
||||
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||
// we need enough data to trigger the bug (at least 32 documents)
|
||||
for _ in 0..32 {
|
||||
index_writer.add_document(doc!(text_field=>"c"));
|
||||
}
|
||||
index_writer.commit().unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -15,6 +15,7 @@ impl Field {
|
||||
}
|
||||
|
||||
/// Returns a u32 identifying uniquely a field within a schema.
|
||||
#[allow(clippy::trivially_copy_pass_by_ref)]
|
||||
pub fn field_id(&self) -> u32 {
|
||||
self.0
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user