mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Compare commits
1 Commits
fix_open_b
...
issue/681
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
97227a2938 |
@@ -10,7 +10,8 @@ Tantivy 0.11.0
|
|||||||
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
- Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock)
|
||||||
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
- Add footer with some metadata to index files. #605 (@fdb-hiroshima)
|
||||||
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
- TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock)
|
||||||
|
- Fix crash when committing multiple times with deleted documents. #681 (@brainlock)
|
||||||
|
|
||||||
## How to update?
|
## How to update?
|
||||||
|
|
||||||
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
- `Box<dyn BoxableTokenizer>` has been replaced by a `BoxedTokenizer` struct.
|
||||||
|
|||||||
@@ -150,6 +150,21 @@ impl SegmentMeta {
|
|||||||
self.num_deleted_docs() > 0
|
self.num_deleted_docs() > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
|
///
|
||||||
|
/// This method is only used when updating `max_doc` from 0
|
||||||
|
/// as we finalize a fresh new segment.
|
||||||
|
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||||
|
assert_eq!(self.tracked.max_doc, 0);
|
||||||
|
assert!(self.tracked.deletes.is_none());
|
||||||
|
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||||
|
segment_id: inner_meta.segment_id,
|
||||||
|
max_doc,
|
||||||
|
deletes: None,
|
||||||
|
});
|
||||||
|
SegmentMeta { tracked }
|
||||||
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta {
|
||||||
let delete_meta = DeleteMeta {
|
let delete_meta = DeleteMeta {
|
||||||
|
|||||||
@@ -50,6 +50,17 @@ impl Segment {
|
|||||||
&self.meta
|
&self.meta
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
|
///
|
||||||
|
/// This method is only used when updating `max_doc` from 0
|
||||||
|
/// as we finalize a fresh new segment.
|
||||||
|
pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment {
|
||||||
|
Segment {
|
||||||
|
index: self.index,
|
||||||
|
meta: self.meta.with_max_doc(max_doc),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[doc(hidden)]
|
#[doc(hidden)]
|
||||||
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment {
|
||||||
Segment {
|
Segment {
|
||||||
|
|||||||
@@ -10,11 +10,14 @@ use std::io::Write;
|
|||||||
/// Write a delete `BitSet`
|
/// Write a delete `BitSet`
|
||||||
///
|
///
|
||||||
/// where `delete_bitset` is the set of deleted `DocId`.
|
/// where `delete_bitset` is the set of deleted `DocId`.
|
||||||
pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> {
|
pub fn write_delete_bitset(
|
||||||
let max_doc = delete_bitset.capacity();
|
delete_bitset: &BitSet,
|
||||||
|
max_doc: u32,
|
||||||
|
writer: &mut WritePtr,
|
||||||
|
) -> io::Result<()> {
|
||||||
let mut byte = 0u8;
|
let mut byte = 0u8;
|
||||||
let mut shift = 0u8;
|
let mut shift = 0u8;
|
||||||
for doc in 0..max_doc {
|
for doc in 0..(max_doc as usize) {
|
||||||
if delete_bitset.contains(doc) {
|
if delete_bitset.contains(doc) {
|
||||||
byte |= 1 << shift;
|
byte |= 1 << shift;
|
||||||
}
|
}
|
||||||
@@ -86,18 +89,17 @@ mod tests {
|
|||||||
use bit_set::BitSet;
|
use bit_set::BitSet;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
fn test_delete_bitset_helper(bitset: &BitSet) {
|
fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) {
|
||||||
let test_path = PathBuf::from("test");
|
let test_path = PathBuf::from("test");
|
||||||
let mut directory = RAMDirectory::create();
|
let mut directory = RAMDirectory::create();
|
||||||
{
|
{
|
||||||
let mut writer = directory.open_write(&*test_path).unwrap();
|
let mut writer = directory.open_write(&*test_path).unwrap();
|
||||||
write_delete_bitset(bitset, &mut writer).unwrap();
|
write_delete_bitset(bitset, max_doc, &mut writer).unwrap();
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let source = directory.open_read(&test_path).unwrap();
|
let source = directory.open_read(&test_path).unwrap();
|
||||||
let delete_bitset = DeleteBitSet::open(source);
|
let delete_bitset = DeleteBitSet::open(source);
|
||||||
let n = bitset.capacity();
|
for doc in 0..max_doc as usize {
|
||||||
for doc in 0..n {
|
|
||||||
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId));
|
||||||
}
|
}
|
||||||
assert_eq!(delete_bitset.len(), bitset.len());
|
assert_eq!(delete_bitset.len(), bitset.len());
|
||||||
@@ -110,7 +112,7 @@ mod tests {
|
|||||||
let mut bitset = BitSet::with_capacity(10);
|
let mut bitset = BitSet::with_capacity(10);
|
||||||
bitset.insert(1);
|
bitset.insert(1);
|
||||||
bitset.insert(9);
|
bitset.insert(9);
|
||||||
test_delete_bitset_helper(&bitset);
|
test_delete_bitset_helper(&bitset, 10);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let mut bitset = BitSet::with_capacity(8);
|
let mut bitset = BitSet::with_capacity(8);
|
||||||
@@ -119,7 +121,7 @@ mod tests {
|
|||||||
bitset.insert(3);
|
bitset.insert(3);
|
||||||
bitset.insert(5);
|
bitset.insert(5);
|
||||||
bitset.insert(7);
|
bitset.insert(7);
|
||||||
test_delete_bitset_helper(&bitset);
|
test_delete_bitset_helper(&bitset, 8);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -148,7 +148,6 @@ pub(crate) fn advance_deletes(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let delete_cursor = segment_entry.delete_cursor();
|
let delete_cursor = segment_entry.delete_cursor();
|
||||||
|
|
||||||
compute_deleted_bitset(
|
compute_deleted_bitset(
|
||||||
&mut delete_bitset,
|
&mut delete_bitset,
|
||||||
&segment_reader,
|
&segment_reader,
|
||||||
@@ -168,7 +167,7 @@ pub(crate) fn advance_deletes(
|
|||||||
if num_deleted_docs > 0 {
|
if num_deleted_docs > 0 {
|
||||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||||
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
let mut delete_file = segment.open_write(SegmentComponent::DELETE)?;
|
||||||
write_delete_bitset(&delete_bitset, &mut delete_file)?;
|
write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?;
|
||||||
delete_file.terminate()?;
|
delete_file.terminate()?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -178,13 +177,13 @@ pub(crate) fn advance_deletes(
|
|||||||
|
|
||||||
fn index_documents(
|
fn index_documents(
|
||||||
memory_budget: usize,
|
memory_budget: usize,
|
||||||
segment: &Segment,
|
segment: Segment,
|
||||||
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
grouped_document_iterator: &mut dyn Iterator<Item = OperationGroup>,
|
||||||
segment_updater: &mut SegmentUpdater,
|
segment_updater: &mut SegmentUpdater,
|
||||||
mut delete_cursor: DeleteCursor,
|
mut delete_cursor: DeleteCursor,
|
||||||
) -> Result<bool> {
|
) -> Result<bool> {
|
||||||
let schema = segment.schema();
|
let schema = segment.schema();
|
||||||
let segment_id = segment.id();
|
|
||||||
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?;
|
||||||
for document_group in grouped_document_iterator {
|
for document_group in grouped_document_iterator {
|
||||||
for doc in document_group {
|
for doc in document_group {
|
||||||
@@ -204,21 +203,30 @@ fn index_documents(
|
|||||||
return Ok(false);
|
return Ok(false);
|
||||||
}
|
}
|
||||||
|
|
||||||
let num_docs = segment_writer.max_doc();
|
let max_doc = segment_writer.max_doc();
|
||||||
|
|
||||||
// this is ensured by the call to peek before starting
|
// this is ensured by the call to peek before starting
|
||||||
// the worker thread.
|
// the worker thread.
|
||||||
assert!(num_docs > 0);
|
assert!(max_doc > 0);
|
||||||
|
|
||||||
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
let doc_opstamps: Vec<Opstamp> = segment_writer.finalize()?;
|
||||||
let segment_meta = segment.index().new_segment_meta(segment_id, num_docs);
|
|
||||||
|
let segment_with_max_doc = segment.with_max_doc(max_doc);
|
||||||
|
|
||||||
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap());
|
||||||
|
|
||||||
let delete_bitset_opt =
|
let delete_bitset_opt = apply_deletes(
|
||||||
apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?;
|
&segment_with_max_doc,
|
||||||
|
&mut delete_cursor,
|
||||||
|
&doc_opstamps,
|
||||||
|
last_docstamp,
|
||||||
|
)?;
|
||||||
|
|
||||||
let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt);
|
let segment_entry = SegmentEntry::new(
|
||||||
|
segment_with_max_doc.meta().clone(),
|
||||||
|
delete_cursor,
|
||||||
|
delete_bitset_opt,
|
||||||
|
);
|
||||||
Ok(segment_updater.add_segment(segment_entry))
|
Ok(segment_updater.add_segment(segment_entry))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -235,7 +243,9 @@ fn apply_deletes(
|
|||||||
}
|
}
|
||||||
let segment_reader = SegmentReader::open(segment)?;
|
let segment_reader = SegmentReader::open(segment)?;
|
||||||
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps);
|
||||||
let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize);
|
|
||||||
|
let max_doc = segment.meta().max_doc();
|
||||||
|
let mut deleted_bitset = BitSet::with_capacity(max_doc as usize);
|
||||||
let may_have_deletes = compute_deleted_bitset(
|
let may_have_deletes = compute_deleted_bitset(
|
||||||
&mut deleted_bitset,
|
&mut deleted_bitset,
|
||||||
&segment_reader,
|
&segment_reader,
|
||||||
@@ -407,7 +417,7 @@ impl IndexWriter {
|
|||||||
let segment = index.new_segment();
|
let segment = index.new_segment();
|
||||||
index_documents(
|
index_documents(
|
||||||
mem_budget,
|
mem_budget,
|
||||||
&segment,
|
segment,
|
||||||
&mut document_iterator,
|
&mut document_iterator,
|
||||||
&mut segment_updater,
|
&mut segment_updater,
|
||||||
delete_cursor.clone(),
|
delete_cursor.clone(),
|
||||||
|
|||||||
@@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter;
|
|||||||
|
|
||||||
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
/// Alias for the default merge policy, which is the `LogMergePolicy`.
|
||||||
pub type DefaultMergePolicy = LogMergePolicy;
|
pub type DefaultMergePolicy = LogMergePolicy;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use crate::schema::{self, Schema};
|
||||||
|
use crate::{Index, Term};
|
||||||
|
#[test]
|
||||||
|
fn test_advance_delete_bug() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text_field = schema_builder.add_text_field("text", schema::TEXT);
|
||||||
|
let index = Index::create_from_tempdir(schema_builder.build()).unwrap();
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap();
|
||||||
|
// there must be one deleted document in the segment
|
||||||
|
index_writer.add_document(doc!(text_field=>"b"));
|
||||||
|
index_writer.delete_term(Term::from_field_text(text_field, "b"));
|
||||||
|
// we need enough data to trigger the bug (at least 32 documents)
|
||||||
|
for _ in 0..32 {
|
||||||
|
index_writer.add_document(doc!(text_field=>"c"));
|
||||||
|
}
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ impl Field {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a u32 identifying uniquely a field within a schema.
|
/// Returns a u32 identifying uniquely a field within a schema.
|
||||||
|
#[allow(clippy::trivially_copy_pass_by_ref)]
|
||||||
pub fn field_id(&self) -> u32 {
|
pub fn field_id(&self) -> u32 {
|
||||||
self.0
|
self.0
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user