diff --git a/CHANGELOG.md b/CHANGELOG.md index 6a637d32f..268e89493 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,8 @@ Tantivy 0.11.0 - Avoid rebuilding Regex automaton whenever a regex query is reused. #639 (@brainlock) - Add footer with some metadata to index files. #605 (@fdb-hiroshima) - TopDocs collector: ensure stable sorting on equal score. #671 (@brainlock) - +- Fix crash when committing multiple times with deleted documents. #681 (@brainlock) + ## How to update? - `Box` has been replaced by a `BoxedTokenizer` struct. diff --git a/src/core/index_meta.rs b/src/core/index_meta.rs index a1be67fee..45b56179d 100644 --- a/src/core/index_meta.rs +++ b/src/core/index_meta.rs @@ -150,6 +150,21 @@ impl SegmentMeta { self.num_deleted_docs() > 0 } + /// Updates the max_doc value from the `SegmentMeta`. + /// + /// This method is only used when updating `max_doc` from 0 + /// as we finalize a fresh new segment. + pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta { + assert_eq!(self.tracked.max_doc, 0); + assert!(self.tracked.deletes.is_none()); + let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta { + segment_id: inner_meta.segment_id, + max_doc, + deletes: None, + }); + SegmentMeta { tracked } + } + #[doc(hidden)] pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> SegmentMeta { let delete_meta = DeleteMeta { diff --git a/src/core/segment.rs b/src/core/segment.rs index a5177cc66..41dc6c91e 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -50,6 +50,17 @@ impl Segment { &self.meta } + /// Updates the max_doc value from the `SegmentMeta`. + /// + /// This method is only used when updating `max_doc` from 0 + /// as we finalize a fresh new segment. + pub(crate) fn with_max_doc(self, max_doc: u32) -> Segment { + Segment { + index: self.index, + meta: self.meta.with_max_doc(max_doc), + } + } + #[doc(hidden)] pub fn with_delete_meta(self, num_deleted_docs: u32, opstamp: Opstamp) -> Segment { Segment { diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index 19a30e999..faee047ba 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -10,11 +10,14 @@ use std::io::Write; /// Write a delete `BitSet` /// /// where `delete_bitset` is the set of deleted `DocId`. -pub fn write_delete_bitset(delete_bitset: &BitSet, writer: &mut WritePtr) -> io::Result<()> { - let max_doc = delete_bitset.capacity(); +pub fn write_delete_bitset( + delete_bitset: &BitSet, + max_doc: u32, + writer: &mut WritePtr, +) -> io::Result<()> { let mut byte = 0u8; let mut shift = 0u8; - for doc in 0..max_doc { + for doc in 0..(max_doc as usize) { if delete_bitset.contains(doc) { byte |= 1 << shift; } @@ -86,18 +89,17 @@ mod tests { use bit_set::BitSet; use std::path::PathBuf; - fn test_delete_bitset_helper(bitset: &BitSet) { + fn test_delete_bitset_helper(bitset: &BitSet, max_doc: u32) { let test_path = PathBuf::from("test"); let mut directory = RAMDirectory::create(); { let mut writer = directory.open_write(&*test_path).unwrap(); - write_delete_bitset(bitset, &mut writer).unwrap(); + write_delete_bitset(bitset, max_doc, &mut writer).unwrap(); } { let source = directory.open_read(&test_path).unwrap(); let delete_bitset = DeleteBitSet::open(source); - let n = bitset.capacity(); - for doc in 0..n { + for doc in 0..max_doc as usize { assert_eq!(bitset.contains(doc), delete_bitset.is_deleted(doc as DocId)); } assert_eq!(delete_bitset.len(), bitset.len()); @@ -110,7 +112,7 @@ mod tests { let mut bitset = BitSet::with_capacity(10); bitset.insert(1); bitset.insert(9); - test_delete_bitset_helper(&bitset); + test_delete_bitset_helper(&bitset, 10); } { let mut bitset = BitSet::with_capacity(8); @@ -119,7 +121,7 @@ mod tests { bitset.insert(3); bitset.insert(5); bitset.insert(7); - test_delete_bitset_helper(&bitset); + test_delete_bitset_helper(&bitset, 8); } } } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 7c7781560..3e79b1688 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -148,7 +148,6 @@ pub(crate) fn advance_deletes( }; let delete_cursor = segment_entry.delete_cursor(); - compute_deleted_bitset( &mut delete_bitset, &segment_reader, @@ -168,7 +167,7 @@ pub(crate) fn advance_deletes( if num_deleted_docs > 0 { segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp); let mut delete_file = segment.open_write(SegmentComponent::DELETE)?; - write_delete_bitset(&delete_bitset, &mut delete_file)?; + write_delete_bitset(&delete_bitset, max_doc, &mut delete_file)?; delete_file.terminate()?; } } @@ -178,13 +177,13 @@ pub(crate) fn advance_deletes( fn index_documents( memory_budget: usize, - segment: &Segment, + segment: Segment, grouped_document_iterator: &mut dyn Iterator, segment_updater: &mut SegmentUpdater, mut delete_cursor: DeleteCursor, ) -> Result { let schema = segment.schema(); - let segment_id = segment.id(); + let mut segment_writer = SegmentWriter::for_segment(memory_budget, segment.clone(), &schema)?; for document_group in grouped_document_iterator { for doc in document_group { @@ -204,21 +203,30 @@ fn index_documents( return Ok(false); } - let num_docs = segment_writer.max_doc(); + let max_doc = segment_writer.max_doc(); // this is ensured by the call to peek before starting // the worker thread. - assert!(num_docs > 0); + assert!(max_doc > 0); let doc_opstamps: Vec = segment_writer.finalize()?; - let segment_meta = segment.index().new_segment_meta(segment_id, num_docs); + + let segment_with_max_doc = segment.with_max_doc(max_doc); let last_docstamp: Opstamp = *(doc_opstamps.last().unwrap()); - let delete_bitset_opt = - apply_deletes(&segment, &mut delete_cursor, &doc_opstamps, last_docstamp)?; + let delete_bitset_opt = apply_deletes( + &segment_with_max_doc, + &mut delete_cursor, + &doc_opstamps, + last_docstamp, + )?; - let segment_entry = SegmentEntry::new(segment_meta, delete_cursor, delete_bitset_opt); + let segment_entry = SegmentEntry::new( + segment_with_max_doc.meta().clone(), + delete_cursor, + delete_bitset_opt, + ); Ok(segment_updater.add_segment(segment_entry)) } @@ -235,7 +243,9 @@ fn apply_deletes( } let segment_reader = SegmentReader::open(segment)?; let doc_to_opstamps = DocToOpstampMapping::from(doc_opstamps); - let mut deleted_bitset = BitSet::with_capacity(segment_reader.max_doc() as usize); + + let max_doc = segment.meta().max_doc(); + let mut deleted_bitset = BitSet::with_capacity(max_doc as usize); let may_have_deletes = compute_deleted_bitset( &mut deleted_bitset, &segment_reader, @@ -407,7 +417,7 @@ impl IndexWriter { let segment = index.new_segment(); index_documents( mem_budget, - &segment, + segment, &mut document_iterator, &mut segment_updater, delete_cursor.clone(), diff --git a/src/indexer/mod.rs b/src/indexer/mod.rs index 0620230ad..ad945866c 100644 --- a/src/indexer/mod.rs +++ b/src/indexer/mod.rs @@ -28,3 +28,25 @@ pub use self::segment_writer::SegmentWriter; /// Alias for the default merge policy, which is the `LogMergePolicy`. pub type DefaultMergePolicy = LogMergePolicy; + +#[cfg(test)] +mod tests { + use crate::schema::{self, Schema}; + use crate::{Index, Term}; + #[test] + fn test_advance_delete_bug() { + let mut schema_builder = Schema::builder(); + let text_field = schema_builder.add_text_field("text", schema::TEXT); + let index = Index::create_from_tempdir(schema_builder.build()).unwrap(); + let mut index_writer = index.writer_with_num_threads(1, 3_000_000).unwrap(); + // there must be one deleted document in the segment + index_writer.add_document(doc!(text_field=>"b")); + index_writer.delete_term(Term::from_field_text(text_field, "b")); + // we need enough data to trigger the bug (at least 32 documents) + for _ in 0..32 { + index_writer.add_document(doc!(text_field=>"c")); + } + index_writer.commit().unwrap(); + index_writer.commit().unwrap(); + } +} diff --git a/src/schema/field.rs b/src/schema/field.rs index 6ab3ec359..c0d418561 100644 --- a/src/schema/field.rs +++ b/src/schema/field.rs @@ -15,6 +15,7 @@ impl Field { } /// Returns a u32 identifying uniquely a field within a schema. + #[allow(clippy::trivially_copy_pass_by_ref)] pub fn field_id(&self) -> u32 { self.0 }