diff --git a/src/core/segment_reader.rs b/src/core/segment_reader.rs index 0ade28f05..015d8c1ee 100644 --- a/src/core/segment_reader.rs +++ b/src/core/segment_reader.rs @@ -1,4 +1,3 @@ -use crate::common::HasLen; use crate::core::InvertedIndexReader; use crate::core::Segment; use crate::core::SegmentComponent; @@ -63,11 +62,9 @@ impl SegmentReader { self.max_doc } - /// Returns the number of documents. + /// Returns the number of alive documents. /// Deleted documents are not counted. /// - /// Today, `tantivy` does not handle deletes so max doc and - /// num_docs are the same. pub fn num_docs(&self) -> DocId { self.num_docs } @@ -81,7 +78,7 @@ impl SegmentReader { /// deleted in the segment. pub fn num_deleted_docs(&self) -> DocId { self.delete_bitset() - .map(|delete_set| delete_set.len() as DocId) + .map(|delete_set| delete_set.num_deleted() as DocId) .unwrap_or(0u32) } @@ -329,6 +326,32 @@ mod test { use crate::schema::{Schema, Term, STORED, TEXT}; use crate::DocId; + #[test] + fn test_num_alive() -> crate::Result<()> { + let mut schema_builder = Schema::builder(); + schema_builder.add_text_field("name", TEXT | STORED); + let schema = schema_builder.build(); + let index = Index::create_in_ram(schema.clone()); + let name = schema.get_field("name").unwrap(); + + { + let mut index_writer = index.writer_for_tests()?; + index_writer.add_document(doc!(name => "tantivy")); + index_writer.add_document(doc!(name => "horse")); + index_writer.add_document(doc!(name => "jockey")); + index_writer.add_document(doc!(name => "cap")); + // we should now have one segment with two docs + index_writer.delete_term(Term::from_field_text(name, "horse")); + index_writer.delete_term(Term::from_field_text(name, "cap")); + + // ok, now we should have a deleted doc + index_writer.commit()?; + } + let searcher = index.reader()?.searcher(); + assert_eq!(2, searcher.segment_reader(0).num_docs()); + assert_eq!(4, searcher.segment_reader(0).max_doc()); + Ok(()) + } #[test] fn test_alive_docs_iterator() -> crate::Result<()> { let mut schema_builder = Schema::builder(); diff --git a/src/fastfield/delete.rs b/src/fastfield/delete.rs index de23effac..a2f14aa7c 100644 --- a/src/fastfield/delete.rs +++ b/src/fastfield/delete.rs @@ -91,6 +91,10 @@ impl DeleteBitSet { b & (1u8 << shift) != 0 } + /// The number of deleted docs + pub fn num_deleted(&self) -> usize { + self.num_deleted + } /// Summarize total space usage of this bitset. pub fn space_usage(&self) -> ByteCount { self.data.len() diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index f89f7f60f..16f19556d 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -344,6 +344,11 @@ impl IndexMerger { }) .collect::>(); if let Some(doc_id_mapping) = doc_id_mapping { + let stats = FastFieldStats { + min_value, + max_value, + num_vals: doc_id_mapping.len() as u64, + }; #[derive(Clone)] struct SortedDocidFieldAccessProvider<'a> { doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>, @@ -355,11 +360,6 @@ impl IndexMerger { self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id) } } - let stats = FastFieldStats { - min_value, - max_value, - num_vals: doc_id_mapping.len() as u64, - }; let fastfield_accessor = SortedDocidFieldAccessProvider { doc_id_mapping, fast_field_readers: &fast_field_readers, @@ -378,31 +378,86 @@ impl IndexMerger { Ok(()) } else { - let u64_readers = self.readers.iter() - .filter(|reader|reader.max_doc() != reader.delete_bitset().map(|bit_set|bit_set.len() as u32).unwrap_or(0)) + let num_vals = self + .readers + .iter() + .map(|reader| reader.num_docs() as u64) + .sum(); + + let stats = FastFieldStats { + min_value, + max_value, + num_vals, + }; + #[derive(Clone)] + struct DocidFieldAccessProvider<'a> { + segment_and_field_readers: Vec<(&'a SegmentReader, DynamicFastFieldReader)>, + } + impl<'a> FastFieldDataAccess for DocidFieldAccessProvider<'a> { + fn get_val(&self, doc: u64) -> u64 { + // Find the reader which will contain the doc_id. + let mut num_docs_so_far = 0; + let reader_ordinal = self + .segment_and_field_readers + .iter() + .position(|(segment_reader, _)| { + num_docs_so_far += segment_reader.num_docs() as u64; + + num_docs_so_far > doc + }) + .unwrap(); + + let (segment_reader, reader) = + &self.segment_and_field_readers[reader_ordinal as usize]; + let pos_in_reader = doc - (num_docs_so_far - segment_reader.num_docs() as u64); + + let docid = segment_reader + .doc_ids_alive() + .nth(pos_in_reader as usize) + .expect(&format!( + "unexpected error, could not find doc id in alive list docid {}, number of docids in segment {} ", + pos_in_reader, + segment_reader.doc_ids_alive().count() + )); + reader.get(docid) + } + } + let segment_and_field_readers = self.readers.iter() .map(|reader|{ let u64_reader: DynamicFastFieldReader = reader .fast_fields() .typed_fast_field_reader(field) .expect("Failed to find a reader for single fast field. This is a tantivy bug and it should never happen."); - (reader.max_doc(), u64_reader, reader.delete_bitset()) + (reader, u64_reader) }).collect::>(); - let mut fast_single_field_serializer = - fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?; - for (max_doc, u64_reader, delete_bitset_opt) in u64_readers { - for doc_id in 0u32..max_doc { - let is_deleted = delete_bitset_opt - .map(|delete_bitset| delete_bitset.is_deleted(doc_id)) - .unwrap_or(false); - if !is_deleted { - let val = u64_reader.get(doc_id); - fast_single_field_serializer.add_val(val)?; - } - } - } + let iter1 = segment_and_field_readers + .iter() + .map(|(reader, u64_reader)| { + reader + .doc_ids_alive() + .map(move |doc_id| u64_reader.get(doc_id)) + }) + .flatten(); + let iter2 = segment_and_field_readers + .iter() + .map(|(reader, u64_reader)| { + reader + .doc_ids_alive() + .map(move |doc_id| u64_reader.get(doc_id)) + }) + .flatten(); - fast_single_field_serializer.close_field()?; + let fastfield_accessor = DocidFieldAccessProvider { + segment_and_field_readers: segment_and_field_readers.clone(), + }; + fast_field_serializer.create_auto_detect_u64_fast_field( + field, + stats, + fastfield_accessor, + iter1, + iter2, + )?; Ok(()) } } @@ -747,16 +802,21 @@ impl IndexMerger { min_value, }; if let Some(doc_id_mapping) = doc_id_mapping { - struct SortedDocidFieldAccessProvider<'a> { + struct SortedDocidMultiValueAccessProvider<'a> { doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>, fast_field_readers: &'a Vec>, offsets: Vec, } - impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> { + impl<'a> FastFieldDataAccess for SortedDocidMultiValueAccessProvider<'a> { fn get_val(&self, pos: u64) -> u64 { // use the offsets index to find the doc_id which will contain the position. // the offsets are stricly increasing so we can do a simple search on it. - let new_docid = self.offsets.iter().position(|&x| x > pos).unwrap() - 1; + let new_docid = self + .offsets + .iter() + .position(|&offset| offset > pos) + .unwrap() + - 1; // now we need to find the position of `pos` in the multivalued bucket let num_pos_covered_until_now = self.offsets[new_docid]; @@ -773,7 +833,7 @@ impl IndexMerger { vals[pos_in_values as usize] } } - let fastfield_accessor = SortedDocidFieldAccessProvider { + let fastfield_accessor = SortedDocidMultiValueAccessProvider { doc_id_mapping, fast_field_readers: &fast_field_readers, offsets,