diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index 534626812..dd7052f30 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -81,6 +81,10 @@ impl<'a> HashMap<'a> { (hash as usize) & self.mask } + pub fn is_saturated(&self) -> bool { + self.table.len() < self.occupied.len() * 3 + } + fn get_key(&self, bytes_ref: BytesRef) -> &[u8] { self.heap.get_slice(bytes_ref) } diff --git a/src/indexer/index_writer.rs b/src/indexer/index_writer.rs index 6759331dd..514670e54 100644 --- a/src/indexer/index_writer.rs +++ b/src/indexer/index_writer.rs @@ -36,7 +36,7 @@ use std::thread; // Size of the margin for the heap. A segment is closed when the remaining memory // in the heap goes below MARGIN_IN_BYTES. -pub const MARGIN_IN_BYTES: u32 = 10_000_000u32; +pub const MARGIN_IN_BYTES: u32 = 1_000_000u32; // We impose the memory per thread to be at least 30 MB. pub const HEAP_SIZE_LIMIT: u32 = MARGIN_IN_BYTES * 3u32; @@ -269,6 +269,11 @@ fn index_documents(heap: &mut Heap, segment_writer.max_doc()); break; } + if segment_writer.is_termdictionary_saturated() { + info!("Term dic saturated, flushing segment with maxdoc={}.", + segment_writer.max_doc()); + break; + } } let num_docs = segment_writer.max_doc(); diff --git a/src/indexer/segment_updater.rs b/src/indexer/segment_updater.rs index ccd0974fa..ea68a5664 100644 --- a/src/indexer/segment_updater.rs +++ b/src/indexer/segment_updater.rs @@ -309,8 +309,6 @@ impl SegmentUpdater { let merging_join_handle = thread::spawn(move || { // first we need to apply deletes to our segment. - info!("Start merge: {:?}", segment_ids_vec); - let merged_segment = segment_updater_clone.new_segment(); let merged_segment_id = merged_segment.id(); let merge_result = perform_merge(&segment_ids_vec, &segment_updater_clone, merged_segment, target_opstamp); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index fc1b6c678..c9150b099 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -97,6 +97,11 @@ impl<'a> SegmentWriter<'a> { self.heap.num_free_bytes() <= MARGIN_IN_BYTES } + pub fn is_termdictionary_saturated(&self,) -> bool { + self.multifield_postings.is_termdictionary_saturated() + } + + /// Indexes a new document /// /// As a user, you should rather use `IndexWriter`'s add_document. diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 0eb6b9f7e..d3f7f7583 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -66,7 +66,6 @@ impl<'a> MultiFieldPostingsWriter<'a> { } } - pub fn index_text(&mut self, doc: DocId, field: Field, @@ -123,6 +122,10 @@ impl<'a> MultiFieldPostingsWriter<'a> { } Ok(()) } + + pub fn is_termdictionary_saturated(&self) -> bool { + self.term_index.is_saturated() + } }