From fb1b2be782f81dd2393436f9649fbd3c1cf446a7 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 12 May 2017 13:51:09 +0900 Subject: [PATCH 1/2] issue/136 Fix following CR --- src/datastruct/stacker/heap.rs | 10 ++++++++-- src/indexer/merger.rs | 7 ++++--- src/indexer/segment_writer.rs | 7 ++----- src/schema/term.rs | 1 - 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 8ba3739f5..694e45c8d 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -80,8 +80,14 @@ impl Heap { pub fn set(&self, addr: u32, val: &Item) { self.inner().set(addr, val); } - - /// Returns a mutable reference for an object at a given Item. + + /// Returns a reference to an `Item` at a given `addr`. + #[cfg(test)] + pub fn get_ref(&self, addr: u32) -> &Item { + self.inner().get_mut_ref(addr) + } + + /// Returns a mutable reference to an `Item` at a given `addr`. pub fn get_mut_ref(&self, addr: u32) -> &mut Item { self.inner().get_mut_ref(addr) } diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index d63333344..44e47f5c6 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -202,7 +202,7 @@ impl IndexMerger { merged_doc_id_map.push(segment_local_map); } - let mut field = Field(u32::max_value()); + let mut last_field: Option = None; while merged_terms.advance() { // Create the total list of doc ids @@ -239,10 +239,11 @@ impl IndexMerger { if let Some(remapped_doc_id) = old_to_new_doc_id[segment_postings.doc() as usize] { if !term_written { let current_field = term.field(); - if current_field != field { + if last_field != Some(current_field) { postings_serializer.new_field(current_field); - field = current_field; + last_field = Some(current_field); } + // we make sure to only write the term iff // there is at least one document. postings_serializer.new_term(term.as_slice())?; diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 8878fd519..6502c2cb7 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -98,8 +98,7 @@ impl<'a> SegmentWriter<'a> { /// Return true if the term dictionary hashmap is reaching capacity. /// It is one of the condition that triggers a `SegmentWriter` to /// be finalized. - #[doc(hidden)] - pub fn is_termdic_saturated(&self,) -> bool { + pub(crate) fn is_termdic_saturated(&self,) -> bool { self.multifield_postings.is_termdic_saturated() } @@ -195,12 +194,10 @@ fn write<'a>( mut serializer: SegmentSerializer) -> Result<()> { try!(multifield_postings.serialize(serializer.get_postings_serializer())); - // for per_field_postings_writer in per_field_postings_writers { - // try!(per_field_postings_writer.serialize(serializer.get_postings_serializer(), heap)); - // } try!(fast_field_writers.serialize(serializer.get_fast_field_serializer())); try!(fieldnorms_writer.serialize(serializer.get_fieldnorms_serializer())); try!(serializer.close()); + Ok(()) } diff --git a/src/schema/term.rs b/src/schema/term.rs index a539d6e2b..aa7700cab 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -16,7 +16,6 @@ const INT_TERM_LEN: usize = 4 + 8; pub struct Term(Vec); /// Extract `field` from Term. -#[doc(hidden)] pub(crate) fn extract_field_from_term_bytes(term_bytes: &[u8]) -> Field { Field(BigEndian::read_u32(&term_bytes[..4])) } From ecbdd70c37563ea192c8c189b3c7227432a47f9d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 12 May 2017 14:01:52 +0900 Subject: [PATCH 2/2] Removed the clunky linked list logic of the heap. --- src/datastruct/stacker/heap.rs | 78 ++++++++++------------------------ 1 file changed, 22 insertions(+), 56 deletions(-) diff --git a/src/datastruct/stacker/heap.rs b/src/datastruct/stacker/heap.rs index 694e45c8d..bf89b923a 100644 --- a/src/datastruct/stacker/heap.rs +++ b/src/datastruct/stacker/heap.rs @@ -96,9 +96,8 @@ impl Heap { struct InnerHeap { buffer: Vec, - buffer_len: u32, used: u32, - next_heap: Option>, + has_been_resized: bool, } @@ -108,15 +107,13 @@ impl InnerHeap { let buffer: Vec = vec![0u8; num_bytes]; InnerHeap { buffer: buffer, - buffer_len: num_bytes as u32, - next_heap: None, used: 0u32, + has_been_resized: false, } } pub fn clear(&mut self) { self.used = 0u32; - self.next_heap = None; } pub fn capacity(&self,) -> u32 { @@ -126,47 +123,31 @@ impl InnerHeap { // Returns the number of free bytes. If the buffer // has reached it's capacity and overflowed to another buffer, return 0. pub fn num_free_bytes(&self,) -> u32 { - if self.next_heap.is_some() { + if self.has_been_resized { 0u32 } else { - self.buffer_len - self.used + (self.buffer.len() as u32) - self.used } } pub fn allocate_space(&mut self, num_bytes: usize) -> u32 { let addr = self.used; self.used += num_bytes as u32; - if self.used <= self.buffer_len { - addr + let buffer_len = self.buffer.len(); + if self.used > buffer_len as u32 { + self.buffer.resize(buffer_len * 2, 0u8); + self.has_been_resized = true } - else { - if self.next_heap.is_none() { - warn!("Exceeded heap size. The margin was apparently unsufficient. The segment will be committed right after indexing this very last document."); - self.next_heap = Some(Box::new(InnerHeap::with_capacity(self.buffer_len as usize))); - } - self.next_heap.as_mut().unwrap().allocate_space(num_bytes) + self.buffer_len - } - - + addr } fn get_slice(&self, start: u32, stop: u32) -> &[u8] { - if start >= self.buffer_len { - self.next_heap.as_ref().unwrap().get_slice(start - self.buffer_len, stop - self.buffer_len) - } - else { - &self.buffer[start as usize..stop as usize] - } + &self.buffer[start as usize..stop as usize] } fn get_mut_slice(&mut self, start: u32, stop: u32) -> &mut [u8] { - if start >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut_slice(start - self.buffer_len, stop - self.buffer_len) - } - else { - &mut self.buffer[start as usize..stop as usize] - } + &mut self.buffer[start as usize..stop as usize] } fn allocate_and_set(&mut self, data: &[u8]) -> BytesRef { @@ -180,38 +161,23 @@ impl InnerHeap { } fn get_mut(&mut self, addr: u32) -> *mut u8 { - if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut(addr - self.buffer_len) - } - else { - let addr_isize = addr as isize; - unsafe { self.buffer.as_mut_ptr().offset(addr_isize) } - } + let addr_isize = addr as isize; + unsafe { self.buffer.as_mut_ptr().offset(addr_isize) } } fn get_mut_ref(&mut self, addr: u32) -> &mut Item { - if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().get_mut_ref(addr - self.buffer_len) - } - else { - let v_ptr_u8 = self.get_mut(addr) as *mut u8; - let v_ptr = v_ptr_u8 as *mut Item; - unsafe { &mut *v_ptr } - } + let v_ptr_u8 = self.get_mut(addr) as *mut u8; + let v_ptr = v_ptr_u8 as *mut Item; + unsafe { &mut *v_ptr } } fn set(&mut self, addr: u32, val: &Item) { - if addr >= self.buffer_len { - self.next_heap.as_mut().unwrap().set(addr - self.buffer_len, val); - } - else { - let v_ptr: *const Item = val as *const Item; - let v_ptr_u8: *const u8 = v_ptr as *const u8; - debug_assert!(addr + mem::size_of::() as u32 <= self.used); - unsafe { - let dest_ptr: *mut u8 = self.get_mut(addr); - ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::()); - } + let v_ptr: *const Item = val as *const Item; + let v_ptr_u8: *const u8 = v_ptr as *const u8; + debug_assert!(addr + mem::size_of::() as u32 <= self.used); + unsafe { + let dest_ptr: *mut u8 = self.get_mut(addr); + ptr::copy(v_ptr_u8, dest_ptr, mem::size_of::()); } } } \ No newline at end of file