diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index e904c2e25..e52749935 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -379,8 +379,10 @@ impl FacetCollector { }) .sum(); if count > 0u64 { - let bytes = facet_merger.key().to_owned(); - facet_counts.insert(Facet::from_encoded(bytes), count); + let bytes: Vec = facet_merger.key().to_owned(); + // may create an corrupted facet if the term dicitonary is corrupted + let facet = unsafe { Facet::from_encoded(bytes) }; + facet_counts.insert(facet, count); } } FacetCounts { facet_counts } @@ -452,9 +454,9 @@ impl FacetCounts { let right_bound = if facet.is_root() { Bound::Unbounded } else { - let mut facet_after_bytes = facet.encoded_bytes().to_owned(); + let mut facet_after_bytes: Vec = facet.encoded_bytes().to_owned(); facet_after_bytes.push(1u8); - let facet_after = Facet::from_encoded(facet_after_bytes); + let facet_after = unsafe { Facet::from_encoded(facet_after_bytes) }; // ok logic Bound::Excluded(facet_after) }; let underlying: btree_map::Range<_, _> = self.facet_counts.range((left_bound, right_bound)); diff --git a/src/datastruct/stacker/hashmap.rs b/src/datastruct/stacker/hashmap.rs index f872f31d8..00c05a71c 100644 --- a/src/datastruct/stacker/hashmap.rs +++ b/src/datastruct/stacker/hashmap.rs @@ -17,7 +17,7 @@ mod murmurhash2 { let num_blocks = len >> 2; for _ in 0..num_blocks { - let mut k: u32 = unsafe { *key_ptr }; + let mut k: u32 = unsafe { *key_ptr }; // ok because of num_blocks definition k = k.wrapping_mul(M); k ^= k >> 24; k = k.wrapping_mul(M); diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 0e5d38ccc..f4b90ac8b 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -67,6 +67,11 @@ impl FastFieldReader { /// associated with the `DocId` going from /// `start` to `start + output.len()`. /// + /// Regardless of the type of `Item`, this method works + /// - transmuting the output array + /// - extracting the `Item`s as if they were `u64` + /// - possibly converting the `u64` value to the right type. + /// /// # Panics /// /// May panic if `start + output.len()` is greater than @@ -75,7 +80,7 @@ impl FastFieldReader { // TODO change start to `u64`. // For multifastfield, start is an index in a second fastfield, not a `DocId` pub fn get_range(&self, start: u32, output: &mut [Item]) { - let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; + let output_u64: &mut [u64] = unsafe { mem::transmute(output) }; // ok: Item is either `u64` or `i64` self.bit_unpacker.get_range(start, output_u64); for out in output_u64.iter_mut() { *out = Item::from_u64(*out + self.min_value_u64).as_u64(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bfd3dd72a..3b6309882 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -138,8 +138,7 @@ impl<'a> SegmentWriter<'a> { } }) .collect(); - let mut term = unsafe { Term::with_capacity(100) }; - term.set_field(field); + let mut term = Term::for_field(field); // we set the Term for facet_bytes in facets { let mut unordered_term_id_opt = None; let fake_str = unsafe { str::from_utf8_unchecked(facet_bytes) }; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 7324f3701..5dd880c2c 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -194,8 +194,7 @@ pub trait PostingsWriter { token_stream: &mut TokenStream, heap: &Heap, ) -> u32 { - let mut term = unsafe { Term::with_capacity(100) }; - term.set_field(field); + let mut term = Term::for_field(field); let num_tokens = { let mut sink = |token: &Token| { term.set_text(token.text.as_str()); diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 3f364c4ab..185732451 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -279,19 +279,11 @@ impl Postings for SegmentPostings { fn positions_with_offset(&mut self, offset: u32, output: &mut Vec) { if self.position_computer.is_some() { - let prev_capacity = output.capacity(); - let term_freq = self.term_freq() as usize; - if term_freq > prev_capacity { - let additional_len = term_freq - output.len(); - output.reserve(additional_len); - } - unsafe { - output.set_len(term_freq); - self.position_computer - .as_mut() - .unwrap() - .positions_with_offset(offset, &mut output[..]) - } + output.resize(self.term_freq() as usize, 0u32); + self.position_computer + .as_mut() + .unwrap() + .positions_with_offset(offset, &mut output[..]) } else { output.clear(); } diff --git a/src/query/union.rs b/src/query/union.rs index 0e7baba25..58d5de242 100644 --- a/src/query/union.rs +++ b/src/query/union.rs @@ -10,7 +10,7 @@ const HORIZON_NUM_TINYBITSETS: usize = 64; const HORIZON: u32 = 64u32 * HORIZON_NUM_TINYBITSETS as u32; // `drain_filter` is not stable yet. -// This function is similar except that it does is not unsafe, and +// This function is similar except that it does is not unstable, and // it does not keep the original vector ordering. // // Also, it does not "yield" any elements. diff --git a/src/schema/facet.rs b/src/schema/facet.rs index cf67084f9..429766c85 100644 --- a/src/schema/facet.rs +++ b/src/schema/facet.rs @@ -54,7 +54,7 @@ impl Facet { } /// Creates a `Facet` from its binary representation. - pub(crate) fn from_encoded(encoded_bytes: Vec) -> Facet { + pub(crate) unsafe fn from_encoded(encoded_bytes: Vec) -> Facet { Facet(encoded_bytes) } diff --git a/src/schema/term.rs b/src/schema/term.rs index eac870f06..55469a85e 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -74,8 +74,10 @@ impl Term { /// It is declared unsafe, as the term content /// is not initialized, and a call to `.field()` /// would panic. - pub(crate) unsafe fn with_capacity(num_bytes: usize) -> Term { - Term(Vec::with_capacity(num_bytes)) + pub(crate) fn for_field(field: Field) -> Term { + let mut term = Term(Vec::with_capacity(100)); + term.set_field(field); + term } /// Returns the field. diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index b10fa9116..1ecaed641 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -198,7 +198,7 @@ mod tests { let mut term_string = String::new(); while term_it.advance() { //let term = Term::from_bytes(term_it.key()); - term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); + term_string.push_str(unsafe { str::from_utf8_unchecked(term_it.key()) }); // ok test } assert_eq!(&*term_string, "abcdef"); } diff --git a/src/termdict/term_info_store.rs b/src/termdict/term_info_store.rs index 2775997fd..342057e0e 100644 --- a/src/termdict/term_info_store.rs +++ b/src/termdict/term_info_store.rs @@ -92,7 +92,9 @@ fn extract_bits(data: &[u8], addr_bits: usize, num_bits: u8) -> u64 { assert!(num_bits <= 56); let addr_byte = addr_bits / 8; let bit_shift = (addr_bits % 8) as u64; + assert!(data.len() >= addr_byte + 8); let val_unshifted_unmasked: u64 = unsafe { + //< ok : check len above let addr = data.as_ptr().offset(addr_byte as isize) as *const u64; ptr::read_unaligned(addr) }; diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 321831934..60f4f297b 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -95,7 +95,7 @@ mod tests { let mut tokens = vec![]; { let mut add_token = |token: &Token| { - let facet = Facet::from_encoded(token.text.as_bytes().to_owned()); + let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test tokens.push(format!("{}", facet)); }; FacetTokenizer @@ -115,11 +115,11 @@ mod tests { let mut tokens = vec![]; { let mut add_token = |token: &Token| { - let facet = Facet::from_encoded(token.text.as_bytes().to_owned()); + let facet = unsafe { Facet::from_encoded(token.text.as_bytes().to_owned()) }; // ok test tokens.push(format!("{}", facet)); }; FacetTokenizer - .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) + .token_stream(unsafe { str::from_utf8_unchecked(facet.encoded_bytes()) }) // ok test .process(&mut add_token); } assert_eq!(tokens.len(), 1);