From 70bae7ce4c2cd35e07655af5b2b60c9bedd5fd04 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Tue, 8 Sep 2020 23:11:00 +0900 Subject: [PATCH] Removing Term Vec allocation (#881) --- src/indexer/segment_writer.rs | 71 ++++++++++++++++----------------- src/postings/postings_writer.rs | 15 +++++-- src/schema/term.rs | 41 ++++++++++--------- 3 files changed, 69 insertions(+), 58 deletions(-) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d56764b4f..5bb979702 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -17,7 +17,6 @@ use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::Opstamp; use crate::{DocId, SegmentComponent}; use std::io; -use std::str; /// Computes the initial size of the hash table. /// @@ -48,6 +47,7 @@ pub struct SegmentWriter { fieldnorms_writer: FieldNormsWriter, doc_opstamps: Vec, tokenizers: Vec>, + term_buffer: Term, } impl SegmentWriter { @@ -91,6 +91,7 @@ impl SegmentWriter { fast_field_writers: FastFieldsWriter::from_schema(schema), doc_opstamps: Vec::with_capacity(1_000), tokenizers, + term_buffer: Term::new(), }) } @@ -128,24 +129,26 @@ impl SegmentWriter { if !field_options.is_indexed() { continue; } + let (term_buffer, multifield_postings) = + (&mut self.term_buffer, &mut self.multifield_postings); match *field_options.field_type() { FieldType::HierarchicalFacet => { - let facets: Vec<&str> = field_values - .iter() - .flat_map(|field_value| match *field_value.value() { - Value::Facet(ref facet) => Some(facet.encoded_str()), - _ => { - panic!("Expected hierarchical facet"); - } - }) - .collect(); - let mut term = Term::for_field(field); // we set the Term + term_buffer.set_field(field); + let facets = + field_values + .iter() + .flat_map(|field_value| match *field_value.value() { + Value::Facet(ref facet) => Some(facet.encoded_str()), + _ => { + panic!("Expected hierarchical facet"); + } + }); for fake_str in facets { let mut unordered_term_id_opt = None; FacetTokenizer.token_stream(fake_str).process(&mut |token| { - term.set_text(&token.text); + term_buffer.set_text(&token.text); let unordered_term_id = - self.multifield_postings.subscribe(doc_id, &term); + multifield_postings.subscribe(doc_id, &term_buffer); unordered_term_id_opt = Some(unordered_term_id); }); if let Some(unordered_term_id) = unordered_term_id_opt { @@ -168,7 +171,6 @@ impl SegmentWriter { if let Some(last_token) = tok_str.tokens.last() { total_offset += last_token.offset_to; } - token_streams .push(PreTokenizedStream::from(tok_str.clone()).into()); } @@ -178,7 +180,6 @@ impl SegmentWriter { { offsets.push(total_offset); total_offset += text.len(); - token_streams.push(tokenizer.token_stream(text)); } } @@ -190,8 +191,12 @@ impl SegmentWriter { 0 } else { let mut token_stream = TokenStreamChain::new(offsets, token_streams); - self.multifield_postings - .index_text(doc_id, field, &mut token_stream) + multifield_postings.index_text( + doc_id, + field, + &mut token_stream, + term_buffer, + ) }; self.fieldnorms_writer.record(doc_id, field, num_tokens); @@ -199,44 +204,36 @@ impl SegmentWriter { FieldType::U64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_u64( - field_value.field(), - field_value.value().u64_value(), - ); - self.multifield_postings.subscribe(doc_id, &term); + term_buffer.set_field(field_value.field()); + term_buffer.set_u64(field_value.value().u64_value()); + multifield_postings.subscribe(doc_id, &term_buffer); } } } FieldType::Date(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_i64( - field_value.field(), - field_value.value().date_value().timestamp(), - ); - self.multifield_postings.subscribe(doc_id, &term); + term_buffer.set_field(field_value.field()); + term_buffer.set_i64(field_value.value().date_value().timestamp()); + multifield_postings.subscribe(doc_id, &term_buffer); } } } FieldType::I64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_i64( - field_value.field(), - field_value.value().i64_value(), - ); - self.multifield_postings.subscribe(doc_id, &term); + term_buffer.set_field(field_value.field()); + term_buffer.set_i64(field_value.value().i64_value()); + multifield_postings.subscribe(doc_id, &term_buffer); } } } FieldType::F64(ref int_option) => { if int_option.is_indexed() { for field_value in field_values { - let term = Term::from_field_f64( - field_value.field(), - field_value.value().f64_value(), - ); - self.multifield_postings.subscribe(doc_id, &term); + term_buffer.set_field(field_value.field()); + term_buffer.set_f64(field_value.value().f64_value()); + multifield_postings.subscribe(doc_id, &term_buffer); } } } diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 65072eecf..3fc0b8291 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -105,6 +105,7 @@ impl MultiFieldPostingsWriter { doc: DocId, field: Field, token_stream: &mut dyn TokenStream, + term_buffer: &mut Term, ) -> u32 { let postings_writer = self.per_field_postings_writers[field.field_id() as usize].deref_mut(); @@ -114,6 +115,7 @@ impl MultiFieldPostingsWriter { field, token_stream, &mut self.heap, + term_buffer, ) } @@ -220,13 +222,20 @@ pub trait PostingsWriter { field: Field, token_stream: &mut dyn TokenStream, heap: &mut MemoryArena, + term_buffer: &mut Term, ) -> u32 { - let mut term = Term::for_field(field); + term_buffer.set_field(field); let mut sink = |token: &Token| { // We skip all tokens with a len greater than u16. if token.text.len() <= MAX_TOKEN_LEN { - term.set_text(token.text.as_str()); - self.subscribe(term_index, doc_id, token.position as u32, &term, heap); + term_buffer.set_text(token.text.as_str()); + self.subscribe( + term_index, + doc_id, + token.position as u32, + &term_buffer, + heap, + ); } else { info!( "A token exceeding MAX_TOKEN_LEN ({}>{}) was dropped. Search for \ diff --git a/src/schema/term.rs b/src/schema/term.rs index f5425702d..2b696e880 100644 --- a/src/schema/term.rs +++ b/src/schema/term.rs @@ -4,7 +4,6 @@ use super::Field; use crate::common; use crate::schema::Facet; use crate::DateTime; -use byteorder::{BigEndian, ByteOrder}; use std::str; /// Size (in bytes) of the buffer of a int field. @@ -19,6 +18,10 @@ where B: AsRef<[u8]>; impl Term { + pub(crate) fn new() -> Term { + Term(Vec::with_capacity(100)) + } + /// Builds a term given a field, and a i64-value /// /// Assuming the term has a field id of 1, and a i64 value of 3234, @@ -93,6 +96,12 @@ impl Term { term } + pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term { + let mut term = Term::for_field(field); + term.set_bytes(bytes); + term + } + /// Creates a new Term for a given field. pub(crate) fn for_field(field: Field) -> Term { let mut term = Term(Vec::with_capacity(100)); @@ -100,12 +109,10 @@ impl Term { term } - /// Returns the field. - pub fn set_field(&mut self, field: Field) { - if self.0.len() < 4 { - self.0.resize(4, 0u8); - } - BigEndian::write_u32(&mut self.0[0..4], field.field_id()); + pub(crate) fn set_field(&mut self, field: Field) { + self.0.clear(); + self.0 + .extend_from_slice(&field.field_id().to_be_bytes()[..]); } /// Sets a u64 value in the term. @@ -116,7 +123,7 @@ impl Term { /// the natural order of the values. pub fn set_u64(&mut self, val: u64) { self.0.resize(INT_TERM_LEN, 0u8); - BigEndian::write_u64(&mut self.0[4..], val); + self.0[4..12].copy_from_slice(val.to_be_bytes().as_ref()); } /// Sets a `i64` value in the term. @@ -134,12 +141,6 @@ impl Term { self.0.extend(bytes); } - pub(crate) fn from_field_bytes(field: Field, bytes: &[u8]) -> Term { - let mut term = Term::for_field(field); - term.set_bytes(bytes); - term - } - /// Set the texts only, keeping the field untouched. pub fn set_text(&mut self, text: &str) { self.set_bytes(text.as_bytes()); @@ -157,7 +158,9 @@ where /// Returns the field. pub fn field(&self) -> Field { - Field::from_field_id(BigEndian::read_u32(&self.0.as_ref()[..4])) + let mut field_id_bytes = [0u8; 4]; + field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]); + Field::from_field_id(u32::from_be_bytes(field_id_bytes)) } /// Returns the `u64` value stored in a term. @@ -166,7 +169,9 @@ where /// ... or returns an invalid value /// if the term is not a `u64` field. pub fn get_u64(&self) -> u64 { - BigEndian::read_u64(&self.0.as_ref()[4..]) + let mut field_id_bytes = [0u8; 8]; + field_id_bytes.copy_from_slice(self.value_bytes()); + u64::from_be_bytes(field_id_bytes) } /// Returns the `i64` value stored in a term. @@ -175,7 +180,7 @@ where /// ... or returns an invalid value /// if the term is not a `i64` field. pub fn get_i64(&self) -> i64 { - common::u64_to_i64(BigEndian::read_u64(&self.0.as_ref()[4..])) + common::u64_to_i64(self.get_u64()) } /// Returns the `f64` value stored in a term. @@ -184,7 +189,7 @@ where /// ... or returns an invalid value /// if the term is not a `f64` field. pub fn get_f64(&self) -> f64 { - common::u64_to_f64(BigEndian::read_u64(&self.0.as_ref()[4..])) + common::u64_to_f64(self.get_u64()) } /// Returns the text associated with the term.