From f73209a868583f5717bc18d0140aa18c1967732c Mon Sep 17 00:00:00 2001 From: dcraven Date: Tue, 22 Dec 2020 10:45:22 +0100 Subject: [PATCH] Reduced number of allocations. --- src/indexer/segment_writer.rs | 18 +++++++++--------- src/tokenizer/token_stream_chain.rs | 27 ++++++++++----------------- src/tokenizer/tokenized_string.rs | 16 +++++++--------- src/tokenizer/tokenizer.rs | 10 +++------- 4 files changed, 29 insertions(+), 42 deletions(-) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 80da2ce3e..f8e452ac5 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -172,37 +172,37 @@ impl SegmentWriter { } } FieldType::Str(_) => { - let mut token_streams: Vec = vec![]; - let mut offsets = vec![]; + let mut streams_with_offsets = vec![]; let mut total_offset = 0; for field_value in field_values { match field_value.value() { Value::PreTokStr(tok_str) => { - offsets.push(total_offset); + streams_with_offsets.push(( + PreTokenizedStream::from(tok_str.clone()).into(), + total_offset, + )); if let Some(last_token) = tok_str.tokens.last() { total_offset += last_token.offset_to; } - token_streams - .push(PreTokenizedStream::from(tok_str.clone()).into()); } Value::Str(ref text) => { if let Some(ref mut tokenizer) = self.tokenizers[field.field_id() as usize] { - offsets.push(total_offset); + streams_with_offsets + .push((tokenizer.token_stream(text), total_offset)); total_offset += text.len(); - token_streams.push(tokenizer.token_stream(text)); } } _ => (), } } - let num_tokens = if token_streams.is_empty() { + let num_tokens = if streams_with_offsets.is_empty() { 0 } else { - let mut token_stream = TokenStreamChain::new(offsets, token_streams); + let mut token_stream = TokenStreamChain::new(streams_with_offsets); multifield_postings.index_text( doc_id, field, diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index cf8ef0206..c9fb3a7d0 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,25 +1,19 @@ use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; -use std::ops::DerefMut; const POSITION_GAP: usize = 2; pub(crate) struct TokenStreamChain<'a> { - offsets: Vec, - token_streams: Vec>, + streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>, position_shift: usize, stream_idx: usize, token: Token, } impl<'a> TokenStreamChain<'a> { - pub fn new( - offsets: Vec, - token_streams: Vec>, - ) -> TokenStreamChain<'a> { + pub fn new(streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>) -> TokenStreamChain<'a> { TokenStreamChain { - offsets, + streams_with_offsets, stream_idx: 0, - token_streams, position_shift: 0, token: Token::default(), } @@ -28,11 +22,10 @@ impl<'a> TokenStreamChain<'a> { impl<'a> TokenStream for TokenStreamChain<'a> { fn advance(&mut self) -> bool { - while self.stream_idx < self.token_streams.len() { - let token_stream = self.token_streams[self.stream_idx].deref_mut(); + while self.stream_idx < self.streams_with_offsets.len() { + let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx]; if token_stream.advance() { let token = token_stream.token(); - let offset_offset = self.offsets[self.stream_idx]; self.token.offset_from = token.offset_from + offset_offset; self.token.offset_to = token.offset_to + offset_offset; self.token.position = token.position + self.position_shift; @@ -49,7 +42,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> { fn token(&self) -> &Token { assert!( - self.stream_idx <= self.token_streams.len(), + self.stream_idx <= self.streams_with_offsets.len(), "You called .token(), after the end of the token stream has been reached" ); &self.token @@ -57,7 +50,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> { fn token_mut(&mut self) -> &mut Token { assert!( - self.stream_idx <= self.token_streams.len(), + self.stream_idx <= self.streams_with_offsets.len(), "You called .token(), after the end of the token stream has been reached" ); &mut self.token @@ -73,10 +66,10 @@ mod tests { #[test] fn test_chain_first_emits_no_tokens() { let token_streams = vec![ - SimpleTokenizer.token_stream(""), - SimpleTokenizer.token_stream("hello world"), + (SimpleTokenizer.token_stream(""), 0), + (SimpleTokenizer.token_stream("hello world"), 0), ]; - let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams); + let mut token_chain = TokenStreamChain::new(token_streams); assert!(token_chain.advance()); assert_eq!(token_chain.token().text, "hello"); diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index e091c6018..b845a6d89 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -44,22 +44,20 @@ impl PreTokenizedStream { tok_strings: &'a [&'a PreTokenizedString], ) -> BoxTokenStream { if tok_strings.len() == 1 { - PreTokenizedStream::from((*tok_strings[0]).clone()).into() + PreTokenizedStream::from(tok_strings[0].to_owned()).into() } else { - let mut offsets = vec![]; + let mut streams_with_offsets = vec![]; let mut total_offset = 0; for &tok_string in tok_strings { - offsets.push(total_offset); + streams_with_offsets.push(( + PreTokenizedStream::from(tok_string.to_owned()).into(), + total_offset, + )); if let Some(last_token) = tok_string.tokens.last() { total_offset += last_token.offset_to; } } - // TODO remove the string cloning. - let token_streams: Vec> = tok_strings - .iter() - .map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into()) - .collect(); - TokenStreamChain::new(offsets, token_streams).into() + TokenStreamChain::new(streams_with_offsets).into() } } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index aba66ccf8..683fbea51 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -91,17 +91,13 @@ impl TextAnalyzer { if texts.len() == 1 { self.token_stream(texts[0]) } else { - let mut offsets = vec![]; + let mut streams_with_offsets = vec![]; let mut total_offset = 0; for &text in texts { - offsets.push(total_offset); + streams_with_offsets.push((self.token_stream(text), total_offset)); total_offset += text.len(); } - let token_streams: Vec> = texts - .iter() - .map(|text| self.token_stream(text)) - .collect(); - From::from(TokenStreamChain::new(offsets, token_streams)) + From::from(TokenStreamChain::new(streams_with_offsets)) } }