mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
Reduced number of allocations.
This commit is contained in:
@@ -172,37 +172,37 @@ impl SegmentWriter {
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) => {
|
||||
let mut token_streams: Vec<BoxTokenStream> = vec![];
|
||||
let mut offsets = vec![];
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
|
||||
for field_value in field_values {
|
||||
match field_value.value() {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets.push((
|
||||
PreTokenizedStream::from(tok_str.clone()).into(),
|
||||
total_offset,
|
||||
));
|
||||
if let Some(last_token) = tok_str.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
token_streams
|
||||
.push(PreTokenizedStream::from(tok_str.clone()).into());
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
if let Some(ref mut tokenizer) =
|
||||
self.tokenizers[field.field_id() as usize]
|
||||
{
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets
|
||||
.push((tokenizer.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
token_streams.push(tokenizer.token_stream(text));
|
||||
}
|
||||
}
|
||||
_ => (),
|
||||
}
|
||||
}
|
||||
|
||||
let num_tokens = if token_streams.is_empty() {
|
||||
let num_tokens = if streams_with_offsets.is_empty() {
|
||||
0
|
||||
} else {
|
||||
let mut token_stream = TokenStreamChain::new(offsets, token_streams);
|
||||
let mut token_stream = TokenStreamChain::new(streams_with_offsets);
|
||||
multifield_postings.index_text(
|
||||
doc_id,
|
||||
field,
|
||||
|
||||
@@ -1,25 +1,19 @@
|
||||
use crate::tokenizer::{BoxTokenStream, Token, TokenStream};
|
||||
use std::ops::DerefMut;
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct TokenStreamChain<'a> {
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>,
|
||||
position_shift: usize,
|
||||
stream_idx: usize,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> TokenStreamChain<'a> {
|
||||
pub fn new(
|
||||
offsets: Vec<usize>,
|
||||
token_streams: Vec<BoxTokenStream<'a>>,
|
||||
) -> TokenStreamChain<'a> {
|
||||
pub fn new(streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>) -> TokenStreamChain<'a> {
|
||||
TokenStreamChain {
|
||||
offsets,
|
||||
streams_with_offsets,
|
||||
stream_idx: 0,
|
||||
token_streams,
|
||||
position_shift: 0,
|
||||
token: Token::default(),
|
||||
}
|
||||
@@ -28,11 +22,10 @@ impl<'a> TokenStreamChain<'a> {
|
||||
|
||||
impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
while self.stream_idx < self.token_streams.len() {
|
||||
let token_stream = self.token_streams[self.stream_idx].deref_mut();
|
||||
while self.stream_idx < self.streams_with_offsets.len() {
|
||||
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx];
|
||||
if token_stream.advance() {
|
||||
let token = token_stream.token();
|
||||
let offset_offset = self.offsets[self.stream_idx];
|
||||
self.token.offset_from = token.offset_from + offset_offset;
|
||||
self.token.offset_to = token.offset_to + offset_offset;
|
||||
self.token.position = token.position + self.position_shift;
|
||||
@@ -49,7 +42,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
self.stream_idx <= self.streams_with_offsets.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&self.token
|
||||
@@ -57,7 +50,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
assert!(
|
||||
self.stream_idx <= self.token_streams.len(),
|
||||
self.stream_idx <= self.streams_with_offsets.len(),
|
||||
"You called .token(), after the end of the token stream has been reached"
|
||||
);
|
||||
&mut self.token
|
||||
@@ -73,10 +66,10 @@ mod tests {
|
||||
#[test]
|
||||
fn test_chain_first_emits_no_tokens() {
|
||||
let token_streams = vec![
|
||||
SimpleTokenizer.token_stream(""),
|
||||
SimpleTokenizer.token_stream("hello world"),
|
||||
(SimpleTokenizer.token_stream(""), 0),
|
||||
(SimpleTokenizer.token_stream("hello world"), 0),
|
||||
];
|
||||
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams);
|
||||
let mut token_chain = TokenStreamChain::new(token_streams);
|
||||
|
||||
assert!(token_chain.advance());
|
||||
assert_eq!(token_chain.token().text, "hello");
|
||||
|
||||
@@ -44,22 +44,20 @@ impl PreTokenizedStream {
|
||||
tok_strings: &'a [&'a PreTokenizedString],
|
||||
) -> BoxTokenStream {
|
||||
if tok_strings.len() == 1 {
|
||||
PreTokenizedStream::from((*tok_strings[0]).clone()).into()
|
||||
PreTokenizedStream::from(tok_strings[0].to_owned()).into()
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &tok_string in tok_strings {
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets.push((
|
||||
PreTokenizedStream::from(tok_string.to_owned()).into(),
|
||||
total_offset,
|
||||
));
|
||||
if let Some(last_token) = tok_string.tokens.last() {
|
||||
total_offset += last_token.offset_to;
|
||||
}
|
||||
}
|
||||
// TODO remove the string cloning.
|
||||
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
|
||||
.iter()
|
||||
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
|
||||
.collect();
|
||||
TokenStreamChain::new(offsets, token_streams).into()
|
||||
TokenStreamChain::new(streams_with_offsets).into()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -91,17 +91,13 @@ impl TextAnalyzer {
|
||||
if texts.len() == 1 {
|
||||
self.token_stream(texts[0])
|
||||
} else {
|
||||
let mut offsets = vec![];
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
offsets.push(total_offset);
|
||||
streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
}
|
||||
let token_streams: Vec<BoxTokenStream<'a>> = texts
|
||||
.iter()
|
||||
.map(|text| self.token_stream(text))
|
||||
.collect();
|
||||
From::from(TokenStreamChain::new(offsets, token_streams))
|
||||
From::from(TokenStreamChain::new(streams_with_offsets))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user