Compare commits

...

7 Commits

Author SHA1 Message Date
dcraven
1a29e06bd2 Remove forgotten code. 2020-12-23 11:42:47 +01:00
dcraven
8e39265dbe Remove unnecessary lifetime. 2020-12-23 09:43:23 +01:00
dcraven
bf7ac960b3 Simplify control flow. 2020-12-23 09:40:01 +01:00
dcraven
783df1b15c Remove BoxTokenFilter. 2020-12-22 17:44:27 +01:00
dcraven
253d207103 Reduced number of allocations. 2020-12-22 10:45:22 +01:00
dcraven
03148e86c9 Removed unnecessary lifetimes. 2020-12-22 10:44:10 +01:00
dcraven
a6a903d8a1 Removed unnecessary trait impls 2020-12-21 16:36:31 +01:00
20 changed files with 100 additions and 186 deletions

View File

@@ -310,7 +310,7 @@ impl SegmentReader {
} }
/// Returns an iterator that will iterate over the alive document ids /// Returns an iterator that will iterate over the alive document ids
pub fn doc_ids_alive<'a>(&'a self) -> impl Iterator<Item = DocId> + 'a { pub fn doc_ids_alive(&self) -> impl Iterator<Item = DocId> + '_ {
(0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc)) (0u32..self.max_doc).filter(move |doc| !self.is_deleted(*doc))
} }

View File

@@ -11,7 +11,8 @@ use crate::schema::Schema;
use crate::schema::Term; use crate::schema::Term;
use crate::schema::Value; use crate::schema::Value;
use crate::schema::{Field, FieldEntry}; use crate::schema::{Field, FieldEntry};
use crate::tokenizer::{BoxTokenStream, PreTokenizedStream}; use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{FacetTokenizer, TextAnalyzer};
use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::tokenizer::{TokenStreamChain, Tokenizer};
use crate::Opstamp; use crate::Opstamp;
@@ -141,13 +142,13 @@ impl SegmentWriter {
} }
let (term_buffer, multifield_postings) = let (term_buffer, multifield_postings) =
(&mut self.term_buffer, &mut self.multifield_postings); (&mut self.term_buffer, &mut self.multifield_postings);
match *field_entry.field_type() { match field_entry.field_type() {
FieldType::HierarchicalFacet => { FieldType::HierarchicalFacet => {
term_buffer.set_field(field); term_buffer.set_field(field);
let facets = let facets =
field_values field_values
.iter() .iter()
.flat_map(|field_value| match *field_value.value() { .flat_map(|field_value| match field_value.value() {
Value::Facet(ref facet) => Some(facet.encoded_str()), Value::Facet(ref facet) => Some(facet.encoded_str()),
_ => { _ => {
panic!("Expected hierarchical facet"); panic!("Expected hierarchical facet");
@@ -172,37 +173,38 @@ impl SegmentWriter {
} }
} }
FieldType::Str(_) => { FieldType::Str(_) => {
let mut token_streams: Vec<BoxTokenStream> = vec![]; let mut streams_with_offsets = vec![];
let mut offsets = vec![];
let mut total_offset = 0; let mut total_offset = 0;
for field_value in field_values { for field_value in field_values {
match field_value.value() { match field_value.value() {
Value::PreTokStr(tok_str) => { Value::PreTokStr(tok_str) => {
offsets.push(total_offset); streams_with_offsets.push((
Box::new(PreTokenizedStream::from(tok_str.clone()))
as Box<dyn TokenStream>,
total_offset,
));
if let Some(last_token) = tok_str.tokens.last() { if let Some(last_token) = tok_str.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
token_streams
.push(PreTokenizedStream::from(tok_str.clone()).into());
} }
Value::Str(ref text) => { Value::Str(ref text) => {
if let Some(ref mut tokenizer) = if let Some(ref mut tokenizer) =
self.tokenizers[field.field_id() as usize] self.tokenizers[field.field_id() as usize]
{ {
offsets.push(total_offset); streams_with_offsets
.push((tokenizer.token_stream(text), total_offset));
total_offset += text.len(); total_offset += text.len();
token_streams.push(tokenizer.token_stream(text));
} }
} }
_ => (), _ => (),
} }
} }
let num_tokens = if token_streams.is_empty() { let num_tokens = if streams_with_offsets.is_empty() {
0 0
} else { } else {
let mut token_stream = TokenStreamChain::new(offsets, token_streams); let mut token_stream = TokenStreamChain::new(streams_with_offsets);
multifield_postings.index_text( multifield_postings.index_text(
doc_id, doc_id,
field, field,

View File

@@ -132,7 +132,7 @@ impl PositionReader {
"offset arguments should be increasing." "offset arguments should be increasing."
); );
let delta_to_block_offset = offset as i64 - self.block_offset as i64; let delta_to_block_offset = offset as i64 - self.block_offset as i64;
if delta_to_block_offset < 0 || delta_to_block_offset >= 128 { if !(0..128).contains(&delta_to_block_offset) {
// The first position is not within the first block. // The first position is not within the first block.
// We need to decompress the first block. // We need to decompress the first block.
let delta_to_anchor_offset = offset - self.anchor_offset; let delta_to_anchor_offset = offset - self.anchor_offset;

View File

@@ -109,9 +109,9 @@ impl BlockSearcher {
/// The results should be equivalent to /// The results should be equivalent to
/// ```compile_fail /// ```compile_fail
/// block[..] /// block[..]
// .iter() /// .iter()
// .take_while(|&&val| val < target) /// .take_while(|&&val| val < target)
// .count() /// .count()
/// ``` /// ```
/// ///
/// The `start` argument is just used to hint that the response is /// The `start` argument is just used to hint that the response is

View File

@@ -35,11 +35,11 @@ struct Layer {
} }
impl Layer { impl Layer {
fn cursor<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { fn cursor(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.cursor_at_offset(0u64) self.cursor_at_offset(0u64)
} }
fn cursor_at_offset<'a>(&'a self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + 'a { fn cursor_at_offset(&self, start_offset: u64) -> impl Iterator<Item = Checkpoint> + '_ {
let data = &self.data.as_slice(); let data = &self.data.as_slice();
LayerCursor { LayerCursor {
remaining: &data[start_offset as usize..], remaining: &data[start_offset as usize..],
@@ -59,7 +59,7 @@ pub struct SkipIndex {
} }
impl SkipIndex { impl SkipIndex {
pub(crate) fn checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { pub(crate) fn checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.layers self.layers
.last() .last()
.into_iter() .into_iter()

View File

@@ -46,7 +46,7 @@ impl StoreReader {
}) })
} }
pub(crate) fn block_checkpoints<'a>(&'a self) -> impl Iterator<Item = Checkpoint> + 'a { pub(crate) fn block_checkpoints(&self) -> impl Iterator<Item = Checkpoint> + '_ {
self.skip_index.checkpoints() self.skip_index.checkpoints()
} }

View File

@@ -19,7 +19,7 @@
//! // the "emoji" is dropped because its not an alphanum //! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
/// `TokenFilter` that removes all tokens that contain non /// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters. /// ascii alphanumeric characters.
@@ -27,7 +27,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream};
pub struct AlphaNumOnlyFilter; pub struct AlphaNumOnlyFilter;
pub struct AlphaNumOnlyFilterStream<'a> { pub struct AlphaNumOnlyFilterStream<'a> {
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
} }
impl<'a> AlphaNumOnlyFilterStream<'a> { impl<'a> AlphaNumOnlyFilterStream<'a> {
@@ -37,8 +37,8 @@ impl<'a> AlphaNumOnlyFilterStream<'a> {
} }
impl TokenFilter for AlphaNumOnlyFilter { impl TokenFilter for AlphaNumOnlyFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) Box::new(AlphaNumOnlyFilterStream { tail: token_stream })
} }
} }

View File

@@ -1,4 +1,4 @@
use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use std::mem; use std::mem;
/// This class converts alphabetic, numeric, and symbolic Unicode characters /// This class converts alphabetic, numeric, and symbolic Unicode characters
@@ -8,8 +8,8 @@ use std::mem;
pub struct AsciiFoldingFilter; pub struct AsciiFoldingFilter;
impl TokenFilter for AsciiFoldingFilter { impl TokenFilter for AsciiFoldingFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
From::from(AsciiFoldingFilterTokenStream { Box::new(AsciiFoldingFilterTokenStream {
tail: token_stream, tail: token_stream,
buffer: String::with_capacity(100), buffer: String::with_capacity(100),
}) })
@@ -18,7 +18,7 @@ impl TokenFilter for AsciiFoldingFilter {
pub struct AsciiFoldingFilterTokenStream<'a> { pub struct AsciiFoldingFilterTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
} }
impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> { impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> {

View File

@@ -1,4 +1,4 @@
use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; use super::{Token, TokenStream, Tokenizer};
use crate::schema::FACET_SEP_BYTE; use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation /// The `FacetTokenizer` process a `Facet` binary representation
@@ -26,13 +26,12 @@ pub struct FacetTokenStream<'a> {
} }
impl Tokenizer for FacetTokenizer { impl Tokenizer for FacetTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
FacetTokenStream { Box::new(FacetTokenStream {
text, text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: Token::default(), token: Token::default(),
} })
.into()
} }
} }

View File

@@ -1,10 +1,9 @@
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use std::mem; use std::mem;
impl TokenFilter for LowerCaser { impl TokenFilter for LowerCaser {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
BoxTokenStream::from(LowerCaserTokenStream { Box::new(LowerCaserTokenStream {
tail: token_stream, tail: token_stream,
buffer: String::with_capacity(100), buffer: String::with_capacity(100),
}) })
@@ -17,7 +16,7 @@ pub struct LowerCaser;
pub struct LowerCaserTokenStream<'a> { pub struct LowerCaserTokenStream<'a> {
buffer: String, buffer: String,
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
} }
// writes a lowercased version of text into output. // writes a lowercased version of text into output.

View File

@@ -145,9 +145,7 @@ pub use self::stop_word_filter::StopWordFilter;
pub(crate) use self::token_stream_chain::TokenStreamChain; pub(crate) use self::token_stream_chain::TokenStreamChain;
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{ pub use self::tokenizer::{TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer};
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager; pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer}; use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// Tokenize the text by splitting words into n-grams of the given size(s) /// Tokenize the text by splitting words into n-grams of the given size(s)
/// ///
@@ -131,8 +130,8 @@ pub struct NgramTokenStream<'a> {
} }
impl Tokenizer for NgramTokenizer { impl Tokenizer for NgramTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
From::from(NgramTokenStream { Box::new(NgramTokenStream {
ngram_charidx_iterator: StutteringIterator::new( ngram_charidx_iterator: StutteringIterator::new(
CodepointFrontiers::for_str(text), CodepointFrontiers::for_str(text),
self.min_gram, self.min_gram,
@@ -308,9 +307,9 @@ mod tests {
use super::StutteringIterator; use super::StutteringIterator;
use crate::tokenizer::tests::assert_token; use crate::tokenizer::tests::assert_token;
use crate::tokenizer::tokenizer::Tokenizer; use crate::tokenizer::tokenizer::Tokenizer;
use crate::tokenizer::{BoxTokenStream, Token}; use crate::tokenizer::{Token, TokenStream};
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> { fn test_helper(mut tokenizer: Box<dyn TokenStream>) -> Vec<Token> {
let mut tokens: Vec<Token> = vec![]; let mut tokens: Vec<Token> = vec![];
tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); tokenizer.process(&mut |token: &Token| tokens.push(token.clone()));
tokens tokens

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenStream, Tokenizer}; use super::{Token, TokenStream, Tokenizer};
use crate::tokenizer::BoxTokenStream;
/// For each value of the field, emit a single unprocessed token. /// For each value of the field, emit a single unprocessed token.
#[derive(Clone)] #[derive(Clone)]
@@ -11,7 +10,7 @@ pub struct RawTokenStream {
} }
impl Tokenizer for RawTokenizer { impl Tokenizer for RawTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
let token = Token { let token = Token {
offset_from: 0, offset_from: 0,
offset_to: text.len(), offset_to: text.len(),
@@ -19,11 +18,10 @@ impl Tokenizer for RawTokenizer {
text: text.to_string(), text: text.to_string(),
position_length: 1, position_length: 1,
}; };
RawTokenStream { Box::new(RawTokenStream {
token, token,
has_token: true, has_token: true,
} })
.into()
} }
} }

View File

@@ -13,7 +13,6 @@
//! ``` //! ```
//! //!
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
/// `RemoveLongFilter` removes tokens that are longer /// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation). /// than a given number of bytes (in UTF-8 representation).
@@ -39,8 +38,8 @@ impl<'a> RemoveLongFilterStream<'a> {
} }
impl TokenFilter for RemoveLongFilter { impl TokenFilter for RemoveLongFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
BoxTokenStream::from(RemoveLongFilterStream { Box::new(RemoveLongFilterStream {
token_length_limit: self.length_limit, token_length_limit: self.length_limit,
tail: token_stream, tail: token_stream,
}) })
@@ -49,7 +48,7 @@ impl TokenFilter for RemoveLongFilter {
pub struct RemoveLongFilterStream<'a> { pub struct RemoveLongFilterStream<'a> {
token_length_limit: usize, token_length_limit: usize,
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
} }
impl<'a> TokenStream for RemoveLongFilterStream<'a> { impl<'a> TokenStream for RemoveLongFilterStream<'a> {

View File

@@ -1,4 +1,3 @@
use super::BoxTokenStream;
use super::{Token, TokenStream, Tokenizer}; use super::{Token, TokenStream, Tokenizer};
use std::str::CharIndices; use std::str::CharIndices;
@@ -13,8 +12,8 @@ pub struct SimpleTokenStream<'a> {
} }
impl Tokenizer for SimpleTokenizer { impl Tokenizer for SimpleTokenizer {
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
BoxTokenStream::from(SimpleTokenStream { Box::new(SimpleTokenStream {
text, text,
chars: text.char_indices(), chars: text.char_indices(),
token: Token::default(), token: Token::default(),

View File

@@ -1,5 +1,4 @@
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use rust_stemmers::{self, Algorithm}; use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -78,9 +77,9 @@ impl Default for Stemmer {
} }
impl TokenFilter for Stemmer { impl TokenFilter for Stemmer {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm);
BoxTokenStream::from(StemmerTokenStream { Box::new(StemmerTokenStream {
tail: token_stream, tail: token_stream,
stemmer: inner_stemmer, stemmer: inner_stemmer,
}) })
@@ -88,7 +87,7 @@ impl TokenFilter for Stemmer {
} }
pub struct StemmerTokenStream<'a> { pub struct StemmerTokenStream<'a> {
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
stemmer: rust_stemmers::Stemmer, stemmer: rust_stemmers::Stemmer,
} }

View File

@@ -11,7 +11,6 @@
//! assert!(stream.next().is_none()); //! assert!(stream.next().is_none());
//! ``` //! ```
use super::{Token, TokenFilter, TokenStream}; use super::{Token, TokenFilter, TokenStream};
use crate::tokenizer::BoxTokenStream;
use fnv::FnvHasher; use fnv::FnvHasher;
use std::collections::HashSet; use std::collections::HashSet;
use std::hash::BuildHasherDefault; use std::hash::BuildHasherDefault;
@@ -51,12 +50,12 @@ impl StopWordFilter {
pub struct StopWordFilterStream<'a> { pub struct StopWordFilterStream<'a> {
words: StopWordHashSet, words: StopWordHashSet,
tail: BoxTokenStream<'a>, tail: Box<dyn TokenStream + 'a>,
} }
impl TokenFilter for StopWordFilter { impl TokenFilter for StopWordFilter {
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a> {
BoxTokenStream::from(StopWordFilterStream { Box::new(StopWordFilterStream {
words: self.words.clone(), words: self.words.clone(),
tail: token_stream, tail: token_stream,
}) })

View File

@@ -1,11 +1,9 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; use crate::tokenizer::{Token, TokenStream};
use std::ops::DerefMut;
const POSITION_GAP: usize = 2; const POSITION_GAP: usize = 2;
pub(crate) struct TokenStreamChain<'a> { pub(crate) struct TokenStreamChain<'a> {
offsets: Vec<usize>, streams_with_offsets: Vec<(Box<dyn TokenStream + 'a>, usize)>,
token_streams: Vec<BoxTokenStream<'a>>,
position_shift: usize, position_shift: usize,
stream_idx: usize, stream_idx: usize,
token: Token, token: Token,
@@ -13,13 +11,11 @@ pub(crate) struct TokenStreamChain<'a> {
impl<'a> TokenStreamChain<'a> { impl<'a> TokenStreamChain<'a> {
pub fn new( pub fn new(
offsets: Vec<usize>, streams_with_offsets: Vec<(Box<dyn TokenStream + 'a>, usize)>,
token_streams: Vec<BoxTokenStream<'a>>,
) -> TokenStreamChain<'a> { ) -> TokenStreamChain<'a> {
TokenStreamChain { TokenStreamChain {
offsets, streams_with_offsets,
stream_idx: 0, stream_idx: 0,
token_streams,
position_shift: 0, position_shift: 0,
token: Token::default(), token: Token::default(),
} }
@@ -28,11 +24,10 @@ impl<'a> TokenStreamChain<'a> {
impl<'a> TokenStream for TokenStreamChain<'a> { impl<'a> TokenStream for TokenStreamChain<'a> {
fn advance(&mut self) -> bool { fn advance(&mut self) -> bool {
while self.stream_idx < self.token_streams.len() { while self.stream_idx < self.streams_with_offsets.len() {
let token_stream = self.token_streams[self.stream_idx].deref_mut(); let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx];
if token_stream.advance() { if token_stream.advance() {
let token = token_stream.token(); let token = token_stream.token();
let offset_offset = self.offsets[self.stream_idx];
self.token.offset_from = token.offset_from + offset_offset; self.token.offset_from = token.offset_from + offset_offset;
self.token.offset_to = token.offset_to + offset_offset; self.token.offset_to = token.offset_to + offset_offset;
self.token.position = token.position + self.position_shift; self.token.position = token.position + self.position_shift;
@@ -49,7 +44,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
fn token(&self) -> &Token { fn token(&self) -> &Token {
assert!( assert!(
self.stream_idx <= self.token_streams.len(), self.stream_idx <= self.streams_with_offsets.len(),
"You called .token(), after the end of the token stream has been reached" "You called .token(), after the end of the token stream has been reached"
); );
&self.token &self.token
@@ -57,7 +52,7 @@ impl<'a> TokenStream for TokenStreamChain<'a> {
fn token_mut(&mut self) -> &mut Token { fn token_mut(&mut self) -> &mut Token {
assert!( assert!(
self.stream_idx <= self.token_streams.len(), self.stream_idx <= self.streams_with_offsets.len(),
"You called .token(), after the end of the token stream has been reached" "You called .token(), after the end of the token stream has been reached"
); );
&mut self.token &mut self.token
@@ -73,10 +68,10 @@ mod tests {
#[test] #[test]
fn test_chain_first_emits_no_tokens() { fn test_chain_first_emits_no_tokens() {
let token_streams = vec![ let token_streams = vec![
SimpleTokenizer.token_stream(""), (SimpleTokenizer.token_stream(""), 0),
SimpleTokenizer.token_stream("hello world"), (SimpleTokenizer.token_stream("hello world"), 0),
]; ];
let mut token_chain = TokenStreamChain::new(vec![0, 0], token_streams); let mut token_chain = TokenStreamChain::new(token_streams);
assert!(token_chain.advance()); assert!(token_chain.advance());
assert_eq!(token_chain.token().text, "hello"); assert_eq!(token_chain.token().text, "hello");

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use std::cmp::Ordering; use std::cmp::Ordering;
@@ -42,24 +42,23 @@ impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array /// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>( pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&'a PreTokenizedString], tok_strings: &'a [&'a PreTokenizedString],
) -> BoxTokenStream { ) -> Box<dyn TokenStream> {
if tok_strings.len() == 1 { if tok_strings.len() == 1 {
PreTokenizedStream::from((*tok_strings[0]).clone()).into() Box::new(PreTokenizedStream::from(tok_strings[0].to_owned()))
} else { } else {
let mut offsets = vec![]; let mut streams_with_offsets = vec![];
let mut total_offset = 0; let mut total_offset = 0;
for &tok_string in tok_strings { for &tok_string in tok_strings {
offsets.push(total_offset); streams_with_offsets.push((
Box::new(PreTokenizedStream::from(tok_string.to_owned()))
as Box<dyn TokenStream>,
total_offset,
));
if let Some(last_token) = tok_string.tokens.last() { if let Some(last_token) = tok_string.tokens.last() {
total_offset += last_token.offset_to; total_offset += last_token.offset_to;
} }
} }
// TODO remove the string cloning. Box::new(TokenStreamChain::new(streams_with_offsets))
let token_streams: Vec<BoxTokenStream<'static>> = tok_strings
.iter()
.map(|&tok_string| PreTokenizedStream::from((*tok_string).clone()).into())
.collect();
TokenStreamChain::new(offsets, token_streams).into()
} }
} }
} }

View File

@@ -2,8 +2,6 @@ use crate::tokenizer::TokenStreamChain;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// The tokenizer module contains all of the tools used to process /// The tokenizer module contains all of the tools used to process
/// text in `tantivy`. /// text in `tantivy`.
use std::borrow::{Borrow, BorrowMut};
use std::ops::{Deref, DerefMut};
/// Token /// Token
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
@@ -40,7 +38,7 @@ impl Default for Token {
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer { pub struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>, tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<BoxTokenFilter>, token_filters: Vec<Box<dyn TokenFilter>>,
} }
impl<T: Tokenizer> From<T> for TextAnalyzer { impl<T: Tokenizer> From<T> for TextAnalyzer {
@@ -50,11 +48,14 @@ impl<T: Tokenizer> From<T> for TextAnalyzer {
} }
impl TextAnalyzer { impl TextAnalyzer {
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
/// ///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`. /// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(tokenizer: T, token_filters: Vec<BoxTokenFilter>) -> TextAnalyzer { pub fn new<T: Tokenizer>(
tokenizer: T,
token_filters: Vec<Box<dyn TokenFilter>>,
) -> TextAnalyzer {
TextAnalyzer { TextAnalyzer {
tokenizer: Box::new(tokenizer), tokenizer: Box::new(tokenizer),
token_filters, token_filters,
@@ -77,8 +78,8 @@ impl TextAnalyzer {
/// .filter(Stemmer::default()); /// .filter(Stemmer::default());
/// ``` /// ```
/// ///
pub fn filter<F: Into<BoxTokenFilter>>(mut self, token_filter: F) -> Self { pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
self.token_filters.push(token_filter.into()); self.token_filters.push(Box::new(token_filter));
self self
} }
@@ -87,28 +88,19 @@ impl TextAnalyzer {
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields /// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms. /// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> { pub fn token_stream_texts<'a>(&self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
assert!(!texts.is_empty()); debug_assert!(!texts.is_empty());
if texts.len() == 1 { let mut streams_with_offsets = vec![];
self.token_stream(texts[0]) let mut total_offset = 0;
} else { for &text in texts {
let mut offsets = vec![]; streams_with_offsets.push((self.token_stream(text), total_offset));
let mut total_offset = 0; total_offset += text.len();
for &text in texts {
offsets.push(total_offset);
total_offset += text.len();
}
let token_streams: Vec<BoxTokenStream<'a>> = texts
.iter()
.cloned()
.map(|text| self.token_stream(text))
.collect();
From::from(TokenStreamChain::new(offsets, token_streams))
} }
Box::new(TokenStreamChain::new(streams_with_offsets))
} }
/// Creates a token stream for a given `str`. /// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
let mut token_stream = self.tokenizer.token_stream(text); let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters { for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream); token_stream = token_filter.transform(token_stream);
@@ -140,7 +132,7 @@ impl Clone for TextAnalyzer {
/// This API may change to use associated types. /// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
/// Creates a token stream for a given `str`. /// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
} }
pub trait TokenizerClone { pub trait TokenizerClone {
@@ -153,69 +145,6 @@ impl<T: Tokenizer + Clone> TokenizerClone for T {
} }
} }
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
fn advance(&mut self) -> bool {
let token_stream: &mut dyn TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token<'b>(&'b self) -> &'b Token {
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
token_stream.token()
}
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
token_stream.token_mut()
}
}
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where
T: TokenStream + 'a,
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
impl<'a> Deref for BoxTokenStream<'a> {
type Target = dyn TokenStream + 'a;
fn deref(&self) -> &Self::Target {
&*self.0
}
}
impl<'a> DerefMut for BoxTokenStream<'a> {
fn deref_mut(&mut self) -> &mut Self::Target {
&mut *self.0
}
}
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
///
/// See `TokenStream` for more information.
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
impl Deref for BoxTokenFilter {
type Target = dyn TokenFilter;
fn deref(&self) -> &dyn TokenFilter {
&*self.0
}
}
impl<T: TokenFilter> From<T> for BoxTokenFilter {
fn from(tokenizer: T) -> BoxTokenFilter {
BoxTokenFilter(Box::new(tokenizer))
}
}
/// `TokenStream` is the result of the tokenization. /// `TokenStream` is the result of the tokenization.
/// ///
/// It consists consumable stream of `Token`s. /// It consists consumable stream of `Token`s.
@@ -295,18 +224,18 @@ pub trait TokenStream {
} }
pub trait TokenFilterClone { pub trait TokenFilterClone {
fn box_clone(&self) -> BoxTokenFilter; fn box_clone(&self) -> Box<dyn TokenFilter>;
} }
/// Trait for the pluggable components of `Tokenizer`s. /// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one. /// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
} }
impl<T: TokenFilter + Clone> TokenFilterClone for T { impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> BoxTokenFilter { fn box_clone(&self) -> Box<dyn TokenFilter> {
BoxTokenFilter::from(self.clone()) Box::new(self.clone())
} }
} }