From 783df1b15c56861472e622358eb3f3cc9016a347 Mon Sep 17 00:00:00 2001 From: dcraven Date: Tue, 22 Dec 2020 17:44:27 +0100 Subject: [PATCH] Remove BoxTokenFilter. --- src/indexer/segment_writer.rs | 4 +- src/tokenizer/alphanum_only.rs | 8 +-- src/tokenizer/ascii_folding_filter.rs | 8 +-- src/tokenizer/facet_tokenizer.rs | 9 ++-- src/tokenizer/lower_caser.rs | 7 ++- src/tokenizer/mod.rs | 4 +- src/tokenizer/ngram_tokenizer.rs | 9 ++-- src/tokenizer/raw_tokenizer.rs | 8 ++- src/tokenizer/remove_long.rs | 7 ++- src/tokenizer/simple_tokenizer.rs | 5 +- src/tokenizer/stemmer.rs | 7 ++- src/tokenizer/stop_word_filter.rs | 7 ++- src/tokenizer/token_stream_chain.rs | 8 +-- src/tokenizer/tokenized_string.rs | 11 ++-- src/tokenizer/tokenizer.rs | 73 +++++++-------------------- 15 files changed, 65 insertions(+), 110 deletions(-) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index f8e452ac5..6fa4dd191 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -12,6 +12,7 @@ use crate::schema::Term; use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::PreTokenizedStream; +use crate::tokenizer::TokenStream; use crate::tokenizer::{FacetTokenizer, TextAnalyzer}; use crate::tokenizer::{TokenStreamChain, Tokenizer}; use crate::Opstamp; @@ -179,7 +180,8 @@ impl SegmentWriter { match field_value.value() { Value::PreTokStr(tok_str) => { streams_with_offsets.push(( - PreTokenizedStream::from(tok_str.clone()).into(), + Box::new(PreTokenizedStream::from(tok_str.clone())) + as Box, total_offset, )); if let Some(last_token) = tok_str.tokens.last() { diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 9d3436535..2cfee26ba 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -19,7 +19,7 @@ //! // the "emoji" is dropped because its not an alphanum //! assert!(stream.next().is_none()); //! ``` -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. @@ -27,7 +27,7 @@ use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; pub struct AlphaNumOnlyFilter; pub struct AlphaNumOnlyFilterStream<'a> { - tail: BoxTokenStream<'a>, + tail: Box, } impl<'a> AlphaNumOnlyFilterStream<'a> { @@ -37,8 +37,8 @@ impl<'a> AlphaNumOnlyFilterStream<'a> { } impl TokenFilter for AlphaNumOnlyFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(AlphaNumOnlyFilterStream { tail: token_stream }) + fn transform<'a>(&self, token_stream: Box) -> Box { + Box::new(AlphaNumOnlyFilterStream { tail: token_stream }) } } diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index 1eb6cbb89..5e6d2cc75 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,4 +1,4 @@ -use super::{BoxTokenStream, Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter, TokenStream}; use std::mem; /// This class converts alphabetic, numeric, and symbolic Unicode characters @@ -8,8 +8,8 @@ use std::mem; pub struct AsciiFoldingFilter; impl TokenFilter for AsciiFoldingFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - From::from(AsciiFoldingFilterTokenStream { + fn transform<'a>(&self, token_stream: Box) -> Box { + Box::new(AsciiFoldingFilterTokenStream { tail: token_stream, buffer: String::with_capacity(100), }) @@ -18,7 +18,7 @@ impl TokenFilter for AsciiFoldingFilter { pub struct AsciiFoldingFilterTokenStream<'a> { buffer: String, - tail: BoxTokenStream<'a>, + tail: Box, } impl<'a> TokenStream for AsciiFoldingFilterTokenStream<'a> { diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 963e4e30c..21a3c9053 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{BoxTokenStream, Token, TokenStream, Tokenizer}; +use super::{Token, TokenStream, Tokenizer}; use crate::schema::FACET_SEP_BYTE; /// The `FacetTokenizer` process a `Facet` binary representation @@ -26,13 +26,12 @@ pub struct FacetTokenStream<'a> { } impl Tokenizer for FacetTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - FacetTokenStream { + fn token_stream<'a>(&self, text: &'a str) -> Box { + Box::new(FacetTokenStream { text, state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet. token: Token::default(), - } - .into() + }) } } diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 830be793d..40af0e520 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,10 +1,9 @@ use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; use std::mem; impl TokenFilter for LowerCaser { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(LowerCaserTokenStream { + fn transform<'a>(&self, token_stream: Box) -> Box { + Box::new(LowerCaserTokenStream { tail: token_stream, buffer: String::with_capacity(100), }) @@ -17,7 +16,7 @@ pub struct LowerCaser; pub struct LowerCaserTokenStream<'a> { buffer: String, - tail: BoxTokenStream<'a>, + tail: Box, } // writes a lowercased version of text into output. diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 7b5772393..76f8e3ccd 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -145,9 +145,7 @@ pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; -pub use self::tokenizer::{ - BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer, -}; +pub use self::tokenizer::{TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 8bd82b79d..b38770abf 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -1,5 +1,4 @@ use super::{Token, TokenStream, Tokenizer}; -use crate::tokenizer::BoxTokenStream; /// Tokenize the text by splitting words into n-grams of the given size(s) /// @@ -131,8 +130,8 @@ pub struct NgramTokenStream<'a> { } impl Tokenizer for NgramTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - From::from(NgramTokenStream { + fn token_stream<'a>(&self, text: &'a str) -> Box { + Box::new(NgramTokenStream { ngram_charidx_iterator: StutteringIterator::new( CodepointFrontiers::for_str(text), self.min_gram, @@ -308,9 +307,9 @@ mod tests { use super::StutteringIterator; use crate::tokenizer::tests::assert_token; use crate::tokenizer::tokenizer::Tokenizer; - use crate::tokenizer::{BoxTokenStream, Token}; + use crate::tokenizer::{Token, TokenStream}; - fn test_helper(mut tokenizer: BoxTokenStream) -> Vec { + fn test_helper(mut tokenizer: Box) -> Vec { let mut tokens: Vec = vec![]; tokenizer.process(&mut |token: &Token| tokens.push(token.clone())); tokens diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 7de93190b..32ad1f958 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,5 +1,4 @@ use super::{Token, TokenStream, Tokenizer}; -use crate::tokenizer::BoxTokenStream; /// For each value of the field, emit a single unprocessed token. #[derive(Clone)] @@ -11,7 +10,7 @@ pub struct RawTokenStream { } impl Tokenizer for RawTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + fn token_stream<'a>(&self, text: &'a str) -> Box { let token = Token { offset_from: 0, offset_to: text.len(), @@ -19,11 +18,10 @@ impl Tokenizer for RawTokenizer { text: text.to_string(), position_length: 1, }; - RawTokenStream { + Box::new(RawTokenStream { token, has_token: true, - } - .into() + }) } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index fb510df7c..1d0e0a0e8 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -13,7 +13,6 @@ //! ``` //! use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; /// `RemoveLongFilter` removes tokens that are longer /// than a given number of bytes (in UTF-8 representation). @@ -39,8 +38,8 @@ impl<'a> RemoveLongFilterStream<'a> { } impl TokenFilter for RemoveLongFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(RemoveLongFilterStream { + fn transform<'a>(&self, token_stream: Box) -> Box { + Box::new(RemoveLongFilterStream { token_length_limit: self.length_limit, tail: token_stream, }) @@ -49,7 +48,7 @@ impl TokenFilter for RemoveLongFilter { pub struct RemoveLongFilterStream<'a> { token_length_limit: usize, - tail: BoxTokenStream<'a>, + tail: Box, } impl<'a> TokenStream for RemoveLongFilterStream<'a> { diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index 7a64071b8..f296623ba 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,4 +1,3 @@ -use super::BoxTokenStream; use super::{Token, TokenStream, Tokenizer}; use std::str::CharIndices; @@ -13,8 +12,8 @@ pub struct SimpleTokenStream<'a> { } impl Tokenizer for SimpleTokenizer { - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { - BoxTokenStream::from(SimpleTokenStream { + fn token_stream<'a>(&self, text: &'a str) -> Box { + Box::new(SimpleTokenStream { text, chars: text.char_indices(), token: Token::default(), diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 8facade1a..06899930d 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,5 +1,4 @@ use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; use rust_stemmers::{self, Algorithm}; use serde::{Deserialize, Serialize}; @@ -78,9 +77,9 @@ impl Default for Stemmer { } impl TokenFilter for Stemmer { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { + fn transform<'a>(&self, token_stream: Box) -> Box { let inner_stemmer = rust_stemmers::Stemmer::create(self.stemmer_algorithm); - BoxTokenStream::from(StemmerTokenStream { + Box::new(StemmerTokenStream { tail: token_stream, stemmer: inner_stemmer, }) @@ -88,7 +87,7 @@ impl TokenFilter for Stemmer { } pub struct StemmerTokenStream<'a> { - tail: BoxTokenStream<'a>, + tail: Box, stemmer: rust_stemmers::Stemmer, } diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index b2551eb48..12c2b12ae 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -11,7 +11,6 @@ //! assert!(stream.next().is_none()); //! ``` use super::{Token, TokenFilter, TokenStream}; -use crate::tokenizer::BoxTokenStream; use fnv::FnvHasher; use std::collections::HashSet; use std::hash::BuildHasherDefault; @@ -51,12 +50,12 @@ impl StopWordFilter { pub struct StopWordFilterStream<'a> { words: StopWordHashSet, - tail: BoxTokenStream<'a>, + tail: Box, } impl TokenFilter for StopWordFilter { - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a> { - BoxTokenStream::from(StopWordFilterStream { + fn transform<'a>(&self, token_stream: Box) -> Box { + Box::new(StopWordFilterStream { words: self.words.clone(), tail: token_stream, }) diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index c9fb3a7d0..5d7e9ea1f 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,16 +1,18 @@ -use crate::tokenizer::{BoxTokenStream, Token, TokenStream}; +use crate::tokenizer::{Token, TokenStream}; const POSITION_GAP: usize = 2; pub(crate) struct TokenStreamChain<'a> { - streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>, + streams_with_offsets: Vec<(Box, usize)>, position_shift: usize, stream_idx: usize, token: Token, } impl<'a> TokenStreamChain<'a> { - pub fn new(streams_with_offsets: Vec<(BoxTokenStream<'a>, usize)>) -> TokenStreamChain<'a> { + pub fn new( + streams_with_offsets: Vec<(Box, usize)>, + ) -> TokenStreamChain<'a> { TokenStreamChain { streams_with_offsets, stream_idx: 0, diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index b845a6d89..c0fc47282 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::{BoxTokenStream, Token, TokenStream, TokenStreamChain}; +use crate::tokenizer::{Token, TokenStream, TokenStreamChain}; use serde::{Deserialize, Serialize}; use std::cmp::Ordering; @@ -42,22 +42,23 @@ impl PreTokenizedStream { /// Creates a TokenStream from PreTokenizedString array pub fn chain_tokenized_strings<'a>( tok_strings: &'a [&'a PreTokenizedString], - ) -> BoxTokenStream { + ) -> Box { if tok_strings.len() == 1 { - PreTokenizedStream::from(tok_strings[0].to_owned()).into() + Box::new(PreTokenizedStream::from(tok_strings[0].to_owned())) } else { let mut streams_with_offsets = vec![]; let mut total_offset = 0; for &tok_string in tok_strings { streams_with_offsets.push(( - PreTokenizedStream::from(tok_string.to_owned()).into(), + Box::new(PreTokenizedStream::from(tok_string.to_owned())) + as Box, total_offset, )); if let Some(last_token) = tok_string.tokens.last() { total_offset += last_token.offset_to; } } - TokenStreamChain::new(streams_with_offsets).into() + Box::new(TokenStreamChain::new(streams_with_offsets)) } } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 683fbea51..32ee3279b 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -2,7 +2,6 @@ use crate::tokenizer::TokenStreamChain; use serde::{Deserialize, Serialize}; /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. -use std::ops::{Deref, DerefMut}; /// Token #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] @@ -39,7 +38,7 @@ impl Default for Token { /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. pub struct TextAnalyzer { tokenizer: Box, - token_filters: Vec, + token_filters: Vec>, } impl From for TextAnalyzer { @@ -49,11 +48,14 @@ impl From for TextAnalyzer { } impl TextAnalyzer { - /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`. + /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box`. /// /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using /// `TextAnalyzer::from(tokenizer)`. - pub fn new(tokenizer: T, token_filters: Vec) -> TextAnalyzer { + pub fn new( + tokenizer: T, + token_filters: Vec>, + ) -> TextAnalyzer { TextAnalyzer { tokenizer: Box::new(tokenizer), token_filters, @@ -76,8 +78,8 @@ impl TextAnalyzer { /// .filter(Stemmer::default()); /// ``` /// - pub fn filter>(mut self, token_filter: F) -> Self { - self.token_filters.push(token_filter.into()); + pub fn filter(mut self, token_filter: F) -> Self { + self.token_filters.push(Box::new(token_filter)); self } @@ -86,7 +88,7 @@ impl TextAnalyzer { /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were /// one concatenated `&str`, with an artificial position gap of `2` between the different fields /// to prevent accidental `PhraseQuery` to match accross two terms. - pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> BoxTokenStream<'a> { + pub fn token_stream_texts<'a>(&self, texts: &'a [&'a str]) -> Box { debug_assert!(!texts.is_empty()); if texts.len() == 1 { self.token_stream(texts[0]) @@ -97,12 +99,12 @@ impl TextAnalyzer { streams_with_offsets.push((self.token_stream(text), total_offset)); total_offset += text.len(); } - From::from(TokenStreamChain::new(streams_with_offsets)) + Box::new(TokenStreamChain::new(streams_with_offsets)) } } /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + pub fn token_stream<'a>(&self, text: &'a str) -> Box { let mut token_stream = self.tokenizer.token_stream(text); for token_filter in &self.token_filters { token_stream = token_filter.transform(token_stream); @@ -134,7 +136,7 @@ impl Clone for TextAnalyzer { /// This API may change to use associated types. pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>; + fn token_stream<'a>(&self, text: &'a str) -> Box; } pub trait TokenizerClone { @@ -150,48 +152,7 @@ impl TokenizerClone for T { /// Simple wrapper of `Box`. /// /// See `TokenStream` for more information. -pub struct BoxTokenStream<'a>(Box); - -impl<'a, T> From for BoxTokenStream<'a> -where - T: TokenStream + 'a, -{ - fn from(token_stream: T) -> BoxTokenStream<'a> { - BoxTokenStream(Box::new(token_stream)) - } -} - -impl<'a> Deref for BoxTokenStream<'a> { - type Target = dyn TokenStream + 'a; - - fn deref(&self) -> &Self::Target { - &*self.0 - } -} -impl<'a> DerefMut for BoxTokenStream<'a> { - fn deref_mut(&mut self) -> &mut Self::Target { - &mut *self.0 - } -} - -/// Simple wrapper of `Box`. -/// -/// See `TokenStream` for more information. -pub struct BoxTokenFilter(Box); - -impl Deref for BoxTokenFilter { - type Target = dyn TokenFilter; - - fn deref(&self) -> &dyn TokenFilter { - &*self.0 - } -} - -impl From for BoxTokenFilter { - fn from(tokenizer: T) -> BoxTokenFilter { - BoxTokenFilter(Box::new(tokenizer)) - } -} +// pub struct Box(Box); /// `TokenStream` is the result of the tokenization. /// @@ -272,18 +233,18 @@ pub trait TokenStream { } pub trait TokenFilterClone { - fn box_clone(&self) -> BoxTokenFilter; + fn box_clone(&self) -> Box; } /// Trait for the pluggable components of `Tokenizer`s. pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { /// Wraps a token stream and returns the modified one. - fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>; + fn transform<'a>(&self, token_stream: Box) -> Box; } impl TokenFilterClone for T { - fn box_clone(&self) -> BoxTokenFilter { - BoxTokenFilter::from(self.clone()) + fn box_clone(&self) -> Box { + Box::new(self.clone()) } }