diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 5b08472dd..24cb6bdad 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -36,28 +36,42 @@ impl Default for Token { /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. /// /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. -pub struct TextAnalyzer { - tokenizer: Box, - token_filters: Vec>, +#[derive(Clone)] +pub struct TokenStream<'a, I> { + tokens: I, + filters: Vec>, } -impl From for TextAnalyzer { - fn from(tokenizer: T) -> Self { - TextAnalyzer::new(tokenizer, Vec::new()) +impl<'a, I> Iterator for TokenStream<'a, I> +where + I: Iterator, +{ + type Item = I::Item; + fn next(&mut self) -> Option { + while let Some(token) = self.tokens.next() { + if self.filters.all(|filter| filter(&token)) { + return Some(token); + } + } + None } } -impl TextAnalyzer { +impl<'a, I> TokenStream<'a, I> +where + I: Iterator, +{ /// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box`. /// /// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using /// `TextAnalyzer::from(tokenizer)`. - pub fn new( + pub fn new>( tokenizer: T, + text: &str, token_filters: Vec>, - ) -> TextAnalyzer { - TextAnalyzer { - tokenizer: Box::new(tokenizer), + ) -> TokenStream<'a, I> { + TokenStream { + tokens: tokenizer.token_stream(text), token_filters, } } @@ -83,44 +97,34 @@ impl TextAnalyzer { self } - /// Tokenize an array`&str` - /// - /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were - /// one concatenated `&str`, with an artificial position gap of `2` between the different fields - /// to prevent accidental `PhraseQuery` to match accross two terms. - pub fn token_stream_texts<'a>(&self, texts: &'a [&str]) -> Box { - debug_assert!(!texts.is_empty()); - let mut streams_with_offsets = vec![]; - let mut total_offset = 0; - for &text in texts { - streams_with_offsets.push((self.token_stream(text), total_offset)); - total_offset += text.len(); - } - Box::new(TokenStreamChain::new(streams_with_offsets)) - } + // /// Tokenize an array`&str` + // /// + // /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were + // /// one concatenated `&str`, with an artificial position gap of `2` between the different fields + // /// to prevent accidental `PhraseQuery` to match accross two terms. - /// Creates a token stream for a given `str`. - pub fn token_stream<'a>(&self, text: &'a str) -> Box { - let mut token_stream = self.tokenizer.token_stream(text); - for token_filter in &self.token_filters { - token_stream = token_filter.transform(token_stream); - } - token_stream - } + // /// Creates a token stream for a given `str`. + // pub fn token_stream<'a>(&self, text: &'a str) -> Box { + // let mut token_stream = self.tokenizer.token_stream(text); + // for token_filter in &self.token_filters { + // token_stream = token_filter.transform(token_stream); + // } + // token_stream + // } } -impl Clone for TextAnalyzer { - fn clone(&self) -> Self { - TextAnalyzer { - tokenizer: self.tokenizer.box_clone(), - token_filters: self - .token_filters - .iter() - .map(|token_filter| token_filter.box_clone()) - .collect(), - } - } -} +// impl<'a,I: Clone> Clone for Tokens<'a,I> { +// fn clone(&self) -> Self { +// Tokens { +// tokenizer: self.tokenizer.box_clone(), +// token_filters: self +// .token_filters +// .iter() +// .map(|token_filter| token_filter.box_clone()) +// .collect(), +// } +// } +// } /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. @@ -130,107 +134,27 @@ impl Clone for TextAnalyzer { /// # Warning /// /// This API may change to use associated types. -pub trait Tokenizer: 'static + Send + Sync + TokenizerClone { +pub trait Tokenizer<'a>: 'static + Send + Sync + Clone { + type Iter: Iterator + 'a; /// Creates a token stream for a given `str`. - fn token_stream<'a>(&self, text: &'a str) -> Box; -} - -pub trait TokenizerClone { - fn box_clone(&self) -> Box; -} - -impl TokenizerClone for T { - fn box_clone(&self) -> Box { - Box::new(self.clone()) - } -} - -/// `TokenStream` is the result of the tokenization. -/// -/// It consists consumable stream of `Token`s. -/// -/// # Example -/// -/// ``` -/// use tantivy::tokenizer::*; -/// -/// let tokenizer = TextAnalyzer::from(SimpleTokenizer) -/// .filter(RemoveLongFilter::limit(40)) -/// .filter(LowerCaser); -/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "hello"); -/// assert_eq!(token.offset_from, 0); -/// assert_eq!(token.offset_to, 5); -/// assert_eq!(token.position, 0); -/// } -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "happy"); -/// assert_eq!(token.offset_from, 7); -/// assert_eq!(token.offset_to, 12); -/// assert_eq!(token.position, 1); -/// } -/// ``` -/// -pub trait TokenStream { - /// Advance to the next token - /// - /// Returns false if there are no other tokens. - fn advance(&mut self) -> bool; - - /// Returns a reference to the current token. - fn token(&self) -> &Token; - - /// Returns a mutable reference to the current token. - fn token_mut(&mut self) -> &mut Token; - - /// Helper to iterate over tokens. It - /// simply combines a call to `.advance()` - /// and `.token()`. - /// - /// ``` - /// use tantivy::tokenizer::*; - /// - /// let tokenizer = TextAnalyzer::from(SimpleTokenizer) - /// .filter(RemoveLongFilter::limit(40)) - /// .filter(LowerCaser); - /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); - /// while let Some(token) = token_stream.next() { - /// println!("Token {:?}", token.text); - /// } - /// ``` - fn next(&mut self) -> Option<&Token> { - if self.advance() { - Some(self.token()) - } else { - None + fn token_stream(&self, text: &'a str) -> Self::Iter; + fn token_stream_texts(&self, texts: &'a [&str]) -> Self::Iter { + debug_assert!(!texts.is_empty()); + let mut streams_with_offsets = vec![]; + let mut total_offset = 0; + for &text in texts { + streams_with_offsets.push((self.token_stream(text), total_offset)); + total_offset += text.len(); } + TokenStreamChain::new(streams_with_offsets) } - - /// Helper function to consume the entire `TokenStream` - /// and push the tokens to a sink function. - /// - /// Remove this. - fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 { - let mut num_tokens_pushed = 0u32; - while self.advance() { - sink(self.token()); - num_tokens_pushed += 1u32; - } - num_tokens_pushed - } -} - -pub trait TokenFilterClone { - fn box_clone(&self) -> Box; } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone { - /// Wraps a token stream and returns the modified one. - fn transform<'a>(&self, token_stream: Box) -> Box; +pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {} + +pub trait TokenFilterClone { + fn box_clone(&self) -> Box; } impl TokenFilterClone for T { @@ -239,24 +163,24 @@ impl TokenFilterClone for T { } } -#[cfg(test)] -mod test { - use super::Token; +// #[cfg(test)] +// mod test { +// use super::Token; - #[test] - fn clone() { - let t1 = Token { - position: 1, - offset_from: 2, - offset_to: 3, - text: "abc".to_string(), - position_length: 1, - }; - let t2 = t1.clone(); +// #[test] +// fn clone() { +// let t1 = Token { +// position: 1, +// offset_from: 2, +// offset_to: 3, +// text: "abc".to_string(), +// position_length: 1, +// }; +// let t2 = t1.clone(); - assert_eq!(t1.position, t2.position); - assert_eq!(t1.offset_from, t2.offset_from); - assert_eq!(t1.offset_to, t2.offset_to); - assert_eq!(t1.text, t2.text); - } -} +// assert_eq!(t1.position, t2.position); +// assert_eq!(t1.offset_from, t2.offset_from); +// assert_eq!(t1.offset_to, t2.offset_to); +// assert_eq!(t1.text, t2.text); +// } +// }