diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 7b5772393..aa385cbc1 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -131,6 +131,7 @@ mod token_stream_chain; mod tokenized_string; mod tokenizer; mod tokenizer_manager; +mod whitespace_tokenizer; pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::ascii_folding_filter::AsciiFoldingFilter; @@ -143,6 +144,7 @@ pub use self::simple_tokenizer::SimpleTokenizer; pub use self::stemmer::{Language, Stemmer}; pub use self::stop_word_filter::StopWordFilter; pub(crate) use self::token_stream_chain::TokenStreamChain; +pub use self::whitespace_tokenizer::WhitespaceTokenizer; pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{ @@ -277,4 +279,25 @@ pub mod tests { assert!(tokens.is_empty()); } } + + #[test] + fn test_whitespace_tokenizer() { + let tokenizer_manager = TokenizerManager::default(); + let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { + tokens.push(token.clone()); + }; + ws_tokenizer + .token_stream("Hello, happy tax payer!") + .process(&mut add_token); + } + + assert_eq!(tokens.len(), 4); + assert_token(&tokens[0], 0, "Hello,", 0, 6); + assert_token(&tokens[1], 1, "happy", 7, 12); + assert_token(&tokens[2], 2, "tax", 13, 16); + assert_token(&tokens[3], 3, "payer!", 17, 23); + } } diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 89cf2407a..07f2e7ae1 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -5,6 +5,7 @@ use crate::tokenizer::RawTokenizer; use crate::tokenizer::RemoveLongFilter; use crate::tokenizer::SimpleTokenizer; use crate::tokenizer::Stemmer; +use crate::tokenizer::WhitespaceTokenizer; use std::collections::HashMap; use std::sync::{Arc, RwLock}; @@ -72,6 +73,7 @@ impl Default for TokenizerManager { .filter(LowerCaser) .filter(Stemmer::new(Language::English)), ); + manager.register("whitespace", WhitespaceTokenizer); manager } } diff --git a/src/tokenizer/whitespace_tokenizer.rs b/src/tokenizer/whitespace_tokenizer.rs new file mode 100644 index 000000000..08d0003e1 --- /dev/null +++ b/src/tokenizer/whitespace_tokenizer.rs @@ -0,0 +1,59 @@ +use super::BoxTokenStream; +use super::{Token, TokenStream, Tokenizer}; +use std::str::CharIndices; + +/// Tokenize the text by splitting on whitespaces. +#[derive(Clone)] +pub struct WhitespaceTokenizer; + +pub struct WhitespaceTokenStream<'a> { + text: &'a str, + chars: CharIndices<'a>, + token: Token, +} + +impl Tokenizer for WhitespaceTokenizer { + fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> { + BoxTokenStream::from(WhitespaceTokenStream { + text, + chars: text.char_indices(), + token: Token::default(), + }) + } +} + +impl<'a> WhitespaceTokenStream<'a> { + // search for the end of the current token. + fn search_token_end(&mut self) -> usize { + (&mut self.chars) + .filter(|&(_, ref c)| c.is_ascii_whitespace()) + .map(|(offset, _)| offset) + .next() + .unwrap_or_else(|| self.text.len()) + } +} + +impl<'a> TokenStream for WhitespaceTokenStream<'a> { + fn advance(&mut self) -> bool { + self.token.text.clear(); + self.token.position = self.token.position.wrapping_add(1); + while let Some((offset_from, c)) = self.chars.next() { + if !c.is_ascii_whitespace() { + let offset_to = self.search_token_end(); + self.token.offset_from = offset_from; + self.token.offset_to = offset_to; + self.token.text.push_str(&self.text[offset_from..offset_to]); + return true; + } + } + false + } + + fn token(&self) -> &Token { + &self.token + } + + fn token_mut(&mut self) -> &mut Token { + &mut self.token + } +}