diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs new file mode 100644 index 000000000..2e009e45d --- /dev/null +++ b/src/tokenizer/alphanum_only.rs @@ -0,0 +1,65 @@ +use super::{Token, TokenFilter, TokenStream}; + +#[derive(Clone)] +pub struct AlphaNumOnlyFilter; + + +pub struct AlphaNumOnlyFilterStream + where TailTokenStream: TokenStream +{ + tail: TailTokenStream, +} + + +impl AlphaNumOnlyFilterStream + where TailTokenStream: TokenStream +{ + fn predicate(&self, token: &Token) -> bool { + token.text.chars().all(|c| c.is_ascii_alphanumeric()) + } + + fn wrap( + tail: TailTokenStream, + ) -> AlphaNumOnlyFilterStream { + AlphaNumOnlyFilterStream { + tail + } + } +} + + +impl TokenFilter for AlphaNumOnlyFilter + where + TailTokenStream: TokenStream, +{ + type ResultTokenStream = AlphaNumOnlyFilterStream; + + fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { + AlphaNumOnlyFilterStream::wrap(token_stream) + } +} + +impl TokenStream for AlphaNumOnlyFilterStream + where + TailTokenStream: TokenStream +{ + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } + + fn advance(&mut self) -> bool { + loop { + if self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; + } + } else { + return false; + } + } + } +} diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index cdcd2346e..7557a0b91 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -137,7 +137,9 @@ mod tokenizer_manager; mod japanese_tokenizer; mod token_stream_chain; mod raw_tokenizer; +mod alphanum_only; +pub use self::alphanum_only::AlphaNumOnlyFilter; pub use self::tokenizer::{Token, TokenFilter, TokenStream, Tokenizer}; pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer_manager::TokenizerManager;