mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 20:19:57 +00:00
@@ -131,6 +131,7 @@ mod token_stream_chain;
|
||||
mod tokenized_string;
|
||||
mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
@@ -143,6 +144,7 @@ pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub(crate) use self::token_stream_chain::TokenStreamChain;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
@@ -277,4 +279,25 @@ pub mod tests {
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_whitespace_tokenizer() {
|
||||
let tokenizer_manager = TokenizerManager::default();
|
||||
let ws_tokenizer = tokenizer_manager.get("whitespace").unwrap();
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| {
|
||||
tokens.push(token.clone());
|
||||
};
|
||||
ws_tokenizer
|
||||
.token_stream("Hello, happy tax payer!")
|
||||
.process(&mut add_token);
|
||||
}
|
||||
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "Hello,", 0, 6);
|
||||
assert_token(&tokens[1], 1, "happy", 7, 12);
|
||||
assert_token(&tokens[2], 2, "tax", 13, 16);
|
||||
assert_token(&tokens[3], 3, "payer!", 17, 23);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ use crate::tokenizer::RawTokenizer;
|
||||
use crate::tokenizer::RemoveLongFilter;
|
||||
use crate::tokenizer::SimpleTokenizer;
|
||||
use crate::tokenizer::Stemmer;
|
||||
use crate::tokenizer::WhitespaceTokenizer;
|
||||
use std::collections::HashMap;
|
||||
use std::sync::{Arc, RwLock};
|
||||
|
||||
@@ -72,6 +73,7 @@ impl Default for TokenizerManager {
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new(Language::English)),
|
||||
);
|
||||
manager.register("whitespace", WhitespaceTokenizer);
|
||||
manager
|
||||
}
|
||||
}
|
||||
|
||||
59
src/tokenizer/whitespace_tokenizer.rs
Normal file
59
src/tokenizer/whitespace_tokenizer.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
use super::BoxTokenStream;
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use std::str::CharIndices;
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces.
|
||||
#[derive(Clone)]
|
||||
pub struct WhitespaceTokenizer;
|
||||
|
||||
pub struct WhitespaceTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl Tokenizer for WhitespaceTokenizer {
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream::from(WhitespaceTokenStream {
|
||||
text,
|
||||
chars: text.char_indices(),
|
||||
token: Token::default(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> WhitespaceTokenStream<'a> {
|
||||
// search for the end of the current token.
|
||||
fn search_token_end(&mut self) -> usize {
|
||||
(&mut self.chars)
|
||||
.filter(|&(_, ref c)| c.is_ascii_whitespace())
|
||||
.map(|(offset, _)| offset)
|
||||
.next()
|
||||
.unwrap_or_else(|| self.text.len())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for WhitespaceTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.token.text.clear();
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
while let Some((offset_from, c)) = self.chars.next() {
|
||||
if !c.is_ascii_whitespace() {
|
||||
let offset_to = self.search_token_end();
|
||||
self.token.offset_from = offset_from;
|
||||
self.token.offset_to = offset_to;
|
||||
self.token.text.push_str(&self.text[offset_from..offset_to]);
|
||||
return true;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user