mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-29 14:40:40 +00:00
added travis ci conf.
This commit is contained in:
95
src/analyzer/mod.rs
Normal file
95
src/analyzer/mod.rs
Normal file
@@ -0,0 +1,95 @@
|
||||
extern crate regex;
|
||||
|
||||
use std::str::Chars;
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
chars: Chars<'a>,
|
||||
term_buffer: String,
|
||||
}
|
||||
|
||||
fn append_char_lowercase(c: char, term_buffer: &mut String) {
|
||||
for c_lower in c.to_lowercase() {
|
||||
term_buffer.push(c_lower);
|
||||
}
|
||||
}
|
||||
|
||||
pub trait StreamingIterator<'a, T> {
|
||||
fn next(&'a mut self) -> Option<T>;
|
||||
}
|
||||
|
||||
impl<'a, 'b> TokenIter<'b> {
|
||||
fn consume_token(&'a mut self) -> Option<&'a str> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
return Some(&self.term_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
|
||||
fn next(&'a mut self,) -> Option<&'a str> {
|
||||
self.term_buffer.clear();
|
||||
// skipping non-letter characters.
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
return self.consume_token();
|
||||
}
|
||||
}
|
||||
None => { return None; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
impl SimpleTokenizer {
|
||||
pub fn new() -> SimpleTokenizer {
|
||||
SimpleTokenizer
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
|
||||
assert_eq!(term_reader.next().unwrap(), "hello");
|
||||
assert_eq!(term_reader.next().unwrap(), "happy");
|
||||
assert_eq!(term_reader.next().unwrap(), "tax");
|
||||
assert_eq!(term_reader.next().unwrap(), "payer");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
Reference in New Issue
Block a user