diff --git a/Cargo.toml b/Cargo.toml index d20c05e75..32038fb53 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -22,7 +22,7 @@ tempdir = "0.3.4" bincode = "0.4.0" serde = "0.6.11" libc = "0.2.6" -lz4 = "*" +lz4 = "1.13.131" [build-dependencies] gcc = "0.3.24" diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs index 75f6460cb..d70bad598 100644 --- a/src/core/analyzer.rs +++ b/src/core/analyzer.rs @@ -1,50 +1,30 @@ extern crate regex; -use self::regex::Regex; -use std::cell::RefCell; use std::str::Chars; -lazy_static! { - static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap(); -} - pub struct TokenIter<'a> { chars: Chars<'a>, + term_buffer: String, } - -fn append_char(c: char, term_buffer: &mut String) { +fn append_char_lowercase(c: char, term_buffer: &mut String) { for c_lower in c.to_lowercase() { term_buffer.push(c_lower); } } -impl<'a> TokenIter<'a> { +pub trait StreamingIterator<'a, T> { + fn next(&'a mut self) -> Option; +} - pub fn read_one(&mut self, term_buffer: &mut String) -> bool { - term_buffer.clear(); +impl<'a, 'b> TokenIter<'b> { + fn consume_token(&'a mut self) -> Option<&'a str> { loop { match self.chars.next() { Some(c) => { if c.is_alphanumeric() { - append_char(c, term_buffer); - break; - } - else { - break; - } - }, - None => { - return false; - } - } - } - loop { - match self.chars.next() { - Some(c) => { - if c.is_alphanumeric() { - append_char(c, term_buffer); + append_char_lowercase(c, &mut self.term_buffer); } else { break; @@ -55,7 +35,27 @@ impl<'a> TokenIter<'a> { } } } - return true; + return Some(&self.term_buffer); + } +} + + +impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> { + + fn next(&'a mut self,) -> Option<&'a str> { + self.term_buffer.clear(); + // skipping non-letter characters. + loop { + match self.chars.next() { + Some(c) => { + if c.is_alphanumeric() { + append_char_lowercase(c, &mut self.term_buffer); + return self.consume_token(); + } + } + None => { return None; } + } + } } } @@ -68,8 +68,22 @@ impl SimpleTokenizer { } pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> { - TokenIter { + TokenIter { + term_buffer: String::new(), chars: text.chars(), - } + } } } + + +#[test] +fn test_tokenizer() { + let simple_tokenizer = SimpleTokenizer::new(); + let mut term_buffer = String::new(); + let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!"); + assert_eq!(term_reader.next().unwrap(), "hello"); + assert_eq!(term_reader.next().unwrap(), "happy"); + assert_eq!(term_reader.next().unwrap(), "tax"); + assert_eq!(term_reader.next().unwrap(), "payer"); + assert_eq!(term_reader.next(), None); +} diff --git a/src/core/serialize.rs b/src/core/serialize.rs index 8a6108cfc..fc60a1805 100644 --- a/src/core/serialize.rs +++ b/src/core/serialize.rs @@ -113,7 +113,7 @@ fn test_serialize_u8() { x.serialize(&mut buffer); assert_eq!(buffer.len(), 2); } - let mut cursor = Cursor::new(&buffer); + let mut cursor = Cursor::new(&buffer[..]); assert_eq!(3, u8::deserialize(&mut cursor).unwrap()); assert_eq!(5, u8::deserialize(&mut cursor).unwrap()); assert!(u8::deserialize(&mut cursor).is_err()); @@ -133,7 +133,7 @@ fn test_serialize_u32() { x.serialize(&mut buffer); assert_eq!(buffer.len(), 8); } - let mut cursor = Cursor::new(&buffer); + let mut cursor = Cursor::new(&buffer[..]); assert_eq!(3, u32::deserialize(&mut cursor).unwrap()); assert_eq!(5, u32::deserialize(&mut cursor).unwrap()); assert!(u32::deserialize(&mut cursor).is_err()); @@ -154,7 +154,7 @@ fn test_serialize_string() { assert_eq!(x.serialize(&mut buffer).unwrap(), second_length); assert_eq!(buffer.len(), first_length + second_length); } - let mut cursor = Cursor::new(&buffer); + let mut cursor = Cursor::new(&buffer[..]); assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap()); assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap()); assert!(u32::deserialize(&mut cursor).is_err()); @@ -167,7 +167,7 @@ fn test_serialize_vec() { let second_length = 4 + 3 * 8; let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。")); assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4); - let mut cursor = Cursor::new(&buffer); + let mut cursor = Cursor::new(&buffer[..]); { let deser: Vec = Vec::deserialize(&mut cursor).unwrap(); assert_eq!(deser.len(), 2); diff --git a/src/core/writer.rs b/src/core/writer.rs index e9767fa4e..bca453226 100644 --- a/src/core/writer.rs +++ b/src/core/writer.rs @@ -14,6 +14,7 @@ use std::sync::Arc; use std::mem; use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt}; use std::iter::Peekable; +use core::analyzer::StreamingIterator; use core::serial::*; use core::error::*; use std::cell::RefCell; @@ -151,10 +152,15 @@ impl SegmentWriter { let field_options = schema.get_field(&field_value.field); if field_options.is_tokenized_indexed() { let mut tokens = self.tokenizer.tokenize(&field_value.text); - while tokens.read_one(&mut term_buffer) { - let term = Term::from_field_text(&field_value.field, term_buffer.as_ref()); - self.suscribe(doc_id, term); - self.num_tokens += 1; + loop { + match tokens.next() { + Some(token) => { + let term = Term::from_field_text(&field_value.field, token); + self.suscribe(doc_id, term); + self.num_tokens += 1; + }, + None => { break; } + } } } } diff --git a/tests/core.rs b/tests/core.rs index dbd824c24..61f00645e 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -51,20 +51,6 @@ fn test_intersection() { } } -#[test] -fn test_tokenizer() { - let simple_tokenizer = SimpleTokenizer::new(); - let mut term_buffer = String::new(); - let mut term_reader = simple_tokenizer.tokenize("hello happy tax payer!"); - assert!(term_reader.read_one(&mut term_buffer)); - assert_eq!(term_buffer, "hello"); - assert!(term_reader.read_one(&mut term_buffer)); - assert_eq!(term_buffer, "happy"); - assert!(term_reader.read_one(&mut term_buffer)); - assert_eq!(term_buffer, "tax"); - assert!(term_reader.read_one(&mut term_buffer)); - assert_eq!(term_buffer, "payer"); -} #[test] fn test_indexing() {