This commit is contained in:
Paul Masurel
2016-02-20 20:13:14 +09:00
parent 1215049adf
commit 107d3c0244
5 changed files with 60 additions and 54 deletions

View File

@@ -22,7 +22,7 @@ tempdir = "0.3.4"
bincode = "0.4.0"
serde = "0.6.11"
libc = "0.2.6"
lz4 = "*"
lz4 = "1.13.131"
[build-dependencies]
gcc = "0.3.24"

View File

@@ -1,50 +1,30 @@
extern crate regex;
use self::regex::Regex;
use std::cell::RefCell;
use std::str::Chars;
lazy_static! {
static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
}
pub struct TokenIter<'a> {
chars: Chars<'a>,
term_buffer: String,
}
fn append_char(c: char, term_buffer: &mut String) {
fn append_char_lowercase(c: char, term_buffer: &mut String) {
for c_lower in c.to_lowercase() {
term_buffer.push(c_lower);
}
}
impl<'a> TokenIter<'a> {
pub trait StreamingIterator<'a, T> {
fn next(&'a mut self) -> Option<T>;
}
pub fn read_one(&mut self, term_buffer: &mut String) -> bool {
term_buffer.clear();
impl<'a, 'b> TokenIter<'b> {
fn consume_token(&'a mut self) -> Option<&'a str> {
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char(c, term_buffer);
break;
}
else {
break;
}
},
None => {
return false;
}
}
}
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char(c, term_buffer);
append_char_lowercase(c, &mut self.term_buffer);
}
else {
break;
@@ -55,7 +35,27 @@ impl<'a> TokenIter<'a> {
}
}
}
return true;
return Some(&self.term_buffer);
}
}
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
fn next(&'a mut self,) -> Option<&'a str> {
self.term_buffer.clear();
// skipping non-letter characters.
loop {
match self.chars.next() {
Some(c) => {
if c.is_alphanumeric() {
append_char_lowercase(c, &mut self.term_buffer);
return self.consume_token();
}
}
None => { return None; }
}
}
}
}
@@ -68,8 +68,22 @@ impl SimpleTokenizer {
}
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
TokenIter {
TokenIter {
term_buffer: String::new(),
chars: text.chars(),
}
}
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_buffer = String::new();
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
assert_eq!(term_reader.next().unwrap(), "hello");
assert_eq!(term_reader.next().unwrap(), "happy");
assert_eq!(term_reader.next().unwrap(), "tax");
assert_eq!(term_reader.next().unwrap(), "payer");
assert_eq!(term_reader.next(), None);
}

View File

@@ -113,7 +113,7 @@ fn test_serialize_u8() {
x.serialize(&mut buffer);
assert_eq!(buffer.len(), 2);
}
let mut cursor = Cursor::new(&buffer);
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u8::deserialize(&mut cursor).unwrap());
assert_eq!(5, u8::deserialize(&mut cursor).unwrap());
assert!(u8::deserialize(&mut cursor).is_err());
@@ -133,7 +133,7 @@ fn test_serialize_u32() {
x.serialize(&mut buffer);
assert_eq!(buffer.len(), 8);
}
let mut cursor = Cursor::new(&buffer);
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!(3, u32::deserialize(&mut cursor).unwrap());
assert_eq!(5, u32::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
@@ -154,7 +154,7 @@ fn test_serialize_string() {
assert_eq!(x.serialize(&mut buffer).unwrap(), second_length);
assert_eq!(buffer.len(), first_length + second_length);
}
let mut cursor = Cursor::new(&buffer);
let mut cursor = Cursor::new(&buffer[..]);
assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap());
assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap());
assert!(u32::deserialize(&mut cursor).is_err());
@@ -167,7 +167,7 @@ fn test_serialize_vec() {
let second_length = 4 + 3 * 8;
let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。"));
assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4);
let mut cursor = Cursor::new(&buffer);
let mut cursor = Cursor::new(&buffer[..]);
{
let deser: Vec<String> = Vec::deserialize(&mut cursor).unwrap();
assert_eq!(deser.len(), 2);

View File

@@ -14,6 +14,7 @@ use std::sync::Arc;
use std::mem;
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
use std::iter::Peekable;
use core::analyzer::StreamingIterator;
use core::serial::*;
use core::error::*;
use std::cell::RefCell;
@@ -151,10 +152,15 @@ impl SegmentWriter {
let field_options = schema.get_field(&field_value.field);
if field_options.is_tokenized_indexed() {
let mut tokens = self.tokenizer.tokenize(&field_value.text);
while tokens.read_one(&mut term_buffer) {
let term = Term::from_field_text(&field_value.field, term_buffer.as_ref());
self.suscribe(doc_id, term);
self.num_tokens += 1;
loop {
match tokens.next() {
Some(token) => {
let term = Term::from_field_text(&field_value.field, token);
self.suscribe(doc_id, term);
self.num_tokens += 1;
},
None => { break; }
}
}
}
}

View File

@@ -51,20 +51,6 @@ fn test_intersection() {
}
}
#[test]
fn test_tokenizer() {
let simple_tokenizer = SimpleTokenizer::new();
let mut term_buffer = String::new();
let mut term_reader = simple_tokenizer.tokenize("hello happy tax payer!");
assert!(term_reader.read_one(&mut term_buffer));
assert_eq!(term_buffer, "hello");
assert!(term_reader.read_one(&mut term_buffer));
assert_eq!(term_buffer, "happy");
assert!(term_reader.read_one(&mut term_buffer));
assert_eq!(term_buffer, "tax");
assert!(term_reader.read_one(&mut term_buffer));
assert_eq!(term_buffer, "payer");
}
#[test]
fn test_indexing() {