mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 01:02:55 +00:00
werwer
This commit is contained in:
@@ -22,7 +22,7 @@ tempdir = "0.3.4"
|
||||
bincode = "0.4.0"
|
||||
serde = "0.6.11"
|
||||
libc = "0.2.6"
|
||||
lz4 = "*"
|
||||
lz4 = "1.13.131"
|
||||
|
||||
[build-dependencies]
|
||||
gcc = "0.3.24"
|
||||
|
||||
@@ -1,50 +1,30 @@
|
||||
extern crate regex;
|
||||
|
||||
use self::regex::Regex;
|
||||
use std::cell::RefCell;
|
||||
use std::str::Chars;
|
||||
|
||||
lazy_static! {
|
||||
static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
|
||||
}
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
chars: Chars<'a>,
|
||||
term_buffer: String,
|
||||
}
|
||||
|
||||
|
||||
fn append_char(c: char, term_buffer: &mut String) {
|
||||
fn append_char_lowercase(c: char, term_buffer: &mut String) {
|
||||
for c_lower in c.to_lowercase() {
|
||||
term_buffer.push(c_lower);
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenIter<'a> {
|
||||
pub trait StreamingIterator<'a, T> {
|
||||
fn next(&'a mut self) -> Option<T>;
|
||||
}
|
||||
|
||||
|
||||
pub fn read_one(&mut self, term_buffer: &mut String) -> bool {
|
||||
term_buffer.clear();
|
||||
impl<'a, 'b> TokenIter<'b> {
|
||||
fn consume_token(&'a mut self) -> Option<&'a str> {
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char(c, term_buffer);
|
||||
break;
|
||||
}
|
||||
else {
|
||||
break;
|
||||
}
|
||||
},
|
||||
None => {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char(c, term_buffer);
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
}
|
||||
else {
|
||||
break;
|
||||
@@ -55,7 +35,27 @@ impl<'a> TokenIter<'a> {
|
||||
}
|
||||
}
|
||||
}
|
||||
return true;
|
||||
return Some(&self.term_buffer);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<'a, 'b> StreamingIterator<'a, &'a str> for TokenIter<'b> {
|
||||
|
||||
fn next(&'a mut self,) -> Option<&'a str> {
|
||||
self.term_buffer.clear();
|
||||
// skipping non-letter characters.
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
Some(c) => {
|
||||
if c.is_alphanumeric() {
|
||||
append_char_lowercase(c, &mut self.term_buffer);
|
||||
return self.consume_token();
|
||||
}
|
||||
}
|
||||
None => { return None; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,8 +68,22 @@ impl SimpleTokenizer {
|
||||
}
|
||||
|
||||
pub fn tokenize<'a>(&self, text: &'a str) -> TokenIter<'a> {
|
||||
TokenIter {
|
||||
TokenIter {
|
||||
term_buffer: String::new(),
|
||||
chars: text.chars(),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_buffer = String::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello, happy tax payer!");
|
||||
assert_eq!(term_reader.next().unwrap(), "hello");
|
||||
assert_eq!(term_reader.next().unwrap(), "happy");
|
||||
assert_eq!(term_reader.next().unwrap(), "tax");
|
||||
assert_eq!(term_reader.next().unwrap(), "payer");
|
||||
assert_eq!(term_reader.next(), None);
|
||||
}
|
||||
|
||||
@@ -113,7 +113,7 @@ fn test_serialize_u8() {
|
||||
x.serialize(&mut buffer);
|
||||
assert_eq!(buffer.len(), 2);
|
||||
}
|
||||
let mut cursor = Cursor::new(&buffer);
|
||||
let mut cursor = Cursor::new(&buffer[..]);
|
||||
assert_eq!(3, u8::deserialize(&mut cursor).unwrap());
|
||||
assert_eq!(5, u8::deserialize(&mut cursor).unwrap());
|
||||
assert!(u8::deserialize(&mut cursor).is_err());
|
||||
@@ -133,7 +133,7 @@ fn test_serialize_u32() {
|
||||
x.serialize(&mut buffer);
|
||||
assert_eq!(buffer.len(), 8);
|
||||
}
|
||||
let mut cursor = Cursor::new(&buffer);
|
||||
let mut cursor = Cursor::new(&buffer[..]);
|
||||
assert_eq!(3, u32::deserialize(&mut cursor).unwrap());
|
||||
assert_eq!(5, u32::deserialize(&mut cursor).unwrap());
|
||||
assert!(u32::deserialize(&mut cursor).is_err());
|
||||
@@ -154,7 +154,7 @@ fn test_serialize_string() {
|
||||
assert_eq!(x.serialize(&mut buffer).unwrap(), second_length);
|
||||
assert_eq!(buffer.len(), first_length + second_length);
|
||||
}
|
||||
let mut cursor = Cursor::new(&buffer);
|
||||
let mut cursor = Cursor::new(&buffer[..]);
|
||||
assert_eq!("ぽよぽよ", String::deserialize(&mut cursor).unwrap());
|
||||
assert_eq!("富士さん見える。", String::deserialize(&mut cursor).unwrap());
|
||||
assert!(u32::deserialize(&mut cursor).is_err());
|
||||
@@ -167,7 +167,7 @@ fn test_serialize_vec() {
|
||||
let second_length = 4 + 3 * 8;
|
||||
let vec = vec!(String::from("ぽよぽよ"), String::from("富士さん見える。"));
|
||||
assert_eq!(vec.serialize(&mut buffer).unwrap(), first_length + second_length + 4);
|
||||
let mut cursor = Cursor::new(&buffer);
|
||||
let mut cursor = Cursor::new(&buffer[..]);
|
||||
{
|
||||
let deser: Vec<String> = Vec::deserialize(&mut cursor).unwrap();
|
||||
assert_eq!(deser.len(), 2);
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::sync::Arc;
|
||||
use std::mem;
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use std::iter::Peekable;
|
||||
use core::analyzer::StreamingIterator;
|
||||
use core::serial::*;
|
||||
use core::error::*;
|
||||
use std::cell::RefCell;
|
||||
@@ -151,10 +152,15 @@ impl SegmentWriter {
|
||||
let field_options = schema.get_field(&field_value.field);
|
||||
if field_options.is_tokenized_indexed() {
|
||||
let mut tokens = self.tokenizer.tokenize(&field_value.text);
|
||||
while tokens.read_one(&mut term_buffer) {
|
||||
let term = Term::from_field_text(&field_value.field, term_buffer.as_ref());
|
||||
self.suscribe(doc_id, term);
|
||||
self.num_tokens += 1;
|
||||
loop {
|
||||
match tokens.next() {
|
||||
Some(token) => {
|
||||
let term = Term::from_field_text(&field_value.field, token);
|
||||
self.suscribe(doc_id, term);
|
||||
self.num_tokens += 1;
|
||||
},
|
||||
None => { break; }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -51,20 +51,6 @@ fn test_intersection() {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let simple_tokenizer = SimpleTokenizer::new();
|
||||
let mut term_buffer = String::new();
|
||||
let mut term_reader = simple_tokenizer.tokenize("hello happy tax payer!");
|
||||
assert!(term_reader.read_one(&mut term_buffer));
|
||||
assert_eq!(term_buffer, "hello");
|
||||
assert!(term_reader.read_one(&mut term_buffer));
|
||||
assert_eq!(term_buffer, "happy");
|
||||
assert!(term_reader.read_one(&mut term_buffer));
|
||||
assert_eq!(term_buffer, "tax");
|
||||
assert!(term_reader.read_one(&mut term_buffer));
|
||||
assert_eq!(term_buffer, "payer");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_indexing() {
|
||||
|
||||
Reference in New Issue
Block a user