diff --git a/Cargo.toml b/Cargo.toml
index f760f3f9d..c4d27134c 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -16,6 +16,7 @@ keywords = ["search", "information", "retrieval"]
byteorder = "1.0"
memmap = "0.4"
lazy_static = "0.2.1"
+tinysegmenter = "0.1.0"
regex = "0.2"
fst = "0.1.37"
atomicwrites = "0.1.3"
diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs
index c1a916a1d..9b889eb13 100644
--- a/src/analyzer/analyzer.rs
+++ b/src/analyzer/analyzer.rs
@@ -1,6 +1,27 @@
+pub trait TextPipeline {
+ fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
+}
+
+
+struct TextPipelineImpl
+ where for<'a> A: Analyzer<'a> + 'static
+{
+ underlying: A,
+}
+
+impl TextPipeline for TextPipelineImpl
+ where for<'a> A: Analyzer<'a> + 'static
+{
+ fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
+ let mut token_stream = self.underlying.token_stream(text);
+ while token_stream.advance() {
+ sink(token_stream.token());
+ }
+ }
+}
#[derive(Default)]
pub struct Token {
@@ -11,33 +32,39 @@ pub struct Token {
}
pub trait Analyzer<'a>: Sized {
-
type TokenStreamImpl: TokenStream;
- fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl;
+ fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
fn filter(self, new_filter: NewFilter) -> ChainAnalyzer
- where NewFilter: TokenFilterFactory<>::TokenStreamImpl> {
+ where NewFilter: TokenFilterFactory<>::TokenStreamImpl>
+ {
ChainAnalyzer {
head: new_filter,
- tail: self
+ tail: self,
}
}
}
-pub trait TokenStream {
+pub fn boxed_pipeline Analyzer<'a>>(analyzer: A)
+ -> Box {
+ let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
+ box text_pipeline_impl
+}
+
+
+pub trait TokenStream {
fn advance(&mut self) -> bool;
-
+
fn token(&self) -> &Token;
-
+
fn token_mut(&mut self) -> &mut Token;
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
- }
- else {
+ } else {
None
}
}
@@ -46,27 +73,26 @@ pub trait TokenStream {
pub struct ChainAnalyzer {
head: HeadTokenFilterFactory,
- tail: TailAnalyzer
+ tail: TailAnalyzer,
}
-impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> for ChainAnalyzer
+impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a>
+ for ChainAnalyzer
where HeadTokenFilterFactory: TokenFilterFactory,
- TailAnalyzer: Analyzer<'a> {
-
+ TailAnalyzer: Analyzer<'a>
+{
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
-
- fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
- let tail_token_stream = self.tail.analyze(text);
+
+ fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
+ let tail_token_stream = self.tail.token_stream(text);
self.head.transform(tail_token_stream)
}
}
pub trait TokenFilterFactory {
-
type ResultTokenStream: TokenStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
}
-
diff --git a/src/analyzer/jp_tokenizer.rs b/src/analyzer/jp_tokenizer.rs
new file mode 100644
index 000000000..6bf508959
--- /dev/null
+++ b/src/analyzer/jp_tokenizer.rs
@@ -0,0 +1,91 @@
+use super::{Token, Analyzer, TokenStream};
+use tinysegmenter;
+
+pub struct JpTokenizer;
+
+#[derive(Eq, PartialEq)]
+enum Cursor {
+ HasNotStarted,
+ Cursor(usize),
+ Terminated,
+}
+
+pub struct JpTokenizerStream {
+ tokens: Vec,
+ cursor: Cursor,
+}
+
+impl<'a> Analyzer<'a> for JpTokenizer {
+ type TokenStreamImpl = JpTokenizerStream;
+
+ fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
+ let mut tokens = vec![];
+ let mut offset_from;
+ let mut offset_to = 0;
+ for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
+ offset_from = offset_to;
+ offset_to = offset_from + term.len();
+ tokens.push(Token {
+ offset_from: offset_from,
+ offset_to: offset_to,
+ position: pos,
+ term: term,
+ });
+ }
+ JpTokenizerStream {
+ tokens: tokens,
+ cursor: Cursor::HasNotStarted,
+ }
+ }
+}
+
+impl<'a> TokenStream for JpTokenizerStream {
+ fn advance(&mut self) -> bool {
+ let new_cursor = match self.cursor {
+ Cursor::HasNotStarted => {
+ if self.tokens.len() > 0 {
+ Cursor::Cursor(0)
+ } else {
+ Cursor::Terminated
+ }
+ }
+ Cursor::Cursor(pos) => {
+ let new_pos = pos + 1;
+ if new_pos >= self.tokens.len() {
+ Cursor::Terminated
+ } else {
+ Cursor::Cursor(new_pos)
+ }
+ }
+ Cursor::Terminated => Cursor::Terminated,
+ };
+ self.cursor = new_cursor;
+ return self.cursor != Cursor::Terminated;
+ }
+
+
+ fn token(&self) -> &Token {
+ match self.cursor {
+ Cursor::Terminated => {
+ panic!("You called .token(), after the end of the token stream has been reached");
+ }
+ Cursor::Cursor(i) => &self.tokens[i],
+ Cursor::HasNotStarted => {
+ panic!("You called .token(), before having called `.advance()`.");
+ }
+ }
+
+ }
+
+ fn token_mut(&mut self) -> &mut Token {
+ match self.cursor {
+ Cursor::Terminated => {
+ panic!("You called .token(), after the end of the token stream has been reached");
+ }
+ Cursor::Cursor(i) => &mut self.tokens[i],
+ Cursor::HasNotStarted => {
+ panic!("You called .token(), before having called `.advance()`.");
+ }
+ }
+ }
+}
diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs
index dda5f597b..e0cb86861 100644
--- a/src/analyzer/lower_caser.rs
+++ b/src/analyzer/lower_caser.rs
@@ -3,9 +3,9 @@ use std::ascii::AsciiExt;
pub struct LowerCaser;
-impl TokenFilterFactory for LowerCaser
- where TailTokenStream: TokenStream {
-
+impl TokenFilterFactory for LowerCaser
+ where TailTokenStream: TokenStream
+{
type ResultTokenStream = LowerCaserTokenStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -13,18 +13,19 @@ impl TokenFilterFactory for LowerCaser
}
}
-pub struct LowerCaserTokenStream
- where TailTokenStream: TokenStream {
+pub struct LowerCaserTokenStream
+ where TailTokenStream: TokenStream
+{
tail: TailTokenStream,
}
impl TokenStream for LowerCaserTokenStream
- where TailTokenStream: TokenStream {
-
+ where TailTokenStream: TokenStream
+{
fn token(&self) -> &Token {
self.tail.token()
}
-
+
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
@@ -33,22 +34,16 @@ impl TokenStream for LowerCaserTokenStream
if self.tail.advance() {
self.tail.token_mut().term.make_ascii_lowercase();
return true;
- }
- else {
+ } else {
return false;
}
}
}
impl LowerCaserTokenStream
- where TailTokenStream: TokenStream {
-
-
+ where TailTokenStream: TokenStream
+{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream {
- LowerCaserTokenStream {
- tail: tail,
- }
- }
+ LowerCaserTokenStream { tail: tail }
+ }
}
-
-
diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs
index 1d2974f29..63d1cf116 100644
--- a/src/analyzer/mod.rs
+++ b/src/analyzer/mod.rs
@@ -4,51 +4,101 @@ mod analyzer;
mod simple_tokenizer;
mod lower_caser;
mod remove_long;
+mod remove_nonalphanum;
mod stemmer;
+mod jp_tokenizer;
-pub use self::analyzer::{Analyzer, Token, TokenFilterFactory, TokenStream};
+pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
+ TokenStream};
pub use self::simple_tokenizer::SimpleTokenizer;
+pub use self::jp_tokenizer::JpTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::lower_caser::LowerCaser;
pub use self::stemmer::Stemmer;
+pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
+pub fn en_pipeline<'a>() -> Box {
+ boxed_pipeline(SimpleTokenizer
+ .filter(RemoveLongFilter::limit(20))
+ .filter(LowerCaser)
+ .filter(Stemmer::new()))
+}
-pub fn en_analyzer<'a>() -> impl Analyzer<'a> {
- SimpleTokenizer
- .filter(RemoveLongFilter::limit(20))
- .filter(LowerCaser)
+pub fn jp_pipeline<'a>() -> Box {
+ boxed_pipeline(JpTokenizer
+ .filter(RemoveLongFilter::limit(20))
+ .filter(RemoveNonAlphaFilter))
}
#[cfg(test)]
mod test {
- use super::{Analyzer, TokenStream, en_analyzer};
+ use super::{en_pipeline, jp_pipeline, Token};
#[test]
- fn test_tokenizer() {
- let mut analyzer = en_analyzer();
- let mut terms = analyzer.analyze("hello, happy tax payer!");
- assert_eq!(terms.next().unwrap().term, "hello");
- assert_eq!(terms.next().unwrap().term, "happy");
- assert_eq!(terms.next().unwrap().term, "tax");
- assert_eq!(terms.next().unwrap().term, "payer");
- assert!(terms.next().is_none());
+ fn test_en_analyzer() {
+ let mut pipeline = en_pipeline();
+ let mut tokens: Vec = vec![];
+ {
+ let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
+ pipeline.analyze("hello, happy tax payer!", &mut add_token);
+ }
+ assert_eq!(tokens.len(), 4);
+ assert_eq!(&tokens[0], "hello");
+ assert_eq!(&tokens[1], "happi");
+ assert_eq!(&tokens[2], "tax");
+ assert_eq!(&tokens[3], "payer");
}
+
+ #[test]
+ fn test_jp_analyzer() {
+ let mut pipeline = jp_pipeline();
+ let mut tokens: Vec = vec![];
+ {
+ let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
+ pipeline.analyze("野菜食べないとやばい!", &mut add_token);
+ }
+ assert_eq!(tokens.len(), 5);
+ assert_eq!(&tokens[0], "野菜");
+ assert_eq!(&tokens[1], "食べ");
+ assert_eq!(&tokens[2], "ない");
+ assert_eq!(&tokens[3], "と");
+ assert_eq!(&tokens[4], "やばい");
+ }
+
+
#[test]
fn test_tokenizer_empty() {
- let mut terms = en_analyzer().analyze("");
- assert!(terms.next().is_none());
+ let mut pipeline = en_pipeline();
+ {
+ let mut tokens: Vec = vec![];
+ {
+ let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
+ pipeline.analyze(" ", &mut add_token);
+ }
+ assert!(tokens.is_empty());
+ }
+ {
+ let mut tokens: Vec = vec![];
+ {
+ let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
+ pipeline.analyze(" ", &mut add_token);
+ }
+ assert!(tokens.is_empty());
+ }
}
#[test]
fn test_tokenizer_cjkchars() {
- let mut terms = en_analyzer().analyze("hello,中国人民");
- assert_eq!(terms.next().unwrap().term, "hello");
- assert_eq!(terms.next().unwrap().term, "中国人民");
- assert!(terms.next().is_none());
+ let mut pipeline = en_pipeline();
+ let mut tokens: Vec = vec![];
+ {
+ let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
+ pipeline.analyze("hello,中国人民", &mut add_token);
+ }
+ assert_eq!(tokens.len(), 2);
+ assert_eq!(tokens, vec!["hello", "中国人民"]);
}
-
}
-
diff --git a/src/analyzer/remove_long.rs b/src/analyzer/remove_long.rs
index b4b4b4e0e..98b73b973 100644
--- a/src/analyzer/remove_long.rs
+++ b/src/analyzer/remove_long.rs
@@ -5,34 +5,34 @@ pub struct RemoveLongFilter {
length_limit: usize,
}
-impl RemoveLongFilter {
+impl RemoveLongFilter {
// the limit is in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
- RemoveLongFilter {
- length_limit: length_limit,
- }
+ RemoveLongFilter { length_limit: length_limit }
}
}
impl RemoveLongFilterStream
- where TailTokenStream: TokenStream {
-
+ where TailTokenStream: TokenStream
+{
fn predicate(&self, token: &Token) -> bool {
token.term.len() < self.token_length_limit
}
- fn wrap(token_length_limit: usize, tail: TailTokenStream) -> RemoveLongFilterStream {
+ fn wrap(token_length_limit: usize,
+ tail: TailTokenStream)
+ -> RemoveLongFilterStream {
RemoveLongFilterStream {
token_length_limit: token_length_limit,
tail: tail,
}
- }
+ }
}
-impl TokenFilterFactory for RemoveLongFilter
- where TailTokenStream: TokenStream {
-
+impl TokenFilterFactory for RemoveLongFilter
+ where TailTokenStream: TokenStream
+{
type ResultTokenStream = RemoveLongFilterStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -40,16 +40,16 @@ impl TokenFilterFactory for RemoveLongFilter
}
}
-pub struct RemoveLongFilterStream
- where TailTokenStream: TokenStream {
-
+pub struct RemoveLongFilterStream
+ where TailTokenStream: TokenStream
+{
token_length_limit: usize,
tail: TailTokenStream,
}
impl TokenStream for RemoveLongFilterStream
- where TailTokenStream: TokenStream {
-
+ where TailTokenStream: TokenStream
+{
fn token(&self) -> &Token {
self.tail.token()
}
@@ -64,11 +64,9 @@ impl TokenStream for RemoveLongFilterStream
if self.predicate(self.tail.token()) {
return true;
}
- }
- else {
+ } else {
return false;
}
}
}
-
-}
\ No newline at end of file
+}
diff --git a/src/analyzer/remove_nonalphanum.rs b/src/analyzer/remove_nonalphanum.rs
new file mode 100644
index 000000000..ede810680
--- /dev/null
+++ b/src/analyzer/remove_nonalphanum.rs
@@ -0,0 +1,58 @@
+use super::{TokenFilterFactory, TokenStream, Token};
+
+
+pub struct RemoveNonAlphaFilter;
+
+impl RemoveNonAlphaFilterStream
+ where TailTokenStream: TokenStream
+{
+ fn predicate(&self, token: &Token) -> bool {
+ for c in token.term.chars() {
+ if !c.is_alphanumeric() {
+ return false;
+ }
+ }
+ true
+ }
+}
+
+
+impl TokenFilterFactory for RemoveNonAlphaFilter
+ where TailTokenStream: TokenStream
+{
+ type ResultTokenStream = RemoveNonAlphaFilterStream;
+
+ fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream {
+ RemoveNonAlphaFilterStream { tail: tail }
+ }
+}
+
+pub struct RemoveNonAlphaFilterStream
+ where TailTokenStream: TokenStream
+{
+ tail: TailTokenStream,
+}
+
+impl TokenStream for RemoveNonAlphaFilterStream
+ where TailTokenStream: TokenStream
+{
+ fn token(&self) -> &Token {
+ self.tail.token()
+ }
+
+ fn token_mut(&mut self) -> &mut Token {
+ self.tail.token_mut()
+ }
+
+ fn advance(&mut self) -> bool {
+ loop {
+ if self.tail.advance() {
+ if self.predicate(self.tail.token()) {
+ return true;
+ }
+ } else {
+ return false;
+ }
+ }
+ }
+}
diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs
index 2d5b27907..96b71c5dd 100644
--- a/src/analyzer/simple_tokenizer.rs
+++ b/src/analyzer/simple_tokenizer.rs
@@ -7,14 +7,13 @@ pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
- token: Token,
+ token: Token,
}
impl<'a> Analyzer<'a> for SimpleTokenizer {
-
type TokenStreamImpl = SimpleTokenStream<'a>;
- fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
+ fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
SimpleTokenStream {
text: text,
chars: text.char_indices(),
@@ -24,10 +23,9 @@ impl<'a> Analyzer<'a> for SimpleTokenizer {
}
impl<'a> SimpleTokenStream<'a> {
-
fn token_limit(&mut self) -> usize {
(&mut self.chars)
- .filter(|&(_, ref c)| !c.is_alphanumeric())
+ .filter(|&(_, ref c)| !c.is_alphanumeric())
.map(|(offset, _)| offset)
.next()
.unwrap_or(self.text.len())
@@ -35,7 +33,6 @@ impl<'a> SimpleTokenStream<'a> {
}
impl<'a> TokenStream for SimpleTokenStream<'a> {
-
fn advance(&mut self) -> bool {
self.token.term.clear();
self.token.position += 1;
@@ -57,7 +54,7 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
}
}
}
-
+
fn token(&self) -> &Token {
&self.token
}
@@ -65,5 +62,4 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
-
-}
\ No newline at end of file
+}
diff --git a/src/analyzer/stemmer.rs b/src/analyzer/stemmer.rs
index 4988d8325..82e3e5ac3 100644
--- a/src/analyzer/stemmer.rs
+++ b/src/analyzer/stemmer.rs
@@ -1,6 +1,6 @@
use std::sync::Arc;
use super::{TokenFilterFactory, TokenStream, Token};
-use rust_stemmers::{Algorithm, self};
+use rust_stemmers::{self, Algorithm};
pub struct Stemmer {
stemmer: Arc,
@@ -9,15 +9,13 @@ pub struct Stemmer {
impl Stemmer {
pub fn new() -> Stemmer {
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
- Stemmer {
- stemmer: Arc::new(inner_stemmer),
- }
+ Stemmer { stemmer: Arc::new(inner_stemmer) }
}
}
-impl TokenFilterFactory for Stemmer
- where TailTokenStream: TokenStream {
-
+impl TokenFilterFactory for Stemmer
+ where TailTokenStream: TokenStream
+{
type ResultTokenStream = StemmerTokenStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -26,19 +24,20 @@ impl TokenFilterFactory for Stemmer
}
-pub struct StemmerTokenStream
- where TailTokenStream: TokenStream {
+pub struct StemmerTokenStream
+ where TailTokenStream: TokenStream
+{
tail: TailTokenStream,
stemmer: Arc,
}
impl TokenStream for StemmerTokenStream
- where TailTokenStream: TokenStream {
-
+ where TailTokenStream: TokenStream
+{
fn token(&self) -> &Token {
self.tail.token()
}
-
+
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
@@ -50,20 +49,21 @@ impl TokenStream for StemmerTokenStream
self.token_mut().term.clear();
self.token_mut().term.push_str(&stemmed_str);
true
- }
- else {
+ } else {
false
}
}
}
impl StemmerTokenStream
- where TailTokenStream: TokenStream {
-
- fn wrap(stemmer: Arc, tail: TailTokenStream) -> StemmerTokenStream {
+ where TailTokenStream: TokenStream
+{
+ fn wrap(stemmer: Arc,
+ tail: TailTokenStream)
+ -> StemmerTokenStream {
StemmerTokenStream {
tail: tail,
stemmer: stemmer,
}
- }
-}
\ No newline at end of file
+ }
+}
diff --git a/src/lib.rs b/src/lib.rs
index 592bc414a..e9d00fa3c 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -79,6 +79,7 @@ extern crate test;
#[cfg(test)]
extern crate rand;
+extern crate tinysegmenter;
#[cfg(test)]
mod functional_test;
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 812490fff..9981be093 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -4,10 +4,9 @@ use schema::FieldValue;
use postings::PostingsSerializer;
use std::io;
use postings::Recorder;
-use analyzer::SimpleTokenizer;
use Result;
use schema::{Schema, Field};
-use analyzer::{TokenStream, Analyzer};
+use analyzer::en_pipeline;
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
@@ -154,16 +153,21 @@ pub trait PostingsWriter {
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
+ let mut pipeline = en_pipeline();
for field_value in field_values {
- let mut tokens = SimpleTokenizer.analyze(field_value.value().text());
- // right now num_tokens and pos are redundant, but it should
- // change when we get proper analyzers
- while let Some(token) = tokens.next() {
- term.set_text(&token.term);
- self.suscribe(term_index, doc_id, pos, &term, heap);
- pos += 1u32;
- num_tokens += 1u32;
- }
+ pipeline.analyze(field_value.value().text(),
+ &mut |token| {
+ term.set_text(&token.term);
+ self.suscribe(term_index, doc_id, pos, &term, heap);
+ pos += 1u32;
+ num_tokens += 1u32;
+ });
+ // let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
+ // // right now num_tokens and pos are redundant, but it should
+ // // change when we get proper analyzers
+ // while let Some(token) = tokens.next() {
+
+ // }
pos += 1;
// THIS is to avoid phrase query accross field repetition.
// span queries might still match though :|
diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
index 8c714ec08..742fc39f0 100644
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -8,11 +8,10 @@ use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
-use analyzer::{SimpleTokenizer, TokenStream};
+use analyzer::{en_pipeline, TextPipeline};
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
-use analyzer::Analyzer;
/// Possible error that may happen when parsing a query.
@@ -75,7 +74,7 @@ pub struct QueryParser {
schema: Schema,
default_fields: Vec,
conjunction_by_default: bool,
- analyzer: Box,
+ analyzer: Box,
}
impl QueryParser {
@@ -88,7 +87,7 @@ impl QueryParser {
schema: schema,
default_fields: default_fields,
conjunction_by_default: false,
- analyzer: box SimpleTokenizer,
+ analyzer: en_pipeline(),
}
}
@@ -162,11 +161,12 @@ impl QueryParser {
FieldType::Str(ref str_options) => {
let mut terms: Vec = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
- let mut token_iter = self.analyzer.analyze(phrase);
- while let Some(token) = token_iter.next() {
- let term = Term::from_field_text(field, &token.term);
- terms.push(term);
- }
+ self.analyzer
+ .analyze(phrase,
+ &mut |token| {
+ let term = Term::from_field_text(field, &token.term);
+ terms.push(term);
+ });
} else {
terms.push(Term::from_field_text(field, phrase));
}