diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs
index 9b889eb13..08912c574 100644
--- a/src/analyzer/analyzer.rs
+++ b/src/analyzer/analyzer.rs
@@ -1,36 +1,31 @@
+use std::borrow::{Borrow, BorrowMut};
-
-
-pub trait TextPipeline {
- fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
+/// Token
+pub struct Token {
+ /// Offset (byte index) of the first character of the token.
+ /// Offsets shall not be modified by token filters.
+ pub offset_from: usize,
+ /// Offset (byte index) of the last character of the token + 1.
+ /// The text that generated the token should be obtained by
+ /// &text[token.offset_from..token.offset_to]
+ pub offset_to: usize,
+ /// Position, expressed in number of tokens.
+ pub position: usize,
+ /// Actual text content of the token.
+ pub term: String,
}
-
-struct TextPipelineImpl
- where for<'a> A: Analyzer<'a> + 'static
-{
- underlying: A,
-}
-
-impl TextPipeline for TextPipelineImpl
- where for<'a> A: Analyzer<'a> + 'static
-{
- fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
- let mut token_stream = self.underlying.token_stream(text);
- while token_stream.advance() {
- sink(token_stream.token());
+impl Default for Token {
+ fn default() -> Token {
+ Token {
+ offset_from: 0,
+ offset_to: 0,
+ position: usize::max_value(),
+ term: String::new(),
}
}
}
-#[derive(Default)]
-pub struct Token {
- pub offset_from: usize,
- pub offset_to: usize,
- pub position: usize,
- pub term: String,
-}
-
pub trait Analyzer<'a>: Sized {
type TokenStreamImpl: TokenStream;
@@ -46,11 +41,39 @@ pub trait Analyzer<'a>: Sized {
}
}
+pub trait BoxedAnalyzer {
+ fn token_stream<'a>(&mut self, text: &'a str) -> Box;
+}
-pub fn boxed_pipeline Analyzer<'a>>(analyzer: A)
- -> Box {
- let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
- box text_pipeline_impl
+struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a>;
+
+impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + for <'a> Analyzer<'a> {
+ fn token_stream<'b>(&mut self, text: &'b str) -> Box {
+ box self.0.token_stream(text)
+ }
+}
+
+pub fn box_analyzer(a: A) -> Box
+ where A: 'static + for <'a> Analyzer<'a> {
+ box BoxableAnalyzer(a)
+}
+
+
+impl<'b> TokenStream for Box {
+ fn advance(&mut self) -> bool {
+ let token_stream: &mut TokenStream = self.borrow_mut();
+ token_stream.advance()
+ }
+
+ fn token(&self) -> &Token {
+ let token_stream: &TokenStream = self.borrow();
+ token_stream.token()
+ }
+
+ fn token_mut(&mut self) -> &mut Token {
+ let token_stream: &mut TokenStream = self.borrow_mut();
+ token_stream.token_mut()
+ }
}
@@ -68,6 +91,15 @@ pub trait TokenStream {
None
}
}
+
+ fn process(&mut self, sink: &mut FnMut(&Token)) -> u32 {
+ let mut num_tokens_pushed = 0u32;
+ while self.advance() {
+ sink(self.token());
+ num_tokens_pushed += 1u32;
+ }
+ num_tokens_pushed
+ }
}
diff --git a/src/analyzer/jp_tokenizer.rs b/src/analyzer/jp_tokenizer.rs
index 6bf508959..4bc5d4689 100644
--- a/src/analyzer/jp_tokenizer.rs
+++ b/src/analyzer/jp_tokenizer.rs
@@ -1,7 +1,7 @@
use super::{Token, Analyzer, TokenStream};
use tinysegmenter;
-pub struct JpTokenizer;
+pub struct JPTokenizer;
#[derive(Eq, PartialEq)]
enum Cursor {
@@ -10,13 +10,13 @@ enum Cursor {
Terminated,
}
-pub struct JpTokenizerStream {
+pub struct JPTokenizerStream {
tokens: Vec,
cursor: Cursor,
}
-impl<'a> Analyzer<'a> for JpTokenizer {
- type TokenStreamImpl = JpTokenizerStream;
+impl<'a> Analyzer<'a> for JPTokenizer {
+ type TokenStreamImpl = JPTokenizerStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let mut tokens = vec![];
@@ -32,14 +32,14 @@ impl<'a> Analyzer<'a> for JpTokenizer {
term: term,
});
}
- JpTokenizerStream {
+ JPTokenizerStream {
tokens: tokens,
cursor: Cursor::HasNotStarted,
}
}
}
-impl<'a> TokenStream for JpTokenizerStream {
+impl<'a> TokenStream for JPTokenizerStream {
fn advance(&mut self) -> bool {
let new_cursor = match self.cursor {
Cursor::HasNotStarted => {
@@ -60,7 +60,7 @@ impl<'a> TokenStream for JpTokenizerStream {
Cursor::Terminated => Cursor::Terminated,
};
self.cursor = new_cursor;
- return self.cursor != Cursor::Terminated;
+ self.cursor != Cursor::Terminated
}
diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs
index e0cb86861..a3f72ddcc 100644
--- a/src/analyzer/lower_caser.rs
+++ b/src/analyzer/lower_caser.rs
@@ -33,9 +33,9 @@ impl TokenStream for LowerCaserTokenStream
fn advance(&mut self) -> bool {
if self.tail.advance() {
self.tail.token_mut().term.make_ascii_lowercase();
- return true;
+ true
} else {
- return false;
+ false
}
}
}
diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs
index 63d1cf116..5cc42b83f 100644
--- a/src/analyzer/mod.rs
+++ b/src/analyzer/mod.rs
@@ -8,27 +8,32 @@ mod remove_nonalphanum;
mod stemmer;
mod jp_tokenizer;
-pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
+pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory,
TokenStream};
pub use self::simple_tokenizer::SimpleTokenizer;
-pub use self::jp_tokenizer::JpTokenizer;
+pub use self::jp_tokenizer::JPTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::lower_caser::LowerCaser;
pub use self::stemmer::Stemmer;
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
+pub use self::analyzer::BoxedAnalyzer;
-pub fn en_pipeline<'a>() -> Box {
- boxed_pipeline(SimpleTokenizer
+pub fn en_pipeline<'a>() -> Box {
+ box_analyzer(
+ SimpleTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(LowerCaser)
- .filter(Stemmer::new()))
+ .filter(Stemmer::new())
+ )
}
-pub fn jp_pipeline<'a>() -> Box {
- boxed_pipeline(JpTokenizer
- .filter(RemoveLongFilter::limit(20))
- .filter(RemoveNonAlphaFilter))
+pub fn jp_pipeline<'a>() -> Box {
+ box_analyzer(
+ JPTokenizer
+ .filter(RemoveLongFilter::limit(20))
+ .filter(RemoveNonAlphaFilter)
+ )
}
#[cfg(test)]
@@ -37,11 +42,11 @@ mod test {
#[test]
fn test_en_analyzer() {
- let mut pipeline = en_pipeline();
+ let mut en_analyzer = en_pipeline();
let mut tokens: Vec = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
- pipeline.analyze("hello, happy tax payer!", &mut add_token);
+ en_analyzer.token_stream("hello, happy tax payer!").process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(&tokens[0], "hello");
@@ -50,14 +55,13 @@ mod test {
assert_eq!(&tokens[3], "payer");
}
-
#[test]
fn test_jp_analyzer() {
- let mut pipeline = jp_pipeline();
+ let mut en_analyzer = jp_pipeline();
let mut tokens: Vec = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
- pipeline.analyze("野菜食べないとやばい!", &mut add_token);
+ en_analyzer.token_stream("野菜食べないとやばい!").process(&mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_eq!(&tokens[0], "野菜");
@@ -67,15 +71,14 @@ mod test {
assert_eq!(&tokens[4], "やばい");
}
-
#[test]
fn test_tokenizer_empty() {
- let mut pipeline = en_pipeline();
+ let mut en_analyzer = en_pipeline();
{
let mut tokens: Vec = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
- pipeline.analyze(" ", &mut add_token);
+ en_analyzer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
@@ -83,22 +86,10 @@ mod test {
let mut tokens: Vec = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
- pipeline.analyze(" ", &mut add_token);
+ en_analyzer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
}
-
- #[test]
- fn test_tokenizer_cjkchars() {
- let mut pipeline = en_pipeline();
- let mut tokens: Vec = vec![];
- {
- let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
- pipeline.analyze("hello,中国人民", &mut add_token);
- }
- assert_eq!(tokens.len(), 2);
- assert_eq!(tokens, vec!["hello", "中国人民"]);
- }
}
diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs
index 96b71c5dd..c79282279 100644
--- a/src/analyzer/simple_tokenizer.rs
+++ b/src/analyzer/simple_tokenizer.rs
@@ -35,7 +35,7 @@ impl<'a> SimpleTokenStream<'a> {
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.term.clear();
- self.token.position += 1;
+ self.token.position = self.token.position.wrapping_add(1);
loop {
match self.chars.next() {
diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs
index 9981be093..3823b20e5 100644
--- a/src/postings/postings_writer.rs
+++ b/src/postings/postings_writer.rs
@@ -6,7 +6,7 @@ use std::io;
use postings::Recorder;
use Result;
use schema::{Schema, Field};
-use analyzer::en_pipeline;
+use analyzer::{en_pipeline, Token};
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
@@ -149,28 +149,28 @@ pub trait PostingsWriter {
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
- let mut pos = 0u32;
+
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
+
term.set_field(field);
- let mut pipeline = en_pipeline();
- for field_value in field_values {
- pipeline.analyze(field_value.value().text(),
- &mut |token| {
- term.set_text(&token.term);
- self.suscribe(term_index, doc_id, pos, &term, heap);
- pos += 1u32;
- num_tokens += 1u32;
- });
- // let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
- // // right now num_tokens and pos are redundant, but it should
- // // change when we get proper analyzers
- // while let Some(token) = tokens.next() {
+ let mut analyzer = en_pipeline();
- // }
- pos += 1;
- // THIS is to avoid phrase query accross field repetition.
- // span queries might still match though :|
+ let mut overall_position = 0u32;
+
+ for field_value in field_values {
+ // TODO fix position when more than one value.
+ let mut token_stream = analyzer.token_stream(field_value.value().text());
+ let mut local_position = 0;
+ num_tokens += {
+ let mut sink = |token: &Token| {
+ term.set_text(token.term.as_str());
+ local_position = token.position as u32;
+ self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap);
+ };
+ token_stream.process(&mut sink)
+ };
+ overall_position += local_position + 2u32;
}
num_tokens
}
diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs
index 742fc39f0..5cb65ea5c 100644
--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -8,7 +8,7 @@ use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
-use analyzer::{en_pipeline, TextPipeline};
+use analyzer::{en_pipeline, BoxedAnalyzer};
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
@@ -74,7 +74,7 @@ pub struct QueryParser {
schema: Schema,
default_fields: Vec,
conjunction_by_default: bool,
- analyzer: Box,
+ analyzer: Box,
}
impl QueryParser {
@@ -161,12 +161,11 @@ impl QueryParser {
FieldType::Str(ref str_options) => {
let mut terms: Vec = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
- self.analyzer
- .analyze(phrase,
- &mut |token| {
+ let mut token_stream = self.analyzer.token_stream(phrase);
+ token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.term);
terms.push(term);
- });
+ });
} else {
terms.push(Term::from_field_text(field, phrase));
}