diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs index 9b889eb13..08912c574 100644 --- a/src/analyzer/analyzer.rs +++ b/src/analyzer/analyzer.rs @@ -1,36 +1,31 @@ +use std::borrow::{Borrow, BorrowMut}; - - -pub trait TextPipeline { - fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)); +/// Token +pub struct Token { + /// Offset (byte index) of the first character of the token. + /// Offsets shall not be modified by token filters. + pub offset_from: usize, + /// Offset (byte index) of the last character of the token + 1. + /// The text that generated the token should be obtained by + /// &text[token.offset_from..token.offset_to] + pub offset_to: usize, + /// Position, expressed in number of tokens. + pub position: usize, + /// Actual text content of the token. + pub term: String, } - -struct TextPipelineImpl - where for<'a> A: Analyzer<'a> + 'static -{ - underlying: A, -} - -impl TextPipeline for TextPipelineImpl - where for<'a> A: Analyzer<'a> + 'static -{ - fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) { - let mut token_stream = self.underlying.token_stream(text); - while token_stream.advance() { - sink(token_stream.token()); +impl Default for Token { + fn default() -> Token { + Token { + offset_from: 0, + offset_to: 0, + position: usize::max_value(), + term: String::new(), } } } -#[derive(Default)] -pub struct Token { - pub offset_from: usize, - pub offset_to: usize, - pub position: usize, - pub term: String, -} - pub trait Analyzer<'a>: Sized { type TokenStreamImpl: TokenStream; @@ -46,11 +41,39 @@ pub trait Analyzer<'a>: Sized { } } +pub trait BoxedAnalyzer { + fn token_stream<'a>(&mut self, text: &'a str) -> Box; +} -pub fn boxed_pipeline Analyzer<'a>>(analyzer: A) - -> Box { - let text_pipeline_impl = TextPipelineImpl { underlying: analyzer }; - box text_pipeline_impl +struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a>; + +impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + for <'a> Analyzer<'a> { + fn token_stream<'b>(&mut self, text: &'b str) -> Box { + box self.0.token_stream(text) + } +} + +pub fn box_analyzer(a: A) -> Box + where A: 'static + for <'a> Analyzer<'a> { + box BoxableAnalyzer(a) +} + + +impl<'b> TokenStream for Box { + fn advance(&mut self) -> bool { + let token_stream: &mut TokenStream = self.borrow_mut(); + token_stream.advance() + } + + fn token(&self) -> &Token { + let token_stream: &TokenStream = self.borrow(); + token_stream.token() + } + + fn token_mut(&mut self) -> &mut Token { + let token_stream: &mut TokenStream = self.borrow_mut(); + token_stream.token_mut() + } } @@ -68,6 +91,15 @@ pub trait TokenStream { None } } + + fn process(&mut self, sink: &mut FnMut(&Token)) -> u32 { + let mut num_tokens_pushed = 0u32; + while self.advance() { + sink(self.token()); + num_tokens_pushed += 1u32; + } + num_tokens_pushed + } } diff --git a/src/analyzer/jp_tokenizer.rs b/src/analyzer/jp_tokenizer.rs index 6bf508959..4bc5d4689 100644 --- a/src/analyzer/jp_tokenizer.rs +++ b/src/analyzer/jp_tokenizer.rs @@ -1,7 +1,7 @@ use super::{Token, Analyzer, TokenStream}; use tinysegmenter; -pub struct JpTokenizer; +pub struct JPTokenizer; #[derive(Eq, PartialEq)] enum Cursor { @@ -10,13 +10,13 @@ enum Cursor { Terminated, } -pub struct JpTokenizerStream { +pub struct JPTokenizerStream { tokens: Vec, cursor: Cursor, } -impl<'a> Analyzer<'a> for JpTokenizer { - type TokenStreamImpl = JpTokenizerStream; +impl<'a> Analyzer<'a> for JPTokenizer { + type TokenStreamImpl = JPTokenizerStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { let mut tokens = vec![]; @@ -32,14 +32,14 @@ impl<'a> Analyzer<'a> for JpTokenizer { term: term, }); } - JpTokenizerStream { + JPTokenizerStream { tokens: tokens, cursor: Cursor::HasNotStarted, } } } -impl<'a> TokenStream for JpTokenizerStream { +impl<'a> TokenStream for JPTokenizerStream { fn advance(&mut self) -> bool { let new_cursor = match self.cursor { Cursor::HasNotStarted => { @@ -60,7 +60,7 @@ impl<'a> TokenStream for JpTokenizerStream { Cursor::Terminated => Cursor::Terminated, }; self.cursor = new_cursor; - return self.cursor != Cursor::Terminated; + self.cursor != Cursor::Terminated } diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs index e0cb86861..a3f72ddcc 100644 --- a/src/analyzer/lower_caser.rs +++ b/src/analyzer/lower_caser.rs @@ -33,9 +33,9 @@ impl TokenStream for LowerCaserTokenStream fn advance(&mut self) -> bool { if self.tail.advance() { self.tail.token_mut().term.make_ascii_lowercase(); - return true; + true } else { - return false; + false } } } diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index 63d1cf116..5cc42b83f 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -8,27 +8,32 @@ mod remove_nonalphanum; mod stemmer; mod jp_tokenizer; -pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory, +pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream}; pub use self::simple_tokenizer::SimpleTokenizer; -pub use self::jp_tokenizer::JpTokenizer; +pub use self::jp_tokenizer::JPTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::lower_caser::LowerCaser; pub use self::stemmer::Stemmer; pub use self::remove_nonalphanum::RemoveNonAlphaFilter; +pub use self::analyzer::BoxedAnalyzer; -pub fn en_pipeline<'a>() -> Box { - boxed_pipeline(SimpleTokenizer +pub fn en_pipeline<'a>() -> Box { + box_analyzer( + SimpleTokenizer .filter(RemoveLongFilter::limit(20)) .filter(LowerCaser) - .filter(Stemmer::new())) + .filter(Stemmer::new()) + ) } -pub fn jp_pipeline<'a>() -> Box { - boxed_pipeline(JpTokenizer - .filter(RemoveLongFilter::limit(20)) - .filter(RemoveNonAlphaFilter)) +pub fn jp_pipeline<'a>() -> Box { + box_analyzer( + JPTokenizer + .filter(RemoveLongFilter::limit(20)) + .filter(RemoveNonAlphaFilter) + ) } #[cfg(test)] @@ -37,11 +42,11 @@ mod test { #[test] fn test_en_analyzer() { - let mut pipeline = en_pipeline(); + let mut en_analyzer = en_pipeline(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - pipeline.analyze("hello, happy tax payer!", &mut add_token); + en_analyzer.token_stream("hello, happy tax payer!").process(&mut add_token); } assert_eq!(tokens.len(), 4); assert_eq!(&tokens[0], "hello"); @@ -50,14 +55,13 @@ mod test { assert_eq!(&tokens[3], "payer"); } - #[test] fn test_jp_analyzer() { - let mut pipeline = jp_pipeline(); + let mut en_analyzer = jp_pipeline(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - pipeline.analyze("野菜食べないとやばい!", &mut add_token); + en_analyzer.token_stream("野菜食べないとやばい!").process(&mut add_token); } assert_eq!(tokens.len(), 5); assert_eq!(&tokens[0], "野菜"); @@ -67,15 +71,14 @@ mod test { assert_eq!(&tokens[4], "やばい"); } - #[test] fn test_tokenizer_empty() { - let mut pipeline = en_pipeline(); + let mut en_analyzer = en_pipeline(); { let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - pipeline.analyze(" ", &mut add_token); + en_analyzer.token_stream(" ").process(&mut add_token); } assert!(tokens.is_empty()); } @@ -83,22 +86,10 @@ mod test { let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - pipeline.analyze(" ", &mut add_token); + en_analyzer.token_stream(" ").process(&mut add_token); } assert!(tokens.is_empty()); } } - - #[test] - fn test_tokenizer_cjkchars() { - let mut pipeline = en_pipeline(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - pipeline.analyze("hello,中国人民", &mut add_token); - } - assert_eq!(tokens.len(), 2); - assert_eq!(tokens, vec!["hello", "中国人民"]); - } } diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs index 96b71c5dd..c79282279 100644 --- a/src/analyzer/simple_tokenizer.rs +++ b/src/analyzer/simple_tokenizer.rs @@ -35,7 +35,7 @@ impl<'a> SimpleTokenStream<'a> { impl<'a> TokenStream for SimpleTokenStream<'a> { fn advance(&mut self) -> bool { self.token.term.clear(); - self.token.position += 1; + self.token.position = self.token.position.wrapping_add(1); loop { match self.chars.next() { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 9981be093..3823b20e5 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -6,7 +6,7 @@ use std::io; use postings::Recorder; use Result; use schema::{Schema, Field}; -use analyzer::en_pipeline; +use analyzer::{en_pipeline, Token}; use std::marker::PhantomData; use std::ops::DerefMut; use datastruct::stacker::{HashMap, Heap}; @@ -149,28 +149,28 @@ pub trait PostingsWriter { field_values: &[&'a FieldValue], heap: &Heap) -> u32 { - let mut pos = 0u32; + let mut num_tokens: u32 = 0u32; let mut term = unsafe { Term::with_capacity(100) }; + term.set_field(field); - let mut pipeline = en_pipeline(); - for field_value in field_values { - pipeline.analyze(field_value.value().text(), - &mut |token| { - term.set_text(&token.term); - self.suscribe(term_index, doc_id, pos, &term, heap); - pos += 1u32; - num_tokens += 1u32; - }); - // let mut tokens = SimpleTokenizer.token_stream(field_value.value().text()); - // // right now num_tokens and pos are redundant, but it should - // // change when we get proper analyzers - // while let Some(token) = tokens.next() { + let mut analyzer = en_pipeline(); - // } - pos += 1; - // THIS is to avoid phrase query accross field repetition. - // span queries might still match though :| + let mut overall_position = 0u32; + + for field_value in field_values { + // TODO fix position when more than one value. + let mut token_stream = analyzer.token_stream(field_value.value().text()); + let mut local_position = 0; + num_tokens += { + let mut sink = |token: &Token| { + term.set_text(token.term.as_str()); + local_position = token.position as u32; + self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap); + }; + token_stream.process(&mut sink) + }; + overall_position += local_position + 2u32; } num_tokens } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 742fc39f0..5cb65ea5c 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -8,7 +8,7 @@ use query::Occur; use query::TermQuery; use postings::SegmentPostingsOption; use query::PhraseQuery; -use analyzer::{en_pipeline, TextPipeline}; +use analyzer::{en_pipeline, BoxedAnalyzer}; use schema::{Term, FieldType}; use std::str::FromStr; use std::num::ParseIntError; @@ -74,7 +74,7 @@ pub struct QueryParser { schema: Schema, default_fields: Vec, conjunction_by_default: bool, - analyzer: Box, + analyzer: Box, } impl QueryParser { @@ -161,12 +161,11 @@ impl QueryParser { FieldType::Str(ref str_options) => { let mut terms: Vec = Vec::new(); if str_options.get_indexing_options().is_tokenized() { - self.analyzer - .analyze(phrase, - &mut |token| { + let mut token_stream = self.analyzer.token_stream(phrase); + token_stream.process(&mut |token| { let term = Term::from_field_text(field, &token.term); terms.push(term); - }); + }); } else { terms.push(Term::from_field_text(field, phrase)); }