Remove the concept of pipeline. Made a BoableAnalyzer

This commit is contained in:
Paul Masurel
2017-06-10 20:06:00 +09:00
parent a7d10b65ae
commit f26874557e
7 changed files with 117 additions and 95 deletions

View File

@@ -1,36 +1,31 @@
use std::borrow::{Borrow, BorrowMut};
pub trait TextPipeline {
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
/// Token
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
pub offset_from: usize,
/// Offset (byte index) of the last character of the token + 1.
/// The text that generated the token should be obtained by
/// &text[token.offset_from..token.offset_to]
pub offset_to: usize,
/// Position, expressed in number of tokens.
pub position: usize,
/// Actual text content of the token.
pub term: String,
}
struct TextPipelineImpl<A>
where for<'a> A: Analyzer<'a> + 'static
{
underlying: A,
}
impl<A> TextPipeline for TextPipelineImpl<A>
where for<'a> A: Analyzer<'a> + 'static
{
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
let mut token_stream = self.underlying.token_stream(text);
while token_stream.advance() {
sink(token_stream.token());
impl Default for Token {
fn default() -> Token {
Token {
offset_from: 0,
offset_to: 0,
position: usize::max_value(),
term: String::new(),
}
}
}
#[derive(Default)]
pub struct Token {
pub offset_from: usize,
pub offset_to: usize,
pub position: usize,
pub term: String,
}
pub trait Analyzer<'a>: Sized {
type TokenStreamImpl: TokenStream;
@@ -46,11 +41,39 @@ pub trait Analyzer<'a>: Sized {
}
}
pub trait BoxedAnalyzer {
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
}
pub fn boxed_pipeline<A: 'static + for<'a> Analyzer<'a>>(analyzer: A)
-> Box<TextPipeline + 'static> {
let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
box text_pipeline_impl
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a>;
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + for <'a> Analyzer<'a> {
fn token_stream<'b>(&mut self, text: &'b str) -> Box<TokenStream + 'b> {
box self.0.token_stream(text)
}
}
pub fn box_analyzer<A>(a: A) -> Box<BoxedAnalyzer>
where A: 'static + for <'a> Analyzer<'a> {
box BoxableAnalyzer(a)
}
impl<'b> TokenStream for Box<TokenStream + 'b> {
fn advance(&mut self) -> bool {
let token_stream: &mut TokenStream = self.borrow_mut();
token_stream.advance()
}
fn token(&self) -> &Token {
let token_stream: &TokenStream = self.borrow();
token_stream.token()
}
fn token_mut(&mut self) -> &mut Token {
let token_stream: &mut TokenStream = self.borrow_mut();
token_stream.token_mut()
}
}
@@ -68,6 +91,15 @@ pub trait TokenStream {
None
}
}
fn process(&mut self, sink: &mut FnMut(&Token)) -> u32 {
let mut num_tokens_pushed = 0u32;
while self.advance() {
sink(self.token());
num_tokens_pushed += 1u32;
}
num_tokens_pushed
}
}

View File

@@ -1,7 +1,7 @@
use super::{Token, Analyzer, TokenStream};
use tinysegmenter;
pub struct JpTokenizer;
pub struct JPTokenizer;
#[derive(Eq, PartialEq)]
enum Cursor {
@@ -10,13 +10,13 @@ enum Cursor {
Terminated,
}
pub struct JpTokenizerStream {
pub struct JPTokenizerStream {
tokens: Vec<Token>,
cursor: Cursor,
}
impl<'a> Analyzer<'a> for JpTokenizer {
type TokenStreamImpl = JpTokenizerStream;
impl<'a> Analyzer<'a> for JPTokenizer {
type TokenStreamImpl = JPTokenizerStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let mut tokens = vec![];
@@ -32,14 +32,14 @@ impl<'a> Analyzer<'a> for JpTokenizer {
term: term,
});
}
JpTokenizerStream {
JPTokenizerStream {
tokens: tokens,
cursor: Cursor::HasNotStarted,
}
}
}
impl<'a> TokenStream for JpTokenizerStream {
impl<'a> TokenStream for JPTokenizerStream {
fn advance(&mut self) -> bool {
let new_cursor = match self.cursor {
Cursor::HasNotStarted => {
@@ -60,7 +60,7 @@ impl<'a> TokenStream for JpTokenizerStream {
Cursor::Terminated => Cursor::Terminated,
};
self.cursor = new_cursor;
return self.cursor != Cursor::Terminated;
self.cursor != Cursor::Terminated
}

View File

@@ -33,9 +33,9 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
fn advance(&mut self) -> bool {
if self.tail.advance() {
self.tail.token_mut().term.make_ascii_lowercase();
return true;
true
} else {
return false;
false
}
}
}

View File

@@ -8,27 +8,32 @@ mod remove_nonalphanum;
mod stemmer;
mod jp_tokenizer;
pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory,
TokenStream};
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::jp_tokenizer::JpTokenizer;
pub use self::jp_tokenizer::JPTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::lower_caser::LowerCaser;
pub use self::stemmer::Stemmer;
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
pub use self::analyzer::BoxedAnalyzer;
pub fn en_pipeline<'a>() -> Box<TextPipeline> {
boxed_pipeline(SimpleTokenizer
pub fn en_pipeline<'a>() -> Box<BoxedAnalyzer> {
box_analyzer(
SimpleTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(LowerCaser)
.filter(Stemmer::new()))
.filter(Stemmer::new())
)
}
pub fn jp_pipeline<'a>() -> Box<TextPipeline> {
boxed_pipeline(JpTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(RemoveNonAlphaFilter))
pub fn jp_pipeline<'a>() -> Box<BoxedAnalyzer> {
box_analyzer(
JPTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(RemoveNonAlphaFilter)
)
}
#[cfg(test)]
@@ -37,11 +42,11 @@ mod test {
#[test]
fn test_en_analyzer() {
let mut pipeline = en_pipeline();
let mut en_analyzer = en_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("hello, happy tax payer!", &mut add_token);
en_analyzer.token_stream("hello, happy tax payer!").process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(&tokens[0], "hello");
@@ -50,14 +55,13 @@ mod test {
assert_eq!(&tokens[3], "payer");
}
#[test]
fn test_jp_analyzer() {
let mut pipeline = jp_pipeline();
let mut en_analyzer = jp_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("野菜食べないとやばい!", &mut add_token);
en_analyzer.token_stream("野菜食べないとやばい!").process(&mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_eq!(&tokens[0], "野菜");
@@ -67,15 +71,14 @@ mod test {
assert_eq!(&tokens[4], "やばい");
}
#[test]
fn test_tokenizer_empty() {
let mut pipeline = en_pipeline();
let mut en_analyzer = en_pipeline();
{
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze(" ", &mut add_token);
en_analyzer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
@@ -83,22 +86,10 @@ mod test {
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze(" ", &mut add_token);
en_analyzer.token_stream(" ").process(&mut add_token);
}
assert!(tokens.is_empty());
}
}
#[test]
fn test_tokenizer_cjkchars() {
let mut pipeline = en_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("hello,中国人民", &mut add_token);
}
assert_eq!(tokens.len(), 2);
assert_eq!(tokens, vec!["hello", "中国人民"]);
}
}

View File

@@ -35,7 +35,7 @@ impl<'a> SimpleTokenStream<'a> {
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.term.clear();
self.token.position += 1;
self.token.position = self.token.position.wrapping_add(1);
loop {
match self.chars.next() {

View File

@@ -6,7 +6,7 @@ use std::io;
use postings::Recorder;
use Result;
use schema::{Schema, Field};
use analyzer::en_pipeline;
use analyzer::{en_pipeline, Token};
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
@@ -149,28 +149,28 @@ pub trait PostingsWriter {
field_values: &[&'a FieldValue],
heap: &Heap)
-> u32 {
let mut pos = 0u32;
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut pipeline = en_pipeline();
for field_value in field_values {
pipeline.analyze(field_value.value().text(),
&mut |token| {
term.set_text(&token.term);
self.suscribe(term_index, doc_id, pos, &term, heap);
pos += 1u32;
num_tokens += 1u32;
});
// let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
// // right now num_tokens and pos are redundant, but it should
// // change when we get proper analyzers
// while let Some(token) = tokens.next() {
let mut analyzer = en_pipeline();
// }
pos += 1;
// THIS is to avoid phrase query accross field repetition.
// span queries might still match though :|
let mut overall_position = 0u32;
for field_value in field_values {
// TODO fix position when more than one value.
let mut token_stream = analyzer.token_stream(field_value.value().text());
let mut local_position = 0;
num_tokens += {
let mut sink = |token: &Token| {
term.set_text(token.term.as_str());
local_position = token.position as u32;
self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap);
};
token_stream.process(&mut sink)
};
overall_position += local_position + 2u32;
}
num_tokens
}

View File

@@ -8,7 +8,7 @@ use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::{en_pipeline, TextPipeline};
use analyzer::{en_pipeline, BoxedAnalyzer};
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
@@ -74,7 +74,7 @@ pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
conjunction_by_default: bool,
analyzer: Box<TextPipeline>,
analyzer: Box<BoxedAnalyzer>,
}
impl QueryParser {
@@ -161,12 +161,11 @@ impl QueryParser {
FieldType::Str(ref str_options) => {
let mut terms: Vec<Term> = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
self.analyzer
.analyze(phrase,
&mut |token| {
let mut token_stream = self.analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.term);
terms.push(term);
});
});
} else {
terms.push(Term::from_field_text(field, phrase));
}