From a7d10b65ae43307c14dde8a540a21a3c6379aa0b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Fri, 9 Jun 2017 10:30:15 +0900 Subject: [PATCH] Added support for Japanese. --- Cargo.toml | 1 + src/analyzer/analyzer.rs | 62 ++++++++++++----- src/analyzer/jp_tokenizer.rs | 91 +++++++++++++++++++++++++ src/analyzer/lower_caser.rs | 33 ++++----- src/analyzer/mod.rs | 94 ++++++++++++++++++++------ src/analyzer/remove_long.rs | 38 +++++------ src/analyzer/remove_nonalphanum.rs | 58 ++++++++++++++++ src/analyzer/simple_tokenizer.rs | 14 ++-- src/analyzer/stemmer.rs | 38 +++++------ src/lib.rs | 1 + src/postings/postings_writer.rs | 26 ++++--- src/query/query_parser/query_parser.rs | 18 ++--- 12 files changed, 347 insertions(+), 127 deletions(-) create mode 100644 src/analyzer/jp_tokenizer.rs create mode 100644 src/analyzer/remove_nonalphanum.rs diff --git a/Cargo.toml b/Cargo.toml index f760f3f9d..c4d27134c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -16,6 +16,7 @@ keywords = ["search", "information", "retrieval"] byteorder = "1.0" memmap = "0.4" lazy_static = "0.2.1" +tinysegmenter = "0.1.0" regex = "0.2" fst = "0.1.37" atomicwrites = "0.1.3" diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs index c1a916a1d..9b889eb13 100644 --- a/src/analyzer/analyzer.rs +++ b/src/analyzer/analyzer.rs @@ -1,6 +1,27 @@ +pub trait TextPipeline { + fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)); +} + + +struct TextPipelineImpl + where for<'a> A: Analyzer<'a> + 'static +{ + underlying: A, +} + +impl TextPipeline for TextPipelineImpl + where for<'a> A: Analyzer<'a> + 'static +{ + fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) { + let mut token_stream = self.underlying.token_stream(text); + while token_stream.advance() { + sink(token_stream.token()); + } + } +} #[derive(Default)] pub struct Token { @@ -11,33 +32,39 @@ pub struct Token { } pub trait Analyzer<'a>: Sized { - type TokenStreamImpl: TokenStream; - fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl; + fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl; fn filter(self, new_filter: NewFilter) -> ChainAnalyzer - where NewFilter: TokenFilterFactory<>::TokenStreamImpl> { + where NewFilter: TokenFilterFactory<>::TokenStreamImpl> + { ChainAnalyzer { head: new_filter, - tail: self + tail: self, } } } -pub trait TokenStream { +pub fn boxed_pipeline Analyzer<'a>>(analyzer: A) + -> Box { + let text_pipeline_impl = TextPipelineImpl { underlying: analyzer }; + box text_pipeline_impl +} + + +pub trait TokenStream { fn advance(&mut self) -> bool; - + fn token(&self) -> &Token; - + fn token_mut(&mut self) -> &mut Token; fn next(&mut self) -> Option<&Token> { if self.advance() { Some(self.token()) - } - else { + } else { None } } @@ -46,27 +73,26 @@ pub trait TokenStream { pub struct ChainAnalyzer { head: HeadTokenFilterFactory, - tail: TailAnalyzer + tail: TailAnalyzer, } -impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> for ChainAnalyzer +impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> + for ChainAnalyzer where HeadTokenFilterFactory: TokenFilterFactory, - TailAnalyzer: Analyzer<'a> { - + TailAnalyzer: Analyzer<'a> +{ type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; - - fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl { - let tail_token_stream = self.tail.analyze(text); + + fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + let tail_token_stream = self.tail.token_stream(text); self.head.transform(tail_token_stream) } } pub trait TokenFilterFactory { - type ResultTokenStream: TokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream; } - diff --git a/src/analyzer/jp_tokenizer.rs b/src/analyzer/jp_tokenizer.rs new file mode 100644 index 000000000..6bf508959 --- /dev/null +++ b/src/analyzer/jp_tokenizer.rs @@ -0,0 +1,91 @@ +use super::{Token, Analyzer, TokenStream}; +use tinysegmenter; + +pub struct JpTokenizer; + +#[derive(Eq, PartialEq)] +enum Cursor { + HasNotStarted, + Cursor(usize), + Terminated, +} + +pub struct JpTokenizerStream { + tokens: Vec, + cursor: Cursor, +} + +impl<'a> Analyzer<'a> for JpTokenizer { + type TokenStreamImpl = JpTokenizerStream; + + fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + let mut tokens = vec![]; + let mut offset_from; + let mut offset_to = 0; + for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() { + offset_from = offset_to; + offset_to = offset_from + term.len(); + tokens.push(Token { + offset_from: offset_from, + offset_to: offset_to, + position: pos, + term: term, + }); + } + JpTokenizerStream { + tokens: tokens, + cursor: Cursor::HasNotStarted, + } + } +} + +impl<'a> TokenStream for JpTokenizerStream { + fn advance(&mut self) -> bool { + let new_cursor = match self.cursor { + Cursor::HasNotStarted => { + if self.tokens.len() > 0 { + Cursor::Cursor(0) + } else { + Cursor::Terminated + } + } + Cursor::Cursor(pos) => { + let new_pos = pos + 1; + if new_pos >= self.tokens.len() { + Cursor::Terminated + } else { + Cursor::Cursor(new_pos) + } + } + Cursor::Terminated => Cursor::Terminated, + }; + self.cursor = new_cursor; + return self.cursor != Cursor::Terminated; + } + + + fn token(&self) -> &Token { + match self.cursor { + Cursor::Terminated => { + panic!("You called .token(), after the end of the token stream has been reached"); + } + Cursor::Cursor(i) => &self.tokens[i], + Cursor::HasNotStarted => { + panic!("You called .token(), before having called `.advance()`."); + } + } + + } + + fn token_mut(&mut self) -> &mut Token { + match self.cursor { + Cursor::Terminated => { + panic!("You called .token(), after the end of the token stream has been reached"); + } + Cursor::Cursor(i) => &mut self.tokens[i], + Cursor::HasNotStarted => { + panic!("You called .token(), before having called `.advance()`."); + } + } + } +} diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs index dda5f597b..e0cb86861 100644 --- a/src/analyzer/lower_caser.rs +++ b/src/analyzer/lower_caser.rs @@ -3,9 +3,9 @@ use std::ascii::AsciiExt; pub struct LowerCaser; -impl TokenFilterFactory for LowerCaser - where TailTokenStream: TokenStream { - +impl TokenFilterFactory for LowerCaser + where TailTokenStream: TokenStream +{ type ResultTokenStream = LowerCaserTokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { @@ -13,18 +13,19 @@ impl TokenFilterFactory for LowerCaser } } -pub struct LowerCaserTokenStream - where TailTokenStream: TokenStream { +pub struct LowerCaserTokenStream + where TailTokenStream: TokenStream +{ tail: TailTokenStream, } impl TokenStream for LowerCaserTokenStream - where TailTokenStream: TokenStream { - + where TailTokenStream: TokenStream +{ fn token(&self) -> &Token { self.tail.token() } - + fn token_mut(&mut self) -> &mut Token { self.tail.token_mut() } @@ -33,22 +34,16 @@ impl TokenStream for LowerCaserTokenStream if self.tail.advance() { self.tail.token_mut().term.make_ascii_lowercase(); return true; - } - else { + } else { return false; } } } impl LowerCaserTokenStream - where TailTokenStream: TokenStream { - - + where TailTokenStream: TokenStream +{ fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { - LowerCaserTokenStream { - tail: tail, - } - } + LowerCaserTokenStream { tail: tail } + } } - - diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index 1d2974f29..63d1cf116 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -4,51 +4,101 @@ mod analyzer; mod simple_tokenizer; mod lower_caser; mod remove_long; +mod remove_nonalphanum; mod stemmer; +mod jp_tokenizer; -pub use self::analyzer::{Analyzer, Token, TokenFilterFactory, TokenStream}; +pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory, + TokenStream}; pub use self::simple_tokenizer::SimpleTokenizer; +pub use self::jp_tokenizer::JpTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::lower_caser::LowerCaser; pub use self::stemmer::Stemmer; +pub use self::remove_nonalphanum::RemoveNonAlphaFilter; +pub fn en_pipeline<'a>() -> Box { + boxed_pipeline(SimpleTokenizer + .filter(RemoveLongFilter::limit(20)) + .filter(LowerCaser) + .filter(Stemmer::new())) +} -pub fn en_analyzer<'a>() -> impl Analyzer<'a> { - SimpleTokenizer - .filter(RemoveLongFilter::limit(20)) - .filter(LowerCaser) +pub fn jp_pipeline<'a>() -> Box { + boxed_pipeline(JpTokenizer + .filter(RemoveLongFilter::limit(20)) + .filter(RemoveNonAlphaFilter)) } #[cfg(test)] mod test { - use super::{Analyzer, TokenStream, en_analyzer}; + use super::{en_pipeline, jp_pipeline, Token}; #[test] - fn test_tokenizer() { - let mut analyzer = en_analyzer(); - let mut terms = analyzer.analyze("hello, happy tax payer!"); - assert_eq!(terms.next().unwrap().term, "hello"); - assert_eq!(terms.next().unwrap().term, "happy"); - assert_eq!(terms.next().unwrap().term, "tax"); - assert_eq!(terms.next().unwrap().term, "payer"); - assert!(terms.next().is_none()); + fn test_en_analyzer() { + let mut pipeline = en_pipeline(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; + pipeline.analyze("hello, happy tax payer!", &mut add_token); + } + assert_eq!(tokens.len(), 4); + assert_eq!(&tokens[0], "hello"); + assert_eq!(&tokens[1], "happi"); + assert_eq!(&tokens[2], "tax"); + assert_eq!(&tokens[3], "payer"); } + + #[test] + fn test_jp_analyzer() { + let mut pipeline = jp_pipeline(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; + pipeline.analyze("野菜食べないとやばい!", &mut add_token); + } + assert_eq!(tokens.len(), 5); + assert_eq!(&tokens[0], "野菜"); + assert_eq!(&tokens[1], "食べ"); + assert_eq!(&tokens[2], "ない"); + assert_eq!(&tokens[3], "と"); + assert_eq!(&tokens[4], "やばい"); + } + + #[test] fn test_tokenizer_empty() { - let mut terms = en_analyzer().analyze(""); - assert!(terms.next().is_none()); + let mut pipeline = en_pipeline(); + { + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; + pipeline.analyze(" ", &mut add_token); + } + assert!(tokens.is_empty()); + } + { + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; + pipeline.analyze(" ", &mut add_token); + } + assert!(tokens.is_empty()); + } } #[test] fn test_tokenizer_cjkchars() { - let mut terms = en_analyzer().analyze("hello,中国人民"); - assert_eq!(terms.next().unwrap().term, "hello"); - assert_eq!(terms.next().unwrap().term, "中国人民"); - assert!(terms.next().is_none()); + let mut pipeline = en_pipeline(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; + pipeline.analyze("hello,中国人民", &mut add_token); + } + assert_eq!(tokens.len(), 2); + assert_eq!(tokens, vec!["hello", "中国人民"]); } - } - diff --git a/src/analyzer/remove_long.rs b/src/analyzer/remove_long.rs index b4b4b4e0e..98b73b973 100644 --- a/src/analyzer/remove_long.rs +++ b/src/analyzer/remove_long.rs @@ -5,34 +5,34 @@ pub struct RemoveLongFilter { length_limit: usize, } -impl RemoveLongFilter { +impl RemoveLongFilter { // the limit is in bytes of the UTF-8 representation. pub fn limit(length_limit: usize) -> RemoveLongFilter { - RemoveLongFilter { - length_limit: length_limit, - } + RemoveLongFilter { length_limit: length_limit } } } impl RemoveLongFilterStream - where TailTokenStream: TokenStream { - + where TailTokenStream: TokenStream +{ fn predicate(&self, token: &Token) -> bool { token.term.len() < self.token_length_limit } - fn wrap(token_length_limit: usize, tail: TailTokenStream) -> RemoveLongFilterStream { + fn wrap(token_length_limit: usize, + tail: TailTokenStream) + -> RemoveLongFilterStream { RemoveLongFilterStream { token_length_limit: token_length_limit, tail: tail, } - } + } } -impl TokenFilterFactory for RemoveLongFilter - where TailTokenStream: TokenStream { - +impl TokenFilterFactory for RemoveLongFilter + where TailTokenStream: TokenStream +{ type ResultTokenStream = RemoveLongFilterStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { @@ -40,16 +40,16 @@ impl TokenFilterFactory for RemoveLongFilter } } -pub struct RemoveLongFilterStream - where TailTokenStream: TokenStream { - +pub struct RemoveLongFilterStream + where TailTokenStream: TokenStream +{ token_length_limit: usize, tail: TailTokenStream, } impl TokenStream for RemoveLongFilterStream - where TailTokenStream: TokenStream { - + where TailTokenStream: TokenStream +{ fn token(&self) -> &Token { self.tail.token() } @@ -64,11 +64,9 @@ impl TokenStream for RemoveLongFilterStream if self.predicate(self.tail.token()) { return true; } - } - else { + } else { return false; } } } - -} \ No newline at end of file +} diff --git a/src/analyzer/remove_nonalphanum.rs b/src/analyzer/remove_nonalphanum.rs new file mode 100644 index 000000000..ede810680 --- /dev/null +++ b/src/analyzer/remove_nonalphanum.rs @@ -0,0 +1,58 @@ +use super::{TokenFilterFactory, TokenStream, Token}; + + +pub struct RemoveNonAlphaFilter; + +impl RemoveNonAlphaFilterStream + where TailTokenStream: TokenStream +{ + fn predicate(&self, token: &Token) -> bool { + for c in token.term.chars() { + if !c.is_alphanumeric() { + return false; + } + } + true + } +} + + +impl TokenFilterFactory for RemoveNonAlphaFilter + where TailTokenStream: TokenStream +{ + type ResultTokenStream = RemoveNonAlphaFilterStream; + + fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream { + RemoveNonAlphaFilterStream { tail: tail } + } +} + +pub struct RemoveNonAlphaFilterStream + where TailTokenStream: TokenStream +{ + tail: TailTokenStream, +} + +impl TokenStream for RemoveNonAlphaFilterStream + where TailTokenStream: TokenStream +{ + fn token(&self) -> &Token { + self.tail.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.tail.token_mut() + } + + fn advance(&mut self) -> bool { + loop { + if self.tail.advance() { + if self.predicate(self.tail.token()) { + return true; + } + } else { + return false; + } + } + } +} diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs index 2d5b27907..96b71c5dd 100644 --- a/src/analyzer/simple_tokenizer.rs +++ b/src/analyzer/simple_tokenizer.rs @@ -7,14 +7,13 @@ pub struct SimpleTokenizer; pub struct SimpleTokenStream<'a> { text: &'a str, chars: CharIndices<'a>, - token: Token, + token: Token, } impl<'a> Analyzer<'a> for SimpleTokenizer { - type TokenStreamImpl = SimpleTokenStream<'a>; - fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl { + fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { SimpleTokenStream { text: text, chars: text.char_indices(), @@ -24,10 +23,9 @@ impl<'a> Analyzer<'a> for SimpleTokenizer { } impl<'a> SimpleTokenStream<'a> { - fn token_limit(&mut self) -> usize { (&mut self.chars) - .filter(|&(_, ref c)| !c.is_alphanumeric()) + .filter(|&(_, ref c)| !c.is_alphanumeric()) .map(|(offset, _)| offset) .next() .unwrap_or(self.text.len()) @@ -35,7 +33,6 @@ impl<'a> SimpleTokenStream<'a> { } impl<'a> TokenStream for SimpleTokenStream<'a> { - fn advance(&mut self) -> bool { self.token.term.clear(); self.token.position += 1; @@ -57,7 +54,7 @@ impl<'a> TokenStream for SimpleTokenStream<'a> { } } } - + fn token(&self) -> &Token { &self.token } @@ -65,5 +62,4 @@ impl<'a> TokenStream for SimpleTokenStream<'a> { fn token_mut(&mut self) -> &mut Token { &mut self.token } - -} \ No newline at end of file +} diff --git a/src/analyzer/stemmer.rs b/src/analyzer/stemmer.rs index 4988d8325..82e3e5ac3 100644 --- a/src/analyzer/stemmer.rs +++ b/src/analyzer/stemmer.rs @@ -1,6 +1,6 @@ use std::sync::Arc; use super::{TokenFilterFactory, TokenStream, Token}; -use rust_stemmers::{Algorithm, self}; +use rust_stemmers::{self, Algorithm}; pub struct Stemmer { stemmer: Arc, @@ -9,15 +9,13 @@ pub struct Stemmer { impl Stemmer { pub fn new() -> Stemmer { let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); - Stemmer { - stemmer: Arc::new(inner_stemmer), - } + Stemmer { stemmer: Arc::new(inner_stemmer) } } } -impl TokenFilterFactory for Stemmer - where TailTokenStream: TokenStream { - +impl TokenFilterFactory for Stemmer + where TailTokenStream: TokenStream +{ type ResultTokenStream = StemmerTokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { @@ -26,19 +24,20 @@ impl TokenFilterFactory for Stemmer } -pub struct StemmerTokenStream - where TailTokenStream: TokenStream { +pub struct StemmerTokenStream + where TailTokenStream: TokenStream +{ tail: TailTokenStream, stemmer: Arc, } impl TokenStream for StemmerTokenStream - where TailTokenStream: TokenStream { - + where TailTokenStream: TokenStream +{ fn token(&self) -> &Token { self.tail.token() } - + fn token_mut(&mut self) -> &mut Token { self.tail.token_mut() } @@ -50,20 +49,21 @@ impl TokenStream for StemmerTokenStream self.token_mut().term.clear(); self.token_mut().term.push_str(&stemmed_str); true - } - else { + } else { false } } } impl StemmerTokenStream - where TailTokenStream: TokenStream { - - fn wrap(stemmer: Arc, tail: TailTokenStream) -> StemmerTokenStream { + where TailTokenStream: TokenStream +{ + fn wrap(stemmer: Arc, + tail: TailTokenStream) + -> StemmerTokenStream { StemmerTokenStream { tail: tail, stemmer: stemmer, } - } -} \ No newline at end of file + } +} diff --git a/src/lib.rs b/src/lib.rs index 592bc414a..e9d00fa3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -79,6 +79,7 @@ extern crate test; #[cfg(test)] extern crate rand; +extern crate tinysegmenter; #[cfg(test)] mod functional_test; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 812490fff..9981be093 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -4,10 +4,9 @@ use schema::FieldValue; use postings::PostingsSerializer; use std::io; use postings::Recorder; -use analyzer::SimpleTokenizer; use Result; use schema::{Schema, Field}; -use analyzer::{TokenStream, Analyzer}; +use analyzer::en_pipeline; use std::marker::PhantomData; use std::ops::DerefMut; use datastruct::stacker::{HashMap, Heap}; @@ -154,16 +153,21 @@ pub trait PostingsWriter { let mut num_tokens: u32 = 0u32; let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); + let mut pipeline = en_pipeline(); for field_value in field_values { - let mut tokens = SimpleTokenizer.analyze(field_value.value().text()); - // right now num_tokens and pos are redundant, but it should - // change when we get proper analyzers - while let Some(token) = tokens.next() { - term.set_text(&token.term); - self.suscribe(term_index, doc_id, pos, &term, heap); - pos += 1u32; - num_tokens += 1u32; - } + pipeline.analyze(field_value.value().text(), + &mut |token| { + term.set_text(&token.term); + self.suscribe(term_index, doc_id, pos, &term, heap); + pos += 1u32; + num_tokens += 1u32; + }); + // let mut tokens = SimpleTokenizer.token_stream(field_value.value().text()); + // // right now num_tokens and pos are redundant, but it should + // // change when we get proper analyzers + // while let Some(token) = tokens.next() { + + // } pos += 1; // THIS is to avoid phrase query accross field repetition. // span queries might still match though :| diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 8c714ec08..742fc39f0 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -8,11 +8,10 @@ use query::Occur; use query::TermQuery; use postings::SegmentPostingsOption; use query::PhraseQuery; -use analyzer::{SimpleTokenizer, TokenStream}; +use analyzer::{en_pipeline, TextPipeline}; use schema::{Term, FieldType}; use std::str::FromStr; use std::num::ParseIntError; -use analyzer::Analyzer; /// Possible error that may happen when parsing a query. @@ -75,7 +74,7 @@ pub struct QueryParser { schema: Schema, default_fields: Vec, conjunction_by_default: bool, - analyzer: Box, + analyzer: Box, } impl QueryParser { @@ -88,7 +87,7 @@ impl QueryParser { schema: schema, default_fields: default_fields, conjunction_by_default: false, - analyzer: box SimpleTokenizer, + analyzer: en_pipeline(), } } @@ -162,11 +161,12 @@ impl QueryParser { FieldType::Str(ref str_options) => { let mut terms: Vec = Vec::new(); if str_options.get_indexing_options().is_tokenized() { - let mut token_iter = self.analyzer.analyze(phrase); - while let Some(token) = token_iter.next() { - let term = Term::from_field_text(field, &token.term); - terms.push(term); - } + self.analyzer + .analyze(phrase, + &mut |token| { + let term = Term::from_field_text(field, &token.term); + terms.push(term); + }); } else { terms.push(Term::from_field_text(field, phrase)); }