Added support for Japanese.

This commit is contained in:
Paul Masurel
2017-06-09 10:30:15 +09:00
parent e120e3b7aa
commit a7d10b65ae
12 changed files with 347 additions and 127 deletions

View File

@@ -16,6 +16,7 @@ keywords = ["search", "information", "retrieval"]
byteorder = "1.0"
memmap = "0.4"
lazy_static = "0.2.1"
tinysegmenter = "0.1.0"
regex = "0.2"
fst = "0.1.37"
atomicwrites = "0.1.3"

View File

@@ -1,6 +1,27 @@
pub trait TextPipeline {
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
}
struct TextPipelineImpl<A>
where for<'a> A: Analyzer<'a> + 'static
{
underlying: A,
}
impl<A> TextPipeline for TextPipelineImpl<A>
where for<'a> A: Analyzer<'a> + 'static
{
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
let mut token_stream = self.underlying.token_stream(text);
while token_stream.advance() {
sink(token_stream.token());
}
}
}
#[derive(Default)]
pub struct Token {
@@ -11,33 +32,39 @@ pub struct Token {
}
pub trait Analyzer<'a>: Sized {
type TokenStreamImpl: TokenStream;
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainAnalyzer<NewFilter, Self>
where NewFilter: TokenFilterFactory<<Self as Analyzer<'a>>::TokenStreamImpl> {
where NewFilter: TokenFilterFactory<<Self as Analyzer<'a>>::TokenStreamImpl>
{
ChainAnalyzer {
head: new_filter,
tail: self
tail: self,
}
}
}
pub trait TokenStream {
pub fn boxed_pipeline<A: 'static + for<'a> Analyzer<'a>>(analyzer: A)
-> Box<TextPipeline + 'static> {
let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
box text_pipeline_impl
}
pub trait TokenStream {
fn advance(&mut self) -> bool;
fn token(&self) -> &Token;
fn token_mut(&mut self) -> &mut Token;
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
}
else {
} else {
None
}
}
@@ -46,27 +73,26 @@ pub trait TokenStream {
pub struct ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer> {
head: HeadTokenFilterFactory,
tail: TailAnalyzer
tail: TailAnalyzer,
}
impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> for ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer>
impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a>
for ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer>
where HeadTokenFilterFactory: TokenFilterFactory<TailAnalyzer::TokenStreamImpl>,
TailAnalyzer: Analyzer<'a> {
TailAnalyzer: Analyzer<'a>
{
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let tail_token_stream = self.tail.analyze(text);
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let tail_token_stream = self.tail.token_stream(text);
self.head.transform(tail_token_stream)
}
}
pub trait TokenFilterFactory<TailTokenStream: TokenStream> {
type ResultTokenStream: TokenStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
}

View File

@@ -0,0 +1,91 @@
use super::{Token, Analyzer, TokenStream};
use tinysegmenter;
pub struct JpTokenizer;
#[derive(Eq, PartialEq)]
enum Cursor {
HasNotStarted,
Cursor(usize),
Terminated,
}
pub struct JpTokenizerStream {
tokens: Vec<Token>,
cursor: Cursor,
}
impl<'a> Analyzer<'a> for JpTokenizer {
type TokenStreamImpl = JpTokenizerStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let mut tokens = vec![];
let mut offset_from;
let mut offset_to = 0;
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
offset_from = offset_to;
offset_to = offset_from + term.len();
tokens.push(Token {
offset_from: offset_from,
offset_to: offset_to,
position: pos,
term: term,
});
}
JpTokenizerStream {
tokens: tokens,
cursor: Cursor::HasNotStarted,
}
}
}
impl<'a> TokenStream for JpTokenizerStream {
fn advance(&mut self) -> bool {
let new_cursor = match self.cursor {
Cursor::HasNotStarted => {
if self.tokens.len() > 0 {
Cursor::Cursor(0)
} else {
Cursor::Terminated
}
}
Cursor::Cursor(pos) => {
let new_pos = pos + 1;
if new_pos >= self.tokens.len() {
Cursor::Terminated
} else {
Cursor::Cursor(new_pos)
}
}
Cursor::Terminated => Cursor::Terminated,
};
self.cursor = new_cursor;
return self.cursor != Cursor::Terminated;
}
fn token(&self) -> &Token {
match self.cursor {
Cursor::Terminated => {
panic!("You called .token(), after the end of the token stream has been reached");
}
Cursor::Cursor(i) => &self.tokens[i],
Cursor::HasNotStarted => {
panic!("You called .token(), before having called `.advance()`.");
}
}
}
fn token_mut(&mut self) -> &mut Token {
match self.cursor {
Cursor::Terminated => {
panic!("You called .token(), after the end of the token stream has been reached");
}
Cursor::Cursor(i) => &mut self.tokens[i],
Cursor::HasNotStarted => {
panic!("You called .token(), before having called `.advance()`.");
}
}
}
}

View File

@@ -3,9 +3,9 @@ use std::ascii::AsciiExt;
pub struct LowerCaser;
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
where TailTokenStream: TokenStream {
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
where TailTokenStream: TokenStream
{
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -13,18 +13,19 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
}
}
pub struct LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
pub struct LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
}
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
where TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
@@ -33,22 +34,16 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
if self.tail.advance() {
self.tail.token_mut().term.make_ascii_lowercase();
return true;
}
else {
} else {
return false;
}
}
}
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
where TailTokenStream: TokenStream
{
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
LowerCaserTokenStream {
tail: tail,
}
}
LowerCaserTokenStream { tail: tail }
}
}

View File

@@ -4,51 +4,101 @@ mod analyzer;
mod simple_tokenizer;
mod lower_caser;
mod remove_long;
mod remove_nonalphanum;
mod stemmer;
mod jp_tokenizer;
pub use self::analyzer::{Analyzer, Token, TokenFilterFactory, TokenStream};
pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
TokenStream};
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::jp_tokenizer::JpTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::lower_caser::LowerCaser;
pub use self::stemmer::Stemmer;
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
pub fn en_pipeline<'a>() -> Box<TextPipeline> {
boxed_pipeline(SimpleTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(LowerCaser)
.filter(Stemmer::new()))
}
pub fn en_analyzer<'a>() -> impl Analyzer<'a> {
SimpleTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(LowerCaser)
pub fn jp_pipeline<'a>() -> Box<TextPipeline> {
boxed_pipeline(JpTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(RemoveNonAlphaFilter))
}
#[cfg(test)]
mod test {
use super::{Analyzer, TokenStream, en_analyzer};
use super::{en_pipeline, jp_pipeline, Token};
#[test]
fn test_tokenizer() {
let mut analyzer = en_analyzer();
let mut terms = analyzer.analyze("hello, happy tax payer!");
assert_eq!(terms.next().unwrap().term, "hello");
assert_eq!(terms.next().unwrap().term, "happy");
assert_eq!(terms.next().unwrap().term, "tax");
assert_eq!(terms.next().unwrap().term, "payer");
assert!(terms.next().is_none());
fn test_en_analyzer() {
let mut pipeline = en_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("hello, happy tax payer!", &mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(&tokens[0], "hello");
assert_eq!(&tokens[1], "happi");
assert_eq!(&tokens[2], "tax");
assert_eq!(&tokens[3], "payer");
}
#[test]
fn test_jp_analyzer() {
let mut pipeline = jp_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("野菜食べないとやばい!", &mut add_token);
}
assert_eq!(tokens.len(), 5);
assert_eq!(&tokens[0], "野菜");
assert_eq!(&tokens[1], "食べ");
assert_eq!(&tokens[2], "ない");
assert_eq!(&tokens[3], "");
assert_eq!(&tokens[4], "やばい");
}
#[test]
fn test_tokenizer_empty() {
let mut terms = en_analyzer().analyze("");
assert!(terms.next().is_none());
let mut pipeline = en_pipeline();
{
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze(" ", &mut add_token);
}
assert!(tokens.is_empty());
}
{
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze(" ", &mut add_token);
}
assert!(tokens.is_empty());
}
}
#[test]
fn test_tokenizer_cjkchars() {
let mut terms = en_analyzer().analyze("hello,中国人民");
assert_eq!(terms.next().unwrap().term, "hello");
assert_eq!(terms.next().unwrap().term, "中国人民");
assert!(terms.next().is_none());
let mut pipeline = en_pipeline();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
pipeline.analyze("hello,中国人民", &mut add_token);
}
assert_eq!(tokens.len(), 2);
assert_eq!(tokens, vec!["hello", "中国人民"]);
}
}

View File

@@ -5,34 +5,34 @@ pub struct RemoveLongFilter {
length_limit: usize,
}
impl RemoveLongFilter {
impl RemoveLongFilter {
// the limit is in bytes of the UTF-8 representation.
pub fn limit(length_limit: usize) -> RemoveLongFilter {
RemoveLongFilter {
length_limit: length_limit,
}
RemoveLongFilter { length_limit: length_limit }
}
}
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream {
where TailTokenStream: TokenStream
{
fn predicate(&self, token: &Token) -> bool {
token.term.len() < self.token_length_limit
}
fn wrap(token_length_limit: usize, tail: TailTokenStream) -> RemoveLongFilterStream<TailTokenStream> {
fn wrap(token_length_limit: usize,
tail: TailTokenStream)
-> RemoveLongFilterStream<TailTokenStream> {
RemoveLongFilterStream {
token_length_limit: token_length_limit,
tail: tail,
}
}
}
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
where TailTokenStream: TokenStream {
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
where TailTokenStream: TokenStream
{
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -40,16 +40,16 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
}
}
pub struct RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream {
pub struct RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
token_length_limit: usize,
tail: TailTokenStream,
}
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
where TailTokenStream: TokenStream {
where TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
@@ -64,11 +64,9 @@ impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
if self.predicate(self.tail.token()) {
return true;
}
}
else {
} else {
return false;
}
}
}
}
}

View File

@@ -0,0 +1,58 @@
use super::{TokenFilterFactory, TokenStream, Token};
pub struct RemoveNonAlphaFilter;
impl<TailTokenStream> RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn predicate(&self, token: &Token) -> bool {
for c in token.term.chars() {
if !c.is_alphanumeric() {
return false;
}
}
true
}
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveNonAlphaFilter
where TailTokenStream: TokenStream
{
type ResultTokenStream = RemoveNonAlphaFilterStream<TailTokenStream>;
fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream {
RemoveNonAlphaFilterStream { tail: tail }
}
}
pub struct RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
}
impl<TailTokenStream> TokenStream for RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
}
}
}
}

View File

@@ -7,14 +7,13 @@ pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {
text: &'a str,
chars: CharIndices<'a>,
token: Token,
token: Token,
}
impl<'a> Analyzer<'a> for SimpleTokenizer {
type TokenStreamImpl = SimpleTokenStream<'a>;
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
SimpleTokenStream {
text: text,
chars: text.char_indices(),
@@ -24,10 +23,9 @@ impl<'a> Analyzer<'a> for SimpleTokenizer {
}
impl<'a> SimpleTokenStream<'a> {
fn token_limit(&mut self) -> usize {
(&mut self.chars)
.filter(|&(_, ref c)| !c.is_alphanumeric())
.filter(|&(_, ref c)| !c.is_alphanumeric())
.map(|(offset, _)| offset)
.next()
.unwrap_or(self.text.len())
@@ -35,7 +33,6 @@ impl<'a> SimpleTokenStream<'a> {
}
impl<'a> TokenStream for SimpleTokenStream<'a> {
fn advance(&mut self) -> bool {
self.token.term.clear();
self.token.position += 1;
@@ -57,7 +54,7 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
}
}
}
fn token(&self) -> &Token {
&self.token
}
@@ -65,5 +62,4 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
fn token_mut(&mut self) -> &mut Token {
&mut self.token
}
}
}

View File

@@ -1,6 +1,6 @@
use std::sync::Arc;
use super::{TokenFilterFactory, TokenStream, Token};
use rust_stemmers::{Algorithm, self};
use rust_stemmers::{self, Algorithm};
pub struct Stemmer {
stemmer: Arc<rust_stemmers::Stemmer>,
@@ -9,15 +9,13 @@ pub struct Stemmer {
impl Stemmer {
pub fn new() -> Stemmer {
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
Stemmer {
stemmer: Arc::new(inner_stemmer),
}
Stemmer { stemmer: Arc::new(inner_stemmer) }
}
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
where TailTokenStream: TokenStream {
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
where TailTokenStream: TokenStream
{
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
@@ -26,19 +24,20 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
}
pub struct StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
pub struct StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
stemmer: Arc<rust_stemmers::Stemmer>,
}
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
where TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
@@ -50,20 +49,21 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
self.token_mut().term.clear();
self.token_mut().term.push_str(&stemmed_str);
true
}
else {
} else {
false
}
}
}
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream {
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>, tail: TailTokenStream) -> StemmerTokenStream<TailTokenStream> {
where TailTokenStream: TokenStream
{
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>,
tail: TailTokenStream)
-> StemmerTokenStream<TailTokenStream> {
StemmerTokenStream {
tail: tail,
stemmer: stemmer,
}
}
}
}
}

View File

@@ -79,6 +79,7 @@ extern crate test;
#[cfg(test)]
extern crate rand;
extern crate tinysegmenter;
#[cfg(test)]
mod functional_test;

View File

@@ -4,10 +4,9 @@ use schema::FieldValue;
use postings::PostingsSerializer;
use std::io;
use postings::Recorder;
use analyzer::SimpleTokenizer;
use Result;
use schema::{Schema, Field};
use analyzer::{TokenStream, Analyzer};
use analyzer::en_pipeline;
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
@@ -154,16 +153,21 @@ pub trait PostingsWriter {
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut pipeline = en_pipeline();
for field_value in field_values {
let mut tokens = SimpleTokenizer.analyze(field_value.value().text());
// right now num_tokens and pos are redundant, but it should
// change when we get proper analyzers
while let Some(token) = tokens.next() {
term.set_text(&token.term);
self.suscribe(term_index, doc_id, pos, &term, heap);
pos += 1u32;
num_tokens += 1u32;
}
pipeline.analyze(field_value.value().text(),
&mut |token| {
term.set_text(&token.term);
self.suscribe(term_index, doc_id, pos, &term, heap);
pos += 1u32;
num_tokens += 1u32;
});
// let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
// // right now num_tokens and pos are redundant, but it should
// // change when we get proper analyzers
// while let Some(token) = tokens.next() {
// }
pos += 1;
// THIS is to avoid phrase query accross field repetition.
// span queries might still match though :|

View File

@@ -8,11 +8,10 @@ use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::{SimpleTokenizer, TokenStream};
use analyzer::{en_pipeline, TextPipeline};
use schema::{Term, FieldType};
use std::str::FromStr;
use std::num::ParseIntError;
use analyzer::Analyzer;
/// Possible error that may happen when parsing a query.
@@ -75,7 +74,7 @@ pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
conjunction_by_default: bool,
analyzer: Box<SimpleTokenizer>,
analyzer: Box<TextPipeline>,
}
impl QueryParser {
@@ -88,7 +87,7 @@ impl QueryParser {
schema: schema,
default_fields: default_fields,
conjunction_by_default: false,
analyzer: box SimpleTokenizer,
analyzer: en_pipeline(),
}
}
@@ -162,11 +161,12 @@ impl QueryParser {
FieldType::Str(ref str_options) => {
let mut terms: Vec<Term> = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
let mut token_iter = self.analyzer.analyze(phrase);
while let Some(token) = token_iter.next() {
let term = Term::from_field_text(field, &token.term);
terms.push(term);
}
self.analyzer
.analyze(phrase,
&mut |token| {
let term = Term::from_field_text(field, &token.term);
terms.push(term);
});
} else {
terms.push(Term::from_field_text(field, phrase));
}