mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-01 16:10:42 +00:00
Added support for Japanese.
This commit is contained in:
@@ -16,6 +16,7 @@ keywords = ["search", "information", "retrieval"]
|
||||
byteorder = "1.0"
|
||||
memmap = "0.4"
|
||||
lazy_static = "0.2.1"
|
||||
tinysegmenter = "0.1.0"
|
||||
regex = "0.2"
|
||||
fst = "0.1.37"
|
||||
atomicwrites = "0.1.3"
|
||||
|
||||
@@ -1,6 +1,27 @@
|
||||
|
||||
|
||||
|
||||
pub trait TextPipeline {
|
||||
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
|
||||
}
|
||||
|
||||
|
||||
struct TextPipelineImpl<A>
|
||||
where for<'a> A: Analyzer<'a> + 'static
|
||||
{
|
||||
underlying: A,
|
||||
}
|
||||
|
||||
impl<A> TextPipeline for TextPipelineImpl<A>
|
||||
where for<'a> A: Analyzer<'a> + 'static
|
||||
{
|
||||
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
|
||||
let mut token_stream = self.underlying.token_stream(text);
|
||||
while token_stream.advance() {
|
||||
sink(token_stream.token());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Token {
|
||||
@@ -11,33 +32,39 @@ pub struct Token {
|
||||
}
|
||||
|
||||
pub trait Analyzer<'a>: Sized {
|
||||
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl;
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
|
||||
|
||||
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainAnalyzer<NewFilter, Self>
|
||||
where NewFilter: TokenFilterFactory<<Self as Analyzer<'a>>::TokenStreamImpl> {
|
||||
where NewFilter: TokenFilterFactory<<Self as Analyzer<'a>>::TokenStreamImpl>
|
||||
{
|
||||
ChainAnalyzer {
|
||||
head: new_filter,
|
||||
tail: self
|
||||
tail: self,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenStream {
|
||||
|
||||
pub fn boxed_pipeline<A: 'static + for<'a> Analyzer<'a>>(analyzer: A)
|
||||
-> Box<TextPipeline + 'static> {
|
||||
let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
|
||||
box text_pipeline_impl
|
||||
}
|
||||
|
||||
|
||||
pub trait TokenStream {
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
|
||||
fn token(&self) -> &Token;
|
||||
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token;
|
||||
|
||||
fn next(&mut self) -> Option<&Token> {
|
||||
if self.advance() {
|
||||
Some(self.token())
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
@@ -46,27 +73,26 @@ pub trait TokenStream {
|
||||
|
||||
pub struct ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer> {
|
||||
head: HeadTokenFilterFactory,
|
||||
tail: TailAnalyzer
|
||||
tail: TailAnalyzer,
|
||||
}
|
||||
|
||||
|
||||
impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> for ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer>
|
||||
impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a>
|
||||
for ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer>
|
||||
where HeadTokenFilterFactory: TokenFilterFactory<TailAnalyzer::TokenStreamImpl>,
|
||||
TailAnalyzer: Analyzer<'a> {
|
||||
|
||||
TailAnalyzer: Analyzer<'a>
|
||||
{
|
||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
||||
|
||||
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let tail_token_stream = self.tail.analyze(text);
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let tail_token_stream = self.tail.token_stream(text);
|
||||
self.head.transform(tail_token_stream)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub trait TokenFilterFactory<TailTokenStream: TokenStream> {
|
||||
|
||||
type ResultTokenStream: TokenStream;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
||||
}
|
||||
|
||||
|
||||
91
src/analyzer/jp_tokenizer.rs
Normal file
91
src/analyzer/jp_tokenizer.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
use super::{Token, Analyzer, TokenStream};
|
||||
use tinysegmenter;
|
||||
|
||||
pub struct JpTokenizer;
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum Cursor {
|
||||
HasNotStarted,
|
||||
Cursor(usize),
|
||||
Terminated,
|
||||
}
|
||||
|
||||
pub struct JpTokenizerStream {
|
||||
tokens: Vec<Token>,
|
||||
cursor: Cursor,
|
||||
}
|
||||
|
||||
impl<'a> Analyzer<'a> for JpTokenizer {
|
||||
type TokenStreamImpl = JpTokenizerStream;
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let mut tokens = vec![];
|
||||
let mut offset_from;
|
||||
let mut offset_to = 0;
|
||||
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
|
||||
offset_from = offset_to;
|
||||
offset_to = offset_from + term.len();
|
||||
tokens.push(Token {
|
||||
offset_from: offset_from,
|
||||
offset_to: offset_to,
|
||||
position: pos,
|
||||
term: term,
|
||||
});
|
||||
}
|
||||
JpTokenizerStream {
|
||||
tokens: tokens,
|
||||
cursor: Cursor::HasNotStarted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for JpTokenizerStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
let new_cursor = match self.cursor {
|
||||
Cursor::HasNotStarted => {
|
||||
if self.tokens.len() > 0 {
|
||||
Cursor::Cursor(0)
|
||||
} else {
|
||||
Cursor::Terminated
|
||||
}
|
||||
}
|
||||
Cursor::Cursor(pos) => {
|
||||
let new_pos = pos + 1;
|
||||
if new_pos >= self.tokens.len() {
|
||||
Cursor::Terminated
|
||||
} else {
|
||||
Cursor::Cursor(new_pos)
|
||||
}
|
||||
}
|
||||
Cursor::Terminated => Cursor::Terminated,
|
||||
};
|
||||
self.cursor = new_cursor;
|
||||
return self.cursor != Cursor::Terminated;
|
||||
}
|
||||
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
match self.cursor {
|
||||
Cursor::Terminated => {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
Cursor::Cursor(i) => &self.tokens[i],
|
||||
Cursor::HasNotStarted => {
|
||||
panic!("You called .token(), before having called `.advance()`.");
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
match self.cursor {
|
||||
Cursor::Terminated => {
|
||||
panic!("You called .token(), after the end of the token stream has been reached");
|
||||
}
|
||||
Cursor::Cursor(i) => &mut self.tokens[i],
|
||||
Cursor::HasNotStarted => {
|
||||
panic!("You called .token(), before having called `.advance()`.");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -3,9 +3,9 @@ use std::ascii::AsciiExt;
|
||||
|
||||
pub struct LowerCaser;
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
@@ -13,18 +13,19 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
|
||||
}
|
||||
}
|
||||
|
||||
pub struct LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
pub struct LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
@@ -33,22 +34,16 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
if self.tail.advance() {
|
||||
self.tail.token_mut().term.make_ascii_lowercase();
|
||||
return true;
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> LowerCaserTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream<TailTokenStream> {
|
||||
LowerCaserTokenStream {
|
||||
tail: tail,
|
||||
}
|
||||
}
|
||||
LowerCaserTokenStream { tail: tail }
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -4,51 +4,101 @@ mod analyzer;
|
||||
mod simple_tokenizer;
|
||||
mod lower_caser;
|
||||
mod remove_long;
|
||||
mod remove_nonalphanum;
|
||||
mod stemmer;
|
||||
mod jp_tokenizer;
|
||||
|
||||
pub use self::analyzer::{Analyzer, Token, TokenFilterFactory, TokenStream};
|
||||
pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
|
||||
TokenStream};
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::jp_tokenizer::JpTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
|
||||
|
||||
|
||||
pub fn en_pipeline<'a>() -> Box<TextPipeline> {
|
||||
boxed_pipeline(SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new()))
|
||||
}
|
||||
|
||||
pub fn en_analyzer<'a>() -> impl Analyzer<'a> {
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(LowerCaser)
|
||||
pub fn jp_pipeline<'a>() -> Box<TextPipeline> {
|
||||
boxed_pipeline(JpTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(RemoveNonAlphaFilter))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{Analyzer, TokenStream, en_analyzer};
|
||||
use super::{en_pipeline, jp_pipeline, Token};
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer() {
|
||||
let mut analyzer = en_analyzer();
|
||||
let mut terms = analyzer.analyze("hello, happy tax payer!");
|
||||
assert_eq!(terms.next().unwrap().term, "hello");
|
||||
assert_eq!(terms.next().unwrap().term, "happy");
|
||||
assert_eq!(terms.next().unwrap().term, "tax");
|
||||
assert_eq!(terms.next().unwrap().term, "payer");
|
||||
assert!(terms.next().is_none());
|
||||
fn test_en_analyzer() {
|
||||
let mut pipeline = en_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("hello, happy tax payer!", &mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_eq!(&tokens[0], "hello");
|
||||
assert_eq!(&tokens[1], "happi");
|
||||
assert_eq!(&tokens[2], "tax");
|
||||
assert_eq!(&tokens[3], "payer");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_jp_analyzer() {
|
||||
let mut pipeline = jp_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("野菜食べないとやばい!", &mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 5);
|
||||
assert_eq!(&tokens[0], "野菜");
|
||||
assert_eq!(&tokens[1], "食べ");
|
||||
assert_eq!(&tokens[2], "ない");
|
||||
assert_eq!(&tokens[3], "と");
|
||||
assert_eq!(&tokens[4], "やばい");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let mut terms = en_analyzer().analyze("");
|
||||
assert!(terms.next().is_none());
|
||||
let mut pipeline = en_pipeline();
|
||||
{
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze(" ", &mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
{
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze(" ", &mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_cjkchars() {
|
||||
let mut terms = en_analyzer().analyze("hello,中国人民");
|
||||
assert_eq!(terms.next().unwrap().term, "hello");
|
||||
assert_eq!(terms.next().unwrap().term, "中国人民");
|
||||
assert!(terms.next().is_none());
|
||||
let mut pipeline = en_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("hello,中国人民", &mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 2);
|
||||
assert_eq!(tokens, vec!["hello", "中国人民"]);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
@@ -5,34 +5,34 @@ pub struct RemoveLongFilter {
|
||||
length_limit: usize,
|
||||
}
|
||||
|
||||
impl RemoveLongFilter {
|
||||
impl RemoveLongFilter {
|
||||
// the limit is in bytes of the UTF-8 representation.
|
||||
pub fn limit(length_limit: usize) -> RemoveLongFilter {
|
||||
RemoveLongFilter {
|
||||
length_limit: length_limit,
|
||||
}
|
||||
RemoveLongFilter { length_limit: length_limit }
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
token.term.len() < self.token_length_limit
|
||||
}
|
||||
|
||||
fn wrap(token_length_limit: usize, tail: TailTokenStream) -> RemoveLongFilterStream<TailTokenStream> {
|
||||
fn wrap(token_length_limit: usize,
|
||||
tail: TailTokenStream)
|
||||
-> RemoveLongFilterStream<TailTokenStream> {
|
||||
RemoveLongFilterStream {
|
||||
token_length_limit: token_length_limit,
|
||||
tail: tail,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
@@ -40,16 +40,16 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
pub struct RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
token_length_limit: usize,
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
@@ -64,11 +64,9 @@ impl<TailTokenStream> TokenStream for RemoveLongFilterStream<TailTokenStream>
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
58
src/analyzer/remove_nonalphanum.rs
Normal file
58
src/analyzer/remove_nonalphanum.rs
Normal file
@@ -0,0 +1,58 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
|
||||
|
||||
pub struct RemoveNonAlphaFilter;
|
||||
|
||||
impl<TailTokenStream> RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn predicate(&self, token: &Token) -> bool {
|
||||
for c in token.term.chars() {
|
||||
if !c.is_alphanumeric() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
true
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveNonAlphaFilter
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = RemoveNonAlphaFilterStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream {
|
||||
RemoveNonAlphaFilterStream { tail: tail }
|
||||
}
|
||||
}
|
||||
|
||||
pub struct RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for RemoveNonAlphaFilterStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
loop {
|
||||
if self.tail.advance() {
|
||||
if self.predicate(self.tail.token()) {
|
||||
return true;
|
||||
}
|
||||
} else {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -7,14 +7,13 @@ pub struct SimpleTokenizer;
|
||||
pub struct SimpleTokenStream<'a> {
|
||||
text: &'a str,
|
||||
chars: CharIndices<'a>,
|
||||
token: Token,
|
||||
token: Token,
|
||||
}
|
||||
|
||||
impl<'a> Analyzer<'a> for SimpleTokenizer {
|
||||
|
||||
type TokenStreamImpl = SimpleTokenStream<'a>;
|
||||
|
||||
fn analyze(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
SimpleTokenStream {
|
||||
text: text,
|
||||
chars: text.char_indices(),
|
||||
@@ -24,10 +23,9 @@ impl<'a> Analyzer<'a> for SimpleTokenizer {
|
||||
}
|
||||
|
||||
impl<'a> SimpleTokenStream<'a> {
|
||||
|
||||
fn token_limit(&mut self) -> usize {
|
||||
(&mut self.chars)
|
||||
.filter(|&(_, ref c)| !c.is_alphanumeric())
|
||||
.filter(|&(_, ref c)| !c.is_alphanumeric())
|
||||
.map(|(offset, _)| offset)
|
||||
.next()
|
||||
.unwrap_or(self.text.len())
|
||||
@@ -35,7 +33,6 @@ impl<'a> SimpleTokenStream<'a> {
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
|
||||
fn advance(&mut self) -> bool {
|
||||
self.token.term.clear();
|
||||
self.token.position += 1;
|
||||
@@ -57,7 +54,7 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
&self.token
|
||||
}
|
||||
@@ -65,5 +62,4 @@ impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
&mut self.token
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
use std::sync::Arc;
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use rust_stemmers::{Algorithm, self};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
|
||||
pub struct Stemmer {
|
||||
stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
@@ -9,15 +9,13 @@ pub struct Stemmer {
|
||||
impl Stemmer {
|
||||
pub fn new() -> Stemmer {
|
||||
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
|
||||
Stemmer {
|
||||
stemmer: Arc::new(inner_stemmer),
|
||||
}
|
||||
Stemmer { stemmer: Arc::new(inner_stemmer) }
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
|
||||
@@ -26,19 +24,20 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
|
||||
}
|
||||
|
||||
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
pub struct StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
tail: TailTokenStream,
|
||||
stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn token(&self) -> &Token {
|
||||
self.tail.token()
|
||||
}
|
||||
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
self.tail.token_mut()
|
||||
}
|
||||
@@ -50,20 +49,21 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
|
||||
self.token_mut().term.clear();
|
||||
self.token_mut().term.push_str(&stemmed_str);
|
||||
true
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
|
||||
where TailTokenStream: TokenStream {
|
||||
|
||||
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>, tail: TailTokenStream) -> StemmerTokenStream<TailTokenStream> {
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>,
|
||||
tail: TailTokenStream)
|
||||
-> StemmerTokenStream<TailTokenStream> {
|
||||
StemmerTokenStream {
|
||||
tail: tail,
|
||||
stemmer: stemmer,
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -79,6 +79,7 @@ extern crate test;
|
||||
#[cfg(test)]
|
||||
extern crate rand;
|
||||
|
||||
extern crate tinysegmenter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod functional_test;
|
||||
|
||||
@@ -4,10 +4,9 @@ use schema::FieldValue;
|
||||
use postings::PostingsSerializer;
|
||||
use std::io;
|
||||
use postings::Recorder;
|
||||
use analyzer::SimpleTokenizer;
|
||||
use Result;
|
||||
use schema::{Schema, Field};
|
||||
use analyzer::{TokenStream, Analyzer};
|
||||
use analyzer::en_pipeline;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
@@ -154,16 +153,21 @@ pub trait PostingsWriter {
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
term.set_field(field);
|
||||
let mut pipeline = en_pipeline();
|
||||
for field_value in field_values {
|
||||
let mut tokens = SimpleTokenizer.analyze(field_value.value().text());
|
||||
// right now num_tokens and pos are redundant, but it should
|
||||
// change when we get proper analyzers
|
||||
while let Some(token) = tokens.next() {
|
||||
term.set_text(&token.term);
|
||||
self.suscribe(term_index, doc_id, pos, &term, heap);
|
||||
pos += 1u32;
|
||||
num_tokens += 1u32;
|
||||
}
|
||||
pipeline.analyze(field_value.value().text(),
|
||||
&mut |token| {
|
||||
term.set_text(&token.term);
|
||||
self.suscribe(term_index, doc_id, pos, &term, heap);
|
||||
pos += 1u32;
|
||||
num_tokens += 1u32;
|
||||
});
|
||||
// let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
|
||||
// // right now num_tokens and pos are redundant, but it should
|
||||
// // change when we get proper analyzers
|
||||
// while let Some(token) = tokens.next() {
|
||||
|
||||
// }
|
||||
pos += 1;
|
||||
// THIS is to avoid phrase query accross field repetition.
|
||||
// span queries might still match though :|
|
||||
|
||||
@@ -8,11 +8,10 @@ use query::Occur;
|
||||
use query::TermQuery;
|
||||
use postings::SegmentPostingsOption;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::{SimpleTokenizer, TokenStream};
|
||||
use analyzer::{en_pipeline, TextPipeline};
|
||||
use schema::{Term, FieldType};
|
||||
use std::str::FromStr;
|
||||
use std::num::ParseIntError;
|
||||
use analyzer::Analyzer;
|
||||
|
||||
|
||||
/// Possible error that may happen when parsing a query.
|
||||
@@ -75,7 +74,7 @@ pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
conjunction_by_default: bool,
|
||||
analyzer: Box<SimpleTokenizer>,
|
||||
analyzer: Box<TextPipeline>,
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
@@ -88,7 +87,7 @@ impl QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
conjunction_by_default: false,
|
||||
analyzer: box SimpleTokenizer,
|
||||
analyzer: en_pipeline(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -162,11 +161,12 @@ impl QueryParser {
|
||||
FieldType::Str(ref str_options) => {
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
if str_options.get_indexing_options().is_tokenized() {
|
||||
let mut token_iter = self.analyzer.analyze(phrase);
|
||||
while let Some(token) = token_iter.next() {
|
||||
let term = Term::from_field_text(field, &token.term);
|
||||
terms.push(term);
|
||||
}
|
||||
self.analyzer
|
||||
.analyze(phrase,
|
||||
&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.term);
|
||||
terms.push(term);
|
||||
});
|
||||
} else {
|
||||
terms.push(Term::from_field_text(field, phrase));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user