mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-08 01:52:54 +00:00
Remove the concept of pipeline. Made a BoableAnalyzer
This commit is contained in:
@@ -1,36 +1,31 @@
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
|
||||
|
||||
|
||||
pub trait TextPipeline {
|
||||
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token));
|
||||
/// Token
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub term: String,
|
||||
}
|
||||
|
||||
|
||||
struct TextPipelineImpl<A>
|
||||
where for<'a> A: Analyzer<'a> + 'static
|
||||
{
|
||||
underlying: A,
|
||||
}
|
||||
|
||||
impl<A> TextPipeline for TextPipelineImpl<A>
|
||||
where for<'a> A: Analyzer<'a> + 'static
|
||||
{
|
||||
fn analyze(&mut self, text: &str, sink: &mut FnMut(&Token)) {
|
||||
let mut token_stream = self.underlying.token_stream(text);
|
||||
while token_stream.advance() {
|
||||
sink(token_stream.token());
|
||||
impl Default for Token {
|
||||
fn default() -> Token {
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::max_value(),
|
||||
term: String::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct Token {
|
||||
pub offset_from: usize,
|
||||
pub offset_to: usize,
|
||||
pub position: usize,
|
||||
pub term: String,
|
||||
}
|
||||
|
||||
pub trait Analyzer<'a>: Sized {
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
@@ -46,11 +41,39 @@ pub trait Analyzer<'a>: Sized {
|
||||
}
|
||||
}
|
||||
|
||||
pub trait BoxedAnalyzer {
|
||||
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
|
||||
}
|
||||
|
||||
pub fn boxed_pipeline<A: 'static + for<'a> Analyzer<'a>>(analyzer: A)
|
||||
-> Box<TextPipeline + 'static> {
|
||||
let text_pipeline_impl = TextPipelineImpl { underlying: analyzer };
|
||||
box text_pipeline_impl
|
||||
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a>;
|
||||
|
||||
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + for <'a> Analyzer<'a> {
|
||||
fn token_stream<'b>(&mut self, text: &'b str) -> Box<TokenStream + 'b> {
|
||||
box self.0.token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
pub fn box_analyzer<A>(a: A) -> Box<BoxedAnalyzer>
|
||||
where A: 'static + for <'a> Analyzer<'a> {
|
||||
box BoxableAnalyzer(a)
|
||||
}
|
||||
|
||||
|
||||
impl<'b> TokenStream for Box<TokenStream + 'b> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut TokenStream = self.borrow_mut();
|
||||
token_stream.advance()
|
||||
}
|
||||
|
||||
fn token(&self) -> &Token {
|
||||
let token_stream: &TokenStream = self.borrow();
|
||||
token_stream.token()
|
||||
}
|
||||
|
||||
fn token_mut(&mut self) -> &mut Token {
|
||||
let token_stream: &mut TokenStream = self.borrow_mut();
|
||||
token_stream.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -68,6 +91,15 @@ pub trait TokenStream {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
fn process(&mut self, sink: &mut FnMut(&Token)) -> u32 {
|
||||
let mut num_tokens_pushed = 0u32;
|
||||
while self.advance() {
|
||||
sink(self.token());
|
||||
num_tokens_pushed += 1u32;
|
||||
}
|
||||
num_tokens_pushed
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
use super::{Token, Analyzer, TokenStream};
|
||||
use tinysegmenter;
|
||||
|
||||
pub struct JpTokenizer;
|
||||
pub struct JPTokenizer;
|
||||
|
||||
#[derive(Eq, PartialEq)]
|
||||
enum Cursor {
|
||||
@@ -10,13 +10,13 @@ enum Cursor {
|
||||
Terminated,
|
||||
}
|
||||
|
||||
pub struct JpTokenizerStream {
|
||||
pub struct JPTokenizerStream {
|
||||
tokens: Vec<Token>,
|
||||
cursor: Cursor,
|
||||
}
|
||||
|
||||
impl<'a> Analyzer<'a> for JpTokenizer {
|
||||
type TokenStreamImpl = JpTokenizerStream;
|
||||
impl<'a> Analyzer<'a> for JPTokenizer {
|
||||
type TokenStreamImpl = JPTokenizerStream;
|
||||
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
|
||||
let mut tokens = vec![];
|
||||
@@ -32,14 +32,14 @@ impl<'a> Analyzer<'a> for JpTokenizer {
|
||||
term: term,
|
||||
});
|
||||
}
|
||||
JpTokenizerStream {
|
||||
JPTokenizerStream {
|
||||
tokens: tokens,
|
||||
cursor: Cursor::HasNotStarted,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for JpTokenizerStream {
|
||||
impl<'a> TokenStream for JPTokenizerStream {
|
||||
fn advance(&mut self) -> bool {
|
||||
let new_cursor = match self.cursor {
|
||||
Cursor::HasNotStarted => {
|
||||
@@ -60,7 +60,7 @@ impl<'a> TokenStream for JpTokenizerStream {
|
||||
Cursor::Terminated => Cursor::Terminated,
|
||||
};
|
||||
self.cursor = new_cursor;
|
||||
return self.cursor != Cursor::Terminated;
|
||||
self.cursor != Cursor::Terminated
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -33,9 +33,9 @@ impl<TailTokenStream> TokenStream for LowerCaserTokenStream<TailTokenStream>
|
||||
fn advance(&mut self) -> bool {
|
||||
if self.tail.advance() {
|
||||
self.tail.token_mut().term.make_ascii_lowercase();
|
||||
return true;
|
||||
true
|
||||
} else {
|
||||
return false;
|
||||
false
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,27 +8,32 @@ mod remove_nonalphanum;
|
||||
mod stemmer;
|
||||
mod jp_tokenizer;
|
||||
|
||||
pub use self::analyzer::{boxed_pipeline, TextPipeline, Analyzer, Token, TokenFilterFactory,
|
||||
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory,
|
||||
TokenStream};
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
pub use self::jp_tokenizer::JpTokenizer;
|
||||
pub use self::jp_tokenizer::JPTokenizer;
|
||||
pub use self::remove_long::RemoveLongFilter;
|
||||
pub use self::lower_caser::LowerCaser;
|
||||
pub use self::stemmer::Stemmer;
|
||||
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
|
||||
pub use self::analyzer::BoxedAnalyzer;
|
||||
|
||||
|
||||
pub fn en_pipeline<'a>() -> Box<TextPipeline> {
|
||||
boxed_pipeline(SimpleTokenizer
|
||||
pub fn en_pipeline<'a>() -> Box<BoxedAnalyzer> {
|
||||
box_analyzer(
|
||||
SimpleTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(LowerCaser)
|
||||
.filter(Stemmer::new()))
|
||||
.filter(Stemmer::new())
|
||||
)
|
||||
}
|
||||
|
||||
pub fn jp_pipeline<'a>() -> Box<TextPipeline> {
|
||||
boxed_pipeline(JpTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(RemoveNonAlphaFilter))
|
||||
pub fn jp_pipeline<'a>() -> Box<BoxedAnalyzer> {
|
||||
box_analyzer(
|
||||
JPTokenizer
|
||||
.filter(RemoveLongFilter::limit(20))
|
||||
.filter(RemoveNonAlphaFilter)
|
||||
)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
@@ -37,11 +42,11 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_en_analyzer() {
|
||||
let mut pipeline = en_pipeline();
|
||||
let mut en_analyzer = en_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("hello, happy tax payer!", &mut add_token);
|
||||
en_analyzer.token_stream("hello, happy tax payer!").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_eq!(&tokens[0], "hello");
|
||||
@@ -50,14 +55,13 @@ mod test {
|
||||
assert_eq!(&tokens[3], "payer");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_jp_analyzer() {
|
||||
let mut pipeline = jp_pipeline();
|
||||
let mut en_analyzer = jp_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("野菜食べないとやばい!", &mut add_token);
|
||||
en_analyzer.token_stream("野菜食べないとやばい!").process(&mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 5);
|
||||
assert_eq!(&tokens[0], "野菜");
|
||||
@@ -67,15 +71,14 @@ mod test {
|
||||
assert_eq!(&tokens[4], "やばい");
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_empty() {
|
||||
let mut pipeline = en_pipeline();
|
||||
let mut en_analyzer = en_pipeline();
|
||||
{
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze(" ", &mut add_token);
|
||||
en_analyzer.token_stream(" ").process(&mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
@@ -83,22 +86,10 @@ mod test {
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze(" ", &mut add_token);
|
||||
en_analyzer.token_stream(" ").process(&mut add_token);
|
||||
}
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_tokenizer_cjkchars() {
|
||||
let mut pipeline = en_pipeline();
|
||||
let mut tokens: Vec<String> = vec![];
|
||||
{
|
||||
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
|
||||
pipeline.analyze("hello,中国人民", &mut add_token);
|
||||
}
|
||||
assert_eq!(tokens.len(), 2);
|
||||
assert_eq!(tokens, vec!["hello", "中国人民"]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,7 +35,7 @@ impl<'a> SimpleTokenStream<'a> {
|
||||
impl<'a> TokenStream for SimpleTokenStream<'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
self.token.term.clear();
|
||||
self.token.position += 1;
|
||||
self.token.position = self.token.position.wrapping_add(1);
|
||||
|
||||
loop {
|
||||
match self.chars.next() {
|
||||
|
||||
@@ -6,7 +6,7 @@ use std::io;
|
||||
use postings::Recorder;
|
||||
use Result;
|
||||
use schema::{Schema, Field};
|
||||
use analyzer::en_pipeline;
|
||||
use analyzer::{en_pipeline, Token};
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::DerefMut;
|
||||
use datastruct::stacker::{HashMap, Heap};
|
||||
@@ -149,28 +149,28 @@ pub trait PostingsWriter {
|
||||
field_values: &[&'a FieldValue],
|
||||
heap: &Heap)
|
||||
-> u32 {
|
||||
let mut pos = 0u32;
|
||||
|
||||
let mut num_tokens: u32 = 0u32;
|
||||
let mut term = unsafe { Term::with_capacity(100) };
|
||||
|
||||
term.set_field(field);
|
||||
let mut pipeline = en_pipeline();
|
||||
for field_value in field_values {
|
||||
pipeline.analyze(field_value.value().text(),
|
||||
&mut |token| {
|
||||
term.set_text(&token.term);
|
||||
self.suscribe(term_index, doc_id, pos, &term, heap);
|
||||
pos += 1u32;
|
||||
num_tokens += 1u32;
|
||||
});
|
||||
// let mut tokens = SimpleTokenizer.token_stream(field_value.value().text());
|
||||
// // right now num_tokens and pos are redundant, but it should
|
||||
// // change when we get proper analyzers
|
||||
// while let Some(token) = tokens.next() {
|
||||
let mut analyzer = en_pipeline();
|
||||
|
||||
// }
|
||||
pos += 1;
|
||||
// THIS is to avoid phrase query accross field repetition.
|
||||
// span queries might still match though :|
|
||||
let mut overall_position = 0u32;
|
||||
|
||||
for field_value in field_values {
|
||||
// TODO fix position when more than one value.
|
||||
let mut token_stream = analyzer.token_stream(field_value.value().text());
|
||||
let mut local_position = 0;
|
||||
num_tokens += {
|
||||
let mut sink = |token: &Token| {
|
||||
term.set_text(token.term.as_str());
|
||||
local_position = token.position as u32;
|
||||
self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap);
|
||||
};
|
||||
token_stream.process(&mut sink)
|
||||
};
|
||||
overall_position += local_position + 2u32;
|
||||
}
|
||||
num_tokens
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ use query::Occur;
|
||||
use query::TermQuery;
|
||||
use postings::SegmentPostingsOption;
|
||||
use query::PhraseQuery;
|
||||
use analyzer::{en_pipeline, TextPipeline};
|
||||
use analyzer::{en_pipeline, BoxedAnalyzer};
|
||||
use schema::{Term, FieldType};
|
||||
use std::str::FromStr;
|
||||
use std::num::ParseIntError;
|
||||
@@ -74,7 +74,7 @@ pub struct QueryParser {
|
||||
schema: Schema,
|
||||
default_fields: Vec<Field>,
|
||||
conjunction_by_default: bool,
|
||||
analyzer: Box<TextPipeline>,
|
||||
analyzer: Box<BoxedAnalyzer>,
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
@@ -161,12 +161,11 @@ impl QueryParser {
|
||||
FieldType::Str(ref str_options) => {
|
||||
let mut terms: Vec<Term> = Vec::new();
|
||||
if str_options.get_indexing_options().is_tokenized() {
|
||||
self.analyzer
|
||||
.analyze(phrase,
|
||||
&mut |token| {
|
||||
let mut token_stream = self.analyzer.token_stream(phrase);
|
||||
token_stream.process(&mut |token| {
|
||||
let term = Term::from_field_text(field, &token.term);
|
||||
terms.push(term);
|
||||
});
|
||||
});
|
||||
} else {
|
||||
terms.push(Term::from_field_text(field, phrase));
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user