issue/191 Added an analyzer manager.

This commit is contained in:
Paul Masurel
2017-06-20 10:02:26 +09:00
parent f26874557e
commit b05b5f5487
17 changed files with 319 additions and 162 deletions

View File

@@ -179,7 +179,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> {
// Here, if the user does not specify which
// field they want to search, tantivy will search
// in both title and body.
let mut query_parser = QueryParser::new(index.schema(), vec![title, body]);
let mut query_parser = QueryParser::for_index(index, vec![title, body]);
// QueryParser may fail if the query is not in the right
// format. For user facing applications, this can be a problem.

View File

@@ -1,4 +1,8 @@
/// The analyzer module contains all of the tools used to process
/// text in `tantivy`.
use std::borrow::{Borrow, BorrowMut};
use analyzer::TokenStreamChain;
/// Token
pub struct Token {
@@ -26,7 +30,7 @@ impl Default for Token {
}
}
pub trait Analyzer<'a>: Sized {
pub trait Analyzer<'a>: Sized + Clone {
type TokenStreamImpl: TokenStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
@@ -41,20 +45,49 @@ pub trait Analyzer<'a>: Sized {
}
}
pub trait BoxedAnalyzer {
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
pub trait BoxedAnalyzer: Send + Sync {
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b>;
fn boxed_clone(&self) -> Box<BoxedAnalyzer>;
}
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a>;
#[derive(Clone)]
struct BoxableAnalyzer<A>(A) where A: for <'a> Analyzer<'a> + Send + Sync;
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + for <'a> Analyzer<'a> {
fn token_stream<'b>(&mut self, text: &'b str) -> Box<TokenStream + 'b> {
impl<A> BoxedAnalyzer for BoxableAnalyzer<A> where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a> {
box self.0.token_stream(text)
}
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b> {
assert!(texts.len() > 0);
if texts.len() == 1 {
box self.0.token_stream(texts[0])
}
else {
let mut offsets = vec!();
let mut total_offset = 0;
for text in texts {
offsets.push(total_offset);
total_offset += text.len();
}
let token_streams: Vec<_> = texts
.iter()
.map(|text| {
self.0.token_stream(text)
})
.collect();
box TokenStreamChain::new(offsets, token_streams)
}
}
fn boxed_clone(&self) -> Box<BoxedAnalyzer> {
box self.clone()
}
}
pub fn box_analyzer<A>(a: A) -> Box<BoxedAnalyzer>
where A: 'static + for <'a> Analyzer<'a> {
where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
box BoxableAnalyzer(a)
}
@@ -102,7 +135,7 @@ pub trait TokenStream {
}
}
#[derive(Clone)]
pub struct ChainAnalyzer<HeadTokenFilterFactory, TailAnalyzer> {
head: HeadTokenFilterFactory,
tail: TailAnalyzer,
@@ -117,13 +150,13 @@ impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a>
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let tail_token_stream = self.tail.token_stream(text);
let tail_token_stream = self.tail.token_stream(text );
self.head.transform(tail_token_stream)
}
}
pub trait TokenFilterFactory<TailTokenStream: TokenStream> {
pub trait TokenFilterFactory<TailTokenStream: TokenStream>: Clone {
type ResultTokenStream: TokenStream;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;

View File

@@ -0,0 +1,66 @@
use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use analyzer::BoxedAnalyzer;
use analyzer::Analyzer;
use analyzer::box_analyzer;
use analyzer::SimpleTokenizer;
use analyzer::JapaneseTokenizer;
use analyzer::RemoveLongFilter;
use analyzer::LowerCaser;
use analyzer::Stemmer;
#[derive(Clone)]
pub struct AnalyzerManager {
analyzers: Arc< RwLock<HashMap<String, Box<BoxedAnalyzer> >> >
}
impl AnalyzerManager {
pub fn register<A>(&self, analyzer_name: &str, analyzer: A)
where A: 'static + Send + Sync + for <'a> Analyzer<'a> {
let boxed_analyzer = box_analyzer(analyzer);
self.analyzers
.write()
.expect("Acquiring the lock should never fail")
.insert(analyzer_name.to_string(), boxed_analyzer);
}
pub fn get(&self, analyzer_name: &str) -> Option<Box<BoxedAnalyzer>> {
self.analyzers
.read()
.expect("Acquiring the lock should never fail")
.get(analyzer_name)
.map(|boxed_analyzer| {
boxed_analyzer.boxed_clone()
})
}
}
impl Default for AnalyzerManager {
/// Creates an `AnalyzerManager` prepopulated with
/// the default analyzers of `tantivy`.
/// - simple
/// - en_stem
/// - jp
fn default() -> AnalyzerManager {
let manager = AnalyzerManager {
analyzers: Arc::new(RwLock::new(HashMap::new()))
};
manager.register("simple",
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
);
manager.register("en_stem",
SimpleTokenizer
.filter(RemoveLongFilter::limit(40))
.filter(LowerCaser)
.filter(Stemmer::new())
);
manager.register("ja",
JapaneseTokenizer
.filter(RemoveLongFilter::limit(40))
);
manager
}
}

View File

@@ -1,7 +1,10 @@
use super::{Token, Analyzer, TokenStream};
use tinysegmenter;
pub struct JPTokenizer;
/// Simple japanese tokenizer based on the `tinysegmenter` crate.
#[derive(Clone)]
pub struct JapaneseTokenizer;
#[derive(Eq, PartialEq)]
enum Cursor {
@@ -10,13 +13,13 @@ enum Cursor {
Terminated,
}
pub struct JPTokenizerStream {
pub struct JapaneseTokenizerStream {
tokens: Vec<Token>,
cursor: Cursor,
}
impl<'a> Analyzer<'a> for JPTokenizer {
type TokenStreamImpl = JPTokenizerStream;
impl<'a> Analyzer<'a> for JapaneseTokenizer {
type TokenStreamImpl = JapaneseTokenizerStream;
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl {
let mut tokens = vec![];
@@ -25,21 +28,23 @@ impl<'a> Analyzer<'a> for JPTokenizer {
for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() {
offset_from = offset_to;
offset_to = offset_from + term.len();
tokens.push(Token {
offset_from: offset_from,
offset_to: offset_to,
position: pos,
term: term,
});
if term.chars().all(char::is_alphanumeric) {
tokens.push(Token {
offset_from: offset_from,
offset_to: offset_to,
position: pos,
term: term,
});
}
}
JPTokenizerStream {
JapaneseTokenizerStream {
tokens: tokens,
cursor: Cursor::HasNotStarted,
}
}
}
impl<'a> TokenStream for JPTokenizerStream {
impl<'a> TokenStream for JapaneseTokenizerStream {
fn advance(&mut self) -> bool {
let new_cursor = match self.cursor {
Cursor::HasNotStarted => {

View File

@@ -1,6 +1,9 @@
use super::{TokenFilterFactory, TokenStream, Token};
use std::ascii::AsciiExt;
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser

View File

@@ -4,45 +4,31 @@ mod analyzer;
mod simple_tokenizer;
mod lower_caser;
mod remove_long;
mod remove_nonalphanum;
mod stemmer;
mod jp_tokenizer;
mod analyzer_manager;
mod japanese_tokenizer;
mod token_stream_chain;
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory,
TokenStream};
pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream};
pub use self::analyzer::BoxedAnalyzer;
pub use self::analyzer_manager::AnalyzerManager;
pub use self::simple_tokenizer::SimpleTokenizer;
pub use self::jp_tokenizer::JPTokenizer;
pub use self::token_stream_chain::TokenStreamChain;
pub use self::japanese_tokenizer::JapaneseTokenizer;
pub use self::remove_long::RemoveLongFilter;
pub use self::lower_caser::LowerCaser;
pub use self::stemmer::Stemmer;
pub use self::remove_nonalphanum::RemoveNonAlphaFilter;
pub use self::analyzer::BoxedAnalyzer;
pub fn en_pipeline<'a>() -> Box<BoxedAnalyzer> {
box_analyzer(
SimpleTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(LowerCaser)
.filter(Stemmer::new())
)
}
pub fn jp_pipeline<'a>() -> Box<BoxedAnalyzer> {
box_analyzer(
JPTokenizer
.filter(RemoveLongFilter::limit(20))
.filter(RemoveNonAlphaFilter)
)
}
#[cfg(test)]
mod test {
use super::{en_pipeline, jp_pipeline, Token};
use super::Token;
use super::AnalyzerManager;
#[test]
fn test_en_analyzer() {
let mut en_analyzer = en_pipeline();
let analyzer_manager = AnalyzerManager::default();
assert!(analyzer_manager.get("en_doesnotexist").is_none());
let mut en_analyzer = analyzer_manager.get("en_stem").unwrap();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
@@ -57,7 +43,9 @@ mod test {
#[test]
fn test_jp_analyzer() {
let mut en_analyzer = jp_pipeline();
let analyzer_manager = AnalyzerManager::default();
let mut en_analyzer = analyzer_manager.get("ja").unwrap();
let mut tokens: Vec<String> = vec![];
{
let mut add_token = |token: &Token| { tokens.push(token.term.clone()); };
@@ -73,7 +61,8 @@ mod test {
#[test]
fn test_tokenizer_empty() {
let mut en_analyzer = en_pipeline();
let analyzer_manager = AnalyzerManager::default();
let mut en_analyzer = analyzer_manager.get("en_stem").unwrap();
{
let mut tokens: Vec<String> = vec![];
{

View File

@@ -1,6 +1,12 @@
use super::{TokenFilterFactory, TokenStream, Token};
/// `RemoveLongFilter` removes tokens that are longer
/// than a given number of bytes (in UTF-8 representation).
///
/// It is especially useful when indexing unconstrained content.
/// e.g. Mail containing base-64 encoded pictures etc.
#[derive(Clone)]
pub struct RemoveLongFilter {
length_limit: usize,
}

View File

@@ -1,58 +0,0 @@
use super::{TokenFilterFactory, TokenStream, Token};
pub struct RemoveNonAlphaFilter;
impl<TailTokenStream> RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn predicate(&self, token: &Token) -> bool {
for c in token.term.chars() {
if !c.is_alphanumeric() {
return false;
}
}
true
}
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveNonAlphaFilter
where TailTokenStream: TokenStream
{
type ResultTokenStream = RemoveNonAlphaFilterStream<TailTokenStream>;
fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream {
RemoveNonAlphaFilterStream { tail: tail }
}
}
pub struct RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
}
impl<TailTokenStream> TokenStream for RemoveNonAlphaFilterStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn token(&self) -> &Token {
self.tail.token()
}
fn token_mut(&mut self) -> &mut Token {
self.tail.token_mut()
}
fn advance(&mut self) -> bool {
loop {
if self.tail.advance() {
if self.predicate(self.tail.token()) {
return true;
}
} else {
return false;
}
}
}
}

View File

@@ -2,6 +2,7 @@
use std::str::CharIndices;
use super::{Token, Analyzer, TokenStream};
#[derive(Clone)]
pub struct SimpleTokenizer;
pub struct SimpleTokenStream<'a> {

View File

@@ -2,14 +2,14 @@ use std::sync::Arc;
use super::{TokenFilterFactory, TokenStream, Token};
use rust_stemmers::{self, Algorithm};
#[derive(Clone)]
pub struct Stemmer {
stemmer: Arc<rust_stemmers::Stemmer>,
stemmer_algorithm: Arc<Algorithm>,
}
impl Stemmer {
pub fn new() -> Stemmer {
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
Stemmer { stemmer: Arc::new(inner_stemmer) }
Stemmer { stemmer_algorithm: Arc::new(Algorithm::English) }
}
}
@@ -19,7 +19,8 @@ impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream {
StemmerTokenStream::wrap(self.stemmer.clone(), token_stream)
let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English);
StemmerTokenStream::wrap(inner_stemmer, token_stream)
}
}
@@ -28,7 +29,7 @@ pub struct StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
{
tail: TailTokenStream,
stemmer: Arc<rust_stemmers::Stemmer>,
stemmer: rust_stemmers::Stemmer,
}
impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
@@ -58,7 +59,7 @@ impl<TailTokenStream> TokenStream for StemmerTokenStream<TailTokenStream>
impl<TailTokenStream> StemmerTokenStream<TailTokenStream>
where TailTokenStream: TokenStream
{
fn wrap(stemmer: Arc<rust_stemmers::Stemmer>,
fn wrap(stemmer: rust_stemmers::Stemmer,
tail: TailTokenStream)
-> StemmerTokenStream<TailTokenStream> {
StemmerTokenStream {

View File

@@ -0,0 +1,63 @@
use analyzer::{TokenStream, Token};
pub struct TokenStreamChain<TTokenStream: TokenStream> {
offsets: Vec<usize>,
token_streams: Vec<TTokenStream>,
position_shift: usize,
stream_idx: usize,
token: Token,
}
impl<'a, TTokenStream> TokenStreamChain<TTokenStream>
where TTokenStream: TokenStream {
pub fn new(offsets: Vec<usize>,
token_streams: Vec<TTokenStream>) -> TokenStreamChain<TTokenStream> {
TokenStreamChain {
offsets: offsets,
stream_idx: 0,
token_streams: token_streams,
position_shift: 0,
token: Token::default(),
}
}
}
impl<'a, TTokenStream> TokenStream for TokenStreamChain<TTokenStream>
where TTokenStream: TokenStream {
fn advance(&mut self) -> bool {
while self.stream_idx < self.token_streams.len() {
let token_stream = &mut self.token_streams[self.stream_idx];
if token_stream.advance() {
let token = token_stream.token();
let offset_offset = self.offsets[self.stream_idx];
self.token.offset_from = token.offset_from + offset_offset;
self.token.offset_from = token.offset_from + offset_offset;
self.token.position = token.position + self.position_shift;
self.token.term.clear();
self.token.term.push_str(token.term.as_str());
return true;
}
else {
self.stream_idx += 1;
self.position_shift = self.token.position + 2;
}
}
false
}
fn token(&self) -> &Token {
if self.stream_idx > self.token_streams.len() {
panic!("You called .token(), after the end of the token stream has been reached");
}
&self.token
}
fn token_mut(&mut self) -> &mut Token {
if self.stream_idx > self.token_streams.len() {
panic!("You called .token(), after the end of the token stream has been reached");
}
&mut self.token
}
}

View File

@@ -103,7 +103,7 @@ mod tests {
{
// perform the query
let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64);
let mut query_parser = QueryParser::new(schema, vec![text_field]);
let mut query_parser = QueryParser::for_index(index, vec![text_field]);
let query = query_parser.parse_query("text:text").unwrap();
query.search(&searcher, &mut facet_collectors).unwrap();
}

View File

@@ -23,6 +23,7 @@ use directory::ManagedDirectory;
use core::META_FILEPATH;
use super::segment::create_segment;
use indexer::segment_updater::save_new_metas;
use analyzer::AnalyzerManager;
const NUM_SEARCHERS: usize = 12;
@@ -37,6 +38,7 @@ pub struct Index {
directory: ManagedDirectory,
schema: Schema,
searcher_pool: Arc<Pool<Searcher>>,
analyzers: AnalyzerManager
}
@@ -64,6 +66,10 @@ impl Index {
Index::from_directory(directory, schema)
}
pub fn analyzers(&self) -> AnalyzerManager {
self.analyzers.clone()
}
/// Creates a new index in a temp directory.
///
/// The index will use the `MMapDirectory` in a newly created directory.
@@ -85,6 +91,7 @@ impl Index {
directory: directory,
schema: schema,
searcher_pool: Arc::new(Pool::new()),
analyzers: AnalyzerManager::default(),
};
try!(index.load_searchers());
Ok(index)
@@ -242,6 +249,7 @@ impl Clone for Index {
directory: self.directory.clone(),
schema: self.schema.clone(),
searcher_pool: self.searcher_pool.clone(),
analyzers: self.analyzers.clone()
}
}
}

View File

@@ -36,6 +36,12 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment {
}
impl Segment {
pub fn index(&self) -> &Index {
&self.index
}
/// Returns our index's schema.
pub fn schema(&self) -> Schema {
self.index.schema()

View File

@@ -14,6 +14,8 @@ use datastruct::stacker::Heap;
use indexer::index_writer::MARGIN_IN_BYTES;
use super::operation::AddOperation;
use postings::MultiFieldPostingsWriter;
use analyzer::BoxedAnalyzer;
use schema::Value;
/// A `SegmentWriter` is in charge of creating segment index from a
@@ -29,6 +31,7 @@ pub struct SegmentWriter<'a> {
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: FastFieldsWriter,
doc_opstamps: Vec<u64>,
analyzers: Vec<Option<Box<BoxedAnalyzer>>>
}
@@ -60,6 +63,18 @@ impl<'a> SegmentWriter<'a> {
-> Result<SegmentWriter<'a>> {
let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment));
let multifield_postings = MultiFieldPostingsWriter::new(schema, heap);
let analyzers = schema.fields()
.iter()
.map(|field_entry| field_entry.field_type())
.map(|field_type| {
match field_type {
&FieldType::Str(ref text_options) => {
segment.index().analyzers().get("simple")
}
_ => None,
}
})
.collect();
Ok(SegmentWriter {
heap: heap,
max_doc: 0,
@@ -68,6 +83,7 @@ impl<'a> SegmentWriter<'a> {
segment_serializer: segment_serializer,
fast_field_writers: FastFieldsWriter::from_schema(schema),
doc_opstamps: Vec::with_capacity(1_000),
analyzers: analyzers,
})
}
@@ -117,17 +133,32 @@ impl<'a> SegmentWriter<'a> {
let field_options = schema.get_field_entry(field);
match *field_options.field_type() {
FieldType::Str(ref text_options) => {
let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() {
self.multifield_postings
.index_text(doc_id, field, &field_values)
} else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
let term = Term::from_field_text(field, field_value.value().text());
self.multifield_postings.suscribe(doc_id, &term);
}
num_field_values
};
let num_tokens: u32 =
if text_options.get_indexing_options().is_tokenized() {
if let Some(ref mut analyzer) = self.analyzers[field.0 as usize] {
let texts: Vec<&str> = field_values.iter()
.flat_map(|field_value| {
match field_value.value() {
&Value::Str(ref text) => Some(text.as_str()),
_ => None
}
})
.collect();
let mut token_stream = analyzer.token_stream_texts(&texts[..]);
self.multifield_postings.index_text(doc_id, field, &mut token_stream)
}
else {
0u32
}
} else {
let num_field_values = field_values.len() as u32;
for field_value in field_values {
let term = Term::from_field_text(field, field_value.value().text());
self.multifield_postings.suscribe(doc_id, &term);
}
num_field_values
};
self.fieldnorms_writer
.get_field_writer(field)
.map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64));

View File

@@ -1,18 +1,18 @@
use DocId;
use schema::Term;
use schema::FieldValue;
use postings::PostingsSerializer;
use std::io;
use postings::Recorder;
use Result;
use schema::{Schema, Field};
use analyzer::{en_pipeline, Token};
use analyzer::Token;
use std::marker::PhantomData;
use std::ops::DerefMut;
use datastruct::stacker::{HashMap, Heap};
use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder};
use schema::FieldEntry;
use schema::FieldType;
use analyzer::TokenStream;
use schema::TextIndexingOptions;
fn posting_from_field_entry<'a>(field_entry: &FieldEntry,
@@ -62,9 +62,9 @@ impl<'a> MultiFieldPostingsWriter<'a> {
}
}
pub fn index_text(&mut self, doc: DocId, field: Field, field_values: &[&FieldValue]) -> u32 {
pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 {
let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut();
postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap)
postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap)
}
pub fn suscribe(&mut self, doc: DocId, term: &Term) {
@@ -140,39 +140,24 @@ pub trait PostingsWriter {
serializer: &mut PostingsSerializer,
heap: &Heap)
-> io::Result<()>;
/// Tokenize a text and suscribe all of its token.
fn index_text<'a>(&mut self,
term_index: &mut HashMap,
doc_id: DocId,
field: Field,
field_values: &[&'a FieldValue],
token_stream: &mut TokenStream,
heap: &Heap)
-> u32 {
let mut num_tokens: u32 = 0u32;
let mut term = unsafe { Term::with_capacity(100) };
term.set_field(field);
let mut analyzer = en_pipeline();
let mut overall_position = 0u32;
let mut sink = |token: &Token| {
term.set_text(token.term.as_str());
self.suscribe(term_index, doc_id, token.position as u32, &term, heap);
};
for field_value in field_values {
// TODO fix position when more than one value.
let mut token_stream = analyzer.token_stream(field_value.value().text());
let mut local_position = 0;
num_tokens += {
let mut sink = |token: &Token| {
term.set_text(token.term.as_str());
local_position = token.position as u32;
self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap);
};
token_stream.process(&mut sink)
};
overall_position += local_position + 2u32;
}
num_tokens
token_stream.process(&mut sink)
}
}
@@ -213,6 +198,7 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> {
}
impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> {
fn suscribe(&mut self,
term_index: &mut HashMap,
doc: DocId,

View File

@@ -8,11 +8,11 @@ use query::Occur;
use query::TermQuery;
use postings::SegmentPostingsOption;
use query::PhraseQuery;
use analyzer::{en_pipeline, BoxedAnalyzer};
use schema::{Term, FieldType};
use std::str::FromStr;
use analyzer::AnalyzerManager;
use std::num::ParseIntError;
use core::Index;
/// Possible error that may happen when parsing a query.
#[derive(Debug, PartialEq, Eq)]
@@ -74,7 +74,7 @@ pub struct QueryParser {
schema: Schema,
default_fields: Vec<Field>,
conjunction_by_default: bool,
analyzer: Box<BoxedAnalyzer>,
analyzer_manager: AnalyzerManager,
}
impl QueryParser {
@@ -82,15 +82,25 @@ impl QueryParser {
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
pub fn new(schema: Schema, default_fields: Vec<Field>) -> QueryParser {
pub fn new(schema: Schema,
default_fields: Vec<Field>,
analyzer_manager: AnalyzerManager) -> QueryParser {
QueryParser {
schema: schema,
default_fields: default_fields,
conjunction_by_default: false,
analyzer: en_pipeline(),
analyzer_manager: analyzer_manager,
}
}
pub fn for_index(index: Index,
default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(
index.schema(),
default_fields,
index.analyzers())
}
/// Set the default way to compose queries to a conjunction.
///
/// By default a ,
@@ -135,7 +145,7 @@ impl QueryParser {
}
Ok(ast)
}
fn compute_logical_ast_for_leaf(&mut self,
field: Field,
phrase: &str)
@@ -143,6 +153,11 @@ impl QueryParser {
let field_entry = self.schema.get_field_entry(field);
let field_type = field_entry.field_type();
let mut analyzer = self.analyzer_manager
.get("simple")
.ok_or_else(|| {
QueryParserError::FieldNotIndexed(field_entry.name().to_string())
})?;
if !field_type.is_indexed() {
let field_name = field_entry.name().to_string();
return Err(QueryParserError::FieldNotIndexed(field_name));
@@ -161,7 +176,7 @@ impl QueryParser {
FieldType::Str(ref str_options) => {
let mut terms: Vec<Term> = Vec::new();
if str_options.get_indexing_options().is_tokenized() {
let mut token_stream = self.analyzer.token_stream(phrase);
let mut token_stream = analyzer.token_stream(phrase);
token_stream.process(&mut |token| {
let term = Term::from_field_text(field, &token.term);
terms.push(term);
@@ -296,6 +311,7 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box<Query> {
#[cfg(test)]
mod test {
use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED};
use analyzer::AnalyzerManager;
use query::Query;
use schema::Field;
use super::QueryParser;
@@ -314,7 +330,8 @@ mod test {
schema_builder.add_text_field("nottokenized", STRING);
let schema = schema_builder.build();
let default_fields = vec![title, text];
QueryParser::new(schema, default_fields)
let analyzer_manager = AnalyzerManager::default();
QueryParser::new(schema, default_fields, analyzer_manager)
}