diff --git a/examples/simple_search.rs b/examples/simple_search.rs index afdeb47c6..7f06f3431 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -179,7 +179,7 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // Here, if the user does not specify which // field they want to search, tantivy will search // in both title and body. - let mut query_parser = QueryParser::new(index.schema(), vec![title, body]); + let mut query_parser = QueryParser::for_index(index, vec![title, body]); // QueryParser may fail if the query is not in the right // format. For user facing applications, this can be a problem. diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs index 08912c574..fdd5cbd25 100644 --- a/src/analyzer/analyzer.rs +++ b/src/analyzer/analyzer.rs @@ -1,4 +1,8 @@ +/// The analyzer module contains all of the tools used to process +/// text in `tantivy`. + use std::borrow::{Borrow, BorrowMut}; +use analyzer::TokenStreamChain; /// Token pub struct Token { @@ -26,7 +30,7 @@ impl Default for Token { } } -pub trait Analyzer<'a>: Sized { +pub trait Analyzer<'a>: Sized + Clone { type TokenStreamImpl: TokenStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl; @@ -41,20 +45,49 @@ pub trait Analyzer<'a>: Sized { } } -pub trait BoxedAnalyzer { - fn token_stream<'a>(&mut self, text: &'a str) -> Box; +pub trait BoxedAnalyzer: Send + Sync { + fn token_stream<'a>(&mut self, text: &'a str) -> Box; + fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box; + fn boxed_clone(&self) -> Box; } -struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a>; +#[derive(Clone)] +struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a> + Send + Sync; -impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + for <'a> Analyzer<'a> { - fn token_stream<'b>(&mut self, text: &'b str) -> Box { +impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + Send + Sync + for <'a> Analyzer<'a> { + fn token_stream<'a>(&mut self, text: &'a str) -> Box { box self.0.token_stream(text) } + + fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box { + assert!(texts.len() > 0); + if texts.len() == 1 { + box self.0.token_stream(texts[0]) + } + else { + let mut offsets = vec!(); + let mut total_offset = 0; + for text in texts { + offsets.push(total_offset); + total_offset += text.len(); + } + let token_streams: Vec<_> = texts + .iter() + .map(|text| { + self.0.token_stream(text) + }) + .collect(); + box TokenStreamChain::new(offsets, token_streams) + } + } + + fn boxed_clone(&self) -> Box { + box self.clone() + } } pub fn box_analyzer(a: A) -> Box - where A: 'static + for <'a> Analyzer<'a> { + where A: 'static + Send + Sync + for <'a> Analyzer<'a> { box BoxableAnalyzer(a) } @@ -102,7 +135,7 @@ pub trait TokenStream { } } - +#[derive(Clone)] pub struct ChainAnalyzer { head: HeadTokenFilterFactory, tail: TailAnalyzer, @@ -117,13 +150,13 @@ impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { - let tail_token_stream = self.tail.token_stream(text); + let tail_token_stream = self.tail.token_stream(text ); self.head.transform(tail_token_stream) } } -pub trait TokenFilterFactory { +pub trait TokenFilterFactory: Clone { type ResultTokenStream: TokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream; diff --git a/src/analyzer/analyzer_manager.rs b/src/analyzer/analyzer_manager.rs new file mode 100644 index 000000000..fb398d2a1 --- /dev/null +++ b/src/analyzer/analyzer_manager.rs @@ -0,0 +1,66 @@ +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; +use analyzer::BoxedAnalyzer; +use analyzer::Analyzer; +use analyzer::box_analyzer; +use analyzer::SimpleTokenizer; +use analyzer::JapaneseTokenizer; +use analyzer::RemoveLongFilter; +use analyzer::LowerCaser; +use analyzer::Stemmer; + + +#[derive(Clone)] +pub struct AnalyzerManager { + analyzers: Arc< RwLock >> > +} + +impl AnalyzerManager { + pub fn register(&self, analyzer_name: &str, analyzer: A) + where A: 'static + Send + Sync + for <'a> Analyzer<'a> { + let boxed_analyzer = box_analyzer(analyzer); + self.analyzers + .write() + .expect("Acquiring the lock should never fail") + .insert(analyzer_name.to_string(), boxed_analyzer); + } + + pub fn get(&self, analyzer_name: &str) -> Option> { + self.analyzers + .read() + .expect("Acquiring the lock should never fail") + .get(analyzer_name) + .map(|boxed_analyzer| { + boxed_analyzer.boxed_clone() + }) + } +} + +impl Default for AnalyzerManager { + /// Creates an `AnalyzerManager` prepopulated with + /// the default analyzers of `tantivy`. + /// - simple + /// - en_stem + /// - jp + fn default() -> AnalyzerManager { + let manager = AnalyzerManager { + analyzers: Arc::new(RwLock::new(HashMap::new())) + }; + manager.register("simple", + SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + ); + manager.register("en_stem", + SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(Stemmer::new()) + ); + manager.register("ja", + JapaneseTokenizer + .filter(RemoveLongFilter::limit(40)) + ); + manager + } +} \ No newline at end of file diff --git a/src/analyzer/jp_tokenizer.rs b/src/analyzer/japanese_tokenizer.rs similarity index 78% rename from src/analyzer/jp_tokenizer.rs rename to src/analyzer/japanese_tokenizer.rs index 4bc5d4689..909ccbb0c 100644 --- a/src/analyzer/jp_tokenizer.rs +++ b/src/analyzer/japanese_tokenizer.rs @@ -1,7 +1,10 @@ use super::{Token, Analyzer, TokenStream}; use tinysegmenter; -pub struct JPTokenizer; + +/// Simple japanese tokenizer based on the `tinysegmenter` crate. +#[derive(Clone)] +pub struct JapaneseTokenizer; #[derive(Eq, PartialEq)] enum Cursor { @@ -10,13 +13,13 @@ enum Cursor { Terminated, } -pub struct JPTokenizerStream { +pub struct JapaneseTokenizerStream { tokens: Vec, cursor: Cursor, } -impl<'a> Analyzer<'a> for JPTokenizer { - type TokenStreamImpl = JPTokenizerStream; +impl<'a> Analyzer<'a> for JapaneseTokenizer { + type TokenStreamImpl = JapaneseTokenizerStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { let mut tokens = vec![]; @@ -25,21 +28,23 @@ impl<'a> Analyzer<'a> for JPTokenizer { for (pos, term) in tinysegmenter::tokenize(text).into_iter().enumerate() { offset_from = offset_to; offset_to = offset_from + term.len(); - tokens.push(Token { - offset_from: offset_from, - offset_to: offset_to, - position: pos, - term: term, - }); + if term.chars().all(char::is_alphanumeric) { + tokens.push(Token { + offset_from: offset_from, + offset_to: offset_to, + position: pos, + term: term, + }); + } } - JPTokenizerStream { + JapaneseTokenizerStream { tokens: tokens, cursor: Cursor::HasNotStarted, } } } -impl<'a> TokenStream for JPTokenizerStream { +impl<'a> TokenStream for JapaneseTokenizerStream { fn advance(&mut self) -> bool { let new_cursor = match self.cursor { Cursor::HasNotStarted => { diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs index a3f72ddcc..866508782 100644 --- a/src/analyzer/lower_caser.rs +++ b/src/analyzer/lower_caser.rs @@ -1,6 +1,9 @@ use super::{TokenFilterFactory, TokenStream, Token}; use std::ascii::AsciiExt; + +/// Token filter that lowercase terms. +#[derive(Clone)] pub struct LowerCaser; impl TokenFilterFactory for LowerCaser diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index 5cc42b83f..bea83d294 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -4,45 +4,31 @@ mod analyzer; mod simple_tokenizer; mod lower_caser; mod remove_long; -mod remove_nonalphanum; mod stemmer; -mod jp_tokenizer; +mod analyzer_manager; +mod japanese_tokenizer; +mod token_stream_chain; -pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, - TokenStream}; +pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream}; +pub use self::analyzer::BoxedAnalyzer; +pub use self::analyzer_manager::AnalyzerManager; pub use self::simple_tokenizer::SimpleTokenizer; -pub use self::jp_tokenizer::JPTokenizer; +pub use self::token_stream_chain::TokenStreamChain; +pub use self::japanese_tokenizer::JapaneseTokenizer; pub use self::remove_long::RemoveLongFilter; pub use self::lower_caser::LowerCaser; pub use self::stemmer::Stemmer; -pub use self::remove_nonalphanum::RemoveNonAlphaFilter; -pub use self::analyzer::BoxedAnalyzer; - - -pub fn en_pipeline<'a>() -> Box { - box_analyzer( - SimpleTokenizer - .filter(RemoveLongFilter::limit(20)) - .filter(LowerCaser) - .filter(Stemmer::new()) - ) -} - -pub fn jp_pipeline<'a>() -> Box { - box_analyzer( - JPTokenizer - .filter(RemoveLongFilter::limit(20)) - .filter(RemoveNonAlphaFilter) - ) -} #[cfg(test)] mod test { - use super::{en_pipeline, jp_pipeline, Token}; + use super::Token; + use super::AnalyzerManager; #[test] fn test_en_analyzer() { - let mut en_analyzer = en_pipeline(); + let analyzer_manager = AnalyzerManager::default(); + assert!(analyzer_manager.get("en_doesnotexist").is_none()); + let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; @@ -57,7 +43,9 @@ mod test { #[test] fn test_jp_analyzer() { - let mut en_analyzer = jp_pipeline(); + let analyzer_manager = AnalyzerManager::default(); + let mut en_analyzer = analyzer_manager.get("ja").unwrap(); + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; @@ -73,7 +61,8 @@ mod test { #[test] fn test_tokenizer_empty() { - let mut en_analyzer = en_pipeline(); + let analyzer_manager = AnalyzerManager::default(); + let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); { let mut tokens: Vec = vec![]; { diff --git a/src/analyzer/remove_long.rs b/src/analyzer/remove_long.rs index 98b73b973..c446d3521 100644 --- a/src/analyzer/remove_long.rs +++ b/src/analyzer/remove_long.rs @@ -1,6 +1,12 @@ use super::{TokenFilterFactory, TokenStream, Token}; +/// `RemoveLongFilter` removes tokens that are longer +/// than a given number of bytes (in UTF-8 representation). +/// +/// It is especially useful when indexing unconstrained content. +/// e.g. Mail containing base-64 encoded pictures etc. +#[derive(Clone)] pub struct RemoveLongFilter { length_limit: usize, } diff --git a/src/analyzer/remove_nonalphanum.rs b/src/analyzer/remove_nonalphanum.rs deleted file mode 100644 index ede810680..000000000 --- a/src/analyzer/remove_nonalphanum.rs +++ /dev/null @@ -1,58 +0,0 @@ -use super::{TokenFilterFactory, TokenStream, Token}; - - -pub struct RemoveNonAlphaFilter; - -impl RemoveNonAlphaFilterStream - where TailTokenStream: TokenStream -{ - fn predicate(&self, token: &Token) -> bool { - for c in token.term.chars() { - if !c.is_alphanumeric() { - return false; - } - } - true - } -} - - -impl TokenFilterFactory for RemoveNonAlphaFilter - where TailTokenStream: TokenStream -{ - type ResultTokenStream = RemoveNonAlphaFilterStream; - - fn transform(&self, tail: TailTokenStream) -> Self::ResultTokenStream { - RemoveNonAlphaFilterStream { tail: tail } - } -} - -pub struct RemoveNonAlphaFilterStream - where TailTokenStream: TokenStream -{ - tail: TailTokenStream, -} - -impl TokenStream for RemoveNonAlphaFilterStream - where TailTokenStream: TokenStream -{ - fn token(&self) -> &Token { - self.tail.token() - } - - fn token_mut(&mut self) -> &mut Token { - self.tail.token_mut() - } - - fn advance(&mut self) -> bool { - loop { - if self.tail.advance() { - if self.predicate(self.tail.token()) { - return true; - } - } else { - return false; - } - } - } -} diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs index c79282279..d36b785fe 100644 --- a/src/analyzer/simple_tokenizer.rs +++ b/src/analyzer/simple_tokenizer.rs @@ -2,6 +2,7 @@ use std::str::CharIndices; use super::{Token, Analyzer, TokenStream}; +#[derive(Clone)] pub struct SimpleTokenizer; pub struct SimpleTokenStream<'a> { diff --git a/src/analyzer/stemmer.rs b/src/analyzer/stemmer.rs index 82e3e5ac3..9d1e45811 100644 --- a/src/analyzer/stemmer.rs +++ b/src/analyzer/stemmer.rs @@ -2,14 +2,14 @@ use std::sync::Arc; use super::{TokenFilterFactory, TokenStream, Token}; use rust_stemmers::{self, Algorithm}; +#[derive(Clone)] pub struct Stemmer { - stemmer: Arc, + stemmer_algorithm: Arc, } impl Stemmer { pub fn new() -> Stemmer { - let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); - Stemmer { stemmer: Arc::new(inner_stemmer) } + Stemmer { stemmer_algorithm: Arc::new(Algorithm::English) } } } @@ -19,7 +19,8 @@ impl TokenFilterFactory for Stemmer type ResultTokenStream = StemmerTokenStream; fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream { - StemmerTokenStream::wrap(self.stemmer.clone(), token_stream) + let inner_stemmer = rust_stemmers::Stemmer::create(Algorithm::English); + StemmerTokenStream::wrap(inner_stemmer, token_stream) } } @@ -28,7 +29,7 @@ pub struct StemmerTokenStream where TailTokenStream: TokenStream { tail: TailTokenStream, - stemmer: Arc, + stemmer: rust_stemmers::Stemmer, } impl TokenStream for StemmerTokenStream @@ -58,7 +59,7 @@ impl TokenStream for StemmerTokenStream impl StemmerTokenStream where TailTokenStream: TokenStream { - fn wrap(stemmer: Arc, + fn wrap(stemmer: rust_stemmers::Stemmer, tail: TailTokenStream) -> StemmerTokenStream { StemmerTokenStream { diff --git a/src/analyzer/token_stream_chain.rs b/src/analyzer/token_stream_chain.rs new file mode 100644 index 000000000..6f59f9ae2 --- /dev/null +++ b/src/analyzer/token_stream_chain.rs @@ -0,0 +1,63 @@ +use analyzer::{TokenStream, Token}; + +pub struct TokenStreamChain { + offsets: Vec, + token_streams: Vec, + position_shift: usize, + stream_idx: usize, + token: Token, +} + + +impl<'a, TTokenStream> TokenStreamChain + where TTokenStream: TokenStream { + + pub fn new(offsets: Vec, + token_streams: Vec) -> TokenStreamChain { + TokenStreamChain { + offsets: offsets, + stream_idx: 0, + token_streams: token_streams, + position_shift: 0, + token: Token::default(), + } + } +} + +impl<'a, TTokenStream> TokenStream for TokenStreamChain + where TTokenStream: TokenStream { + fn advance(&mut self) -> bool { + while self.stream_idx < self.token_streams.len() { + let token_stream = &mut self.token_streams[self.stream_idx]; + if token_stream.advance() { + let token = token_stream.token(); + let offset_offset = self.offsets[self.stream_idx]; + self.token.offset_from = token.offset_from + offset_offset; + self.token.offset_from = token.offset_from + offset_offset; + self.token.position = token.position + self.position_shift; + self.token.term.clear(); + self.token.term.push_str(token.term.as_str()); + return true; + } + else { + self.stream_idx += 1; + self.position_shift = self.token.position + 2; + } + } + false + } + + fn token(&self) -> &Token { + if self.stream_idx > self.token_streams.len() { + panic!("You called .token(), after the end of the token stream has been reached"); + } + &self.token + } + + fn token_mut(&mut self) -> &mut Token { + if self.stream_idx > self.token_streams.len() { + panic!("You called .token(), after the end of the token stream has been reached"); + } + &mut self.token + } +} diff --git a/src/collector/facet_collector.rs b/src/collector/facet_collector.rs index 983d1ffd6..0721a4271 100644 --- a/src/collector/facet_collector.rs +++ b/src/collector/facet_collector.rs @@ -103,7 +103,7 @@ mod tests { { // perform the query let mut facet_collectors = chain().push(&mut ffvf_i64).push(&mut ffvf_u64); - let mut query_parser = QueryParser::new(schema, vec![text_field]); + let mut query_parser = QueryParser::for_index(index, vec![text_field]); let query = query_parser.parse_query("text:text").unwrap(); query.search(&searcher, &mut facet_collectors).unwrap(); } diff --git a/src/core/index.rs b/src/core/index.rs index 01a0abe54..3d8aae70b 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -23,6 +23,7 @@ use directory::ManagedDirectory; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; +use analyzer::AnalyzerManager; const NUM_SEARCHERS: usize = 12; @@ -37,6 +38,7 @@ pub struct Index { directory: ManagedDirectory, schema: Schema, searcher_pool: Arc>, + analyzers: AnalyzerManager } @@ -64,6 +66,10 @@ impl Index { Index::from_directory(directory, schema) } + pub fn analyzers(&self) -> AnalyzerManager { + self.analyzers.clone() + } + /// Creates a new index in a temp directory. /// /// The index will use the `MMapDirectory` in a newly created directory. @@ -85,6 +91,7 @@ impl Index { directory: directory, schema: schema, searcher_pool: Arc::new(Pool::new()), + analyzers: AnalyzerManager::default(), }; try!(index.load_searchers()); Ok(index) @@ -242,6 +249,7 @@ impl Clone for Index { directory: self.directory.clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), + analyzers: self.analyzers.clone() } } } diff --git a/src/core/segment.rs b/src/core/segment.rs index 6d6d07db5..344c71a8f 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -36,6 +36,12 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { } impl Segment { + + pub fn index(&self) -> &Index { + &self.index + } + + /// Returns our index's schema. pub fn schema(&self) -> Schema { self.index.schema() diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index c5d4d6662..d04fce927 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -14,6 +14,8 @@ use datastruct::stacker::Heap; use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::AddOperation; use postings::MultiFieldPostingsWriter; +use analyzer::BoxedAnalyzer; +use schema::Value; /// A `SegmentWriter` is in charge of creating segment index from a @@ -29,6 +31,7 @@ pub struct SegmentWriter<'a> { fast_field_writers: FastFieldsWriter, fieldnorms_writer: FastFieldsWriter, doc_opstamps: Vec, + analyzers: Vec>> } @@ -60,6 +63,18 @@ impl<'a> SegmentWriter<'a> { -> Result> { let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); let multifield_postings = MultiFieldPostingsWriter::new(schema, heap); + let analyzers = schema.fields() + .iter() + .map(|field_entry| field_entry.field_type()) + .map(|field_type| { + match field_type { + &FieldType::Str(ref text_options) => { + segment.index().analyzers().get("simple") + } + _ => None, + } + }) + .collect(); Ok(SegmentWriter { heap: heap, max_doc: 0, @@ -68,6 +83,7 @@ impl<'a> SegmentWriter<'a> { segment_serializer: segment_serializer, fast_field_writers: FastFieldsWriter::from_schema(schema), doc_opstamps: Vec::with_capacity(1_000), + analyzers: analyzers, }) } @@ -117,17 +133,32 @@ impl<'a> SegmentWriter<'a> { let field_options = schema.get_field_entry(field); match *field_options.field_type() { FieldType::Str(ref text_options) => { - let num_tokens: u32 = if text_options.get_indexing_options().is_tokenized() { - self.multifield_postings - .index_text(doc_id, field, &field_values) - } else { - let num_field_values = field_values.len() as u32; - for field_value in field_values { - let term = Term::from_field_text(field, field_value.value().text()); - self.multifield_postings.suscribe(doc_id, &term); - } - num_field_values - }; + let num_tokens: u32 = + if text_options.get_indexing_options().is_tokenized() { + if let Some(ref mut analyzer) = self.analyzers[field.0 as usize] { + let texts: Vec<&str> = field_values.iter() + .flat_map(|field_value| { + match field_value.value() { + &Value::Str(ref text) => Some(text.as_str()), + _ => None + } + }) + .collect(); + let mut token_stream = analyzer.token_stream_texts(&texts[..]); + self.multifield_postings.index_text(doc_id, field, &mut token_stream) + } + else { + 0u32 + } + + } else { + let num_field_values = field_values.len() as u32; + for field_value in field_values { + let term = Term::from_field_text(field, field_value.value().text()); + self.multifield_postings.suscribe(doc_id, &term); + } + num_field_values + }; self.fieldnorms_writer .get_field_writer(field) .map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64)); diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 3823b20e5..03e10da26 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -1,18 +1,18 @@ use DocId; use schema::Term; -use schema::FieldValue; use postings::PostingsSerializer; use std::io; use postings::Recorder; use Result; use schema::{Schema, Field}; -use analyzer::{en_pipeline, Token}; +use analyzer::Token; use std::marker::PhantomData; use std::ops::DerefMut; use datastruct::stacker::{HashMap, Heap}; use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; use schema::FieldEntry; use schema::FieldType; +use analyzer::TokenStream; use schema::TextIndexingOptions; fn posting_from_field_entry<'a>(field_entry: &FieldEntry, @@ -62,9 +62,9 @@ impl<'a> MultiFieldPostingsWriter<'a> { } } - pub fn index_text(&mut self, doc: DocId, field: Field, field_values: &[&FieldValue]) -> u32 { + pub fn index_text(&mut self, doc: DocId, field: Field, token_stream: &mut TokenStream) -> u32 { let postings_writer = self.per_field_postings_writers[field.0 as usize].deref_mut(); - postings_writer.index_text(&mut self.term_index, doc, field, field_values, self.heap) + postings_writer.index_text(&mut self.term_index, doc, field, token_stream, self.heap) } pub fn suscribe(&mut self, doc: DocId, term: &Term) { @@ -140,39 +140,24 @@ pub trait PostingsWriter { serializer: &mut PostingsSerializer, heap: &Heap) -> io::Result<()>; - + /// Tokenize a text and suscribe all of its token. fn index_text<'a>(&mut self, term_index: &mut HashMap, doc_id: DocId, field: Field, - field_values: &[&'a FieldValue], + token_stream: &mut TokenStream, heap: &Heap) -> u32 { - let mut num_tokens: u32 = 0u32; let mut term = unsafe { Term::with_capacity(100) }; - term.set_field(field); - let mut analyzer = en_pipeline(); - - let mut overall_position = 0u32; + let mut sink = |token: &Token| { + term.set_text(token.term.as_str()); + self.suscribe(term_index, doc_id, token.position as u32, &term, heap); + }; - for field_value in field_values { - // TODO fix position when more than one value. - let mut token_stream = analyzer.token_stream(field_value.value().text()); - let mut local_position = 0; - num_tokens += { - let mut sink = |token: &Token| { - term.set_text(token.term.as_str()); - local_position = token.position as u32; - self.suscribe(term_index, doc_id, overall_position + local_position, &term, heap); - }; - token_stream.process(&mut sink) - }; - overall_position += local_position + 2u32; - } - num_tokens + token_stream.process(&mut sink) } } @@ -213,6 +198,7 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { + fn suscribe(&mut self, term_index: &mut HashMap, doc: DocId, diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 5cb65ea5c..2478da8fe 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -8,11 +8,11 @@ use query::Occur; use query::TermQuery; use postings::SegmentPostingsOption; use query::PhraseQuery; -use analyzer::{en_pipeline, BoxedAnalyzer}; use schema::{Term, FieldType}; use std::str::FromStr; +use analyzer::AnalyzerManager; use std::num::ParseIntError; - +use core::Index; /// Possible error that may happen when parsing a query. #[derive(Debug, PartialEq, Eq)] @@ -74,7 +74,7 @@ pub struct QueryParser { schema: Schema, default_fields: Vec, conjunction_by_default: bool, - analyzer: Box, + analyzer_manager: AnalyzerManager, } impl QueryParser { @@ -82,15 +82,25 @@ impl QueryParser { /// * schema - index Schema /// * default_fields - fields used to search if no field is specifically defined /// in the query. - pub fn new(schema: Schema, default_fields: Vec) -> QueryParser { + pub fn new(schema: Schema, + default_fields: Vec, + analyzer_manager: AnalyzerManager) -> QueryParser { QueryParser { schema: schema, default_fields: default_fields, conjunction_by_default: false, - analyzer: en_pipeline(), + analyzer_manager: analyzer_manager, } } + pub fn for_index(index: Index, + default_fields: Vec) -> QueryParser { + QueryParser::new( + index.schema(), + default_fields, + index.analyzers()) + } + /// Set the default way to compose queries to a conjunction. /// /// By default a , @@ -135,7 +145,7 @@ impl QueryParser { } Ok(ast) } - + fn compute_logical_ast_for_leaf(&mut self, field: Field, phrase: &str) @@ -143,6 +153,11 @@ impl QueryParser { let field_entry = self.schema.get_field_entry(field); let field_type = field_entry.field_type(); + let mut analyzer = self.analyzer_manager + .get("simple") + .ok_or_else(|| { + QueryParserError::FieldNotIndexed(field_entry.name().to_string()) + })?; if !field_type.is_indexed() { let field_name = field_entry.name().to_string(); return Err(QueryParserError::FieldNotIndexed(field_name)); @@ -161,7 +176,7 @@ impl QueryParser { FieldType::Str(ref str_options) => { let mut terms: Vec = Vec::new(); if str_options.get_indexing_options().is_tokenized() { - let mut token_stream = self.analyzer.token_stream(phrase); + let mut token_stream = analyzer.token_stream(phrase); token_stream.process(&mut |token| { let term = Term::from_field_text(field, &token.term); terms.push(term); @@ -296,6 +311,7 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box { #[cfg(test)] mod test { use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED}; + use analyzer::AnalyzerManager; use query::Query; use schema::Field; use super::QueryParser; @@ -314,7 +330,8 @@ mod test { schema_builder.add_text_field("nottokenized", STRING); let schema = schema_builder.build(); let default_fields = vec![title, text]; - QueryParser::new(schema, default_fields) + let analyzer_manager = AnalyzerManager::default(); + QueryParser::new(schema, default_fields, analyzer_manager) }