diff --git a/src/analyzer/analyzer_manager.rs b/src/analyzer/analyzer_manager.rs deleted file mode 100644 index da35ade1a..000000000 --- a/src/analyzer/analyzer_manager.rs +++ /dev/null @@ -1,78 +0,0 @@ -use std::collections::HashMap; -use std::sync::{Arc, RwLock}; -use analyzer::BoxedAnalyzer; -use analyzer::Analyzer; -use analyzer::box_analyzer; -use analyzer::RawTokenizer; -use analyzer::SimpleTokenizer; -use analyzer::JapaneseTokenizer; -use analyzer::RemoveLongFilter; -use analyzer::LowerCaser; -use analyzer::Stemmer; - - - -/// The analyzer manager serves as a store for -/// all of the configured analyzers. -/// -/// By default, it is populated with the following managers. -/// -/// * raw : does not process nor tokenize the text. -/// * default : Tokenizes according to whitespace and punctuation, removes tokens that are too long, lowercases the -#[derive(Clone)] -pub struct AnalyzerManager { - analyzers: Arc< RwLock >> > -} - -impl AnalyzerManager { - pub fn register(&self, analyzer_name: &str, analyzer: A) - where A: 'static + Send + Sync + for <'a> Analyzer<'a> { - let boxed_analyzer = box_analyzer(analyzer); - self.analyzers - .write() - .expect("Acquiring the lock should never fail") - .insert(analyzer_name.to_string(), boxed_analyzer); - } - - pub fn get(&self, analyzer_name: &str) -> Option> { - self.analyzers - .read() - .expect("Acquiring the lock should never fail") - .get(analyzer_name) - .map(|boxed_analyzer| { - boxed_analyzer.boxed_clone() - }) - } -} - -impl Default for AnalyzerManager { - /// Creates an `AnalyzerManager` prepopulated with - /// the default analyzers of `tantivy`. - /// - simple - /// - en_stem - /// - jp - fn default() -> AnalyzerManager { - let manager = AnalyzerManager { - analyzers: Arc::new(RwLock::new(HashMap::new())) - }; - manager.register("raw", - RawTokenizer - ); - manager.register("default", - SimpleTokenizer - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - ); - manager.register("en_stem", - SimpleTokenizer - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(Stemmer::new()) - ); - manager.register("ja", - JapaneseTokenizer - .filter(RemoveLongFilter::limit(40)) - ); - manager - } -} \ No newline at end of file diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs deleted file mode 100644 index 227995b85..000000000 --- a/src/analyzer/mod.rs +++ /dev/null @@ -1,100 +0,0 @@ -mod analyzer; -mod simple_tokenizer; -mod lower_caser; -mod remove_long; -mod stemmer; -mod analyzer_manager; -mod japanese_tokenizer; -mod token_stream_chain; -mod raw_tokenizer; - - -pub use self::analyzer::{box_analyzer, Analyzer, Token, TokenFilterFactory, TokenStream}; -pub use self::analyzer::BoxedAnalyzer; -pub use self::analyzer_manager::AnalyzerManager; -pub use self::simple_tokenizer::SimpleTokenizer; -pub use self::raw_tokenizer::RawTokenizer; -pub use self::token_stream_chain::TokenStreamChain; -pub use self::japanese_tokenizer::JapaneseTokenizer; -pub use self::remove_long::RemoveLongFilter; -pub use self::lower_caser::LowerCaser; -pub use self::stemmer::Stemmer; - -#[cfg(test)] -mod test { - use super::Token; - use super::AnalyzerManager; - - - #[test] - fn test_raw_tokenizer() { - let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("raw").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - en_analyzer.token_stream("Hello, happy tax payer!").process(&mut add_token); - } - assert_eq!(tokens.len(), 1); - assert_eq!(&tokens[0], "Hello, happy tax payer!"); - } - - - #[test] - fn test_en_analyzer() { - let analyzer_manager = AnalyzerManager::default(); - assert!(analyzer_manager.get("en_doesnotexist").is_none()); - let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - en_analyzer.token_stream("Hello, happy tax payer!").process(&mut add_token); - } - assert_eq!(tokens.len(), 4); - assert_eq!(&tokens[0], "hello"); - assert_eq!(&tokens[1], "happi"); - assert_eq!(&tokens[2], "tax"); - assert_eq!(&tokens[3], "payer"); - } - - #[test] - fn test_jp_analyzer() { - let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("ja").unwrap(); - - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - en_analyzer.token_stream("野菜食べないとやばい!").process(&mut add_token); - } - assert_eq!(tokens.len(), 5); - assert_eq!(&tokens[0], "野菜"); - assert_eq!(&tokens[1], "食べ"); - assert_eq!(&tokens[2], "ない"); - assert_eq!(&tokens[3], "と"); - assert_eq!(&tokens[4], "やばい"); - } - - #[test] - fn test_tokenizer_empty() { - let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); - { - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - en_analyzer.token_stream(" ").process(&mut add_token); - } - assert!(tokens.is_empty()); - } - { - let mut tokens: Vec = vec![]; - { - let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; - en_analyzer.token_stream(" ").process(&mut add_token); - } - assert!(tokens.is_empty()); - } - } - -} diff --git a/src/core/index.rs b/src/core/index.rs index 4cfd6096c..ce1284474 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -23,7 +23,7 @@ use directory::ManagedDirectory; use core::META_FILEPATH; use super::segment::create_segment; use indexer::segment_updater::save_new_metas; -use analyzer::AnalyzerManager; +use tokenizer::TokenizerManager; const NUM_SEARCHERS: usize = 12; @@ -38,7 +38,7 @@ pub struct Index { directory: ManagedDirectory, schema: Schema, searcher_pool: Arc>, - analyzers: AnalyzerManager + tokenizers: TokenizerManager } @@ -68,9 +68,9 @@ impl Index { } - /// Accessor for the analyzer manager. - pub fn analyzers(&self) -> &AnalyzerManager { - &self.analyzers + /// Accessor for the tokenizer manager. + pub fn tokenizers(&self) -> &TokenizerManager { + &self.tokenizers } /// Creates a new index in a temp directory. @@ -94,9 +94,9 @@ impl Index { directory: directory, schema: schema, searcher_pool: Arc::new(Pool::new()), - analyzers: AnalyzerManager::default(), + tokenizers: TokenizerManager::default(), }; - try!(index.load_searchers()); + index.load_searchers()?; Ok(index) } @@ -110,7 +110,7 @@ impl Index { pub fn open(directory_path: &Path) -> Result { let mmap_directory = MmapDirectory::open(directory_path)?; let directory = ManagedDirectory::new(mmap_directory)?; - let metas = try!(load_metas(&directory)); + let metas = load_metas(&directory)?; Index::create_from_metas(directory, metas) } @@ -259,7 +259,7 @@ impl Clone for Index { directory: self.directory.clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), - analyzers: self.analyzers.clone() + tokenizers: self.tokenizers.clone() } } } diff --git a/src/core/segment.rs b/src/core/segment.rs index 064d38cad..91d1f382d 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -37,11 +37,11 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { impl Segment { + /// Returns the index the segment belongs to. pub fn index(&self) -> &Index { &self.index } - /// Returns our index's schema. pub fn schema(&self) -> Schema { self.index.schema() diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index b6f0a7c15..85627e19c 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -406,7 +406,7 @@ mod tests { let mut schema_builder = schema::SchemaBuilder::default(); let text_fieldtype = schema::TextOptions::default() .set_indexing_options(TextFieldIndexing::default() - .set_analyzer("default") + .set_tokenizer("default") .set_index_option(IndexRecordOption::WithFreqs)) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 1b1b85ee1..d2245d300 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -14,7 +14,7 @@ use datastruct::stacker::Heap; use indexer::index_writer::MARGIN_IN_BYTES; use super::operation::AddOperation; use postings::MultiFieldPostingsWriter; -use analyzer::BoxedAnalyzer; +use tokenizer::BoxedTokenizer; use schema::Value; @@ -31,7 +31,7 @@ pub struct SegmentWriter<'a> { fast_field_writers: FastFieldsWriter, fieldnorms_writer: FastFieldsWriter, doc_opstamps: Vec, - analyzers: Vec>> + tokenizers: Vec>> } @@ -62,9 +62,9 @@ impl<'a> SegmentWriter<'a> { mut segment: Segment, schema: &Schema) -> Result> { - let segment_serializer = try!(SegmentSerializer::for_segment(&mut segment)); + let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap); - let analyzers = schema.fields() + let tokenizers = schema.fields() .iter() .map(|field_entry| field_entry.field_type()) .map(|field_type| { @@ -73,8 +73,8 @@ impl<'a> SegmentWriter<'a> { text_options .get_indexing_options() .and_then(|text_index_option| { - let analyzer_name = &text_index_option.analyzer(); - segment.index().analyzers().get(analyzer_name) + let tokenizer_name = &text_index_option.tokenizer(); + segment.index().tokenizers().get(tokenizer_name) }) } _ => None, @@ -89,7 +89,7 @@ impl<'a> SegmentWriter<'a> { segment_serializer: segment_serializer, fast_field_writers: FastFieldsWriter::from_schema(schema), doc_opstamps: Vec::with_capacity(1_000), - analyzers: analyzers, + tokenizers: tokenizers, }) } @@ -146,7 +146,7 @@ impl<'a> SegmentWriter<'a> { match *field_options.field_type() { FieldType::Str(_) => { let num_tokens = - if let Some(ref mut analyzer) = self.analyzers[field.0 as usize] { + if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] { let texts: Vec<&str> = field_values.iter() .flat_map(|field_value| { match field_value.value() { @@ -155,7 +155,7 @@ impl<'a> SegmentWriter<'a> { } }) .collect(); - let mut token_stream = analyzer.token_stream_texts(&texts[..]); + let mut token_stream = tokenizer.token_stream_texts(&texts[..]); self.multifield_postings.index_text(doc_id, field, &mut token_stream) } else { @@ -198,7 +198,7 @@ impl<'a> SegmentWriter<'a> { }) .collect(); let doc_writer = self.segment_serializer.get_store_writer(); - try!(doc_writer.store(&stored_fieldvalues)); + doc_writer.store(&stored_fieldvalues)?; self.max_doc += 1; Ok(()) } @@ -233,16 +233,16 @@ fn write( mut serializer: SegmentSerializer, ) -> Result<()> { - try!(multifield_postings.serialize( + multifield_postings.serialize( serializer.get_postings_serializer(), - )); - try!(fast_field_writers.serialize( + )?; + fast_field_writers.serialize( serializer.get_fast_field_serializer(), - )); - try!(fieldnorms_writer.serialize( + )?; + fieldnorms_writer.serialize( serializer.get_fieldnorms_serializer(), - )); - try!(serializer.close()); + )?; + serializer.close()?; Ok(()) } diff --git a/src/lib.rs b/src/lib.rs index 5adc8e888..faffade18 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -101,7 +101,7 @@ mod common; #[allow(unused_doc_comment)] mod error; -pub mod analyzer; +pub mod tokenizer; mod datastruct; pub mod termdict; diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 4b4922dac..baaab2c4e 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -5,14 +5,14 @@ use std::io; use postings::Recorder; use Result; use schema::{Schema, Field}; -use analyzer::Token; +use tokenizer::Token; use std::marker::PhantomData; use std::ops::DerefMut; use datastruct::stacker::{HashMap, Heap}; use postings::{NothingRecorder, TermFrequencyRecorder, TFAndPositionRecorder}; use schema::FieldEntry; use schema::FieldType; -use analyzer::TokenStream; +use tokenizer::TokenStream; use schema::IndexRecordOption; fn posting_from_field_entry<'a>( @@ -166,7 +166,7 @@ pub trait PostingsWriter { let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); let mut sink = |token: &Token| { - term.set_text(token.term.as_str()); + term.set_text(token.text.as_str()); self.suscribe(term_index, doc_id, token.position as u32, &term, heap); }; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 356afaa6e..3c9061500 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -10,7 +10,7 @@ use schema::IndexRecordOption; use query::PhraseQuery; use schema::{Term, FieldType}; use std::str::FromStr; -use analyzer::AnalyzerManager; +use tokenizer::TokenizerManager; use std::num::ParseIntError; use core::Index; @@ -33,9 +33,9 @@ pub enum QueryParserError { /// The field searched for is not declared /// as indexed in the schema. FieldNotIndexed(String), - /// The analyzer for the given field is unknown - /// The two argument strings are the name of the field, the name of the analyzer: - UnknownAnalyzer(String, String), + /// The tokenizer for the given field is unknown + /// The two argument strings are the name of the field, the name of the tokenizer + UnknownTokenizer(String, String), } @@ -49,7 +49,7 @@ impl From for QueryParserError { /// /// The language covered by the current parser is extremely simple. /// -/// * simple terms: "e.g.: `Barack Obama` are simply analyzed using +/// * simple terms: "e.g.: `Barack Obama` are simply tokenized using /// tantivy's `StandardTokenizer`, hence becoming `["barack", "obama"]`. /// The terms are then searched within the default terms of the query parser. /// @@ -77,7 +77,7 @@ pub struct QueryParser { schema: Schema, default_fields: Vec, conjunction_by_default: bool, - analyzer_manager: AnalyzerManager, + tokenizer_manager: TokenizerManager, } impl QueryParser { @@ -87,11 +87,11 @@ impl QueryParser { /// in the query. pub fn new(schema: Schema, default_fields: Vec, - analyzer_manager: AnalyzerManager) -> QueryParser { + tokenizer_manager: TokenizerManager) -> QueryParser { QueryParser { schema, default_fields, - analyzer_manager, + tokenizer_manager: tokenizer_manager, conjunction_by_default: false, } } @@ -101,7 +101,7 @@ impl QueryParser { QueryParser::new( index.schema(), default_fields, - index.analyzers().clone()) + index.tokenizers().clone()) } /// Set the default way to compose queries to a conjunction. @@ -176,15 +176,15 @@ impl QueryParser { } FieldType::Str(ref str_options) => { if let Some(option) = str_options.get_indexing_options() { - let mut analyzer = self.analyzer_manager - .get(option.analyzer()) + let mut tokenizer = self.tokenizer_manager + .get(option.tokenizer()) .ok_or_else(|| { - QueryParserError::UnknownAnalyzer(field_entry.name().to_string(), option.analyzer().to_string()) + QueryParserError::UnknownTokenizer(field_entry.name().to_string(), option.tokenizer().to_string()) })?; let mut terms: Vec = Vec::new(); - let mut token_stream = analyzer.token_stream(phrase); + let mut token_stream = tokenizer.token_stream(phrase); token_stream.process(&mut |token| { - let term = Term::from_field_text(field, &token.term); + let term = Term::from_field_text(field, &token.text); terms.push(term); }); if terms.is_empty() { @@ -223,16 +223,12 @@ impl QueryParser { match user_input_ast { UserInputAST::Clause(sub_queries) => { let default_occur = self.default_occur(); - let logical_sub_queries: Vec<(Occur, LogicalAST)> = - try!(sub_queries - .into_iter() - .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query)) - .map(|res| { - res.map(|(occur, sub_ast)| { - (compose_occur(default_occur, occur), sub_ast) - }) - }) - .collect()); + let mut logical_sub_queries: Vec<(Occur, LogicalAST)> = Vec::new(); + for sub_query in sub_queries { + let (occur, sub_ast) = self.compute_logical_ast_with_occur(*sub_query)?; + let new_occur = compose_occur(default_occur, occur); + logical_sub_queries.push((new_occur, sub_ast)); + } Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } UserInputAST::Not(subquery) => { @@ -328,7 +324,7 @@ fn convert_to_query(logical_ast: LogicalAST) -> Box { #[cfg(test)] mod test { use schema::{SchemaBuilder, Term, TEXT, STRING, STORED, INT_INDEXED}; - use analyzer::AnalyzerManager; + use tokenizer::TokenizerManager; use query::Query; use schema::Field; use super::QueryParser; @@ -347,8 +343,8 @@ mod test { schema_builder.add_text_field("nottokenized", STRING); let schema = schema_builder.build(); let default_fields = vec![title, text]; - let analyzer_manager = AnalyzerManager::default(); - QueryParser::new(schema, default_fields, analyzer_manager) + let tokenizer_manager = TokenizerManager::default(); + QueryParser::new(schema, default_fields, tokenizer_manager) } diff --git a/src/schema/field_entry.rs b/src/schema/field_entry.rs index 2a045fb77..4bb0204a9 100644 --- a/src/schema/field_entry.rs +++ b/src/schema/field_entry.rs @@ -223,7 +223,7 @@ mod tests { "options": { "indexing": { "record": "position", - "analyzer": "default" + "tokenizer": "default" }, "stored": false } diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index edb57eb3a..bfc97482b 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -13,12 +13,16 @@ /// #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] pub enum IndexRecordOption { + /// records only the `DocId`s #[serde(rename = "basic")] - Basic, //< records only the `DocId`s + Basic, + /// records the document ids as well as the term frequency. #[serde(rename = "freq")] - WithFreqs, //< records the document ids as well as the term frequency. + WithFreqs, + /// records the document id, the term frequency and the positions of + /// the occurences in the document. #[serde(rename = "position")] - WithFreqsAndPositions, //< records the document id, the term frequency and the positions of the occurences in the document. + WithFreqsAndPositions, } impl IndexRecordOption { diff --git a/src/schema/mod.rs b/src/schema/mod.rs index b26d76491..b9cad2b17 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -31,7 +31,7 @@ let mut schema_builder = SchemaBuilder::default(); let title_options = TextOptions::default() .set_stored() .set_indexing_options(TextFieldIndexing::default() - .set_analyzer("default") + .set_tokenizer("default") .set_index_option(IndexRecordOption::WithFreqsAndPositions)); schema_builder.add_text_field("title_options", title_options); let schema = schema_builder.build(); diff --git a/src/schema/schema.rs b/src/schema/schema.rs index d80641485..9e1ba0d12 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -225,16 +225,21 @@ impl Schema { JsonValue::Array(ref json_items) => { for json_item in json_items { let value = - try!(field_type.value_from_json(json_item).map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })); + field_type + .value_from_json(json_item) + .map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })?; doc.add(FieldValue::new(field, value)); } } _ => { - let value = try!(field_type.value_from_json(json_value).map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })); + let value = + field_type + .value_from_json(json_value) + .map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })?; doc.add(FieldValue::new(field, value)); } @@ -360,7 +365,7 @@ mod tests { "options": { "indexing": { "record": "position", - "analyzer": "default" + "tokenizer": "default" }, "stored": false } @@ -371,7 +376,7 @@ mod tests { "options": { "indexing": { "record": "basic", - "analyzer": "raw" + "tokenizer": "raw" }, "stored": false } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 7fb713a0b..46ea89b98 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -47,27 +47,27 @@ impl Default for TextOptions { #[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, - analyzer: Cow<'static, str>, + tokenizer: Cow<'static, str>, } impl Default for TextFieldIndexing { fn default() -> TextFieldIndexing { TextFieldIndexing { - analyzer: Cow::Borrowed("default"), + tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::Basic, } } } impl TextFieldIndexing { - pub fn set_analyzer(mut self, analyzer_name: &str) -> TextFieldIndexing { - self.analyzer = Cow::Owned(analyzer_name.to_string()); + pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing { + self.tokenizer = Cow::Owned(tokenizer_name.to_string()); self } - pub fn analyzer(&self) -> &str { - &self.analyzer + pub fn tokenizer(&self) -> &str { + &self.tokenizer } pub fn set_index_option(mut self, index_option: IndexRecordOption) -> TextFieldIndexing { @@ -84,7 +84,7 @@ impl TextFieldIndexing { pub const STRING: TextOptions = TextOptions { indexing: Some( TextFieldIndexing { - analyzer: Cow::Borrowed("raw"), + tokenizer: Cow::Borrowed("raw"), record: IndexRecordOption::Basic, }), stored: false, @@ -95,7 +95,7 @@ pub const STRING: TextOptions = TextOptions { pub const TEXT: TextOptions = TextOptions { indexing: Some( TextFieldIndexing { - analyzer: Cow::Borrowed("default"), + tokenizer: Cow::Borrowed("default"), record: IndexRecordOption::WithFreqsAndPositions, }), stored: false, @@ -143,7 +143,7 @@ mod tests { match field_entry.field_type() { &FieldType::Str(ref text_options) => { assert!(text_options.get_indexing_options().is_some()); - assert_eq!(text_options.get_indexing_options().unwrap().analyzer(), "default"); + assert_eq!(text_options.get_indexing_options().unwrap().tokenizer(), "default"); } _ => { panic!(""); diff --git a/src/analyzer/japanese_tokenizer.rs b/src/tokenizer/japanese_tokenizer.rs similarity index 95% rename from src/analyzer/japanese_tokenizer.rs rename to src/tokenizer/japanese_tokenizer.rs index 909ccbb0c..af56eb94f 100644 --- a/src/analyzer/japanese_tokenizer.rs +++ b/src/tokenizer/japanese_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, Analyzer, TokenStream}; +use super::{Token, Tokenizer, TokenStream}; use tinysegmenter; @@ -18,7 +18,7 @@ pub struct JapaneseTokenizerStream { cursor: Cursor, } -impl<'a> Analyzer<'a> for JapaneseTokenizer { +impl<'a> Tokenizer<'a> for JapaneseTokenizer { type TokenStreamImpl = JapaneseTokenizerStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { @@ -33,7 +33,7 @@ impl<'a> Analyzer<'a> for JapaneseTokenizer { offset_from: offset_from, offset_to: offset_to, position: pos, - term: term, + text: term, }); } } diff --git a/src/analyzer/lower_caser.rs b/src/tokenizer/lower_caser.rs similarity index 93% rename from src/analyzer/lower_caser.rs rename to src/tokenizer/lower_caser.rs index 866508782..18da55654 100644 --- a/src/analyzer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,6 +1,4 @@ use super::{TokenFilterFactory, TokenStream, Token}; -use std::ascii::AsciiExt; - /// Token filter that lowercase terms. #[derive(Clone)] @@ -35,7 +33,7 @@ impl TokenStream for LowerCaserTokenStream fn advance(&mut self) -> bool { if self.tail.advance() { - self.tail.token_mut().term.make_ascii_lowercase(); + self.tail.token_mut().text.make_ascii_lowercase(); true } else { false diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs new file mode 100644 index 000000000..b8905909d --- /dev/null +++ b/src/tokenizer/mod.rs @@ -0,0 +1,132 @@ +//! Tokenizer are in charge of processing text for indexing. +//! +//! An tokenizer is a configurable pipeline that starts by a `Tokenizer`, +//! followed by a sequence of [`TokenFilter`s](./trait.TokenFilter.html) to it. +//! +//! The `Tokenizer` is in charge of chopping the text into tokens. There is no +//! trait called `Tokenizer`. Instead `Tokenizer` like [`SimpleTokenizer`](./struct.SimpleTokenizer.html) +//! are just directly implementing the tokenizer trait. +//! +//! - choosing a tokenizer. A tokenizer is in charge of chopping your text into token. +//! - adding so called filter to modify your tokens (e.g. filter out stop words, apply stemming etc.) +//! +//! # Example +//! +//! ``` +//! extern crate tantivy; +//! use tantivy::tokenizer::*; +//! +//! // ... +//! +//! # fn main() { +//! let mut tokenizer = SimpleTokenizer +//! .filter(RemoveLongFilter::limit(40)) +//! .filter(LowerCaser); +//! tokenizer +//! .token_stream("Hello, happy tax payer") +//! .process(&mut |token| { +//! println!("token {:?}", token.text); +//! }); +//! # } +//! ``` + +mod tokenizer; +mod simple_tokenizer; +mod lower_caser; +mod remove_long; +mod stemmer; +mod tokenizer_manager; +mod japanese_tokenizer; +mod token_stream_chain; +mod raw_tokenizer; + + +pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilterFactory, TokenStream}; +pub use self::tokenizer::BoxedTokenizer; +pub use self::tokenizer_manager::TokenizerManager; +pub use self::simple_tokenizer::SimpleTokenizer; +pub use self::raw_tokenizer::RawTokenizer; +pub use self::token_stream_chain::TokenStreamChain; +pub use self::japanese_tokenizer::JapaneseTokenizer; +pub use self::remove_long::RemoveLongFilter; +pub use self::lower_caser::LowerCaser; +pub use self::stemmer::Stemmer; + +#[cfg(test)] +mod test { + use super::Token; + use super::TokenizerManager; + + + #[test] + fn test_raw_tokenizer() { + let tokenizer_manager = TokenizerManager::default(); + let mut en_tokenizer = tokenizer_manager.get("raw").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; + en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token); + } + assert_eq!(tokens.len(), 1); + assert_eq!(&tokens[0], "Hello, happy tax payer!"); + } + + + #[test] + fn test_en_tokenizer() { + let tokenizer_manager = TokenizerManager::default(); + assert!(tokenizer_manager.get("en_doesnotexist").is_none()); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; + en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token); + } + assert_eq!(tokens.len(), 4); + assert_eq!(&tokens[0], "hello"); + assert_eq!(&tokens[1], "happi"); + assert_eq!(&tokens[2], "tax"); + assert_eq!(&tokens[3], "payer"); + } + + #[test] + fn test_jp_tokenizer() { + let tokenizer_manager = TokenizerManager::default(); + let mut en_tokenizer = tokenizer_manager.get("ja").unwrap(); + + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; + en_tokenizer.token_stream("野菜食べないとやばい!").process(&mut add_token); + } + assert_eq!(tokens.len(), 5); + assert_eq!(&tokens[0], "野菜"); + assert_eq!(&tokens[1], "食べ"); + assert_eq!(&tokens[2], "ない"); + assert_eq!(&tokens[3], "と"); + assert_eq!(&tokens[4], "やばい"); + } + + #[test] + fn test_tokenizer_empty() { + let tokenizer_manager = TokenizerManager::default(); + let mut en_tokenizer = tokenizer_manager.get("en_stem").unwrap(); + { + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; + en_tokenizer.token_stream(" ").process(&mut add_token); + } + assert!(tokens.is_empty()); + } + { + let mut tokens: Vec = vec![]; + { + let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; + en_tokenizer.token_stream(" ").process(&mut add_token); + } + assert!(tokens.is_empty()); + } + } + +} diff --git a/src/analyzer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs similarity index 86% rename from src/analyzer/raw_tokenizer.rs rename to src/tokenizer/raw_tokenizer.rs index 488ca5590..65875f301 100644 --- a/src/analyzer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, Analyzer, TokenStream}; +use super::{Token, Tokenizer, TokenStream}; #[derive(Clone)] pub struct RawTokenizer; @@ -8,7 +8,7 @@ pub struct RawTokenStream { has_token: bool, } -impl<'a> Analyzer<'a> for RawTokenizer { +impl<'a> Tokenizer<'a> for RawTokenizer { type TokenStreamImpl = RawTokenStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { @@ -16,7 +16,7 @@ impl<'a> Analyzer<'a> for RawTokenizer { offset_from: 0, offset_to: text.len(), position: 0, - term: text.to_string() + text: text.to_string() }; RawTokenStream { token: token, diff --git a/src/analyzer/remove_long.rs b/src/tokenizer/remove_long.rs similarity index 97% rename from src/analyzer/remove_long.rs rename to src/tokenizer/remove_long.rs index c446d3521..ff7748cdc 100644 --- a/src/analyzer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -22,7 +22,7 @@ impl RemoveLongFilterStream where TailTokenStream: TokenStream { fn predicate(&self, token: &Token) -> bool { - token.term.len() < self.token_length_limit + token.text.len() < self.token_length_limit } fn wrap(token_length_limit: usize, diff --git a/src/analyzer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs similarity index 89% rename from src/analyzer/simple_tokenizer.rs rename to src/tokenizer/simple_tokenizer.rs index 1d4b71c22..cf4bbd487 100644 --- a/src/analyzer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,6 +1,6 @@ use std::str::CharIndices; -use super::{Token, Analyzer, TokenStream}; +use super::{Token, Tokenizer, TokenStream}; #[derive(Clone)] pub struct SimpleTokenizer; @@ -11,7 +11,7 @@ pub struct SimpleTokenStream<'a> { token: Token, } -impl<'a> Analyzer<'a> for SimpleTokenizer { +impl<'a> Tokenizer<'a> for SimpleTokenizer { type TokenStreamImpl = SimpleTokenStream<'a>; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { @@ -36,7 +36,7 @@ impl<'a> SimpleTokenStream<'a> { impl<'a> TokenStream for SimpleTokenStream<'a> { fn advance(&mut self) -> bool { - self.token.term.clear(); + self.token.text.clear(); self.token.position = self.token.position.wrapping_add(1); loop { @@ -46,7 +46,7 @@ impl<'a> TokenStream for SimpleTokenStream<'a> { let offset_to = self.search_token_end(); self.token.offset_from = offset_from; self.token.offset_to = offset_to; - self.token.term.push_str(&self.text[offset_from..offset_to]); + self.token.text.push_str(&self.text[offset_from..offset_to]); return true; } } diff --git a/src/analyzer/stemmer.rs b/src/tokenizer/stemmer.rs similarity index 93% rename from src/analyzer/stemmer.rs rename to src/tokenizer/stemmer.rs index 9d1e45811..29875f018 100644 --- a/src/analyzer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -46,9 +46,9 @@ impl TokenStream for StemmerTokenStream fn advance(&mut self) -> bool { if self.tail.advance() { // TODO remove allocation - let stemmed_str: String = self.stemmer.stem(&self.token().term).into_owned(); - self.token_mut().term.clear(); - self.token_mut().term.push_str(&stemmed_str); + let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned(); + self.token_mut().text.clear(); + self.token_mut().text.push_str(&stemmed_str); true } else { false diff --git a/src/analyzer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs similarity index 93% rename from src/analyzer/token_stream_chain.rs rename to src/tokenizer/token_stream_chain.rs index 6f59f9ae2..401e77bba 100644 --- a/src/analyzer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,4 +1,4 @@ -use analyzer::{TokenStream, Token}; +use tokenizer::{TokenStream, Token}; pub struct TokenStreamChain { offsets: Vec, @@ -35,8 +35,8 @@ impl<'a, TTokenStream> TokenStream for TokenStreamChain self.token.offset_from = token.offset_from + offset_offset; self.token.offset_from = token.offset_from + offset_offset; self.token.position = token.position + self.position_shift; - self.token.term.clear(); - self.token.term.push_str(token.term.as_str()); + self.token.text.clear(); + self.token.text.push_str(token.text.as_str()); return true; } else { diff --git a/src/analyzer/analyzer.rs b/src/tokenizer/tokenizer.rs similarity index 64% rename from src/analyzer/analyzer.rs rename to src/tokenizer/tokenizer.rs index f2a485557..eb03e3138 100644 --- a/src/analyzer/analyzer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,10 +1,41 @@ -/// The analyzer module contains all of the tools used to process +/// The tokenizer module contains all of the tools used to process /// text in `tantivy`. use std::borrow::{Borrow, BorrowMut}; -use analyzer::TokenStreamChain; +use tokenizer::TokenStreamChain; /// Token +/// +/// +/// +/// # Example +/// +/// ``` +/// extern crate tantivy; +/// use tantivy::tokenizer::*; +/// +/// # fn main() { +/// let mut tokenizer = SimpleTokenizer +/// .filter(RemoveLongFilter::limit(40)) +/// .filter(LowerCaser); +/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "hello"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 5); +/// assert_eq!(token.position, 0); +/// } +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "happy"); +/// assert_eq!(token.offset_from, 7); +/// assert_eq!(token.offset_to, 12); +/// assert_eq!(token.position, 1); +/// } +/// # } +/// ``` +/// # pub struct Token { /// Offset (byte index) of the first character of the token. /// Offsets shall not be modified by token filters. @@ -16,7 +47,7 @@ pub struct Token { /// Position, expressed in number of tokens. pub position: usize, /// Actual text content of the token. - pub term: String, + pub text: String, } impl Default for Token { @@ -25,7 +56,7 @@ impl Default for Token { offset_from: 0, offset_to: 0, position: usize::max_value(), - term: String::new(), + text: String::new(), } } } @@ -35,31 +66,31 @@ impl Default for Token { // land in nightly. -pub trait Analyzer<'a>: Sized + Clone { +pub trait Tokenizer<'a>: Sized + Clone { type TokenStreamImpl: TokenStream; fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl; - fn filter(self, new_filter: NewFilter) -> ChainAnalyzer - where NewFilter: TokenFilterFactory<>::TokenStreamImpl> + fn filter(self, new_filter: NewFilter) -> ChainTokenizer + where NewFilter: TokenFilterFactory<>::TokenStreamImpl> { - ChainAnalyzer { + ChainTokenizer { head: new_filter, tail: self, } } } -pub trait BoxedAnalyzer: Send + Sync { +pub trait BoxedTokenizer: Send + Sync { fn token_stream<'a>(&mut self, text: &'a str) -> Box; fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box; - fn boxed_clone(&self) -> Box; + fn boxed_clone(&self) -> Box; } #[derive(Clone)] -struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a> + Send + Sync; +struct BoxableTokenizer(A) where A: for <'a> Tokenizer<'a> + Send + Sync; -impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + Send + Sync + for <'a> Analyzer<'a> { +impl BoxedTokenizer for BoxableTokenizer where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { fn token_stream<'a>(&mut self, text: &'a str) -> Box { box self.0.token_stream(text) } @@ -86,14 +117,14 @@ impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + Send + Sync + fo } } - fn boxed_clone(&self) -> Box { + fn boxed_clone(&self) -> Box { box self.clone() } } -pub fn box_analyzer(a: A) -> Box - where A: 'static + Send + Sync + for <'a> Analyzer<'a> { - box BoxableAnalyzer(a) +pub fn box_tokenizer(a: A) -> Box + where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { + box BoxableTokenizer(a) } @@ -141,16 +172,16 @@ pub trait TokenStream { } #[derive(Clone)] -pub struct ChainAnalyzer { +pub struct ChainTokenizer { head: HeadTokenFilterFactory, - tail: TailAnalyzer, + tail: TailTokenizer, } -impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> - for ChainAnalyzer - where HeadTokenFilterFactory: TokenFilterFactory, - TailAnalyzer: Analyzer<'a> +impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a> + for ChainTokenizer + where HeadTokenFilterFactory: TokenFilterFactory, + TailTokenizer: Tokenizer<'a> { type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs new file mode 100644 index 000000000..e405f7596 --- /dev/null +++ b/src/tokenizer/tokenizer_manager.rs @@ -0,0 +1,78 @@ +use std::collections::HashMap; +use std::sync::{Arc, RwLock}; +use tokenizer::BoxedTokenizer; +use tokenizer::Tokenizer; +use tokenizer::box_tokenizer; +use tokenizer::RawTokenizer; +use tokenizer::SimpleTokenizer; +use tokenizer::JapaneseTokenizer; +use tokenizer::RemoveLongFilter; +use tokenizer::LowerCaser; +use tokenizer::Stemmer; + + + +/// The tokenizer manager serves as a store for +/// all of the pre-configured tokenizer pipelines. +/// +/// By default, it is populated with the following managers. +/// +/// * raw : does not process nor tokenize the text. +/// * default : Chops the text on according to whitespace and punctuation, removes tokens that are too long, lowercases +#[derive(Clone)] +pub struct TokenizerManager { + tokenizers: Arc< RwLock >> > +} + +impl TokenizerManager { + pub fn register(&self, tokenizer_name: &str, tokenizer: A) + where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { + let boxed_tokenizer = box_tokenizer(tokenizer); + self.tokenizers + .write() + .expect("Acquiring the lock should never fail") + .insert(tokenizer_name.to_string(), boxed_tokenizer); + } + + pub fn get(&self, tokenizer_name: &str) -> Option> { + self.tokenizers + .read() + .expect("Acquiring the lock should never fail") + .get(tokenizer_name) + .map(|boxed_tokenizer| { + boxed_tokenizer.boxed_clone() + }) + } +} + +impl Default for TokenizerManager { + /// Creates an `TokenizerManager` prepopulated with + /// the default pre-configured tokenizers of `tantivy`. + /// - simple + /// - en_stem + /// - jp + fn default() -> TokenizerManager { + let manager = TokenizerManager { + tokenizers: Arc::new(RwLock::new(HashMap::new())) + }; + manager.register("raw", + RawTokenizer + ); + manager.register("default", + SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + ); + manager.register("en_stem", + SimpleTokenizer + .filter(RemoveLongFilter::limit(40)) + .filter(LowerCaser) + .filter(Stemmer::new()) + ); + manager.register("ja", + JapaneseTokenizer + .filter(RemoveLongFilter::limit(40)) + ); + manager + } +} \ No newline at end of file