From 974c321153a763755674a02408c7be84fbecd4e8 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 26 Nov 2017 11:02:02 +0900 Subject: [PATCH] cargo fmt --- examples/simple_search.rs | 12 ++-- src/core/index.rs | 4 +- src/core/inverted_index_reader.rs | 6 +- src/core/segment.rs | 1 - src/indexer/merger.rs | 20 +++---- src/indexer/segment_writer.rs | 80 +++++++++++++------------- src/postings/postings_writer.rs | 43 +++++++------- src/postings/segment_postings.rs | 6 +- src/postings/serializer.rs | 8 ++- src/query/query_parser/query_parser.rs | 48 ++++++++-------- src/schema/field_type.rs | 8 ++- src/schema/index_record_option.rs | 9 ++- src/schema/mod.rs | 4 +- src/schema/schema.rs | 18 ++---- src/schema/text_options.rs | 27 +++++---- src/tokenizer/lower_caser.rs | 9 ++- src/tokenizer/mod.rs | 14 +++-- src/tokenizer/raw_tokenizer.rs | 5 +- src/tokenizer/remove_long.rs | 18 +++--- src/tokenizer/stemmer.rs | 18 +++--- src/tokenizer/token_stream_chain.rs | 20 ++++--- src/tokenizer/tokenizer.rs | 39 +++++++------ src/tokenizer/tokenizer_manager.rs | 42 ++++++-------- 23 files changed, 236 insertions(+), 223 deletions(-) diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 301508cd5..e90304131 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -36,12 +36,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { let mut schema_builder = SchemaBuilder::default(); // Our first field is title. - // We want full-text search for it, and we also want + // We want full-text search for it, and we also want // to be able to retrieve the document after the search. - // + // // TEXT | STORED is some syntactic sugar to describe // that. - // + // // `TEXT` means the field should be tokenized and indexed, // along with its term frequency and term positions. // @@ -52,11 +52,11 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { schema_builder.add_text_field("title", TEXT | STORED); // Our second field is body. - // We want full-text search for it, but we do not + // We want full-text search for it, but we do not // need to be able to be able to retrieve it - // for our application. + // for our application. // - // We can make our index lighter and + // We can make our index lighter and // by omitting `STORED` flag. schema_builder.add_text_field("body", TEXT); diff --git a/src/core/index.rs b/src/core/index.rs index e790fecd9..ede44867c 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -38,7 +38,7 @@ pub struct Index { directory: ManagedDirectory, schema: Schema, searcher_pool: Arc>, - tokenizers: TokenizerManager + tokenizers: TokenizerManager, } @@ -259,7 +259,7 @@ impl Clone for Index { directory: self.directory.clone(), schema: self.schema.clone(), searcher_pool: self.searcher_pool.clone(), - tokenizers: self.tokenizers.clone() + tokenizers: self.tokenizers.clone(), } } } diff --git a/src/core/inverted_index_reader.rs b/src/core/inverted_index_reader.rs index dc904550d..d4af06c0e 100644 --- a/src/core/inverted_index_reader.rs +++ b/src/core/inverted_index_reader.rs @@ -138,11 +138,7 @@ impl InvertedIndexReader { /// For instance, requesting `IndexRecordOption::Freq` for a /// `TextIndexingOptions` that does not index position will return a `SegmentPostings` /// with `DocId`s and frequencies. - pub fn read_postings( - &self, - term: &Term, - option: IndexRecordOption, - ) -> Option { + pub fn read_postings(&self, term: &Term, option: IndexRecordOption) -> Option { let field = term.field(); let field_entry = self.schema.get_field_entry(field); let term_info = get!(self.get_term_info(term)); diff --git a/src/core/segment.rs b/src/core/segment.rs index 91d1f382d..c4e7e3442 100644 --- a/src/core/segment.rs +++ b/src/core/segment.rs @@ -36,7 +36,6 @@ pub fn create_segment(index: Index, meta: SegmentMeta) -> Segment { } impl Segment { - /// Returns the index the segment belongs to. pub fn index(&self) -> &Index { &self.index diff --git a/src/indexer/merger.rs b/src/indexer/merger.rs index 85627e19c..30be279f3 100644 --- a/src/indexer/merger.rs +++ b/src/indexer/merger.rs @@ -269,10 +269,8 @@ impl IndexMerger { let field_entry = self.schema.get_field_entry(indexed_field); // ... set segment postings option the new field. - let segment_postings_option = field_entry - .field_type() - .get_index_record_option() - .expect( + let segment_postings_option = + field_entry.field_type().get_index_record_option().expect( "Encountered a field that is not supposed to be indexed. Have you modified the schema?", ); @@ -405,9 +403,11 @@ mod tests { fn test_index_merger_no_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); let text_fieldtype = schema::TextOptions::default() - .set_indexing_options(TextFieldIndexing::default() - .set_tokenizer("default") - .set_index_option(IndexRecordOption::WithFreqs)) + .set_indexing_options( + TextFieldIndexing::default() + .set_tokenizer("default") + .set_index_option(IndexRecordOption::WithFreqs), + ) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(); @@ -539,9 +539,9 @@ mod tests { fn test_index_merger_with_deletes() { let mut schema_builder = schema::SchemaBuilder::default(); let text_fieldtype = schema::TextOptions::default() - .set_indexing_options( - TextFieldIndexing::default() - .set_index_option(IndexRecordOption::WithFreqs)) + .set_indexing_options(TextFieldIndexing::default().set_index_option( + IndexRecordOption::WithFreqs, + )) .set_stored(); let text_field = schema_builder.add_text_field("text", text_fieldtype); let score_fieldtype = schema::IntOptions::default().set_fast(); diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index d2245d300..5cd20140b 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -31,7 +31,7 @@ pub struct SegmentWriter<'a> { fast_field_writers: FastFieldsWriter, fieldnorms_writer: FastFieldsWriter, doc_opstamps: Vec, - tokenizers: Vec>> + tokenizers: Vec>>, } @@ -57,40 +57,40 @@ impl<'a> SegmentWriter<'a> { /// the flushing behavior as a buffer limit /// - segment: The segment being written /// - schema - pub fn for_segment(heap: &'a Heap, - table_bits: usize, - mut segment: Segment, - schema: &Schema) - -> Result> { + pub fn for_segment( + heap: &'a Heap, + table_bits: usize, + mut segment: Segment, + schema: &Schema, + ) -> Result> { let segment_serializer = SegmentSerializer::for_segment(&mut segment)?; let multifield_postings = MultiFieldPostingsWriter::new(schema, table_bits, heap); - let tokenizers = schema.fields() + let tokenizers = schema + .fields() .iter() .map(|field_entry| field_entry.field_type()) - .map(|field_type| { - match field_type { - &FieldType::Str(ref text_options) => { - text_options - .get_indexing_options() - .and_then(|text_index_option| { - let tokenizer_name = &text_index_option.tokenizer(); - segment.index().tokenizers().get(tokenizer_name) - }) - } - _ => None, + .map(|field_type| match field_type { + &FieldType::Str(ref text_options) => { + text_options.get_indexing_options().and_then( + |text_index_option| { + let tokenizer_name = &text_index_option.tokenizer(); + segment.index().tokenizers().get(tokenizer_name) + }, + ) } + _ => None, }) .collect(); Ok(SegmentWriter { - heap: heap, - max_doc: 0, - multifield_postings: multifield_postings, - fieldnorms_writer: create_fieldnorms_writer(schema), - segment_serializer: segment_serializer, - fast_field_writers: FastFieldsWriter::from_schema(schema), - doc_opstamps: Vec::with_capacity(1_000), - tokenizers: tokenizers, - }) + heap: heap, + max_doc: 0, + multifield_postings: multifield_postings, + fieldnorms_writer: create_fieldnorms_writer(schema), + segment_serializer: segment_serializer, + fast_field_writers: FastFieldsWriter::from_schema(schema), + doc_opstamps: Vec::with_capacity(1_000), + tokenizers: tokenizers, + }) } /// Lay on disk the current content of the `SegmentWriter` @@ -147,23 +147,25 @@ impl<'a> SegmentWriter<'a> { FieldType::Str(_) => { let num_tokens = if let Some(ref mut tokenizer) = self.tokenizers[field.0 as usize] { - let texts: Vec<&str> = field_values.iter() - .flat_map(|field_value| { - match field_value.value() { - &Value::Str(ref text) => Some(text.as_str()), - _ => None - } + let texts: Vec<&str> = field_values + .iter() + .flat_map(|field_value| match field_value.value() { + &Value::Str(ref text) => Some(text.as_str()), + _ => None, }) .collect(); let mut token_stream = tokenizer.token_stream_texts(&texts[..]); - self.multifield_postings.index_text(doc_id, field, &mut token_stream) - } - else { + self.multifield_postings.index_text( + doc_id, + field, + &mut token_stream, + ) + } else { 0 }; - self.fieldnorms_writer - .get_field_writer(field) - .map(|field_norms_writer| field_norms_writer.add_val(num_tokens as u64)); + self.fieldnorms_writer.get_field_writer(field).map( + |field_norms_writer| field_norms_writer.add_val(num_tokens as u64), + ); } FieldType::U64(ref int_option) => { if int_option.is_indexed() { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index baaab2c4e..70d797e1c 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -22,9 +22,8 @@ fn posting_from_field_entry<'a>( match *field_entry.field_type() { FieldType::Str(ref text_options) => { text_options - .get_indexing_options() - .map(|indexing_options| { - match indexing_options.index_option() { + .get_indexing_options() + .map(|indexing_options| match indexing_options.index_option() { IndexRecordOption::Basic => { SpecializedPostingsWriter::::new_boxed(heap) } @@ -34,11 +33,10 @@ fn posting_from_field_entry<'a>( IndexRecordOption::WithFreqsAndPositions => { SpecializedPostingsWriter::::new_boxed(heap) } - } - }) - .unwrap_or_else(|| { - SpecializedPostingsWriter::::new_boxed(heap) - }) + }) + .unwrap_or_else(|| { + SpecializedPostingsWriter::::new_boxed(heap) + }) } FieldType::U64(_) | FieldType::I64(_) => SpecializedPostingsWriter::::new_boxed(heap), @@ -149,27 +147,29 @@ pub trait PostingsWriter { /// Serializes the postings on disk. /// The actual serialization format is handled by the `PostingsSerializer`. - fn serialize(&self, - term_addrs: &[(&[u8], u32)], - serializer: &mut FieldSerializer, - heap: &Heap) - -> io::Result<()>; + fn serialize( + &self, + term_addrs: &[(&[u8], u32)], + serializer: &mut FieldSerializer, + heap: &Heap, + ) -> io::Result<()>; /// Tokenize a text and suscribe all of its token. - fn index_text<'a>(&mut self, - term_index: &mut HashMap, - doc_id: DocId, - field: Field, - token_stream: &mut TokenStream, - heap: &Heap) - -> u32 { + fn index_text<'a>( + &mut self, + term_index: &mut HashMap, + doc_id: DocId, + field: Field, + token_stream: &mut TokenStream, + heap: &Heap, + ) -> u32 { let mut term = unsafe { Term::with_capacity(100) }; term.set_field(field); let mut sink = |token: &Token| { term.set_text(token.text.as_str()); self.suscribe(term_index, doc_id, token.position as u32, &term, heap); }; - + token_stream.process(&mut sink) } } @@ -197,7 +197,6 @@ impl<'a, Rec: Recorder + 'static> SpecializedPostingsWriter<'a, Rec> { } impl<'a, Rec: Recorder + 'static> PostingsWriter for SpecializedPostingsWriter<'a, Rec> { - fn suscribe( &mut self, term_index: &mut HashMap, diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs index 4dff60a0e..cdf2451dd 100644 --- a/src/postings/segment_postings.rs +++ b/src/postings/segment_postings.rs @@ -509,10 +509,8 @@ mod tests { let inverted_index = segment_reader.inverted_index(int_field); let term = Term::from_field_u64(int_field, 0u64); let term_info = inverted_index.get_term_info(&term).unwrap(); - let mut block_segments = inverted_index.read_block_postings_from_terminfo( - &term_info, - IndexRecordOption::Basic, - ); + let mut block_segments = + inverted_index.read_block_postings_from_terminfo(&term_info, IndexRecordOption::Basic); let mut offset: u32 = 0u32; // checking that the block before calling advance is empty assert!(block_segments.docs().is_empty()); diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs index 1a91a339d..a1a257ca3 100644 --- a/src/postings/serializer.rs +++ b/src/postings/serializer.rs @@ -133,9 +133,11 @@ impl<'a> FieldSerializer<'a> { FieldType::Str(ref text_options) => { if let Some(ref text_indexing_options) = text_options.get_indexing_options() { let index_option = text_indexing_options.index_option(); - (index_option.is_termfreq_enabled(), index_option.is_position_enabled()) - } - else { + ( + index_option.is_termfreq_enabled(), + index_option.is_position_enabled(), + ) + } else { (false, false) } } diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 22699d0fe..77baa0055 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -85,9 +85,11 @@ impl QueryParser { /// * schema - index Schema /// * default_fields - fields used to search if no field is specifically defined /// in the query. - pub fn new(schema: Schema, - default_fields: Vec, - tokenizer_manager: TokenizerManager) -> QueryParser { + pub fn new( + schema: Schema, + default_fields: Vec, + tokenizer_manager: TokenizerManager, + ) -> QueryParser { QueryParser { schema, default_fields, @@ -100,12 +102,8 @@ impl QueryParser { /// * an index /// * a set of default - fields used to search if no field is specifically defined /// in the query. - pub fn for_index(index: Index, - default_fields: Vec) -> QueryParser { - QueryParser::new( - index.schema(), - default_fields, - index.tokenizers().clone()) + pub fn for_index(index: Index, default_fields: Vec) -> QueryParser { + QueryParser::new(index.schema(), default_fields, index.tokenizers().clone()) } /// Set the default way to compose queries to a conjunction. @@ -181,17 +179,20 @@ impl QueryParser { Ok(Some(LogicalLiteral::Term(term))) } FieldType::Str(ref str_options) => { - if let Some(option) = str_options.get_indexing_options() { - let mut tokenizer = self.tokenizer_manager - .get(option.tokenizer()) - .ok_or_else(|| { - QueryParserError::UnknownTokenizer(field_entry.name().to_string(), option.tokenizer().to_string()) - })?; + if let Some(option) = str_options.get_indexing_options() { + let mut tokenizer = self.tokenizer_manager.get(option.tokenizer()).ok_or_else( + || { + QueryParserError::UnknownTokenizer( + field_entry.name().to_string(), + option.tokenizer().to_string(), + ) + }, + )?; let mut terms: Vec = Vec::new(); let mut token_stream = tokenizer.token_stream(phrase); token_stream.process(&mut |token| { - let term = Term::from_field_text(field, &token.text); - terms.push(term); + let term = Term::from_field_text(field, &token.text); + terms.push(term); }); if terms.is_empty() { Ok(None) @@ -202,10 +203,11 @@ impl QueryParser { } else { Ok(Some(LogicalLiteral::Phrase(terms))) } - } - else { + } else { // This should have been seen earlier really. - Err(QueryParserError::FieldNotIndexed(field_entry.name().to_string())) + Err(QueryParserError::FieldNotIndexed( + field_entry.name().to_string(), + )) } } } @@ -238,13 +240,11 @@ impl QueryParser { Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries))) } UserInputAST::Not(subquery) => { - let (occur, logical_sub_queries) = - self.compute_logical_ast_with_occur(*subquery)?; + let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries)) } UserInputAST::Must(subquery) => { - let (occur, logical_sub_queries) = - self.compute_logical_ast_with_occur(*subquery)?; + let (occur, logical_sub_queries) = self.compute_logical_ast_with_occur(*subquery)?; Ok((compose_occur(Occur::Must, occur), logical_sub_queries)) } UserInputAST::Leaf(literal) => { diff --git a/src/schema/field_type.rs b/src/schema/field_type.rs index 2604c6c2e..5e2602e32 100644 --- a/src/schema/field_type.rs +++ b/src/schema/field_type.rs @@ -46,9 +46,11 @@ impl FieldType { pub fn get_index_record_option(&self) -> Option { match *self { FieldType::Str(ref text_options) => { - text_options - .get_indexing_options() - .map(|indexing_options| indexing_options.index_option()) + text_options.get_indexing_options().map( + |indexing_options| { + indexing_options.index_option() + }, + ) } FieldType::U64(ref int_options) | FieldType::I64(ref int_options) => { diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index 7a1fd9d9d..e43063b7e 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -5,10 +5,13 @@ /// It is both used to: /// /// * describe in the schema the amount of information -/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option)) +/// that should be retained during indexing (See +/// [TextFieldIndexing.html.set_index_option]( +/// ../schema/struct.TextFieldIndexing.html#method.set_index_option)) /// * to request for a given /// amount of information to be decoded as one goes through a posting list. -/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings)) +/// (See [InvertedIndexReader.read_postings]( +/// ../struct.InvertedIndexReader.html#method.read_postings)) /// #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] pub enum IndexRecordOption { @@ -63,4 +66,4 @@ impl IndexRecordOption { IndexRecordOption::WithFreqsAndPositions => true, } } -} \ No newline at end of file +} diff --git a/src/schema/mod.rs b/src/schema/mod.rs index b9cad2b17..c174cb50c 100644 --- a/src/schema/mod.rs +++ b/src/schema/mod.rs @@ -40,8 +40,8 @@ let schema = schema_builder.build(); We can split the problem of generating a search result page into two phases : * identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`) -* for each of these documents, retrieving the information required to generate the search results page. - (`doc_ids[] -> Document[]`) +* for each of these documents, retrieving the information required to generate + the search results page. (`doc_ids[] -> Document[]`) In the first phase, the ability to search for documents by the given field is determined by the [`TextIndexingOptions`](enum.TextIndexingOptions.html) of our [`TextOptions`] diff --git a/src/schema/schema.rs b/src/schema/schema.rs index 9e1ba0d12..5372ffe16 100644 --- a/src/schema/schema.rs +++ b/src/schema/schema.rs @@ -224,22 +224,16 @@ impl Schema { match *json_value { JsonValue::Array(ref json_items) => { for json_item in json_items { - let value = - field_type - .value_from_json(json_item) - .map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })?; + let value = field_type.value_from_json(json_item).map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })?; doc.add(FieldValue::new(field, value)); } } _ => { - let value = - field_type - .value_from_json(json_value) - .map_err(|e| { - DocParsingError::ValueError(field_name.clone(), e) - })?; + let value = field_type.value_from_json(json_value).map_err(|e| { + DocParsingError::ValueError(field_name.clone(), e) + })?; doc.add(FieldValue::new(field, value)); } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 058e3d256..e7cc2d992 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -44,7 +44,7 @@ impl Default for TextOptions { } -#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] +#[derive(Clone, PartialEq, Eq, Debug, Serialize, Deserialize)] pub struct TextFieldIndexing { record: IndexRecordOption, tokenizer: Cow<'static, str>, @@ -88,22 +88,20 @@ impl TextFieldIndexing { /// The field will be untokenized and indexed pub const STRING: TextOptions = TextOptions { - indexing: Some( - TextFieldIndexing { - tokenizer: Cow::Borrowed("raw"), - record: IndexRecordOption::Basic, - }), + indexing: Some(TextFieldIndexing { + tokenizer: Cow::Borrowed("raw"), + record: IndexRecordOption::Basic, + }), stored: false, }; /// The field will be tokenized and indexed pub const TEXT: TextOptions = TextOptions { - indexing: Some( - TextFieldIndexing { - tokenizer: Cow::Borrowed("default"), - record: IndexRecordOption::WithFreqsAndPositions, - }), + indexing: Some(TextFieldIndexing { + tokenizer: Cow::Borrowed("default"), + record: IndexRecordOption::WithFreqsAndPositions, + }), stored: false, }; @@ -149,7 +147,10 @@ mod tests { match field_entry.field_type() { &FieldType::Str(ref text_options) => { assert!(text_options.get_indexing_options().is_some()); - assert_eq!(text_options.get_indexing_options().unwrap().tokenizer(), "default"); + assert_eq!( + text_options.get_indexing_options().unwrap().tokenizer(), + "default" + ); } _ => { panic!(""); @@ -164,5 +165,3 @@ mod tests { assert!(IndexRecordOption::WithFreqs > IndexRecordOption::Basic); } } - - diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index c603709f1..b7357ee07 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -5,7 +5,8 @@ use super::{TokenFilter, TokenStream, Token}; pub struct LowerCaser; impl TokenFilter for LowerCaser - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { type ResultTokenStream = LowerCaserTokenStream; @@ -15,7 +16,8 @@ impl TokenFilter for LowerCaser } pub struct LowerCaserTokenStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { tail: TailTokenStream, } @@ -42,7 +44,8 @@ impl TokenStream for LowerCaserTokenStream } impl LowerCaserTokenStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { fn wrap(tail: TailTokenStream) -> LowerCaserTokenStream { LowerCaserTokenStream { tail: tail } diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 634bc4810..d664fd7b9 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -163,7 +163,9 @@ mod test { let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; - en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token); + en_tokenizer + .token_stream("Hello, happy tax payer!") + .process(&mut add_token); } assert_eq!(tokens.len(), 1); assert_eq!(&tokens[0], "Hello, happy tax payer!"); @@ -178,7 +180,9 @@ mod test { let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; - en_tokenizer.token_stream("Hello, happy tax payer!").process(&mut add_token); + en_tokenizer + .token_stream("Hello, happy tax payer!") + .process(&mut add_token); } assert_eq!(tokens.len(), 4); assert_eq!(&tokens[0], "hello"); @@ -191,11 +195,13 @@ mod test { fn test_jp_tokenizer() { let tokenizer_manager = TokenizerManager::default(); let en_tokenizer = tokenizer_manager.get("ja").unwrap(); - + let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.text.clone()); }; - en_tokenizer.token_stream("野菜食べないとやばい!").process(&mut add_token); + en_tokenizer + .token_stream("野菜食べないとやばい!") + .process(&mut add_token); } assert_eq!(tokens.len(), 5); assert_eq!(&tokens[0], "野菜"); diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 0d97103eb..fe363386a 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -18,7 +18,7 @@ impl<'a> Tokenizer<'a> for RawTokenizer { offset_from: 0, offset_to: text.len(), position: 0, - text: text.to_string() + text: text.to_string(), }; RawTokenStream { token: token, @@ -32,8 +32,7 @@ impl TokenStream for RawTokenStream { if self.has_token { self.has_token = false; true - } - else { + } else { false } } diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index 1ba3ba3b4..5637906f4 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -1,7 +1,7 @@ use super::{TokenFilter, TokenStream, Token}; -/// `RemoveLongFilter` removes tokens that are longer +/// `RemoveLongFilter` removes tokens that are longer /// than a given number of bytes (in UTF-8 representation). /// /// It is especially useful when indexing unconstrained content. @@ -19,15 +19,17 @@ impl RemoveLongFilter { } impl RemoveLongFilterStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { fn predicate(&self, token: &Token) -> bool { token.text.len() < self.token_length_limit } - fn wrap(token_length_limit: usize, - tail: TailTokenStream) - -> RemoveLongFilterStream { + fn wrap( + token_length_limit: usize, + tail: TailTokenStream, + ) -> RemoveLongFilterStream { RemoveLongFilterStream { token_length_limit: token_length_limit, tail: tail, @@ -37,7 +39,8 @@ impl RemoveLongFilterStream impl TokenFilter for RemoveLongFilter - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { type ResultTokenStream = RemoveLongFilterStream; @@ -47,7 +50,8 @@ impl TokenFilter for RemoveLongFilter } pub struct RemoveLongFilterStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { token_length_limit: usize, tail: TailTokenStream, diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 890c8b551..1c349e049 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -14,7 +14,8 @@ impl Stemmer { } impl TokenFilter for Stemmer - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { type ResultTokenStream = StemmerTokenStream; @@ -26,7 +27,8 @@ impl TokenFilter for Stemmer pub struct StemmerTokenStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { tail: TailTokenStream, stemmer: rust_stemmers::Stemmer, @@ -45,7 +47,7 @@ impl TokenStream for StemmerTokenStream fn advance(&mut self) -> bool { if self.tail.advance() { - // TODO remove allocation +// TODO remove allocation let stemmed_str: String = self.stemmer.stem(&self.token().text).into_owned(); self.token_mut().text.clear(); self.token_mut().text.push_str(&stemmed_str); @@ -57,11 +59,13 @@ impl TokenStream for StemmerTokenStream } impl StemmerTokenStream - where TailTokenStream: TokenStream +where + TailTokenStream: TokenStream, { - fn wrap(stemmer: rust_stemmers::Stemmer, - tail: TailTokenStream) - -> StemmerTokenStream { + fn wrap( + stemmer: rust_stemmers::Stemmer, + tail: TailTokenStream, + ) -> StemmerTokenStream { StemmerTokenStream { tail: tail, stemmer: stemmer, diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index 84d17fede..82fe06299 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -9,11 +9,14 @@ pub struct TokenStreamChain { } -impl<'a, TTokenStream> TokenStreamChain - where TTokenStream: TokenStream { - - pub fn new(offsets: Vec, - token_streams: Vec) -> TokenStreamChain { +impl<'a, TTokenStream> TokenStreamChain +where + TTokenStream: TokenStream, +{ + pub fn new( + offsets: Vec, + token_streams: Vec, + ) -> TokenStreamChain { TokenStreamChain { offsets: offsets, stream_idx: 0, @@ -25,7 +28,9 @@ impl<'a, TTokenStream> TokenStreamChain } impl<'a, TTokenStream> TokenStream for TokenStreamChain - where TTokenStream: TokenStream { +where + TTokenStream: TokenStream, +{ fn advance(&mut self) -> bool { while self.stream_idx < self.token_streams.len() { let token_stream = &mut self.token_streams[self.stream_idx]; @@ -38,8 +43,7 @@ impl<'a, TTokenStream> TokenStream for TokenStreamChain self.token.text.clear(); self.token.text.push_str(token.text.as_str()); return true; - } - else { + } else { self.stream_idx += 1; self.position_shift = self.token.position + 2; } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 6297af0ca..d49cf53db 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -11,7 +11,7 @@ pub struct Token { /// Offsets shall not be modified by token filters. pub offset_from: usize, /// Offset (byte index) of the last character of the token + 1. - /// The text that generated the token should be obtained by + /// The text that generated the token should be obtained by /// &text[token.offset_from..token.offset_to] pub offset_to: usize, /// Position, expressed in number of tokens. @@ -43,7 +43,6 @@ impl Default for Token { /// /// This API may change to use associated types. pub trait Tokenizer<'a>: Sized + Clone { - /// Type associated to the resulting tokenstream tokenstream. type TokenStreamImpl: TokenStream; @@ -71,7 +70,8 @@ pub trait Tokenizer<'a>: Sized + Clone { /// ``` /// fn filter(self, new_filter: NewFilter) -> ChainTokenizer - where NewFilter: TokenFilter<>::TokenStreamImpl> + where + NewFilter: TokenFilter<>::TokenStreamImpl>, { ChainTokenizer { head: new_filter, @@ -87,9 +87,14 @@ pub trait BoxedTokenizer: Send + Sync { } #[derive(Clone)] -struct BoxableTokenizer(A) where A: for <'a> Tokenizer<'a> + Send + Sync; +struct BoxableTokenizer(A) +where + A: for<'a> Tokenizer<'a> + Send + Sync; -impl BoxedTokenizer for BoxableTokenizer where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { +impl BoxedTokenizer for BoxableTokenizer +where + A: 'static + Send + Sync + for<'a> Tokenizer<'a>, +{ fn token_stream<'a>(&self, text: &'a str) -> Box { box self.0.token_stream(text) } @@ -98,20 +103,15 @@ impl BoxedTokenizer for BoxableTokenizer where A: 'static + Send + Sync + assert!(texts.len() > 0); if texts.len() == 1 { box self.0.token_stream(texts[0]) - } - else { - let mut offsets = vec!(); + } else { + let mut offsets = vec![]; let mut total_offset = 0; for &text in texts { offsets.push(total_offset); total_offset += text.len(); } - let token_streams: Vec<_> = texts - .iter() - .map(|text| { - self.0.token_stream(text) - }) - .collect(); + let token_streams: Vec<_> = + texts.iter().map(|text| self.0.token_stream(text)).collect(); box TokenStreamChain::new(offsets, token_streams) } } @@ -122,7 +122,9 @@ impl BoxedTokenizer for BoxableTokenizer where A: 'static + Send + Sync + } pub fn box_tokenizer(a: A) -> Box - where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { +where + A: 'static + Send + Sync + for<'a> Tokenizer<'a>, +{ box BoxableTokenizer(a) } @@ -211,13 +213,14 @@ pub struct ChainTokenizer { impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a> for ChainTokenizer - where HeadTokenFilterFactory: TokenFilter, - TailTokenizer: Tokenizer<'a> +where + HeadTokenFilterFactory: TokenFilter, + TailTokenizer: Tokenizer<'a>, { type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { - let tail_token_stream = self.tail.token_stream(text ); + let tail_token_stream = self.tail.token_stream(text); self.head.transform(tail_token_stream) } } diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index 286d7ae6d..ac2c7a691 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -18,15 +18,18 @@ use tokenizer::Stemmer; /// By default, it is populated with the following managers. /// /// * raw : does not process nor tokenize the text. -/// * default : Chops the text on according to whitespace and punctuation, removes tokens that are too long, lowercases +/// * default : Chops the text on according to whitespace and +/// punctuation, removes tokens that are too long, lowercases #[derive(Clone)] pub struct TokenizerManager { - tokenizers: Arc< RwLock >> > + tokenizers: Arc>>>, } impl TokenizerManager { pub fn register(&self, tokenizer_name: &str, tokenizer: A) - where A: 'static + Send + Sync + for <'a> Tokenizer<'a> { + where + A: 'static + Send + Sync + for<'a> Tokenizer<'a>, + { let boxed_tokenizer = box_tokenizer(tokenizer); self.tokenizers .write() @@ -39,9 +42,7 @@ impl TokenizerManager { .read() .expect("Acquiring the lock should never fail") .get(tokenizer_name) - .map(|boxed_tokenizer| { - boxed_tokenizer.boxed_clone() - }) + .map(|boxed_tokenizer| boxed_tokenizer.boxed_clone()) } } @@ -52,27 +53,22 @@ impl Default for TokenizerManager { /// - en_stem /// - ja fn default() -> TokenizerManager { - let manager = TokenizerManager { - tokenizers: Arc::new(RwLock::new(HashMap::new())) - }; - manager.register("raw", - RawTokenizer + let manager = TokenizerManager { tokenizers: Arc::new(RwLock::new(HashMap::new())) }; + manager.register("raw", RawTokenizer); + manager.register( + "default", + SimpleTokenizer.filter(RemoveLongFilter::limit(40)).filter( + LowerCaser, + ), ); - manager.register("default", + manager.register( + "en_stem", SimpleTokenizer .filter(RemoveLongFilter::limit(40)) .filter(LowerCaser) + .filter(Stemmer::new()), ); - manager.register("en_stem", - SimpleTokenizer - .filter(RemoveLongFilter::limit(40)) - .filter(LowerCaser) - .filter(Stemmer::new()) - ); - manager.register("ja", - JapaneseTokenizer - .filter(RemoveLongFilter::limit(40)) - ); + manager.register("ja", JapaneseTokenizer.filter(RemoveLongFilter::limit(40))); manager } -} \ No newline at end of file +}