diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index 42684598b..c6e0eadff 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -12,9 +12,8 @@ use crate::schema::Term; use crate::schema::Value; use crate::schema::{Field, FieldEntry}; use crate::tokenizer::PreTokenizedStream; -use crate::tokenizer::TokenStream; use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer}; -use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT}; +use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT, Token}; use crate::Opstamp; use crate::{DocId, SegmentComponent}; @@ -183,7 +182,7 @@ impl SegmentWriter { Value::PreTokStr(tok_str) => { streams_with_offsets.push(( Box::new(PreTokenizedStream::from(tok_str.clone())) - as Box, + as Box>, total_offset, )); if let Some(last_token) = tok_str.tokens.last() { diff --git a/src/postings/postings_writer.rs b/src/postings/postings_writer.rs index 21f963a21..e3217b698 100644 --- a/src/postings/postings_writer.rs +++ b/src/postings/postings_writer.rs @@ -9,7 +9,6 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer}; use crate::schema::IndexRecordOption; use crate::schema::{Field, FieldEntry, FieldType, Schema, Term}; use crate::termdict::TermOrdinal; -use crate::tokenizer::TokenStream; use crate::tokenizer::{Token, MAX_TOKEN_LEN}; use crate::DocId; use fnv::FnvHashMap; @@ -100,7 +99,7 @@ impl MultiFieldPostingsWriter { &mut self, doc: DocId, field: Field, - token_stream: &mut dyn TokenStream, + token_stream: &mut dyn Iterator, term_buffer: &mut Term, ) -> u32 { self.per_field_postings_writers[field.field_id() as usize].index_text( @@ -215,7 +214,7 @@ pub trait PostingsWriter { term_index: &mut TermHashMap, doc_id: DocId, field: Field, - token_stream: &mut dyn TokenStream, + token_stream: &mut dyn Iterator, heap: &mut MemoryArena, term_buffer: &mut Term, ) -> u32 { diff --git a/src/tokenizer/alphanum_only.rs b/src/tokenizer/alphanum_only.rs index 8034cfaa6..7b784dbfd 100644 --- a/src/tokenizer/alphanum_only.rs +++ b/src/tokenizer/alphanum_only.rs @@ -19,7 +19,7 @@ //! // the "emoji" is dropped because its not an alphanum //! assert!(stream.next().is_none()); //! ``` -use super::{Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter}; /// `TokenFilter` that removes all tokens that contain non /// ascii alphanumeric characters. diff --git a/src/tokenizer/ascii_folding_filter.rs b/src/tokenizer/ascii_folding_filter.rs index bbfb4d4d9..1207879ad 100644 --- a/src/tokenizer/ascii_folding_filter.rs +++ b/src/tokenizer/ascii_folding_filter.rs @@ -1,4 +1,4 @@ -use super::{analyzer_builder, Token, TokenFilter, TokenStream}; +use super::{analyzer_builder, Token, TokenFilter}; use std::mem; /// This class converts alphabetic, numeric, and symbolic Unicode characters diff --git a/src/tokenizer/facet_tokenizer.rs b/src/tokenizer/facet_tokenizer.rs index 1916d837f..69f263964 100644 --- a/src/tokenizer/facet_tokenizer.rs +++ b/src/tokenizer/facet_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenStream, Tokenizer}; +use super::{Token, Tokenizer}; use crate::schema::FACET_SEP_BYTE; /// The `FacetTokenizer` process a `Facet` binary representation @@ -69,8 +69,6 @@ impl Iterator for FacetTokenStream { } } -impl TokenStream for FacetTokenStream {} - #[cfg(test)] mod tests { diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index 4235e6f9d..0e3886e7e 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -146,8 +146,7 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain} pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString}; pub use self::tokenizer::{ - analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream, - Tokenizer, + analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer, }; pub use self::tokenizer_manager::TokenizerManager; diff --git a/src/tokenizer/ngram_tokenizer.rs b/src/tokenizer/ngram_tokenizer.rs index 036f8cd0b..ce45bba8b 100644 --- a/src/tokenizer/ngram_tokenizer.rs +++ b/src/tokenizer/ngram_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenStream, Tokenizer}; +use super::{Token, Tokenizer}; /// Tokenize the text by splitting words into n-grams of the given size(s) /// @@ -145,8 +145,6 @@ impl Tokenizer for NgramTokenizer { } } -impl TokenStream for NgramTokenStream {} - impl Iterator for NgramTokenStream { type Item = Token; fn next(&mut self) -> Option { @@ -296,18 +294,8 @@ fn utf8_codepoint_width(b: u8) -> usize { #[cfg(test)] mod tests { - - use super::utf8_codepoint_width; - use super::CodepointFrontiers; - use super::NgramTokenizer; - use super::StutteringIterator; + use super::*; use crate::tokenizer::tests::assert_token; - use crate::tokenizer::tokenizer::Tokenizer; - use crate::tokenizer::{Token, TokenStream}; - - fn test_helper(tokens: T) -> Vec { - tokens.collect() - } #[test] fn test_utf8_codepoint_width() { @@ -344,7 +332,9 @@ mod tests { #[test] fn test_ngram_tokenizer_1_2_false() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello")); + let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2) + .token_stream("hello") + .collect(); assert_eq!(tokens.len(), 9); assert_token(&tokens[0], 0, "h", 0, 1); assert_token(&tokens[1], 0, "he", 0, 2); @@ -359,7 +349,9 @@ mod tests { #[test] fn test_ngram_tokenizer_min_max_equal() { - let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello")); + let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3) + .token_stream("hello") + .collect(); assert_eq!(tokens.len(), 3); assert_token(&tokens[0], 0, "hel", 0, 3); assert_token(&tokens[1], 0, "ell", 1, 4); @@ -368,7 +360,9 @@ mod tests { #[test] fn test_ngram_tokenizer_2_5_prefix() { - let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein")); + let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5) + .token_stream("frankenstein") + .collect(); assert_eq!(tokens.len(), 4); assert_token(&tokens[0], 0, "fr", 0, 2); assert_token(&tokens[1], 0, "fra", 0, 3); @@ -378,7 +372,9 @@ mod tests { #[test] fn test_ngram_non_ascii_1_2() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo")); + let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2) + .token_stream("hεllo") + .collect(); assert_eq!(tokens.len(), 9); assert_token(&tokens[0], 0, "h", 0, 1); assert_token(&tokens[1], 0, "hε", 0, 3); @@ -393,7 +389,9 @@ mod tests { #[test] fn test_ngram_non_ascii_2_5_prefix() { - let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo")); + let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5) + .token_stream("hεllo") + .collect(); assert_eq!(tokens.len(), 4); assert_token(&tokens[0], 0, "hε", 0, 3); assert_token(&tokens[1], 0, "hεl", 0, 4); @@ -403,16 +401,16 @@ mod tests { #[test] fn test_ngram_empty() { - let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream("")); + let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect(); assert!(tokens.is_empty()); - let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream("")); + let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect(); assert!(tokens.is_empty()); } #[test] #[should_panic(expected = "min_gram must be greater than 0")] fn test_ngram_min_max_interval_empty() { - test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss")); + NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"); } #[test] diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index ea8eaaed1..e223ed05a 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenStream, Tokenizer}; +use super::{Token, Tokenizer}; /// For each value of the field, emit a single unprocessed token. #[derive(Clone, Debug, Default)] @@ -29,5 +29,3 @@ impl Iterator for RawTokenStream { self.token.take() } } - -impl TokenStream for RawTokenStream {} diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index fc69b0efd..c3aef7a05 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -1,8 +1,6 @@ -use super::{Token, TokenStream, Tokenizer}; +use super::{Token, Tokenizer}; use std::str::CharIndices; -impl TokenStream for SimpleTokenizerStream {} - /// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone, Debug)] pub struct SimpleTokenizer; diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 36caaa347..9d84d5799 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,4 +1,4 @@ -use super::{Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter}; use rust_stemmers::{self, Algorithm}; use serde::{Deserialize, Serialize}; diff --git a/src/tokenizer/stop_word_filter.rs b/src/tokenizer/stop_word_filter.rs index 2a4818f59..bb1de855b 100644 --- a/src/tokenizer/stop_word_filter.rs +++ b/src/tokenizer/stop_word_filter.rs @@ -10,7 +10,7 @@ //! assert_eq!(stream.next().unwrap().text, "crafty"); //! assert!(stream.next().is_none()); //! ``` -use super::{Token, TokenFilter, TokenStream}; +use super::{Token, TokenFilter}; use fnv::FnvHasher; use std::collections::HashSet; use std::hash::BuildHasherDefault; diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index c724c8546..6c635fbae 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::{Token, TokenStream, Tokenizer}; +use crate::tokenizer::{Token, Tokenizer}; const POSITION_GAP: usize = 2; @@ -25,13 +25,6 @@ where } } -impl<'a, Inner, Outer: Iterator> TokenStream - for TokenStreamChain -where - Inner: Iterator, -{ -} - impl<'a, Inner, Outer> Iterator for TokenStreamChain where Inner: Iterator, @@ -55,7 +48,9 @@ where } impl DynTokenStreamChain { - pub fn from_vec(streams_with_offsets: Vec<(Box, usize)>) -> impl TokenStream { + pub fn from_vec( + streams_with_offsets: Vec<(Box>, usize)>, + ) -> impl Iterator { DynTokenStreamChain { streams_with_offsets, idx: 0, @@ -66,14 +61,12 @@ impl DynTokenStreamChain { } pub(crate) struct DynTokenStreamChain { - streams_with_offsets: Vec<(Box, usize)>, + streams_with_offsets: Vec<(Box>, usize)>, idx: usize, position: usize, position_shift: usize, } -impl<'a> TokenStream for DynTokenStreamChain {} - impl Iterator for DynTokenStreamChain { type Item = Token; fn next(&mut self) -> Option { diff --git a/src/tokenizer/tokenized_string.rs b/src/tokenizer/tokenized_string.rs index 016263160..703f97c3a 100644 --- a/src/tokenizer/tokenized_string.rs +++ b/src/tokenizer/tokenized_string.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::{Token, TokenStream, TokenStreamChain}; +use crate::tokenizer::{Token, TokenStreamChain}; use serde::{Deserialize, Serialize}; use std::cmp::Ordering; @@ -42,7 +42,7 @@ impl PreTokenizedStream { /// Creates a TokenStream from PreTokenizedString array pub fn chain_tokenized_strings<'a>( tok_strings: &'a [&PreTokenizedString], - ) -> impl TokenStream + 'a { + ) -> impl Iterator + 'a { let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| { let next = Some(( PreTokenizedStream::from((*tok_string).to_owned()), @@ -57,8 +57,6 @@ impl PreTokenizedStream { } } -impl TokenStream for PreTokenizedStream {} - impl Iterator for PreTokenizedStream { type Item = Token; fn next(&mut self) -> Option { diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 6bba961ec..a9dfcdc08 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -11,7 +11,7 @@ pub trait TextAnalyzerClone { pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone { /// 'Top-level' dynamic dispatch function hiding concrete types of the staticly /// dispatched `token_stream` from the `Tokenizer` trait. - fn token_stream(&self, text: &str) -> Box; + fn token_stream(&self, text: &str) -> Box>; } impl Clone for Box { @@ -57,51 +57,21 @@ pub trait TokenFilter: 'static + Send + Sync + Clone { fn transform(&mut self, token: Token) -> Option; } -/// `TokenStream` is the result of the tokenization. -/// -/// It consists consumable stream of `Token`s. -/// -/// # Example -/// -/// ``` -/// use tantivy::tokenizer::*; -/// -/// let tokenizer = analyzer_builder(SimpleTokenizer) -/// .filter(RemoveLongFilter::limit(40)) -/// .filter(LowerCaser::new()).build(); -/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "hello"); -/// assert_eq!(token.offset_from, 0); -/// assert_eq!(token.offset_to, 5); -/// assert_eq!(token.position, 0); -/// } -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "happy"); -/// assert_eq!(token.offset_from, 7); -/// assert_eq!(token.offset_to, 12); -/// assert_eq!(token.position, 1); -/// } -/// ``` -pub trait TokenStream: Iterator {} - /// `Tokenizer` are in charge of splitting text into a stream of token /// before indexing. /// /// See the [module documentation](./index.html) for more detail. pub trait Tokenizer: 'static + Send + Sync + Clone { /// An iteratable type is returned. - type Iter: TokenStream; + type Iter: Iterator; /// Creates a token stream for a given `str`. fn token_stream(&self, text: &str) -> Self::Iter; /// Tokenize an array`&str` /// - /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were + /// The resulting `Token` stream is equivalent to what would be obtained if the &str were /// one concatenated `&str`, with an artificial position gap of `2` between the different fields /// to prevent accidental `PhraseQuery` to match accross two terms. - fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box { + fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box + 'a> { let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| { let temp = *total_offset; *total_offset += text.len(); @@ -111,7 +81,7 @@ pub trait Tokenizer: 'static + Send + Sync + Clone { } } -/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`. +/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it. /// /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially. #[derive(Clone, Debug, Default)] @@ -130,7 +100,7 @@ impl TextAnalyzerClone for TextAnalyzer { } impl TextAnalyzerT for TextAnalyzer { - fn token_stream(&self, text: &str) -> Box { + fn token_stream(&self, text: &str) -> Box> { Box::new(self.0.token_stream(text)) } } @@ -145,7 +115,7 @@ impl TokenFilter for Identity { } } -/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`. +/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it. #[derive(Clone, Default, Debug)] pub struct Filter { iter: I, @@ -154,7 +124,7 @@ pub struct Filter { impl Iterator for Filter where - I: TokenStream, + I: Iterator, F: TokenFilter, { type Item = Token; @@ -168,13 +138,6 @@ where } } -impl TokenStream for Filter -where - I: TokenStream, - F: TokenFilter, -{ -} - #[derive(Clone, Debug, Default)] pub struct AnalyzerBuilder { tokenizer: T, @@ -196,7 +159,7 @@ where { /// Appends a token filter to the current tokenizer. /// - /// The method consumes the current `TokenStream` and returns a + /// The method consumes the current `Token` and returns a /// new one. /// /// # Example