Remove TokenStream trait.

This commit is contained in:
dcraven
2021-01-04 12:38:52 +01:00
committed by Paul Masurel
parent 4a68c8a712
commit 50812d0081
14 changed files with 48 additions and 105 deletions

View File

@@ -12,9 +12,8 @@ use crate::schema::Term;
use crate::schema::Value;
use crate::schema::{Field, FieldEntry};
use crate::tokenizer::PreTokenizedStream;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT};
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT, Token};
use crate::Opstamp;
use crate::{DocId, SegmentComponent};
@@ -183,7 +182,7 @@ impl SegmentWriter {
Value::PreTokStr(tok_str) => {
streams_with_offsets.push((
Box::new(PreTokenizedStream::from(tok_str.clone()))
as Box<dyn TokenStream>,
as Box<dyn Iterator<Item = Token>>,
total_offset,
));
if let Some(last_token) = tok_str.tokens.last() {

View File

@@ -9,7 +9,6 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer};
use crate::schema::IndexRecordOption;
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
use crate::termdict::TermOrdinal;
use crate::tokenizer::TokenStream;
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
use crate::DocId;
use fnv::FnvHashMap;
@@ -100,7 +99,7 @@ impl MultiFieldPostingsWriter {
&mut self,
doc: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
token_stream: &mut dyn Iterator<Item = Token>,
term_buffer: &mut Term,
) -> u32 {
self.per_field_postings_writers[field.field_id() as usize].index_text(
@@ -215,7 +214,7 @@ pub trait PostingsWriter {
term_index: &mut TermHashMap,
doc_id: DocId,
field: Field,
token_stream: &mut dyn TokenStream,
token_stream: &mut dyn Iterator<Item = Token>,
heap: &mut MemoryArena,
term_buffer: &mut Term,
) -> u32 {

View File

@@ -19,7 +19,7 @@
//! // the "emoji" is dropped because its not an alphanum
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter};
/// `TokenFilter` that removes all tokens that contain non
/// ascii alphanumeric characters.

View File

@@ -1,4 +1,4 @@
use super::{analyzer_builder, Token, TokenFilter, TokenStream};
use super::{analyzer_builder, Token, TokenFilter};
use std::mem;
/// This class converts alphabetic, numeric, and symbolic Unicode characters

View File

@@ -1,4 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use super::{Token, Tokenizer};
use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation
@@ -69,8 +69,6 @@ impl Iterator for FacetTokenStream {
}
}
impl TokenStream for FacetTokenStream {}
#[cfg(test)]
mod tests {

View File

@@ -146,8 +146,7 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain}
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
pub use self::tokenizer::{
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream,
Tokenizer,
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer,
};
pub use self::tokenizer_manager::TokenizerManager;

View File

@@ -1,4 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use super::{Token, Tokenizer};
/// Tokenize the text by splitting words into n-grams of the given size(s)
///
@@ -145,8 +145,6 @@ impl Tokenizer for NgramTokenizer {
}
}
impl TokenStream for NgramTokenStream {}
impl Iterator for NgramTokenStream {
type Item = Token;
fn next(&mut self) -> Option<Self::Item> {
@@ -296,18 +294,8 @@ fn utf8_codepoint_width(b: u8) -> usize {
#[cfg(test)]
mod tests {
use super::utf8_codepoint_width;
use super::CodepointFrontiers;
use super::NgramTokenizer;
use super::StutteringIterator;
use super::*;
use crate::tokenizer::tests::assert_token;
use crate::tokenizer::tokenizer::Tokenizer;
use crate::tokenizer::{Token, TokenStream};
fn test_helper<T: TokenStream>(tokens: T) -> Vec<Token> {
tokens.collect()
}
#[test]
fn test_utf8_codepoint_width() {
@@ -344,7 +332,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_1_2_false() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "he", 0, 2);
@@ -359,7 +349,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_min_max_equal() {
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3)
.token_stream("hello")
.collect();
assert_eq!(tokens.len(), 3);
assert_token(&tokens[0], 0, "hel", 0, 3);
assert_token(&tokens[1], 0, "ell", 1, 4);
@@ -368,7 +360,9 @@ mod tests {
#[test]
fn test_ngram_tokenizer_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("frankenstein")
.collect();
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "fr", 0, 2);
assert_token(&tokens[1], 0, "fra", 0, 3);
@@ -378,7 +372,9 @@ mod tests {
#[test]
fn test_ngram_non_ascii_1_2() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 9);
assert_token(&tokens[0], 0, "h", 0, 1);
assert_token(&tokens[1], 0, "", 0, 3);
@@ -393,7 +389,9 @@ mod tests {
#[test]
fn test_ngram_non_ascii_2_5_prefix() {
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
.token_stream("hεllo")
.collect();
assert_eq!(tokens.len(), 4);
assert_token(&tokens[0], 0, "", 0, 3);
assert_token(&tokens[1], 0, "hεl", 0, 4);
@@ -403,16 +401,16 @@ mod tests {
#[test]
fn test_ngram_empty() {
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect();
assert!(tokens.is_empty());
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect();
assert!(tokens.is_empty());
}
#[test]
#[should_panic(expected = "min_gram must be greater than 0")]
fn test_ngram_min_max_interval_empty() {
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss");
}
#[test]

View File

@@ -1,4 +1,4 @@
use super::{Token, TokenStream, Tokenizer};
use super::{Token, Tokenizer};
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone, Debug, Default)]
@@ -29,5 +29,3 @@ impl Iterator for RawTokenStream {
self.token.take()
}
}
impl TokenStream for RawTokenStream {}

View File

@@ -1,8 +1,6 @@
use super::{Token, TokenStream, Tokenizer};
use super::{Token, Tokenizer};
use std::str::CharIndices;
impl TokenStream for SimpleTokenizerStream {}
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone, Debug)]
pub struct SimpleTokenizer;

View File

@@ -1,4 +1,4 @@
use super::{Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter};
use rust_stemmers::{self, Algorithm};
use serde::{Deserialize, Serialize};

View File

@@ -10,7 +10,7 @@
//! assert_eq!(stream.next().unwrap().text, "crafty");
//! assert!(stream.next().is_none());
//! ```
use super::{Token, TokenFilter, TokenStream};
use super::{Token, TokenFilter};
use fnv::FnvHasher;
use std::collections::HashSet;
use std::hash::BuildHasherDefault;

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::{Token, TokenStream, Tokenizer};
use crate::tokenizer::{Token, Tokenizer};
const POSITION_GAP: usize = 2;
@@ -25,13 +25,6 @@ where
}
}
impl<'a, Inner, Outer: Iterator<Item = (Inner, usize)>> TokenStream
for TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
{
}
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
where
Inner: Iterator<Item = Token>,
@@ -55,7 +48,9 @@ where
}
impl DynTokenStreamChain {
pub fn from_vec(streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>) -> impl TokenStream {
pub fn from_vec(
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
) -> impl Iterator<Item = Token> {
DynTokenStreamChain {
streams_with_offsets,
idx: 0,
@@ -66,14 +61,12 @@ impl DynTokenStreamChain {
}
pub(crate) struct DynTokenStreamChain {
streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>,
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
idx: usize,
position: usize,
position_shift: usize,
}
impl<'a> TokenStream for DynTokenStreamChain {}
impl Iterator for DynTokenStreamChain {
type Item = Token;
fn next(&mut self) -> Option<Token> {

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
use crate::tokenizer::{Token, TokenStreamChain};
use serde::{Deserialize, Serialize};
use std::cmp::Ordering;
@@ -42,7 +42,7 @@ impl PreTokenizedStream {
/// Creates a TokenStream from PreTokenizedString array
pub fn chain_tokenized_strings<'a>(
tok_strings: &'a [&PreTokenizedString],
) -> impl TokenStream + 'a {
) -> impl Iterator<Item = Token> + 'a {
let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| {
let next = Some((
PreTokenizedStream::from((*tok_string).to_owned()),
@@ -57,8 +57,6 @@ impl PreTokenizedStream {
}
}
impl TokenStream for PreTokenizedStream {}
impl Iterator for PreTokenizedStream {
type Item = Token;
fn next(&mut self) -> Option<Token> {

View File

@@ -11,7 +11,7 @@ pub trait TextAnalyzerClone {
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
/// dispatched `token_stream` from the `Tokenizer` trait.
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>>;
}
impl Clone for Box<dyn TextAnalyzerT> {
@@ -57,51 +57,21 @@ pub trait TokenFilter: 'static + Send + Sync + Clone {
fn transform(&mut self, token: Token) -> Option<Token>;
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = analyzer_builder(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser::new()).build();
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
pub trait TokenStream: Iterator<Item = Token> {}
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
pub trait Tokenizer: 'static + Send + Sync + Clone {
/// An iteratable type is returned.
type Iter: TokenStream;
type Iter: Iterator<Item = Token>;
/// Creates a token stream for a given `str`.
fn token_stream(&self, text: &str) -> Self::Iter;
/// Tokenize an array`&str`
///
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
/// The resulting `Token` stream is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn Iterator<Item = Token> + 'a> {
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
let temp = *total_offset;
*total_offset += text.len();
@@ -111,7 +81,7 @@ pub trait Tokenizer: 'static + Send + Sync + Clone {
}
}
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
#[derive(Clone, Debug, Default)]
@@ -130,7 +100,7 @@ impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
}
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
fn token_stream(&self, text: &str) -> Box<dyn TokenStream> {
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>> {
Box::new(self.0.token_stream(text))
}
}
@@ -145,7 +115,7 @@ impl TokenFilter for Identity {
}
}
/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`.
/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it.
#[derive(Clone, Default, Debug)]
pub struct Filter<I, F> {
iter: I,
@@ -154,7 +124,7 @@ pub struct Filter<I, F> {
impl<I, F> Iterator for Filter<I, F>
where
I: TokenStream,
I: Iterator<Item = Token>,
F: TokenFilter,
{
type Item = Token;
@@ -168,13 +138,6 @@ where
}
}
impl<I, F> TokenStream for Filter<I, F>
where
I: TokenStream,
F: TokenFilter,
{
}
#[derive(Clone, Debug, Default)]
pub struct AnalyzerBuilder<T, F> {
tokenizer: T,
@@ -196,7 +159,7 @@ where
{
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// The method consumes the current `Token` and returns a
/// new one.
///
/// # Example