mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Remove TokenStream trait.
This commit is contained in:
@@ -12,9 +12,8 @@ use crate::schema::Term;
|
||||
use crate::schema::Value;
|
||||
use crate::schema::{Field, FieldEntry};
|
||||
use crate::tokenizer::PreTokenizedStream;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{DynTokenStreamChain, TokenStreamChain, Tokenizer};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT};
|
||||
use crate::tokenizer::{FacetTokenizer, TextAnalyzer, TextAnalyzerT, Token};
|
||||
use crate::Opstamp;
|
||||
use crate::{DocId, SegmentComponent};
|
||||
|
||||
@@ -183,7 +182,7 @@ impl SegmentWriter {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
streams_with_offsets.push((
|
||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
as Box<dyn TokenStream>,
|
||||
as Box<dyn Iterator<Item = Token>>,
|
||||
total_offset,
|
||||
));
|
||||
if let Some(last_token) = tok_str.tokens.last() {
|
||||
|
||||
@@ -9,7 +9,6 @@ use crate::postings::{FieldSerializer, InvertedIndexSerializer};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::schema::{Field, FieldEntry, FieldType, Schema, Term};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::tokenizer::TokenStream;
|
||||
use crate::tokenizer::{Token, MAX_TOKEN_LEN};
|
||||
use crate::DocId;
|
||||
use fnv::FnvHashMap;
|
||||
@@ -100,7 +99,7 @@ impl MultiFieldPostingsWriter {
|
||||
&mut self,
|
||||
doc: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
token_stream: &mut dyn Iterator<Item = Token>,
|
||||
term_buffer: &mut Term,
|
||||
) -> u32 {
|
||||
self.per_field_postings_writers[field.field_id() as usize].index_text(
|
||||
@@ -215,7 +214,7 @@ pub trait PostingsWriter {
|
||||
term_index: &mut TermHashMap,
|
||||
doc_id: DocId,
|
||||
field: Field,
|
||||
token_stream: &mut dyn TokenStream,
|
||||
token_stream: &mut dyn Iterator<Item = Token>,
|
||||
heap: &mut MemoryArena,
|
||||
term_buffer: &mut Term,
|
||||
) -> u32 {
|
||||
|
||||
@@ -19,7 +19,7 @@
|
||||
//! // the "emoji" is dropped because its not an alphanum
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter};
|
||||
|
||||
/// `TokenFilter` that removes all tokens that contain non
|
||||
/// ascii alphanumeric characters.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{analyzer_builder, Token, TokenFilter, TokenStream};
|
||||
use super::{analyzer_builder, Token, TokenFilter};
|
||||
use std::mem;
|
||||
|
||||
/// This class converts alphabetic, numeric, and symbolic Unicode characters
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use super::{Token, Tokenizer};
|
||||
use crate::schema::FACET_SEP_BYTE;
|
||||
|
||||
/// The `FacetTokenizer` process a `Facet` binary representation
|
||||
@@ -69,8 +69,6 @@ impl Iterator for FacetTokenStream {
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for FacetTokenStream {}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -146,8 +146,7 @@ pub(crate) use self::token_stream_chain::{DynTokenStreamChain, TokenStreamChain}
|
||||
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, TokenStream,
|
||||
Tokenizer,
|
||||
analyzer_builder, Identity, TextAnalyzer, TextAnalyzerT, Token, TokenFilter, Tokenizer,
|
||||
};
|
||||
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use super::{Token, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting words into n-grams of the given size(s)
|
||||
///
|
||||
@@ -145,8 +145,6 @@ impl Tokenizer for NgramTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for NgramTokenStream {}
|
||||
|
||||
impl Iterator for NgramTokenStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
@@ -296,18 +294,8 @@ fn utf8_codepoint_width(b: u8) -> usize {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::utf8_codepoint_width;
|
||||
use super::CodepointFrontiers;
|
||||
use super::NgramTokenizer;
|
||||
use super::StutteringIterator;
|
||||
use super::*;
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{Token, TokenStream};
|
||||
|
||||
fn test_helper<T: TokenStream>(tokens: T) -> Vec<Token> {
|
||||
tokens.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_utf8_codepoint_width() {
|
||||
@@ -344,7 +332,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_1_2_false() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hello"));
|
||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
|
||||
.token_stream("hello")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "he", 0, 2);
|
||||
@@ -359,7 +349,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_min_max_equal() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(3, 3).token_stream("hello"));
|
||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(3, 3)
|
||||
.token_stream("hello")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 3);
|
||||
assert_token(&tokens[0], 0, "hel", 0, 3);
|
||||
assert_token(&tokens[1], 0, "ell", 1, 4);
|
||||
@@ -368,7 +360,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_tokenizer_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("frankenstein"));
|
||||
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
|
||||
.token_stream("frankenstein")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "fr", 0, 2);
|
||||
assert_token(&tokens[1], 0, "fra", 0, 3);
|
||||
@@ -378,7 +372,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_1_2() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 2).token_stream("hεllo"));
|
||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 2)
|
||||
.token_stream("hεllo")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 9);
|
||||
assert_token(&tokens[0], 0, "h", 0, 1);
|
||||
assert_token(&tokens[1], 0, "hε", 0, 3);
|
||||
@@ -393,7 +389,9 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_non_ascii_2_5_prefix() {
|
||||
let tokens = test_helper(NgramTokenizer::prefix_only(2, 5).token_stream("hεllo"));
|
||||
let tokens: Vec<_> = NgramTokenizer::prefix_only(2, 5)
|
||||
.token_stream("hεllo")
|
||||
.collect();
|
||||
assert_eq!(tokens.len(), 4);
|
||||
assert_token(&tokens[0], 0, "hε", 0, 3);
|
||||
assert_token(&tokens[1], 0, "hεl", 0, 4);
|
||||
@@ -403,16 +401,16 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_ngram_empty() {
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(1, 5).token_stream(""));
|
||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(1, 5).token_stream("").collect();
|
||||
assert!(tokens.is_empty());
|
||||
let tokens = test_helper(NgramTokenizer::all_ngrams(2, 5).token_stream(""));
|
||||
let tokens: Vec<_> = NgramTokenizer::all_ngrams(2, 5).token_stream("").collect();
|
||||
assert!(tokens.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "min_gram must be greater than 0")]
|
||||
fn test_ngram_min_max_interval_empty() {
|
||||
test_helper(NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss"));
|
||||
NgramTokenizer::all_ngrams(0, 2).token_stream("hellossss");
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use super::{Token, Tokenizer};
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
@@ -29,5 +29,3 @@ impl Iterator for RawTokenStream {
|
||||
self.token.take()
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for RawTokenStream {}
|
||||
|
||||
@@ -1,8 +1,6 @@
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
use super::{Token, Tokenizer};
|
||||
use std::str::CharIndices;
|
||||
|
||||
impl TokenStream for SimpleTokenizerStream {}
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
//! assert_eq!(stream.next().unwrap().text, "crafty");
|
||||
//! assert!(stream.next().is_none());
|
||||
//! ```
|
||||
use super::{Token, TokenFilter, TokenStream};
|
||||
use super::{Token, TokenFilter};
|
||||
use fnv::FnvHasher;
|
||||
use std::collections::HashSet;
|
||||
use std::hash::BuildHasherDefault;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::{Token, TokenStream, Tokenizer};
|
||||
use crate::tokenizer::{Token, Tokenizer};
|
||||
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
@@ -25,13 +25,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, Inner, Outer: Iterator<Item = (Inner, usize)>> TokenStream
|
||||
for TokenStreamChain<Inner, Outer>
|
||||
where
|
||||
Inner: Iterator<Item = Token>,
|
||||
{
|
||||
}
|
||||
|
||||
impl<'a, Inner, Outer> Iterator for TokenStreamChain<Inner, Outer>
|
||||
where
|
||||
Inner: Iterator<Item = Token>,
|
||||
@@ -55,7 +48,9 @@ where
|
||||
}
|
||||
|
||||
impl DynTokenStreamChain {
|
||||
pub fn from_vec(streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>) -> impl TokenStream {
|
||||
pub fn from_vec(
|
||||
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
|
||||
) -> impl Iterator<Item = Token> {
|
||||
DynTokenStreamChain {
|
||||
streams_with_offsets,
|
||||
idx: 0,
|
||||
@@ -66,14 +61,12 @@ impl DynTokenStreamChain {
|
||||
}
|
||||
|
||||
pub(crate) struct DynTokenStreamChain {
|
||||
streams_with_offsets: Vec<(Box<dyn TokenStream>, usize)>,
|
||||
streams_with_offsets: Vec<(Box<dyn Iterator<Item = Token>>, usize)>,
|
||||
idx: usize,
|
||||
position: usize,
|
||||
position_shift: usize,
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for DynTokenStreamChain {}
|
||||
|
||||
impl Iterator for DynTokenStreamChain {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::{Token, TokenStream, TokenStreamChain};
|
||||
use crate::tokenizer::{Token, TokenStreamChain};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::cmp::Ordering;
|
||||
|
||||
@@ -42,7 +42,7 @@ impl PreTokenizedStream {
|
||||
/// Creates a TokenStream from PreTokenizedString array
|
||||
pub fn chain_tokenized_strings<'a>(
|
||||
tok_strings: &'a [&PreTokenizedString],
|
||||
) -> impl TokenStream + 'a {
|
||||
) -> impl Iterator<Item = Token> + 'a {
|
||||
let streams_with_offsets = tok_strings.iter().scan(0, |total_offset, tok_string| {
|
||||
let next = Some((
|
||||
PreTokenizedStream::from((*tok_string).to_owned()),
|
||||
@@ -57,8 +57,6 @@ impl PreTokenizedStream {
|
||||
}
|
||||
}
|
||||
|
||||
impl TokenStream for PreTokenizedStream {}
|
||||
|
||||
impl Iterator for PreTokenizedStream {
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
|
||||
@@ -11,7 +11,7 @@ pub trait TextAnalyzerClone {
|
||||
pub trait TextAnalyzerT: 'static + Send + Sync + TextAnalyzerClone {
|
||||
/// 'Top-level' dynamic dispatch function hiding concrete types of the staticly
|
||||
/// dispatched `token_stream` from the `Tokenizer` trait.
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream>;
|
||||
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>>;
|
||||
}
|
||||
|
||||
impl Clone for Box<dyn TextAnalyzerT> {
|
||||
@@ -57,51 +57,21 @@ pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
fn transform(&mut self, token: Token) -> Option<Token>;
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = analyzer_builder(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser::new()).build();
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// ```
|
||||
pub trait TokenStream: Iterator<Item = Token> {}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// See the [module documentation](./index.html) for more detail.
|
||||
pub trait Tokenizer: 'static + Send + Sync + Clone {
|
||||
/// An iteratable type is returned.
|
||||
type Iter: TokenStream;
|
||||
type Iter: Iterator<Item = Token>;
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream(&self, text: &str) -> Self::Iter;
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// The resulting `Token` stream is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
|
||||
fn token_stream_texts<'a>(&'a self, texts: &'a [&str]) -> Box<dyn Iterator<Item = Token> + 'a> {
|
||||
let streams_with_offsets = texts.iter().scan(0, move |total_offset, &text| {
|
||||
let temp = *total_offset;
|
||||
*total_offset += text.len();
|
||||
@@ -111,7 +81,7 @@ pub trait Tokenizer: 'static + Send + Sync + Clone {
|
||||
}
|
||||
}
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
/// `TextAnalyzer` wraps the tokenization of an input text and its modification by any filters applied onto it.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
@@ -130,7 +100,7 @@ impl<T: Tokenizer> TextAnalyzerClone for TextAnalyzer<T> {
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> TextAnalyzerT for TextAnalyzer<T> {
|
||||
fn token_stream(&self, text: &str) -> Box<dyn TokenStream> {
|
||||
fn token_stream(&self, text: &str) -> Box<dyn Iterator<Item = Token>> {
|
||||
Box::new(self.0.token_stream(text))
|
||||
}
|
||||
}
|
||||
@@ -145,7 +115,7 @@ impl TokenFilter for Identity {
|
||||
}
|
||||
}
|
||||
|
||||
/// `Filter` is a wrapper around a `TokenStream` and a `TokenFilter` which modifies the `TokenStream`.
|
||||
/// `Filter` is a wrapper around a `Token` stream and a `TokenFilter` which modifies it.
|
||||
#[derive(Clone, Default, Debug)]
|
||||
pub struct Filter<I, F> {
|
||||
iter: I,
|
||||
@@ -154,7 +124,7 @@ pub struct Filter<I, F> {
|
||||
|
||||
impl<I, F> Iterator for Filter<I, F>
|
||||
where
|
||||
I: TokenStream,
|
||||
I: Iterator<Item = Token>,
|
||||
F: TokenFilter,
|
||||
{
|
||||
type Item = Token;
|
||||
@@ -168,13 +138,6 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
impl<I, F> TokenStream for Filter<I, F>
|
||||
where
|
||||
I: TokenStream,
|
||||
F: TokenFilter,
|
||||
{
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
pub struct AnalyzerBuilder<T, F> {
|
||||
tokenizer: T,
|
||||
@@ -196,7 +159,7 @@ where
|
||||
{
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// The method consumes the current `Token` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
|
||||
Reference in New Issue
Block a user