mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 17:42:55 +00:00
Checkpoint converting to Iterators and static dispatch.
This commit is contained in:
@@ -36,28 +36,42 @@ impl Default for Token {
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn Tokenizer>,
|
||||
token_filters: Vec<Box<dyn TokenFilter>>,
|
||||
#[derive(Clone)]
|
||||
pub struct TokenStream<'a, I> {
|
||||
tokens: I,
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> From<T> for TextAnalyzer {
|
||||
fn from(tokenizer: T) -> Self {
|
||||
TextAnalyzer::new(tokenizer, Vec::new())
|
||||
impl<'a, I> Iterator for TokenStream<'a, I>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
{
|
||||
type Item = I::Item;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while let Some(token) = self.tokens.next() {
|
||||
if self.filters.all(|filter| filter(&token)) {
|
||||
return Some(token);
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
impl<'a, I> TokenStream<'a, I>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
{
|
||||
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
|
||||
/// `TextAnalyzer::from(tokenizer)`.
|
||||
pub fn new<T: Tokenizer>(
|
||||
pub fn new<T: Tokenizer<'a, Iter = I>>(
|
||||
tokenizer: T,
|
||||
text: &str,
|
||||
token_filters: Vec<Box<dyn TokenFilter>>,
|
||||
) -> TextAnalyzer {
|
||||
TextAnalyzer {
|
||||
tokenizer: Box::new(tokenizer),
|
||||
) -> TokenStream<'a, I> {
|
||||
TokenStream {
|
||||
tokens: tokenizer.token_stream(text),
|
||||
token_filters,
|
||||
}
|
||||
}
|
||||
@@ -83,44 +97,34 @@ impl TextAnalyzer {
|
||||
self
|
||||
}
|
||||
|
||||
/// Tokenize an array`&str`
|
||||
///
|
||||
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
/// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
pub fn token_stream_texts<'a>(&self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
|
||||
debug_assert!(!texts.is_empty());
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
}
|
||||
Box::new(TokenStreamChain::new(streams_with_offsets))
|
||||
}
|
||||
// /// Tokenize an array`&str`
|
||||
// ///
|
||||
// /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
|
||||
// /// one concatenated `&str`, with an artificial position gap of `2` between the different fields
|
||||
// /// to prevent accidental `PhraseQuery` to match accross two terms.
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
let mut token_stream = self.tokenizer.token_stream(text);
|
||||
for token_filter in &self.token_filters {
|
||||
token_stream = token_filter.transform(token_stream);
|
||||
}
|
||||
token_stream
|
||||
}
|
||||
// /// Creates a token stream for a given `str`.
|
||||
// pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
// let mut token_stream = self.tokenizer.token_stream(text);
|
||||
// for token_filter in &self.token_filters {
|
||||
// token_stream = token_filter.transform(token_stream);
|
||||
// }
|
||||
// token_stream
|
||||
// }
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
token_filters: self
|
||||
.token_filters
|
||||
.iter()
|
||||
.map(|token_filter| token_filter.box_clone())
|
||||
.collect(),
|
||||
}
|
||||
}
|
||||
}
|
||||
// impl<'a,I: Clone> Clone for Tokens<'a,I> {
|
||||
// fn clone(&self) -> Self {
|
||||
// Tokens {
|
||||
// tokenizer: self.tokenizer.box_clone(),
|
||||
// token_filters: self
|
||||
// .token_filters
|
||||
// .iter()
|
||||
// .map(|token_filter| token_filter.box_clone())
|
||||
// .collect(),
|
||||
// }
|
||||
// }
|
||||
// }
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
@@ -130,107 +134,27 @@ impl Clone for TextAnalyzer {
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
pub trait Tokenizer<'a>: 'static + Send + Sync + Clone {
|
||||
type Iter: Iterator<Item = Token> + 'a;
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
pub trait TokenStream {
|
||||
/// Advance to the next token
|
||||
///
|
||||
/// Returns false if there are no other tokens.
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
/// Returns a reference to the current token.
|
||||
fn token(&self) -> &Token;
|
||||
|
||||
/// Returns a mutable reference to the current token.
|
||||
fn token_mut(&mut self) -> &mut Token;
|
||||
|
||||
/// Helper to iterate over tokens. It
|
||||
/// simply combines a call to `.advance()`
|
||||
/// and `.token()`.
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// while let Some(token) = token_stream.next() {
|
||||
/// println!("Token {:?}", token.text);
|
||||
/// }
|
||||
/// ```
|
||||
fn next(&mut self) -> Option<&Token> {
|
||||
if self.advance() {
|
||||
Some(self.token())
|
||||
} else {
|
||||
None
|
||||
fn token_stream(&self, text: &'a str) -> Self::Iter;
|
||||
fn token_stream_texts(&self, texts: &'a [&str]) -> Self::Iter {
|
||||
debug_assert!(!texts.is_empty());
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
}
|
||||
TokenStreamChain::new(streams_with_offsets)
|
||||
}
|
||||
|
||||
/// Helper function to consume the entire `TokenStream`
|
||||
/// and push the tokens to a sink function.
|
||||
///
|
||||
/// Remove this.
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
|
||||
let mut num_tokens_pushed = 0u32;
|
||||
while self.advance() {
|
||||
sink(self.token());
|
||||
num_tokens_pushed += 1u32;
|
||||
}
|
||||
num_tokens_pushed
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter>;
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
|
||||
pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
@@ -239,24 +163,24 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::Token;
|
||||
// #[cfg(test)]
|
||||
// mod test {
|
||||
// use super::Token;
|
||||
|
||||
#[test]
|
||||
fn clone() {
|
||||
let t1 = Token {
|
||||
position: 1,
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
// #[test]
|
||||
// fn clone() {
|
||||
// let t1 = Token {
|
||||
// position: 1,
|
||||
// offset_from: 2,
|
||||
// offset_to: 3,
|
||||
// text: "abc".to_string(),
|
||||
// position_length: 1,
|
||||
// };
|
||||
// let t2 = t1.clone();
|
||||
|
||||
assert_eq!(t1.position, t2.position);
|
||||
assert_eq!(t1.offset_from, t2.offset_from);
|
||||
assert_eq!(t1.offset_to, t2.offset_to);
|
||||
assert_eq!(t1.text, t2.text);
|
||||
}
|
||||
}
|
||||
// assert_eq!(t1.position, t2.position);
|
||||
// assert_eq!(t1.offset_from, t2.offset_from);
|
||||
// assert_eq!(t1.offset_to, t2.offset_to);
|
||||
// assert_eq!(t1.text, t2.text);
|
||||
// }
|
||||
// }
|
||||
|
||||
Reference in New Issue
Block a user