Checkpoint converting to Iterators and static dispatch.

This commit is contained in:
dcraven
2020-12-23 13:40:18 +01:00
committed by Paul Masurel
parent f1973759ef
commit ccd0f3ccc9

View File

@@ -36,28 +36,42 @@ impl Default for Token {
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
///
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
pub struct TextAnalyzer {
tokenizer: Box<dyn Tokenizer>,
token_filters: Vec<Box<dyn TokenFilter>>,
#[derive(Clone)]
pub struct TokenStream<'a, I> {
tokens: I,
filters: Vec<Box<dyn TokenFilter>>,
}
impl<T: Tokenizer> From<T> for TextAnalyzer {
fn from(tokenizer: T) -> Self {
TextAnalyzer::new(tokenizer, Vec::new())
impl<'a, I> Iterator for TokenStream<'a, I>
where
I: Iterator<Item = Token>,
{
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
while let Some(token) = self.tokens.next() {
if self.filters.all(|filter| filter(&token)) {
return Some(token);
}
}
None
}
}
impl TextAnalyzer {
impl<'a, I> TokenStream<'a, I>
where
I: Iterator<Item = Token>,
{
/// Creates a new `TextAnalyzer` given a tokenizer and a vector of `Box<dyn TokenFilter>`.
///
/// When creating a `TextAnalyzer` from a `Tokenizer` alone, prefer using
/// `TextAnalyzer::from(tokenizer)`.
pub fn new<T: Tokenizer>(
pub fn new<T: Tokenizer<'a, Iter = I>>(
tokenizer: T,
text: &str,
token_filters: Vec<Box<dyn TokenFilter>>,
) -> TextAnalyzer {
TextAnalyzer {
tokenizer: Box::new(tokenizer),
) -> TokenStream<'a, I> {
TokenStream {
tokens: tokenizer.token_stream(text),
token_filters,
}
}
@@ -83,44 +97,34 @@ impl TextAnalyzer {
self
}
/// Tokenize an array`&str`
///
/// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
/// one concatenated `&str`, with an artificial position gap of `2` between the different fields
/// to prevent accidental `PhraseQuery` to match accross two terms.
pub fn token_stream_texts<'a>(&self, texts: &'a [&str]) -> Box<dyn TokenStream + 'a> {
debug_assert!(!texts.is_empty());
let mut streams_with_offsets = vec![];
let mut total_offset = 0;
for &text in texts {
streams_with_offsets.push((self.token_stream(text), total_offset));
total_offset += text.len();
}
Box::new(TokenStreamChain::new(streams_with_offsets))
}
// /// Tokenize an array`&str`
// ///
// /// The resulting `BoxTokenStream` is equivalent to what would be obtained if the &str were
// /// one concatenated `&str`, with an artificial position gap of `2` between the different fields
// /// to prevent accidental `PhraseQuery` to match accross two terms.
/// Creates a token stream for a given `str`.
pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
let mut token_stream = self.tokenizer.token_stream(text);
for token_filter in &self.token_filters {
token_stream = token_filter.transform(token_stream);
}
token_stream
}
// /// Creates a token stream for a given `str`.
// pub fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a> {
// let mut token_stream = self.tokenizer.token_stream(text);
// for token_filter in &self.token_filters {
// token_stream = token_filter.transform(token_stream);
// }
// token_stream
// }
}
impl Clone for TextAnalyzer {
fn clone(&self) -> Self {
TextAnalyzer {
tokenizer: self.tokenizer.box_clone(),
token_filters: self
.token_filters
.iter()
.map(|token_filter| token_filter.box_clone())
.collect(),
}
}
}
// impl<'a,I: Clone> Clone for Tokens<'a,I> {
// fn clone(&self) -> Self {
// Tokens {
// tokenizer: self.tokenizer.box_clone(),
// token_filters: self
// .token_filters
// .iter()
// .map(|token_filter| token_filter.box_clone())
// .collect(),
// }
// }
// }
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
@@ -130,107 +134,27 @@ impl Clone for TextAnalyzer {
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
pub trait Tokenizer<'a>: 'static + Send + Sync + Clone {
type Iter: Iterator<Item = Token> + 'a;
/// Creates a token stream for a given `str`.
fn token_stream<'a>(&self, text: &'a str) -> Box<dyn TokenStream + 'a>;
}
pub trait TokenizerClone {
fn box_clone(&self) -> Box<dyn Tokenizer>;
}
impl<T: Tokenizer + Clone> TokenizerClone for T {
fn box_clone(&self) -> Box<dyn Tokenizer> {
Box::new(self.clone())
}
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// ```
///
pub trait TokenStream {
/// Advance to the next token
///
/// Returns false if there are no other tokens.
fn advance(&mut self) -> bool;
/// Returns a reference to the current token.
fn token(&self) -> &Token;
/// Returns a mutable reference to the current token.
fn token_mut(&mut self) -> &mut Token;
/// Helper to iterate over tokens. It
/// simply combines a call to `.advance()`
/// and `.token()`.
///
/// ```
/// use tantivy::tokenizer::*;
///
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// while let Some(token) = token_stream.next() {
/// println!("Token {:?}", token.text);
/// }
/// ```
fn next(&mut self) -> Option<&Token> {
if self.advance() {
Some(self.token())
} else {
None
fn token_stream(&self, text: &'a str) -> Self::Iter;
fn token_stream_texts(&self, texts: &'a [&str]) -> Self::Iter {
debug_assert!(!texts.is_empty());
let mut streams_with_offsets = vec![];
let mut total_offset = 0;
for &text in texts {
streams_with_offsets.push((self.token_stream(text), total_offset));
total_offset += text.len();
}
TokenStreamChain::new(streams_with_offsets)
}
/// Helper function to consume the entire `TokenStream`
/// and push the tokens to a sink function.
///
/// Remove this.
fn process(&mut self, sink: &mut dyn FnMut(&Token)) -> u32 {
let mut num_tokens_pushed = 0u32;
while self.advance() {
sink(self.token());
num_tokens_pushed += 1u32;
}
num_tokens_pushed
}
}
pub trait TokenFilterClone {
fn box_clone(&self) -> Box<dyn TokenFilter>;
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
/// Wraps a token stream and returns the modified one.
fn transform<'a>(&self, token_stream: Box<dyn TokenStream + 'a>) -> Box<dyn TokenStream + 'a>;
pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {}
pub trait TokenFilterClone {
fn box_clone(&self) -> Box<dyn TokenFilter>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
@@ -239,24 +163,24 @@ impl<T: TokenFilter + Clone> TokenFilterClone for T {
}
}
#[cfg(test)]
mod test {
use super::Token;
// #[cfg(test)]
// mod test {
// use super::Token;
#[test]
fn clone() {
let t1 = Token {
position: 1,
offset_from: 2,
offset_to: 3,
text: "abc".to_string(),
position_length: 1,
};
let t2 = t1.clone();
// #[test]
// fn clone() {
// let t1 = Token {
// position: 1,
// offset_from: 2,
// offset_to: 3,
// text: "abc".to_string(),
// position_length: 1,
// };
// let t2 = t1.clone();
assert_eq!(t1.position, t2.position);
assert_eq!(t1.offset_from, t2.offset_from);
assert_eq!(t1.offset_to, t2.offset_to);
assert_eq!(t1.text, t2.text);
}
}
// assert_eq!(t1.position, t2.position);
// assert_eq!(t1.offset_from, t2.offset_from);
// assert_eq!(t1.offset_to, t2.offset_to);
// assert_eq!(t1.text, t2.text);
// }
// }