Reformulate as Iterators, Checkpoint 2. Finished, now bubble up changes.

This commit is contained in:
dcraven
2020-12-23 16:34:51 +01:00
committed by Paul Masurel
parent 801c82a5e1
commit 39e8739ea5
3 changed files with 47 additions and 41 deletions

View File

@@ -42,8 +42,7 @@ fn load_metas(
META_FILEPATH.to_path_buf(),
format!("Meta file cannot be deserialized. {:?}.", e),
)
})
.map_err(From::from)
})?
}
/// Search Index
@@ -119,13 +118,12 @@ impl Index {
return Index::create(dir, schema);
}
let index = Index::open(dir)?;
if index.schema() == schema {
Ok(index)
} else {
Err(TantivyError::SchemaError(
if index.schema() != schema {
return Err(TantivyError::SchemaError(
"An index exists but the schema does not match.".to_string(),
))
));
}
Ok(index)
}
/// Creates a new index in a temp directory.

View File

@@ -3,39 +3,37 @@ use crate::tokenizer::{Token, TokenStream};
const POSITION_GAP: usize = 2;
pub(crate) struct Chain<'a, I> {
streams_with_offsets: Vec<(I, usize)>,
stream_idx: usize,
streams_with_offsets: I,
position_shift: usize,
}
impl<'a, I> Chain<'a, I>
where
I: Iterator<Item = Token>,
{
pub fn new(streams_with_offsets: Vec<(I, usize)>) -> Chain<'a, I> {
impl<'a, Out> Chain<'a, Out> {
pub fn new<In>(streams_with_offsets: Out) -> Chain<'a, Out>
where
In: Iterator<Item = Token>,
Out: Iterator<Item = In>,
{
Chain {
streams_with_offsets,
stream_idx: 0,
position_shift: 0,
}
}
}
impl<'a, I> Iterator for Chain<'a, I>
impl<'a, In, Out> Iterator for Chain<'a, Out>
where
I: Iterator<Item = Token>,
In: Iterator<Item = Token>,
Out: Iterator<Item = In>,
{
type Item = Token;
fn next(&mut self) -> Option<Token> {
while self.stream_idx < self.streams_with_offsets.len() {
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx];
while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() {
if let Some(token) = token_stream.next() {
token.offset_from += offset_offset;
token.offset_to += offset_offset;
token.position += self.position_shift;
return Some(token);
} else {
self.stream_idx += 1;
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
}
}

View File

@@ -1,4 +1,4 @@
use crate::tokenizer::TokenStreamChain;
use crate::tokenizer::Chain;
use serde::{Deserialize, Serialize};
/// The tokenizer module contains all of the tools used to process
/// text in `tantivy`.
@@ -39,7 +39,7 @@ impl Default for Token {
#[derive(Clone)]
pub struct TokenStream<'a, I> {
tokens: I,
filters: Vec<Box<dyn TokenFilter>>,
transformers: Vec<Box<dyn Transformer>>,
}
impl<'a, I> Iterator for TokenStream<'a, I>
@@ -48,12 +48,11 @@ where
{
type Item = I::Item;
fn next(&mut self) -> Option<Self::Item> {
while let Some(token) = self.tokens.next() {
if self.filters.all(|filter| filter(&token)) {
return Some(token);
}
let token = self.tokens.next()?;
for transformer in self.tranformers.iter_mut() {
token = transformer.transform(token)?;
}
None
Some(token)
}
}
@@ -68,11 +67,10 @@ where
pub fn new<T: Tokenizer<'a, Iter = I>>(
tokenizer: T,
text: &str,
token_filters: Vec<Box<dyn TokenFilter>>,
) -> TokenStream<'a, I> {
TokenStream {
tokens: tokenizer.token_stream(text),
token_filters,
transformers: vec![],
}
}
@@ -92,7 +90,7 @@ where
/// .filter(Stemmer::default());
/// ```
///
pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
pub fn filter<F: Transformer>(mut self, token_filter: F) -> Self {
self.token_filters.push(Box::new(token_filter));
self
}
@@ -142,23 +140,35 @@ pub trait Tokenizer<'a>: 'static + Send + Sync + Clone {
debug_assert!(!texts.is_empty());
let mut streams_with_offsets = vec![];
let mut total_offset = 0;
for &text in texts {
streams_with_offsets.push((self.token_stream(text), total_offset));
total_offset += text.len();
}
TokenStreamChain::new(streams_with_offsets)
// for &text in texts {
// streams_with_offsets.push((self.token_stream(text), total_offset));
// total_offset += text.len();
// }
let streams_with_offsets = texts.iter().scan(0,|total_offset, &text| {
let temp = *total_offset;
*total_offset += text.len();
Some((self.token_stream(text), temp))
});
// {
// streams_with_offsets.push((self.token_stream(text), total_offset));
// total_offset += text.len();
// }
Chain::new(streams_with_offsets)
}
}
/// Trait for the pluggable components of `Tokenizer`s.
pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {}
pub trait TokenFilterClone {
fn box_clone(&self) -> Box<dyn TokenFilter>;
pub trait Transformer: 'static + Send + Sync + TransformerClone {
fn transform(&mut self, token: Token) -> Option<Token>;
}
impl<T: TokenFilter + Clone> TokenFilterClone for T {
fn box_clone(&self) -> Box<dyn TokenFilter> {
pub trait TransformerClone {
fn box_clone(&self) -> Box<dyn Transformer>;
}
impl<T: Transformer + Clone> TransformerClone for T {
fn box_clone(&self) -> Box<dyn Transformer> {
Box::new(self.clone())
}
}