From 39e8739ea51779c55f43d68a8b6588d5d05fa4e3 Mon Sep 17 00:00:00 2001 From: dcraven Date: Wed, 23 Dec 2020 16:34:51 +0100 Subject: [PATCH] Reformulate as Iterators, Checkpoint 2. Finished, now bubble up changes. --- src/core/index.rs | 12 +++---- src/tokenizer/token_stream_chain.rs | 24 ++++++------- src/tokenizer/tokenizer.rs | 52 +++++++++++++++++------------ 3 files changed, 47 insertions(+), 41 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index 12ef5e37f..4f224aa2c 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -42,8 +42,7 @@ fn load_metas( META_FILEPATH.to_path_buf(), format!("Meta file cannot be deserialized. {:?}.", e), ) - }) - .map_err(From::from) + })? } /// Search Index @@ -119,13 +118,12 @@ impl Index { return Index::create(dir, schema); } let index = Index::open(dir)?; - if index.schema() == schema { - Ok(index) - } else { - Err(TantivyError::SchemaError( + if index.schema() != schema { + return Err(TantivyError::SchemaError( "An index exists but the schema does not match.".to_string(), - )) + )); } + Ok(index) } /// Creates a new index in a temp directory. diff --git a/src/tokenizer/token_stream_chain.rs b/src/tokenizer/token_stream_chain.rs index 5cfb60c70..297fae07a 100644 --- a/src/tokenizer/token_stream_chain.rs +++ b/src/tokenizer/token_stream_chain.rs @@ -3,39 +3,37 @@ use crate::tokenizer::{Token, TokenStream}; const POSITION_GAP: usize = 2; pub(crate) struct Chain<'a, I> { - streams_with_offsets: Vec<(I, usize)>, - stream_idx: usize, + streams_with_offsets: I, position_shift: usize, } -impl<'a, I> Chain<'a, I> -where - I: Iterator, -{ - pub fn new(streams_with_offsets: Vec<(I, usize)>) -> Chain<'a, I> { +impl<'a, Out> Chain<'a, Out> { + pub fn new(streams_with_offsets: Out) -> Chain<'a, Out> + where + In: Iterator, + Out: Iterator, + { Chain { streams_with_offsets, - stream_idx: 0, position_shift: 0, } } } -impl<'a, I> Iterator for Chain<'a, I> +impl<'a, In, Out> Iterator for Chain<'a, Out> where - I: Iterator, + In: Iterator, + Out: Iterator, { type Item = Token; fn next(&mut self) -> Option { - while self.stream_idx < self.streams_with_offsets.len() { - let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx]; + while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() { if let Some(token) = token_stream.next() { token.offset_from += offset_offset; token.offset_to += offset_offset; token.position += self.position_shift; return Some(token); } else { - self.stream_idx += 1; self.position_shift = self.token.position.wrapping_add(POSITION_GAP); } } diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index 24cb6bdad..941e25616 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -1,4 +1,4 @@ -use crate::tokenizer::TokenStreamChain; +use crate::tokenizer::Chain; use serde::{Deserialize, Serialize}; /// The tokenizer module contains all of the tools used to process /// text in `tantivy`. @@ -39,7 +39,7 @@ impl Default for Token { #[derive(Clone)] pub struct TokenStream<'a, I> { tokens: I, - filters: Vec>, + transformers: Vec>, } impl<'a, I> Iterator for TokenStream<'a, I> @@ -48,12 +48,11 @@ where { type Item = I::Item; fn next(&mut self) -> Option { - while let Some(token) = self.tokens.next() { - if self.filters.all(|filter| filter(&token)) { - return Some(token); - } + let token = self.tokens.next()?; + for transformer in self.tranformers.iter_mut() { + token = transformer.transform(token)?; } - None + Some(token) } } @@ -68,11 +67,10 @@ where pub fn new>( tokenizer: T, text: &str, - token_filters: Vec>, ) -> TokenStream<'a, I> { TokenStream { tokens: tokenizer.token_stream(text), - token_filters, + transformers: vec![], } } @@ -92,7 +90,7 @@ where /// .filter(Stemmer::default()); /// ``` /// - pub fn filter(mut self, token_filter: F) -> Self { + pub fn filter(mut self, token_filter: F) -> Self { self.token_filters.push(Box::new(token_filter)); self } @@ -142,23 +140,35 @@ pub trait Tokenizer<'a>: 'static + Send + Sync + Clone { debug_assert!(!texts.is_empty()); let mut streams_with_offsets = vec![]; let mut total_offset = 0; - for &text in texts { - streams_with_offsets.push((self.token_stream(text), total_offset)); - total_offset += text.len(); - } - TokenStreamChain::new(streams_with_offsets) + // for &text in texts { + // streams_with_offsets.push((self.token_stream(text), total_offset)); + // total_offset += text.len(); + // } + let streams_with_offsets = texts.iter().scan(0,|total_offset, &text| { + let temp = *total_offset; + *total_offset += text.len(); + Some((self.token_stream(text), temp)) + }); + + // { + // streams_with_offsets.push((self.token_stream(text), total_offset)); + // total_offset += text.len(); + // } + Chain::new(streams_with_offsets) } } /// Trait for the pluggable components of `Tokenizer`s. -pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {} - -pub trait TokenFilterClone { - fn box_clone(&self) -> Box; +pub trait Transformer: 'static + Send + Sync + TransformerClone { + fn transform(&mut self, token: Token) -> Option; } -impl TokenFilterClone for T { - fn box_clone(&self) -> Box { +pub trait TransformerClone { + fn box_clone(&self) -> Box; +} + +impl TransformerClone for T { + fn box_clone(&self) -> Box { Box::new(self.clone()) } }