mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-30 15:10:40 +00:00
Reformulate as Iterators, Checkpoint 2. Finished, now bubble up changes.
This commit is contained in:
@@ -42,8 +42,7 @@ fn load_metas(
|
||||
META_FILEPATH.to_path_buf(),
|
||||
format!("Meta file cannot be deserialized. {:?}.", e),
|
||||
)
|
||||
})
|
||||
.map_err(From::from)
|
||||
})?
|
||||
}
|
||||
|
||||
/// Search Index
|
||||
@@ -119,13 +118,12 @@ impl Index {
|
||||
return Index::create(dir, schema);
|
||||
}
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == schema {
|
||||
Ok(index)
|
||||
} else {
|
||||
Err(TantivyError::SchemaError(
|
||||
if index.schema() != schema {
|
||||
return Err(TantivyError::SchemaError(
|
||||
"An index exists but the schema does not match.".to_string(),
|
||||
))
|
||||
));
|
||||
}
|
||||
Ok(index)
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
|
||||
@@ -3,39 +3,37 @@ use crate::tokenizer::{Token, TokenStream};
|
||||
const POSITION_GAP: usize = 2;
|
||||
|
||||
pub(crate) struct Chain<'a, I> {
|
||||
streams_with_offsets: Vec<(I, usize)>,
|
||||
stream_idx: usize,
|
||||
streams_with_offsets: I,
|
||||
position_shift: usize,
|
||||
}
|
||||
|
||||
impl<'a, I> Chain<'a, I>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
{
|
||||
pub fn new(streams_with_offsets: Vec<(I, usize)>) -> Chain<'a, I> {
|
||||
impl<'a, Out> Chain<'a, Out> {
|
||||
pub fn new<In>(streams_with_offsets: Out) -> Chain<'a, Out>
|
||||
where
|
||||
In: Iterator<Item = Token>,
|
||||
Out: Iterator<Item = In>,
|
||||
{
|
||||
Chain {
|
||||
streams_with_offsets,
|
||||
stream_idx: 0,
|
||||
position_shift: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for Chain<'a, I>
|
||||
impl<'a, In, Out> Iterator for Chain<'a, Out>
|
||||
where
|
||||
I: Iterator<Item = Token>,
|
||||
In: Iterator<Item = Token>,
|
||||
Out: Iterator<Item = In>,
|
||||
{
|
||||
type Item = Token;
|
||||
fn next(&mut self) -> Option<Token> {
|
||||
while self.stream_idx < self.streams_with_offsets.len() {
|
||||
let (ref mut token_stream, offset_offset) = self.streams_with_offsets[self.stream_idx];
|
||||
while let Some((ref mut token_stream, offset_offset)) = self.streams_with_offsets.next() {
|
||||
if let Some(token) = token_stream.next() {
|
||||
token.offset_from += offset_offset;
|
||||
token.offset_to += offset_offset;
|
||||
token.position += self.position_shift;
|
||||
return Some(token);
|
||||
} else {
|
||||
self.stream_idx += 1;
|
||||
self.position_shift = self.token.position.wrapping_add(POSITION_GAP);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use crate::tokenizer::TokenStreamChain;
|
||||
use crate::tokenizer::Chain;
|
||||
use serde::{Deserialize, Serialize};
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
@@ -39,7 +39,7 @@ impl Default for Token {
|
||||
#[derive(Clone)]
|
||||
pub struct TokenStream<'a, I> {
|
||||
tokens: I,
|
||||
filters: Vec<Box<dyn TokenFilter>>,
|
||||
transformers: Vec<Box<dyn Transformer>>,
|
||||
}
|
||||
|
||||
impl<'a, I> Iterator for TokenStream<'a, I>
|
||||
@@ -48,12 +48,11 @@ where
|
||||
{
|
||||
type Item = I::Item;
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
while let Some(token) = self.tokens.next() {
|
||||
if self.filters.all(|filter| filter(&token)) {
|
||||
return Some(token);
|
||||
}
|
||||
let token = self.tokens.next()?;
|
||||
for transformer in self.tranformers.iter_mut() {
|
||||
token = transformer.transform(token)?;
|
||||
}
|
||||
None
|
||||
Some(token)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,11 +67,10 @@ where
|
||||
pub fn new<T: Tokenizer<'a, Iter = I>>(
|
||||
tokenizer: T,
|
||||
text: &str,
|
||||
token_filters: Vec<Box<dyn TokenFilter>>,
|
||||
) -> TokenStream<'a, I> {
|
||||
TokenStream {
|
||||
tokens: tokenizer.token_stream(text),
|
||||
token_filters,
|
||||
transformers: vec![],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -92,7 +90,7 @@ where
|
||||
/// .filter(Stemmer::default());
|
||||
/// ```
|
||||
///
|
||||
pub fn filter<F: TokenFilter>(mut self, token_filter: F) -> Self {
|
||||
pub fn filter<F: Transformer>(mut self, token_filter: F) -> Self {
|
||||
self.token_filters.push(Box::new(token_filter));
|
||||
self
|
||||
}
|
||||
@@ -142,23 +140,35 @@ pub trait Tokenizer<'a>: 'static + Send + Sync + Clone {
|
||||
debug_assert!(!texts.is_empty());
|
||||
let mut streams_with_offsets = vec![];
|
||||
let mut total_offset = 0;
|
||||
for &text in texts {
|
||||
streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
total_offset += text.len();
|
||||
}
|
||||
TokenStreamChain::new(streams_with_offsets)
|
||||
// for &text in texts {
|
||||
// streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
// total_offset += text.len();
|
||||
// }
|
||||
let streams_with_offsets = texts.iter().scan(0,|total_offset, &text| {
|
||||
let temp = *total_offset;
|
||||
*total_offset += text.len();
|
||||
Some((self.token_stream(text), temp))
|
||||
});
|
||||
|
||||
// {
|
||||
// streams_with_offsets.push((self.token_stream(text), total_offset));
|
||||
// total_offset += text.len();
|
||||
// }
|
||||
Chain::new(streams_with_offsets)
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: Fn(&Token) -> bool + 'static + Send + Sync + TokenFilterClone {}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter>;
|
||||
pub trait Transformer: 'static + Send + Sync + TransformerClone {
|
||||
fn transform(&mut self, token: Token) -> Option<Token>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> Box<dyn TokenFilter> {
|
||||
pub trait TransformerClone {
|
||||
fn box_clone(&self) -> Box<dyn Transformer>;
|
||||
}
|
||||
|
||||
impl<T: Transformer + Clone> TransformerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Transformer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user