Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
4d3e9bc177 proof of concept for dynamic tokenizer. 2023-06-29 17:23:40 +09:00
4 changed files with 43 additions and 9 deletions

View File

@@ -1,5 +1,6 @@
use columnar::MonotonicallyMappableToU64;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
@@ -209,7 +210,7 @@ impl SegmentWriter {
for value in values {
let mut token_stream = match value {
Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into()
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
}
Value::Str(ref text) => {
let text_analyzer =

View File

@@ -5,7 +5,7 @@ use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
};
use crate::{DocAddress, Result, Searcher, TantivyError};
@@ -206,8 +206,8 @@ impl MoreLikeThis {
for value in values {
match value {
Value::PreTokStr(tok_str) => {
let mut token_stream: BoxTokenStream =
PreTokenizedStream::from(tok_str.clone()).into();
let mut token_stream =
PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text);

View File

@@ -9,6 +9,26 @@ pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>,
}
impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = BoxTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.box_token_stream(text)
}
}
impl Clone for Box<dyn BoxableTokenizer> {
fn clone(&self) -> Self {
self.box_clone()
}
}
fn add_filter<F: TokenFilter>(tokenizer: Box<dyn BoxableTokenizer>, filter: F) -> Box<dyn BoxableTokenizer> {
let filtered_tokenizer = filter.transform(tokenizer);
Box::new(filtered_tokenizer)
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`.
@@ -19,7 +39,7 @@ trait BoxableTokenizer: 'static + Send + Sync {
impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into()
BoxTokenStream::new(self.token_stream(text))
}
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone())

View File

@@ -63,10 +63,22 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a>
where T: TokenStream + 'a
{
fn from(token_stream: T) -> BoxTokenStream<'a> {
impl<'a> TokenStream for BoxTokenStream<'a> {
fn advance(&mut self) -> bool {
self.0.advance()
}
fn token(&self) -> &Token {
self.0.token()
}
fn token_mut(&mut self) -> &mut Token {
self.0.token_mut()
}
}
impl<'a> BoxTokenStream<'a> {
pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream))
}
}
@@ -145,6 +157,7 @@ pub trait TokenFilter: 'static + Send + Sync {
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
}
#[cfg(test)]
mod test {
use super::*;