From ad4c940fa350877cd3f2bda14077123c585264ec Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 29 Jun 2023 17:23:40 +0900 Subject: [PATCH] proof of concept for dynamic tokenizer. --- src/indexer/segment_writer.rs | 3 ++- src/query/more_like_this/more_like_this.rs | 6 +++--- src/tokenizer/tokenizer.rs | 22 +++++++++++++++++++++- tokenizer-api/src/lib.rs | 21 +++++++++++++++++---- 4 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/indexer/segment_writer.rs b/src/indexer/segment_writer.rs index bad8638e5..dd8ce3b55 100644 --- a/src/indexer/segment_writer.rs +++ b/src/indexer/segment_writer.rs @@ -1,5 +1,6 @@ use columnar::MonotonicallyMappableToU64; use itertools::Itertools; +use tokenizer_api::BoxTokenStream; use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; use super::operation::AddOperation; @@ -209,7 +210,7 @@ impl SegmentWriter { for value in values { let mut token_stream = match value { Value::PreTokStr(tok_str) => { - PreTokenizedStream::from(tok_str.clone()).into() + BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone())) } Value::Str(ref text) => { let text_analyzer = diff --git a/src/query/more_like_this/more_like_this.rs b/src/query/more_like_this/more_like_this.rs index 994dd96c0..c0bfe552f 100644 --- a/src/query/more_like_this/more_like_this.rs +++ b/src/query/more_like_this/more_like_this.rs @@ -5,7 +5,7 @@ use crate::query::bm25::idf; use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; use crate::tokenizer::{ - BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer, + FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer, }; use crate::{DocAddress, Result, Searcher, TantivyError}; @@ -206,8 +206,8 @@ impl MoreLikeThis { for value in values { match value { Value::PreTokStr(tok_str) => { - let mut token_stream: BoxTokenStream = - PreTokenizedStream::from(tok_str.clone()).into(); + let mut token_stream = + PreTokenizedStream::from(tok_str.clone()); token_stream.process(&mut |token| { if !self.is_noise_word(token.text.clone()) { let term = Term::from_field_text(field, &token.text); diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index ccab6cda7..0ce418ae8 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -9,6 +9,26 @@ pub struct TextAnalyzer { tokenizer: Box, } +impl Tokenizer for Box { + type TokenStream<'a> = BoxTokenStream<'a>; + + fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> { + self.box_token_stream(text) + } +} + +impl Clone for Box { + fn clone(&self) -> Self { + self.box_clone() + } +} + +fn add_filter(tokenizer: Box, filter: F) -> Box { + let filtered_tokenizer = filter.transform(tokenizer); + Box::new(filtered_tokenizer) +} + + /// A boxable `Tokenizer`, with its `TokenStream` type erased. trait BoxableTokenizer: 'static + Send + Sync { /// Creates a boxed token stream for a given `str`. @@ -19,7 +39,7 @@ trait BoxableTokenizer: 'static + Send + Sync { impl BoxableTokenizer for T { fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { - self.token_stream(text).into() + BoxTokenStream::new(self.token_stream(text)) } fn box_clone(&self) -> Box { Box::new(self.clone()) diff --git a/tokenizer-api/src/lib.rs b/tokenizer-api/src/lib.rs index adb37a0b4..70badf165 100644 --- a/tokenizer-api/src/lib.rs +++ b/tokenizer-api/src/lib.rs @@ -63,10 +63,22 @@ pub trait Tokenizer: 'static + Clone + Send + Sync { /// Simple wrapper of `Box`. pub struct BoxTokenStream<'a>(Box); -impl<'a, T> From for BoxTokenStream<'a> -where T: TokenStream + 'a -{ - fn from(token_stream: T) -> BoxTokenStream<'a> { +impl<'a> TokenStream for BoxTokenStream<'a> { + fn advance(&mut self) -> bool { + self.0.advance() + } + + fn token(&self) -> &Token { + self.0.token() + } + + fn token_mut(&mut self) -> &mut Token { + self.0.token_mut() + } +} + +impl<'a> BoxTokenStream<'a> { + pub fn new(token_stream: T) -> BoxTokenStream<'a> { BoxTokenStream(Box::new(token_stream)) } } @@ -145,6 +157,7 @@ pub trait TokenFilter: 'static + Send + Sync { fn transform(self, tokenizer: T) -> Self::Tokenizer; } + #[cfg(test)] mod test { use super::*;