Compare commits

...

1 Commits

Author SHA1 Message Date
Paul Masurel
4d3e9bc177 proof of concept for dynamic tokenizer. 2023-06-29 17:23:40 +09:00
4 changed files with 43 additions and 9 deletions

View File

@@ -1,5 +1,6 @@
use columnar::MonotonicallyMappableToU64; use columnar::MonotonicallyMappableToU64;
use itertools::Itertools; use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping}; use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation; use super::operation::AddOperation;
@@ -209,7 +210,7 @@ impl SegmentWriter {
for value in values { for value in values {
let mut token_stream = match value { let mut token_stream = match value {
Value::PreTokStr(tok_str) => { Value::PreTokStr(tok_str) => {
PreTokenizedStream::from(tok_str.clone()).into() BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
} }
Value::Str(ref text) => { Value::Str(ref text) => {
let text_analyzer = let text_analyzer =

View File

@@ -5,7 +5,7 @@ use crate::query::bm25::idf;
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery}; use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value}; use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
use crate::tokenizer::{ use crate::tokenizer::{
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
}; };
use crate::{DocAddress, Result, Searcher, TantivyError}; use crate::{DocAddress, Result, Searcher, TantivyError};
@@ -206,8 +206,8 @@ impl MoreLikeThis {
for value in values { for value in values {
match value { match value {
Value::PreTokStr(tok_str) => { Value::PreTokStr(tok_str) => {
let mut token_stream: BoxTokenStream = let mut token_stream =
PreTokenizedStream::from(tok_str.clone()).into(); PreTokenizedStream::from(tok_str.clone());
token_stream.process(&mut |token| { token_stream.process(&mut |token| {
if !self.is_noise_word(token.text.clone()) { if !self.is_noise_word(token.text.clone()) {
let term = Term::from_field_text(field, &token.text); let term = Term::from_field_text(field, &token.text);

View File

@@ -9,6 +9,26 @@ pub struct TextAnalyzer {
tokenizer: Box<dyn BoxableTokenizer>, tokenizer: Box<dyn BoxableTokenizer>,
} }
impl Tokenizer for Box<dyn BoxableTokenizer> {
type TokenStream<'a> = BoxTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
self.box_token_stream(text)
}
}
impl Clone for Box<dyn BoxableTokenizer> {
fn clone(&self) -> Self {
self.box_clone()
}
}
fn add_filter<F: TokenFilter>(tokenizer: Box<dyn BoxableTokenizer>, filter: F) -> Box<dyn BoxableTokenizer> {
let filtered_tokenizer = filter.transform(tokenizer);
Box::new(filtered_tokenizer)
}
/// A boxable `Tokenizer`, with its `TokenStream` type erased. /// A boxable `Tokenizer`, with its `TokenStream` type erased.
trait BoxableTokenizer: 'static + Send + Sync { trait BoxableTokenizer: 'static + Send + Sync {
/// Creates a boxed token stream for a given `str`. /// Creates a boxed token stream for a given `str`.
@@ -19,7 +39,7 @@ trait BoxableTokenizer: 'static + Send + Sync {
impl<T: Tokenizer> BoxableTokenizer for T { impl<T: Tokenizer> BoxableTokenizer for T {
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> { fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
self.token_stream(text).into() BoxTokenStream::new(self.token_stream(text))
} }
fn box_clone(&self) -> Box<dyn BoxableTokenizer> { fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
Box::new(self.clone()) Box::new(self.clone())

View File

@@ -63,10 +63,22 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
/// Simple wrapper of `Box<dyn TokenStream + 'a>`. /// Simple wrapper of `Box<dyn TokenStream + 'a>`.
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>); pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
impl<'a, T> From<T> for BoxTokenStream<'a> impl<'a> TokenStream for BoxTokenStream<'a> {
where T: TokenStream + 'a fn advance(&mut self) -> bool {
{ self.0.advance()
fn from(token_stream: T) -> BoxTokenStream<'a> { }
fn token(&self) -> &Token {
self.0.token()
}
fn token_mut(&mut self) -> &mut Token {
self.0.token_mut()
}
}
impl<'a> BoxTokenStream<'a> {
pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
BoxTokenStream(Box::new(token_stream)) BoxTokenStream(Box::new(token_stream))
} }
} }
@@ -145,6 +157,7 @@ pub trait TokenFilter: 'static + Send + Sync {
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>; fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
} }
#[cfg(test)] #[cfg(test)]
mod test { mod test {
use super::*; use super::*;