mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
Compare commits
1 Commits
raphael_op
...
dynamic-to
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4d3e9bc177 |
@@ -1,5 +1,6 @@
|
|||||||
use columnar::MonotonicallyMappableToU64;
|
use columnar::MonotonicallyMappableToU64;
|
||||||
use itertools::Itertools;
|
use itertools::Itertools;
|
||||||
|
use tokenizer_api::BoxTokenStream;
|
||||||
|
|
||||||
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
|
||||||
use super::operation::AddOperation;
|
use super::operation::AddOperation;
|
||||||
@@ -209,7 +210,7 @@ impl SegmentWriter {
|
|||||||
for value in values {
|
for value in values {
|
||||||
let mut token_stream = match value {
|
let mut token_stream = match value {
|
||||||
Value::PreTokStr(tok_str) => {
|
Value::PreTokStr(tok_str) => {
|
||||||
PreTokenizedStream::from(tok_str.clone()).into()
|
BoxTokenStream::new(PreTokenizedStream::from(tok_str.clone()))
|
||||||
}
|
}
|
||||||
Value::Str(ref text) => {
|
Value::Str(ref text) => {
|
||||||
let text_analyzer =
|
let text_analyzer =
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use crate::query::bm25::idf;
|
|||||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||||
use crate::tokenizer::{
|
use crate::tokenizer::{
|
||||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||||
};
|
};
|
||||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||||
|
|
||||||
@@ -206,8 +206,8 @@ impl MoreLikeThis {
|
|||||||
for value in values {
|
for value in values {
|
||||||
match value {
|
match value {
|
||||||
Value::PreTokStr(tok_str) => {
|
Value::PreTokStr(tok_str) => {
|
||||||
let mut token_stream: BoxTokenStream =
|
let mut token_stream =
|
||||||
PreTokenizedStream::from(tok_str.clone()).into();
|
PreTokenizedStream::from(tok_str.clone());
|
||||||
token_stream.process(&mut |token| {
|
token_stream.process(&mut |token| {
|
||||||
if !self.is_noise_word(token.text.clone()) {
|
if !self.is_noise_word(token.text.clone()) {
|
||||||
let term = Term::from_field_text(field, &token.text);
|
let term = Term::from_field_text(field, &token.text);
|
||||||
|
|||||||
@@ -9,6 +9,26 @@ pub struct TextAnalyzer {
|
|||||||
tokenizer: Box<dyn BoxableTokenizer>,
|
tokenizer: Box<dyn BoxableTokenizer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Tokenizer for Box<dyn BoxableTokenizer> {
|
||||||
|
type TokenStream<'a> = BoxTokenStream<'a>;
|
||||||
|
|
||||||
|
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||||
|
self.box_token_stream(text)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Clone for Box<dyn BoxableTokenizer> {
|
||||||
|
fn clone(&self) -> Self {
|
||||||
|
self.box_clone()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn add_filter<F: TokenFilter>(tokenizer: Box<dyn BoxableTokenizer>, filter: F) -> Box<dyn BoxableTokenizer> {
|
||||||
|
let filtered_tokenizer = filter.transform(tokenizer);
|
||||||
|
Box::new(filtered_tokenizer)
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||||
trait BoxableTokenizer: 'static + Send + Sync {
|
trait BoxableTokenizer: 'static + Send + Sync {
|
||||||
/// Creates a boxed token stream for a given `str`.
|
/// Creates a boxed token stream for a given `str`.
|
||||||
@@ -19,7 +39,7 @@ trait BoxableTokenizer: 'static + Send + Sync {
|
|||||||
|
|
||||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||||
self.token_stream(text).into()
|
BoxTokenStream::new(self.token_stream(text))
|
||||||
}
|
}
|
||||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||||
Box::new(self.clone())
|
Box::new(self.clone())
|
||||||
|
|||||||
@@ -63,10 +63,22 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
|||||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||||
|
|
||||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
impl<'a> TokenStream for BoxTokenStream<'a> {
|
||||||
where T: TokenStream + 'a
|
fn advance(&mut self) -> bool {
|
||||||
{
|
self.0.advance()
|
||||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
}
|
||||||
|
|
||||||
|
fn token(&self) -> &Token {
|
||||||
|
self.0.token()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn token_mut(&mut self) -> &mut Token {
|
||||||
|
self.0.token_mut()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'a> BoxTokenStream<'a> {
|
||||||
|
pub fn new<T: TokenStream + 'a>(token_stream: T) -> BoxTokenStream<'a> {
|
||||||
BoxTokenStream(Box::new(token_stream))
|
BoxTokenStream(Box::new(token_stream))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -145,6 +157,7 @@ pub trait TokenFilter: 'static + Send + Sync {
|
|||||||
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
fn transform<T: Tokenizer>(self, tokenizer: T) -> Self::Tokenizer<T>;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod test {
|
mod test {
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
Reference in New Issue
Block a user