mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-27 20:42:54 +00:00
Compare commits
5 Commits
update_exa
...
fmassot/ad
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
dc783f8328 | ||
|
|
b82cd08f5d | ||
|
|
54f43135f2 | ||
|
|
6c6b97d4ef | ||
|
|
ad9b825067 |
@@ -19,6 +19,7 @@ oneshot = "0.1.5"
|
||||
base64 = "0.21.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
dyn-clone = "1.0.11"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
aho-corasick = "1.0"
|
||||
|
||||
@@ -209,7 +209,7 @@ impl SegmentWriter {
|
||||
for value in values {
|
||||
let mut token_stream = match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
PreTokenizedStream::from(tok_str.clone()).into()
|
||||
Box::new(PreTokenizedStream::from(tok_str.clone()))
|
||||
}
|
||||
Value::Str(ref text) => {
|
||||
let text_analyzer =
|
||||
|
||||
@@ -4,9 +4,7 @@ use std::collections::{BinaryHeap, HashMap};
|
||||
use crate::query::bm25::idf;
|
||||
use crate::query::{BooleanQuery, BoostQuery, Occur, Query, TermQuery};
|
||||
use crate::schema::{Field, FieldType, IndexRecordOption, Term, Value};
|
||||
use crate::tokenizer::{
|
||||
BoxTokenStream, FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer,
|
||||
};
|
||||
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TokenStream, Tokenizer};
|
||||
use crate::{DocAddress, Result, Searcher, TantivyError};
|
||||
|
||||
#[derive(Debug, PartialEq)]
|
||||
@@ -206,8 +204,7 @@ impl MoreLikeThis {
|
||||
for value in values {
|
||||
match value {
|
||||
Value::PreTokStr(tok_str) => {
|
||||
let mut token_stream: BoxTokenStream =
|
||||
PreTokenizedStream::from(tok_str.clone()).into();
|
||||
let mut token_stream = PreTokenizedStream::from(tok_str.clone());
|
||||
token_stream.process(&mut |token| {
|
||||
if !self.is_noise_word(token.text.clone()) {
|
||||
let term = Term::from_field_text(field, &token.text);
|
||||
|
||||
@@ -139,7 +139,7 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer};
|
||||
pub use tokenizer_api::{Token, TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
@@ -154,7 +154,7 @@ pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer::{BoxTokenFilter, TextAnalyzer, TextAnalyzerBuilder};
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
|
||||
@@ -1,36 +1,105 @@
|
||||
use dyn_clone::DynClone;
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use tokenizer_api::{BoxTokenStream, TokenFilter, Tokenizer};
|
||||
use tokenizer_api::{TokenFilter, TokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
#[derive(Clone)]
|
||||
pub struct TextAnalyzer {
|
||||
tokenizer: Box<dyn BoxableTokenizer>,
|
||||
}
|
||||
|
||||
/// A boxable `Tokenizer`, with its `TokenStream` type erased.
|
||||
trait BoxableTokenizer: 'static + Send + Sync {
|
||||
trait BoxableTokenizer: 'static + Send + Sync + DynClone {
|
||||
/// Creates a boxed token stream for a given `str`.
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
/// Clone this tokenizer.
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer>;
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer> BoxableTokenizer for T {
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.token_stream(text).into()
|
||||
}
|
||||
fn box_clone(&self) -> Box<dyn BoxableTokenizer> {
|
||||
Box::new(self.clone())
|
||||
fn box_token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
Box::new(self.token_stream(text))
|
||||
}
|
||||
}
|
||||
|
||||
impl Clone for TextAnalyzer {
|
||||
fn clone(&self) -> Self {
|
||||
TextAnalyzer {
|
||||
tokenizer: self.tokenizer.box_clone(),
|
||||
dyn_clone::clone_trait_object!(BoxableTokenizer);
|
||||
|
||||
/// A boxed `BoxableTokenizer` which is a `Tokenizer` with its `TokenStream` type erased.
|
||||
#[derive(Clone)]
|
||||
struct BoxTokenizer(Box<dyn BoxableTokenizer>);
|
||||
|
||||
impl Tokenizer for BoxTokenizer {
|
||||
type TokenStream<'a> = Box<dyn TokenStream + 'a>;
|
||||
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a> {
|
||||
self.0.box_token_stream(text).into()
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxable `TokenFilter`, with its `Tokenizer` type erased.
|
||||
trait BoxableTokenFilter: 'static + Send + Sync {
|
||||
/// Wraps a `BoxedTokenizer` and returns a new one.
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> BoxableTokenFilter for T {
|
||||
fn box_transform(&self, tokenizer: BoxTokenizer) -> BoxTokenizer {
|
||||
let tokenizer = self.clone().transform(tokenizer);
|
||||
BoxTokenizer(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
/// A boxed `BoxableTokenFilter` which is a `TokenFilter` with its `Tokenizer` type erased.
|
||||
pub struct BoxTokenFilter(Box<dyn BoxableTokenFilter>);
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Builds a new `TextAnalyzer` given a tokenizer and a vector of `BoxTokenFilter`.
|
||||
///
|
||||
/// When creating a `TextAnalyzer` from a `Tokenizer` and a static set of `TokenFilter`,
|
||||
/// prefer using `TextAnalyzer::builder(tokenizer).filter(token_filter).build()` as it
|
||||
/// will be more performant and create less boxes.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let en_stem = TextAnalyzer::build(
|
||||
/// SimpleTokenizer::default(),
|
||||
/// vec![
|
||||
/// BoxTokenFilter::from(RemoveLongFilter::limit(40)),
|
||||
/// BoxTokenFilter::from(LowerCaser),
|
||||
/// BoxTokenFilter::from(Stemmer::default()),
|
||||
/// ]);
|
||||
/// ```
|
||||
pub fn build<T: Tokenizer>(
|
||||
tokenizer: T,
|
||||
boxed_token_filters: Vec<BoxTokenFilter>,
|
||||
) -> TextAnalyzer {
|
||||
let mut boxed_tokenizer = BoxTokenizer(Box::new(tokenizer));
|
||||
for filter in boxed_token_filters.into_iter() {
|
||||
boxed_tokenizer = filter.0.box_transform(boxed_tokenizer);
|
||||
}
|
||||
TextAnalyzer {
|
||||
tokenizer: boxed_tokenizer.0,
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> Box<dyn TokenStream + 'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -46,20 +115,8 @@ impl<T: Tokenizer + Clone> From<T> for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
|
||||
impl TextAnalyzer {
|
||||
/// Create a new TextAnalyzerBuilder
|
||||
pub fn builder<T: Tokenizer>(tokenizer: T) -> TextAnalyzerBuilder<T> {
|
||||
TextAnalyzerBuilder { tokenizer }
|
||||
}
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
pub fn token_stream<'a>(&'a mut self, text: &'a str) -> BoxTokenStream<'a> {
|
||||
self.tokenizer.box_token_stream(text)
|
||||
}
|
||||
}
|
||||
|
||||
/// Builder helper for [`TextAnalyzer`]
|
||||
pub struct TextAnalyzerBuilder<T> {
|
||||
pub struct TextAnalyzerBuilder<T: Tokenizer> {
|
||||
tokenizer: T,
|
||||
}
|
||||
|
||||
@@ -90,3 +147,37 @@ impl<T: Tokenizer> TextAnalyzerBuilder<T> {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
use crate::tokenizer::{AlphaNumOnlyFilter, LowerCaser, RemoveLongFilter, WhitespaceTokenizer};
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_builder() {
|
||||
let mut analyzer = TextAnalyzer::builder(WhitespaceTokenizer::default())
|
||||
.filter(AlphaNumOnlyFilter)
|
||||
.filter(RemoveLongFilter::limit(6))
|
||||
.filter(LowerCaser)
|
||||
.build();
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_analyzer_with_filters_boxed() {
|
||||
let mut analyzer = TextAnalyzer::build(
|
||||
WhitespaceTokenizer::default(),
|
||||
vec![
|
||||
BoxTokenFilter::from(AlphaNumOnlyFilter),
|
||||
BoxTokenFilter::from(LowerCaser),
|
||||
BoxTokenFilter::from(RemoveLongFilter::limit(6)),
|
||||
],
|
||||
);
|
||||
let mut stream = analyzer.token_stream("- first bullet point");
|
||||
assert_eq!(stream.next().unwrap().text, "first");
|
||||
assert_eq!(stream.next().unwrap().text, "point");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,7 +6,6 @@
|
||||
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -60,30 +59,6 @@ pub trait Tokenizer: 'static + Clone + Send + Sync {
|
||||
fn token_stream<'a>(&'a mut self, text: &'a str) -> Self::TokenStream<'a>;
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
@@ -137,7 +112,7 @@ pub trait TokenStream {
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync {
|
||||
pub trait TokenFilter: 'static + Send + Sync + Clone {
|
||||
/// The Tokenizer type returned by this filter, typically parametrized by the underlying
|
||||
/// Tokenizer.
|
||||
type Tokenizer<T: Tokenizer>: Tokenizer;
|
||||
|
||||
Reference in New Issue
Block a user