mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-22 18:19:58 +00:00
move tokenizer API to seperate crate (#1767)
closes #1766 Finding tantivy tokenizers is a frustrating experience currently, since they need be updated for each tantivy version. That's unnecessary since the API is rather stable anyway.
This commit is contained in:
@@ -61,6 +61,7 @@ tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
|
||||
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version= "0.3", path="./fastfield_codecs", default-features = false }
|
||||
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
@@ -106,7 +107,7 @@ unstable = [] # useful for benches.
|
||||
quickwit = ["sstable"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar"]
|
||||
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar", "tokenizer-api"]
|
||||
|
||||
# Following the "fail" crate best practises, we isolate
|
||||
# tests that define specific behavior in fail check points
|
||||
|
||||
10
README.md
10
README.md
@@ -29,7 +29,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
# Features
|
||||
|
||||
- Full-text search
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
|
||||
- Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
|
||||
- Tiny startup time (<10ms), perfect for command-line tools
|
||||
- BM25 scoring (the same as Lucene)
|
||||
@@ -42,12 +42,12 @@ Your mileage WILL vary depending on the nature of queries and their load.
|
||||
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
|
||||
- `&[u8]` fast fields
|
||||
- Text, i64, u64, f64, dates, and hierarchical facet fields
|
||||
- LZ4 compressed document store
|
||||
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
|
||||
- Range queries
|
||||
- Faceted search
|
||||
- Configurable indexing (optional term frequency and position indexing)
|
||||
- JSON Field
|
||||
- Aggregation Collector: range buckets, average, and stats metrics
|
||||
- Aggregation Collector: histogram, range buckets, average, and stats metrics
|
||||
- LogMergePolicy with deletes
|
||||
- Searcher Warmer API
|
||||
- Cheesy logo with a horse
|
||||
@@ -81,6 +81,10 @@ There are many ways to support this project.
|
||||
|
||||
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
|
||||
|
||||
## Tokenizer
|
||||
|
||||
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
|
||||
|
||||
## Minimum supported Rust version
|
||||
|
||||
Tantivy currently requires at least Rust 1.62 or later to compile.
|
||||
|
||||
@@ -52,6 +52,8 @@
|
||||
//! remove their inflection. This tokenizer is slower than the default one,
|
||||
//! but is recommended to improve recall.
|
||||
//!
|
||||
//! # Custom tokenizer Library
|
||||
//! Avoid using tantivy as dependency and prefer `tantivy-tokenizer-api` instead.
|
||||
//!
|
||||
//! # Custom tokenizers
|
||||
//!
|
||||
@@ -134,6 +136,10 @@ mod tokenizer;
|
||||
mod tokenizer_manager;
|
||||
mod whitespace_tokenizer;
|
||||
|
||||
pub use tokenizer_api::{
|
||||
BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
};
|
||||
|
||||
pub use self::alphanum_only::AlphaNumOnlyFilter;
|
||||
pub use self::ascii_folding_filter::AsciiFoldingFilter;
|
||||
pub use self::facet_tokenizer::FacetTokenizer;
|
||||
@@ -146,9 +152,7 @@ pub use self::split_compound_words::SplitCompoundWords;
|
||||
pub use self::stemmer::{Language, Stemmer};
|
||||
pub use self::stop_word_filter::StopWordFilter;
|
||||
pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
|
||||
pub use self::tokenizer::{
|
||||
BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
|
||||
};
|
||||
pub use self::tokenizer::TextAnalyzer;
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::whitespace_tokenizer::WhitespaceTokenizer;
|
||||
|
||||
|
||||
@@ -303,8 +303,7 @@ mod tests {
|
||||
|
||||
use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
|
||||
use crate::tokenizer::tests::assert_token;
|
||||
use crate::tokenizer::tokenizer::Tokenizer;
|
||||
use crate::tokenizer::{BoxTokenStream, Token};
|
||||
use crate::tokenizer::{BoxTokenStream, Token, Tokenizer};
|
||||
|
||||
fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
|
||||
let mut tokens: Vec<Token> = vec![];
|
||||
|
||||
@@ -1,42 +1,9 @@
|
||||
/// The tokenizer module contains all of the tools used to process
|
||||
/// text in `tantivy`.
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};
|
||||
|
||||
use crate::tokenizer::empty_tokenizer::EmptyTokenizer;
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub text: String,
|
||||
/// Is the length expressed in term of number of original tokens.
|
||||
pub position_length: usize,
|
||||
}
|
||||
|
||||
impl Default for Token {
|
||||
fn default() -> Token {
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::MAX,
|
||||
text: String::with_capacity(200),
|
||||
position_length: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
|
||||
///
|
||||
/// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
|
||||
@@ -112,200 +79,3 @@ impl Clone for TextAnalyzer {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// See the [module documentation](crate::tokenizer) for more detail.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
token_stream.advance()
|
||||
}
|
||||
|
||||
fn token<'b>(&'b self) -> &'b Token {
|
||||
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||
token_stream.token()
|
||||
}
|
||||
|
||||
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||
token_stream.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
///
|
||||
/// See [`TokenStream`] for more information.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See [`TokenFilter`] for more information.
|
||||
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn TokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn TokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// ```
|
||||
pub trait TokenStream {
|
||||
/// Advance to the next token
|
||||
///
|
||||
/// Returns false if there are no other tokens.
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
/// Returns a reference to the current token.
|
||||
fn token(&self) -> &Token;
|
||||
|
||||
/// Returns a mutable reference to the current token.
|
||||
fn token_mut(&mut self) -> &mut Token;
|
||||
|
||||
/// Helper to iterate over tokens. It
|
||||
/// simply combines a call to `.advance()`
|
||||
/// and `.token()`.
|
||||
///
|
||||
/// ```
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// while let Some(token) = token_stream.next() {
|
||||
/// println!("Token {:?}", token.text);
|
||||
/// }
|
||||
/// ```
|
||||
fn next(&mut self) -> Option<&Token> {
|
||||
if self.advance() {
|
||||
Some(self.token())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to consume the entire `TokenStream`
|
||||
/// and push the tokens to a sink function.
|
||||
///
|
||||
/// Remove this.
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
|
||||
while self.advance() {
|
||||
sink(self.token());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> BoxTokenFilter;
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> BoxTokenFilter {
|
||||
BoxTokenFilter::from(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::Token;
|
||||
|
||||
#[test]
|
||||
fn clone() {
|
||||
let t1 = Token {
|
||||
position: 1,
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
|
||||
assert_eq!(t1.position, t2.position);
|
||||
assert_eq!(t1.offset_from, t2.offset_from);
|
||||
assert_eq!(t1.offset_to, t2.offset_to);
|
||||
assert_eq!(t1.text, t2.text);
|
||||
}
|
||||
}
|
||||
|
||||
10
tokenizer-api/Cargo.toml
Normal file
10
tokenizer-api/Cargo.toml
Normal file
@@ -0,0 +1,10 @@
|
||||
[package]
|
||||
name = "tantivy-tokenizer-api"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
description = "Tokenizer API of tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
serde = { version = "1.0.152", features = ["derive"] }
|
||||
6
tokenizer-api/README.md
Normal file
6
tokenizer-api/README.md
Normal file
@@ -0,0 +1,6 @@
|
||||
|
||||
#Tokenizer-API
|
||||
|
||||
An API to interface a tokenizer with tantivy.
|
||||
|
||||
The API will be kept stable in order to not break support for existing tokenizers.
|
||||
197
tokenizer-api/src/lib.rs
Normal file
197
tokenizer-api/src/lib.rs
Normal file
@@ -0,0 +1,197 @@
|
||||
//! Tokenizer are in charge of chopping text into a stream of tokens
|
||||
//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
|
||||
//! for each new tantivy version.
|
||||
//!
|
||||
//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
|
||||
//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
|
||||
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use std::ops::{Deref, DerefMut};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Token
|
||||
#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
pub offset_from: usize,
|
||||
/// Offset (byte index) of the last character of the token + 1.
|
||||
/// The text that generated the token should be obtained by
|
||||
/// &text[token.offset_from..token.offset_to]
|
||||
pub offset_to: usize,
|
||||
/// Position, expressed in number of tokens.
|
||||
pub position: usize,
|
||||
/// Actual text content of the token.
|
||||
pub text: String,
|
||||
/// Is the length expressed in term of number of original tokens.
|
||||
pub position_length: usize,
|
||||
}
|
||||
|
||||
impl Default for Token {
|
||||
fn default() -> Token {
|
||||
Token {
|
||||
offset_from: 0,
|
||||
offset_to: 0,
|
||||
position: usize::MAX,
|
||||
text: String::with_capacity(200),
|
||||
position_length: 1,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
pub trait TokenizerClone {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer>;
|
||||
}
|
||||
|
||||
impl<T: Tokenizer + Clone> TokenizerClone for T {
|
||||
fn box_clone(&self) -> Box<dyn Tokenizer> {
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
|
||||
///
|
||||
/// See [`TokenStream`] for more information.
|
||||
pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
|
||||
|
||||
impl<'a, T> From<T> for BoxTokenStream<'a>
|
||||
where T: TokenStream + 'a
|
||||
{
|
||||
fn from(token_stream: T) -> BoxTokenStream<'a> {
|
||||
BoxTokenStream(Box::new(token_stream))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Deref for BoxTokenStream<'a> {
|
||||
type Target = dyn TokenStream + 'a;
|
||||
|
||||
fn deref(&self) -> &Self::Target {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
impl<'a> DerefMut for BoxTokenStream<'a> {
|
||||
fn deref_mut(&mut self) -> &mut Self::Target {
|
||||
&mut *self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
|
||||
fn advance(&mut self) -> bool {
|
||||
let token_stream: &mut dyn TokenStream = self.borrow_mut();
|
||||
token_stream.advance()
|
||||
}
|
||||
|
||||
fn token<'b>(&'b self) -> &'b Token {
|
||||
let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
|
||||
token_stream.token()
|
||||
}
|
||||
|
||||
fn token_mut<'b>(&'b mut self) -> &'b mut Token {
|
||||
let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
|
||||
token_stream.token_mut()
|
||||
}
|
||||
}
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
pub trait TokenStream {
|
||||
/// Advance to the next token
|
||||
///
|
||||
/// Returns false if there are no other tokens.
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
/// Returns a reference to the current token.
|
||||
fn token(&self) -> &Token;
|
||||
|
||||
/// Returns a mutable reference to the current token.
|
||||
fn token_mut(&mut self) -> &mut Token;
|
||||
|
||||
/// Helper to iterate over tokens. It
|
||||
/// simply combines a call to `.advance()`
|
||||
/// and `.token()`.
|
||||
fn next(&mut self) -> Option<&Token> {
|
||||
if self.advance() {
|
||||
Some(self.token())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Helper function to consume the entire `TokenStream`
|
||||
/// and push the tokens to a sink function.
|
||||
fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
|
||||
while self.advance() {
|
||||
sink(self.token());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
|
||||
///
|
||||
/// See [`TokenFilter`] for more information.
|
||||
pub struct BoxTokenFilter(Box<dyn TokenFilter>);
|
||||
|
||||
impl Deref for BoxTokenFilter {
|
||||
type Target = dyn TokenFilter;
|
||||
|
||||
fn deref(&self) -> &dyn TokenFilter {
|
||||
&*self.0
|
||||
}
|
||||
}
|
||||
|
||||
impl<T: TokenFilter> From<T> for BoxTokenFilter {
|
||||
fn from(tokenizer: T) -> BoxTokenFilter {
|
||||
BoxTokenFilter(Box::new(tokenizer))
|
||||
}
|
||||
}
|
||||
|
||||
pub trait TokenFilterClone {
|
||||
fn box_clone(&self) -> BoxTokenFilter;
|
||||
}
|
||||
|
||||
/// Trait for the pluggable components of `Tokenizer`s.
|
||||
pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
|
||||
}
|
||||
|
||||
impl<T: TokenFilter + Clone> TokenFilterClone for T {
|
||||
fn box_clone(&self) -> BoxTokenFilter {
|
||||
BoxTokenFilter::from(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn clone() {
|
||||
let t1 = Token {
|
||||
position: 1,
|
||||
offset_from: 2,
|
||||
offset_to: 3,
|
||||
text: "abc".to_string(),
|
||||
position_length: 1,
|
||||
};
|
||||
let t2 = t1.clone();
|
||||
|
||||
assert_eq!(t1.position, t2.position);
|
||||
assert_eq!(t1.offset_from, t2.offset_from);
|
||||
assert_eq!(t1.offset_to, t2.offset_to);
|
||||
assert_eq!(t1.text, t2.text);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user