move tokenizer API to seperate crate (#1767)

closes #1766 Finding tantivy tokenizers is a frustrating experience currently, since they need be updated for each tantivy version. That's unnecessary since the API is rather stable anyway.
2025-12-22 18:19:58 +00:00 · 2023-01-09 06:37:38 +01:00
parent 4f9efe654c
commit 514d23a20c
8 changed files with 231 additions and 240 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -61,6 +61,7 @@ tantivy-query-grammar = { version= "0.19.0", path="./query-grammar" }
 tantivy-bitpacker = 		{ version= "0.3", path="./bitpacker" }
 common = 								{ version= "0.5", path = "./common/", package = "tantivy-common" }
 fastfield_codecs = 			{ version= "0.3", path="./fastfield_codecs", default-features = false }
+tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }

 [target.'cfg(windows)'.dependencies]
 winapi = "0.3.9"
@@ -106,7 +107,7 @@ unstable = [] # useful for benches.
 quickwit = ["sstable"]

 [workspace]
-members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar"]
+members = ["query-grammar", "bitpacker", "common", "fastfield_codecs", "ownedbytes", "stacker", "sstable", "columnar", "tokenizer-api"]

 # Following the "fail" crate best practises, we isolate
 # tests that define specific behavior in fail check points
--- a/README.md
+++ b/README.md
@@ -29,7 +29,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
 # Features

 - Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
+- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
 - Fast (check out the :racehorse: :sparkles: [benchmark](https://tantivy-search.github.io/bench/) :sparkles: :racehorse:)
 - Tiny startup time (<10ms), perfect for command-line tools
 - BM25 scoring (the same as Lucene)
@@ -42,12 +42,12 @@ Your mileage WILL vary depending on the nature of queries and their load.
 - Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
 - `&[u8]` fast fields
 - Text, i64, u64, f64, dates, and hierarchical facet fields
- LZ4 compressed document store
+- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
 - Range queries
 - Faceted search
 - Configurable indexing (optional term frequency and position indexing)
 - JSON Field
- Aggregation Collector: range buckets, average, and stats metrics
+- Aggregation Collector: histogram, range buckets, average, and stats metrics
 - LogMergePolicy with deletes
 - Searcher Warmer API
 - Cheesy logo with a horse
@@ -81,6 +81,10 @@ There are many ways to support this project.

 We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.

+## Tokenizer
+
+When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
+
 ## Minimum supported Rust version

 Tantivy currently requires at least Rust 1.62 or later to compile.
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -52,6 +52,8 @@
 //! remove their inflection. This tokenizer is slower than the default one,
 //! but is recommended to improve recall.
 //!
+//! # Custom tokenizer Library
+//! Avoid using tantivy as dependency and prefer `tantivy-tokenizer-api` instead.
 //!
 //! # Custom tokenizers
 //!
@@ -134,6 +136,10 @@ mod tokenizer;
 mod tokenizer_manager;
 mod whitespace_tokenizer;

+pub use tokenizer_api::{
+    BoxTokenFilter, BoxTokenStream, Token, TokenFilter, TokenStream, Tokenizer,
+};
+
 pub use self::alphanum_only::AlphaNumOnlyFilter;
 pub use self::ascii_folding_filter::AsciiFoldingFilter;
 pub use self::facet_tokenizer::FacetTokenizer;
@@ -146,9 +152,7 @@ pub use self::split_compound_words::SplitCompoundWords;
 pub use self::stemmer::{Language, Stemmer};
 pub use self::stop_word_filter::StopWordFilter;
 pub use self::tokenized_string::{PreTokenizedStream, PreTokenizedString};
-pub use self::tokenizer::{
-    BoxTokenFilter, BoxTokenStream, TextAnalyzer, Token, TokenFilter, TokenStream, Tokenizer,
-};
+pub use self::tokenizer::TextAnalyzer;
 pub use self::tokenizer_manager::TokenizerManager;
 pub use self::whitespace_tokenizer::WhitespaceTokenizer;

--- a/src/tokenizer/ngram_tokenizer.rs
+++ b/src/tokenizer/ngram_tokenizer.rs
@@ -303,8 +303,7 @@ mod tests {

    use super::{utf8_codepoint_width, CodepointFrontiers, NgramTokenizer, StutteringIterator};
    use crate::tokenizer::tests::assert_token;
-    use crate::tokenizer::tokenizer::Tokenizer;
-    use crate::tokenizer::{BoxTokenStream, Token};
+    use crate::tokenizer::{BoxTokenStream, Token, Tokenizer};

    fn test_helper(mut tokenizer: BoxTokenStream) -> Vec<Token> {
        let mut tokens: Vec<Token> = vec![];
--- a/src/tokenizer/tokenizer.rs
+++ b/src/tokenizer/tokenizer.rs
@@ -1,42 +1,9 @@
 /// The tokenizer module contains all of the tools used to process
 /// text in `tantivy`.
-use std::borrow::{Borrow, BorrowMut};
-use std::ops::{Deref, DerefMut};
-
-use serde::{Deserialize, Serialize};
+use tokenizer_api::{BoxTokenFilter, BoxTokenStream, Tokenizer};

 use crate::tokenizer::empty_tokenizer::EmptyTokenizer;

-/// Token
-#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
-pub struct Token {
-    /// Offset (byte index) of the first character of the token.
-    /// Offsets shall not be modified by token filters.
-    pub offset_from: usize,
-    /// Offset (byte index) of the last character of the token + 1.
-    /// The text that generated the token should be obtained by
-    /// &text[token.offset_from..token.offset_to]
-    pub offset_to: usize,
-    /// Position, expressed in number of tokens.
-    pub position: usize,
-    /// Actual text content of the token.
-    pub text: String,
-    /// Is the length expressed in term of number of original tokens.
-    pub position_length: usize,
-}
-
-impl Default for Token {
-    fn default() -> Token {
-        Token {
-            offset_from: 0,
-            offset_to: 0,
-            position: usize::MAX,
-            text: String::with_capacity(200),
-            position_length: 1,
-        }
-    }
-}
-
 /// `TextAnalyzer` tokenizes an input text into tokens and modifies the resulting `TokenStream`.
 ///
 /// It simply wraps a `Tokenizer` and a list of `TokenFilter` that are applied sequentially.
@@ -112,200 +79,3 @@ impl Clone for TextAnalyzer {
        }
    }
 }
-
-/// `Tokenizer` are in charge of splitting text into a stream of token
-/// before indexing.
-///
-/// See the [module documentation](crate::tokenizer) for more detail.
-///
-/// # Warning
-///
-/// This API may change to use associated types.
-pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
-    /// Creates a token stream for a given `str`.
-    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
-}
-
-pub trait TokenizerClone {
-    fn box_clone(&self) -> Box<dyn Tokenizer>;
-}
-
-impl<T: Tokenizer + Clone> TokenizerClone for T {
-    fn box_clone(&self) -> Box<dyn Tokenizer> {
-        Box::new(self.clone())
-    }
-}
-
-impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
-    fn advance(&mut self) -> bool {
-        let token_stream: &mut dyn TokenStream = self.borrow_mut();
-        token_stream.advance()
-    }
-
-    fn token<'b>(&'b self) -> &'b Token {
-        let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
-        token_stream.token()
-    }
-
-    fn token_mut<'b>(&'b mut self) -> &'b mut Token {
-        let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
-        token_stream.token_mut()
-    }
-}
-
-/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
-///
-/// See [`TokenStream`] for more information.
-pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
-
-impl<'a, T> From<T> for BoxTokenStream<'a>
-where T: TokenStream + 'a
-{
-    fn from(token_stream: T) -> BoxTokenStream<'a> {
-        BoxTokenStream(Box::new(token_stream))
-    }
-}
-
-impl<'a> Deref for BoxTokenStream<'a> {
-    type Target = dyn TokenStream + 'a;
-
-    fn deref(&self) -> &Self::Target {
-        &*self.0
-    }
-}
-impl<'a> DerefMut for BoxTokenStream<'a> {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        &mut *self.0
-    }
-}
-
-/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
-///
-/// See [`TokenFilter`] for more information.
-pub struct BoxTokenFilter(Box<dyn TokenFilter>);
-
-impl Deref for BoxTokenFilter {
-    type Target = dyn TokenFilter;
-
-    fn deref(&self) -> &dyn TokenFilter {
-        &*self.0
-    }
-}
-
-impl<T: TokenFilter> From<T> for BoxTokenFilter {
-    fn from(tokenizer: T) -> BoxTokenFilter {
-        BoxTokenFilter(Box::new(tokenizer))
-    }
-}
-
-/// `TokenStream` is the result of the tokenization.
-///
-/// It consists consumable stream of `Token`s.
-///
-/// # Example
-///
-/// ```
-/// use tantivy::tokenizer::*;
-///
-/// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
-///        .filter(RemoveLongFilter::limit(40))
-///        .filter(LowerCaser);
-/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
-/// {
-///     let token = token_stream.next().unwrap();
-///     assert_eq!(&token.text, "hello");
-///     assert_eq!(token.offset_from, 0);
-///     assert_eq!(token.offset_to, 5);
-///     assert_eq!(token.position, 0);
-/// }
-/// {
-///     let token = token_stream.next().unwrap();
-///     assert_eq!(&token.text, "happy");
-///     assert_eq!(token.offset_from, 7);
-///     assert_eq!(token.offset_to, 12);
-///     assert_eq!(token.position, 1);
-/// }
-/// ```
-pub trait TokenStream {
-    /// Advance to the next token
-    ///
-    /// Returns false if there are no other tokens.
-    fn advance(&mut self) -> bool;
-
-    /// Returns a reference to the current token.
-    fn token(&self) -> &Token;
-
-    /// Returns a mutable reference to the current token.
-    fn token_mut(&mut self) -> &mut Token;
-
-    /// Helper to iterate over tokens. It
-    /// simply combines a call to `.advance()`
-    /// and `.token()`.
-    ///
-    /// ```
-    /// use tantivy::tokenizer::*;
-    ///
-    /// let tokenizer = TextAnalyzer::from(SimpleTokenizer)
-    ///       .filter(RemoveLongFilter::limit(40))
-    ///       .filter(LowerCaser);
-    /// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
-    /// while let Some(token) = token_stream.next() {
-    ///     println!("Token {:?}", token.text);
-    /// }
-    /// ```
-    fn next(&mut self) -> Option<&Token> {
-        if self.advance() {
-            Some(self.token())
-        } else {
-            None
-        }
-    }
-
-    /// Helper function to consume the entire `TokenStream`
-    /// and push the tokens to a sink function.
-    ///
-    /// Remove this.
-    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
-        while self.advance() {
-            sink(self.token());
-        }
-    }
-}
-
-pub trait TokenFilterClone {
-    fn box_clone(&self) -> BoxTokenFilter;
-}
-
-/// Trait for the pluggable components of `Tokenizer`s.
-pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
-    /// Wraps a token stream and returns the modified one.
-    fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
-}
-
-impl<T: TokenFilter + Clone> TokenFilterClone for T {
-    fn box_clone(&self) -> BoxTokenFilter {
-        BoxTokenFilter::from(self.clone())
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use super::Token;
-
-    #[test]
-    fn clone() {
-        let t1 = Token {
-            position: 1,
-            offset_from: 2,
-            offset_to: 3,
-            text: "abc".to_string(),
-            position_length: 1,
-        };
-        let t2 = t1.clone();
-
-        assert_eq!(t1.position, t2.position);
-        assert_eq!(t1.offset_from, t2.offset_from);
-        assert_eq!(t1.offset_to, t2.offset_to);
-        assert_eq!(t1.text, t2.text);
-    }
-}
--- a/tokenizer-api/Cargo.toml
+++ b/tokenizer-api/Cargo.toml
@@ -0,0 +1,10 @@
+[package]
+name = "tantivy-tokenizer-api"
+version = "0.1.0"
+edition = "2021"
+description = "Tokenizer API of tantivy"
+
+# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
+
+[dependencies]
+serde = { version = "1.0.152", features = ["derive"] }
--- a/tokenizer-api/README.md
+++ b/tokenizer-api/README.md
@@ -0,0 +1,6 @@
+
+#Tokenizer-API
+
+An API to interface a tokenizer with tantivy. 
+
+The API will be kept stable in order to not break support for existing tokenizers.
--- a/tokenizer-api/src/lib.rs
+++ b/tokenizer-api/src/lib.rs
@@ -0,0 +1,197 @@
+//! Tokenizer are in charge of chopping text into a stream of tokens
+//! ready for indexing. This is an seperate crate from tantivy, so implementors don't need to update
+//! for each new tantivy version.
+//!
+//! To add support for a tokenizer, implement the [`Tokenizer`](crate::Tokenizer) trait.
+//! Checkout the [tantivy repo](https://github.com/quickwit-oss/tantivy/tree/main/src/tokenizer) for some examples.
+
+use std::borrow::{Borrow, BorrowMut};
+use std::ops::{Deref, DerefMut};
+
+use serde::{Deserialize, Serialize};
+
+/// Token
+#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)]
+pub struct Token {
+    /// Offset (byte index) of the first character of the token.
+    /// Offsets shall not be modified by token filters.
+    pub offset_from: usize,
+    /// Offset (byte index) of the last character of the token + 1.
+    /// The text that generated the token should be obtained by
+    /// &text[token.offset_from..token.offset_to]
+    pub offset_to: usize,
+    /// Position, expressed in number of tokens.
+    pub position: usize,
+    /// Actual text content of the token.
+    pub text: String,
+    /// Is the length expressed in term of number of original tokens.
+    pub position_length: usize,
+}
+
+impl Default for Token {
+    fn default() -> Token {
+        Token {
+            offset_from: 0,
+            offset_to: 0,
+            position: usize::MAX,
+            text: String::with_capacity(200),
+            position_length: 1,
+        }
+    }
+}
+
+/// `Tokenizer` are in charge of splitting text into a stream of token
+/// before indexing.
+///
+/// # Warning
+///
+/// This API may change to use associated types.
+pub trait Tokenizer: 'static + Send + Sync + TokenizerClone {
+    /// Creates a token stream for a given `str`.
+    fn token_stream<'a>(&self, text: &'a str) -> BoxTokenStream<'a>;
+}
+
+pub trait TokenizerClone {
+    fn box_clone(&self) -> Box<dyn Tokenizer>;
+}
+
+impl<T: Tokenizer + Clone> TokenizerClone for T {
+    fn box_clone(&self) -> Box<dyn Tokenizer> {
+        Box::new(self.clone())
+    }
+}
+
+/// Simple wrapper of `Box<dyn TokenStream + 'a>`.
+///
+/// See [`TokenStream`] for more information.
+pub struct BoxTokenStream<'a>(Box<dyn TokenStream + 'a>);
+
+impl<'a, T> From<T> for BoxTokenStream<'a>
+where T: TokenStream + 'a
+{
+    fn from(token_stream: T) -> BoxTokenStream<'a> {
+        BoxTokenStream(Box::new(token_stream))
+    }
+}
+
+impl<'a> Deref for BoxTokenStream<'a> {
+    type Target = dyn TokenStream + 'a;
+
+    fn deref(&self) -> &Self::Target {
+        &*self.0
+    }
+}
+impl<'a> DerefMut for BoxTokenStream<'a> {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut *self.0
+    }
+}
+
+impl<'a> TokenStream for Box<dyn TokenStream + 'a> {
+    fn advance(&mut self) -> bool {
+        let token_stream: &mut dyn TokenStream = self.borrow_mut();
+        token_stream.advance()
+    }
+
+    fn token<'b>(&'b self) -> &'b Token {
+        let token_stream: &'b (dyn TokenStream + 'a) = self.borrow();
+        token_stream.token()
+    }
+
+    fn token_mut<'b>(&'b mut self) -> &'b mut Token {
+        let token_stream: &'b mut (dyn TokenStream + 'a) = self.borrow_mut();
+        token_stream.token_mut()
+    }
+}
+
+/// `TokenStream` is the result of the tokenization.
+///
+/// It consists consumable stream of `Token`s.
+pub trait TokenStream {
+    /// Advance to the next token
+    ///
+    /// Returns false if there are no other tokens.
+    fn advance(&mut self) -> bool;
+
+    /// Returns a reference to the current token.
+    fn token(&self) -> &Token;
+
+    /// Returns a mutable reference to the current token.
+    fn token_mut(&mut self) -> &mut Token;
+
+    /// Helper to iterate over tokens. It
+    /// simply combines a call to `.advance()`
+    /// and `.token()`.
+    fn next(&mut self) -> Option<&Token> {
+        if self.advance() {
+            Some(self.token())
+        } else {
+            None
+        }
+    }
+
+    /// Helper function to consume the entire `TokenStream`
+    /// and push the tokens to a sink function.
+    fn process(&mut self, sink: &mut dyn FnMut(&Token)) {
+        while self.advance() {
+            sink(self.token());
+        }
+    }
+}
+
+/// Simple wrapper of `Box<dyn TokenFilter + 'a>`.
+///
+/// See [`TokenFilter`] for more information.
+pub struct BoxTokenFilter(Box<dyn TokenFilter>);
+
+impl Deref for BoxTokenFilter {
+    type Target = dyn TokenFilter;
+
+    fn deref(&self) -> &dyn TokenFilter {
+        &*self.0
+    }
+}
+
+impl<T: TokenFilter> From<T> for BoxTokenFilter {
+    fn from(tokenizer: T) -> BoxTokenFilter {
+        BoxTokenFilter(Box::new(tokenizer))
+    }
+}
+
+pub trait TokenFilterClone {
+    fn box_clone(&self) -> BoxTokenFilter;
+}
+
+/// Trait for the pluggable components of `Tokenizer`s.
+pub trait TokenFilter: 'static + Send + Sync + TokenFilterClone {
+    /// Wraps a token stream and returns the modified one.
+    fn transform<'a>(&self, token_stream: BoxTokenStream<'a>) -> BoxTokenStream<'a>;
+}
+
+impl<T: TokenFilter + Clone> TokenFilterClone for T {
+    fn box_clone(&self) -> BoxTokenFilter {
+        BoxTokenFilter::from(self.clone())
+    }
+}
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn clone() {
+        let t1 = Token {
+            position: 1,
+            offset_from: 2,
+            offset_to: 3,
+            text: "abc".to_string(),
+            position_length: 1,
+        };
+        let t2 = t1.clone();
+
+        assert_eq!(t1.position, t2.position);
+        assert_eq!(t1.offset_from, t2.offset_from);
+        assert_eq!(t1.offset_to, t2.offset_to);
+        assert_eq!(t1.text, t2.text);
+    }
+}