clarify tokenizer docs

2026-02-17 05:10:36 +00:00 · 2023-04-03 22:59:38 +08:00
3 changed files with 9 additions and 7 deletions
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -14,8 +14,8 @@
 //! Fields have to be declared as `FAST` in the schema.
 //! Currently supported fields are: u64, i64, f64, bytes, ip and text.
 //!
-//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
-//! automatically, when serializing.
+//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is
+//! detected automatically, when serializing.
 //!
 //! Read access performance is comparable to that of an array lookup.

--- a/src/tokenizer/simple_tokenizer.rs
+++ b/src/tokenizer/simple_tokenizer.rs
@@ -2,7 +2,8 @@ use std::str::CharIndices;

 use super::{Token, TokenStream, Tokenizer};

-/// Tokenize the text by splitting on whitespaces and punctuation.
+/// Tokenize the text by returning only tokens of consecutive
+/// [`alphanumeric`](char::is_alphanumeric).
 #[derive(Clone)]
 pub struct SimpleTokenizer;

--- a/src/tokenizer/tokenizer_manager.rs
+++ b/src/tokenizer/tokenizer_manager.rs
@@ -13,9 +13,8 @@ use crate::tokenizer::{
 /// By default, it is populated with the following managers.
 ///
 ///  * `raw` : does not process nor tokenize the text.
-///  * `default` : Chops the text on according to whitespace and
-///  punctuation, removes tokens that are too long, and lowercases
-///  tokens
+///  * `default` : Chops the text according to [`SimpleTokenizer`],
+///  removes tokens that are longer than 40, and lowercases tokens
 ///  * `en_stem` : Like `default`, but also applies stemming on the
 ///  resulting tokens. Stemming can improve the recall of your
 ///  search engine.
@@ -35,7 +34,9 @@ impl TokenizerManager {

    /// Registers a new tokenizer associated with a given name.
    pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
-    where TextAnalyzer: From<T> {
+    where
+        TextAnalyzer: From<T>,
+    {
        let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
        self.tokenizers
            .write()