From 806a1e1b1ecd24d65c19d72e57fa1fdd5738562a Mon Sep 17 00:00:00 2001
From: Pascal Seitz <pascal.seitz@gmail.com>
Date: Mon, 3 Apr 2023 22:59:38 +0800
Subject: [PATCH] clarify tokenizer docs

---
 src/fastfield/mod.rs               | 4 ++--
 src/tokenizer/simple_tokenizer.rs  | 3 ++-
 src/tokenizer/tokenizer_manager.rs | 9 +++++----
 3 files changed, 9 insertions(+), 7 deletions(-)
diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs
index 238a89df1..1aca1ba5e 100644
--- a/src/fastfield/mod.rs
+++ b/src/fastfield/mod.rs
@@ -14,8 +14,8 @@
 //! Fields have to be declared as `FAST` in the schema.
 //! Currently supported fields are: u64, i64, f64, bytes, ip and text.
 //!
-//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
-//! automatically, when serializing.
+//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is
+//! detected automatically, when serializing.
 //!
 //! Read access performance is comparable to that of an array lookup.
 
diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs
index 2b9163b23..46ecc31b2 100644
--- a/src/tokenizer/simple_tokenizer.rs
+++ b/src/tokenizer/simple_tokenizer.rs
@@ -2,7 +2,8 @@ use std::str::CharIndices;
 
 use super::{Token, TokenStream, Tokenizer};
 
-/// Tokenize the text by splitting on whitespaces and punctuation.
+/// Tokenize the text by returning only tokens of consecutive
+/// [`alphanumeric`](char::is_alphanumeric).
 #[derive(Clone)]
 pub struct SimpleTokenizer;
 
diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs
index e849471bc..80fedcc2e 100644
--- a/src/tokenizer/tokenizer_manager.rs
+++ b/src/tokenizer/tokenizer_manager.rs
@@ -13,9 +13,8 @@ use crate::tokenizer::{
 /// By default, it is populated with the following managers.
 ///
 ///  * `raw` : does not process nor tokenize the text.
-///  * `default` : Chops the text on according to whitespace and
-///  punctuation, removes tokens that are too long, and lowercases
-///  tokens
+///  * `default` : Chops the text according to [`SimpleTokenizer`],
+///  removes tokens that are longer than 40, and lowercases tokens
 ///  * `en_stem` : Like `default`, but also applies stemming on the
 ///  resulting tokens. Stemming can improve the recall of your
 ///  search engine.
@@ -35,7 +34,9 @@ impl TokenizerManager {
 
     /// Registers a new tokenizer associated with a given name.
     pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
-    where TextAnalyzer: From<T> {
+    where
+        TextAnalyzer: From<T>,
+    {
         let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
         self.tokenizers
             .write()