From 806a1e1b1ecd24d65c19d72e57fa1fdd5738562a Mon Sep 17 00:00:00 2001 From: Pascal Seitz Date: Mon, 3 Apr 2023 22:59:38 +0800 Subject: [PATCH] clarify tokenizer docs --- src/fastfield/mod.rs | 4 ++-- src/tokenizer/simple_tokenizer.rs | 3 ++- src/tokenizer/tokenizer_manager.rs | 9 +++++---- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/fastfield/mod.rs b/src/fastfield/mod.rs index 238a89df1..1aca1ba5e 100644 --- a/src/fastfield/mod.rs +++ b/src/fastfield/mod.rs @@ -14,8 +14,8 @@ //! Fields have to be declared as `FAST` in the schema. //! Currently supported fields are: u64, i64, f64, bytes, ip and text. //! -//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected -//! automatically, when serializing. +//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is +//! detected automatically, when serializing. //! //! Read access performance is comparable to that of an array lookup. diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index 2b9163b23..46ecc31b2 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -2,7 +2,8 @@ use std::str::CharIndices; use super::{Token, TokenStream, Tokenizer}; -/// Tokenize the text by splitting on whitespaces and punctuation. +/// Tokenize the text by returning only tokens of consecutive +/// [`alphanumeric`](char::is_alphanumeric). #[derive(Clone)] pub struct SimpleTokenizer; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index e849471bc..80fedcc2e 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -13,9 +13,8 @@ use crate::tokenizer::{ /// By default, it is populated with the following managers. /// /// * `raw` : does not process nor tokenize the text. -/// * `default` : Chops the text on according to whitespace and -/// punctuation, removes tokens that are too long, and lowercases -/// tokens +/// * `default` : Chops the text according to [`SimpleTokenizer`], +/// removes tokens that are longer than 40, and lowercases tokens /// * `en_stem` : Like `default`, but also applies stemming on the /// resulting tokens. Stemming can improve the recall of your /// search engine. @@ -35,7 +34,9 @@ impl TokenizerManager { /// Registers a new tokenizer associated with a given name. pub fn register(&self, tokenizer_name: &str, tokenizer: T) - where TextAnalyzer: From { + where + TextAnalyzer: From, + { let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer); self.tokenizers .write()