Compare commits

...

1 Commits

Author SHA1 Message Date
Pascal Seitz
806a1e1b1e clarify tokenizer docs 2023-04-03 22:59:38 +08:00
3 changed files with 9 additions and 7 deletions

View File

@@ -14,8 +14,8 @@
//! Fields have to be declared as `FAST` in the schema.
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
//!
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
//! automatically, when serializing.
//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is
//! detected automatically, when serializing.
//!
//! Read access performance is comparable to that of an array lookup.

View File

@@ -2,7 +2,8 @@ use std::str::CharIndices;
use super::{Token, TokenStream, Tokenizer};
/// Tokenize the text by splitting on whitespaces and punctuation.
/// Tokenize the text by returning only tokens of consecutive
/// [`alphanumeric`](char::is_alphanumeric).
#[derive(Clone)]
pub struct SimpleTokenizer;

View File

@@ -13,9 +13,8 @@ use crate::tokenizer::{
/// By default, it is populated with the following managers.
///
/// * `raw` : does not process nor tokenize the text.
/// * `default` : Chops the text on according to whitespace and
/// punctuation, removes tokens that are too long, and lowercases
/// tokens
/// * `default` : Chops the text according to [`SimpleTokenizer`],
/// removes tokens that are longer than 40, and lowercases tokens
/// * `en_stem` : Like `default`, but also applies stemming on the
/// resulting tokens. Stemming can improve the recall of your
/// search engine.
@@ -35,7 +34,9 @@ impl TokenizerManager {
/// Registers a new tokenizer associated with a given name.
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
where TextAnalyzer: From<T> {
where
TextAnalyzer: From<T>,
{
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
self.tokenizers
.write()