mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
1 Commits
numeric_wi
...
tokenizer_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
806a1e1b1e |
@@ -14,8 +14,8 @@
|
||||
//! Fields have to be declared as `FAST` in the schema.
|
||||
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
|
||||
//!
|
||||
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
|
||||
//! automatically, when serializing.
|
||||
//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is
|
||||
//! detected automatically, when serializing.
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
|
||||
|
||||
@@ -2,7 +2,8 @@ use std::str::CharIndices;
|
||||
|
||||
use super::{Token, TokenStream, Tokenizer};
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
/// Tokenize the text by returning only tokens of consecutive
|
||||
/// [`alphanumeric`](char::is_alphanumeric).
|
||||
#[derive(Clone)]
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
@@ -13,9 +13,8 @@ use crate::tokenizer::{
|
||||
/// By default, it is populated with the following managers.
|
||||
///
|
||||
/// * `raw` : does not process nor tokenize the text.
|
||||
/// * `default` : Chops the text on according to whitespace and
|
||||
/// punctuation, removes tokens that are too long, and lowercases
|
||||
/// tokens
|
||||
/// * `default` : Chops the text according to [`SimpleTokenizer`],
|
||||
/// removes tokens that are longer than 40, and lowercases tokens
|
||||
/// * `en_stem` : Like `default`, but also applies stemming on the
|
||||
/// resulting tokens. Stemming can improve the recall of your
|
||||
/// search engine.
|
||||
@@ -35,7 +34,9 @@ impl TokenizerManager {
|
||||
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where TextAnalyzer: From<T> {
|
||||
where
|
||||
TextAnalyzer: From<T>,
|
||||
{
|
||||
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||
self.tokenizers
|
||||
.write()
|
||||
|
||||
Reference in New Issue
Block a user