mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
1 Commits
block-cach
...
tokenizer_
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
806a1e1b1e |
@@ -14,8 +14,8 @@
|
|||||||
//! Fields have to be declared as `FAST` in the schema.
|
//! Fields have to be declared as `FAST` in the schema.
|
||||||
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
|
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
|
||||||
//!
|
//!
|
||||||
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
|
//! Fast fields are stored in with [different codecs](columnar::column_values). The best codec is
|
||||||
//! automatically, when serializing.
|
//! detected automatically, when serializing.
|
||||||
//!
|
//!
|
||||||
//! Read access performance is comparable to that of an array lookup.
|
//! Read access performance is comparable to that of an array lookup.
|
||||||
|
|
||||||
|
|||||||
@@ -2,7 +2,8 @@ use std::str::CharIndices;
|
|||||||
|
|
||||||
use super::{Token, TokenStream, Tokenizer};
|
use super::{Token, TokenStream, Tokenizer};
|
||||||
|
|
||||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
/// Tokenize the text by returning only tokens of consecutive
|
||||||
|
/// [`alphanumeric`](char::is_alphanumeric).
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SimpleTokenizer;
|
pub struct SimpleTokenizer;
|
||||||
|
|
||||||
|
|||||||
@@ -13,9 +13,8 @@ use crate::tokenizer::{
|
|||||||
/// By default, it is populated with the following managers.
|
/// By default, it is populated with the following managers.
|
||||||
///
|
///
|
||||||
/// * `raw` : does not process nor tokenize the text.
|
/// * `raw` : does not process nor tokenize the text.
|
||||||
/// * `default` : Chops the text on according to whitespace and
|
/// * `default` : Chops the text according to [`SimpleTokenizer`],
|
||||||
/// punctuation, removes tokens that are too long, and lowercases
|
/// removes tokens that are longer than 40, and lowercases tokens
|
||||||
/// tokens
|
|
||||||
/// * `en_stem` : Like `default`, but also applies stemming on the
|
/// * `en_stem` : Like `default`, but also applies stemming on the
|
||||||
/// resulting tokens. Stemming can improve the recall of your
|
/// resulting tokens. Stemming can improve the recall of your
|
||||||
/// search engine.
|
/// search engine.
|
||||||
@@ -35,7 +34,9 @@ impl TokenizerManager {
|
|||||||
|
|
||||||
/// Registers a new tokenizer associated with a given name.
|
/// Registers a new tokenizer associated with a given name.
|
||||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||||
where TextAnalyzer: From<T> {
|
where
|
||||||
|
TextAnalyzer: From<T>,
|
||||||
|
{
|
||||||
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
let boxed_tokenizer: TextAnalyzer = TextAnalyzer::from(tokenizer);
|
||||||
self.tokenizers
|
self.tokenizers
|
||||||
.write()
|
.write()
|
||||||
|
|||||||
Reference in New Issue
Block a user