mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-26 12:09:57 +00:00
Editing rustdoc
This commit is contained in:
@@ -61,7 +61,7 @@ impl Index {
|
||||
/// The index will use the `MMapDirectory`.
|
||||
///
|
||||
/// If a previous index was in this directory, then its meta file will be destroyed.
|
||||
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
|
||||
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
Index::from_directory(directory, schema)
|
||||
@@ -107,7 +107,7 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Opens a new directory from an index path.
|
||||
pub fn open(directory_path: &Path) -> Result<Index> {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
|
||||
let mmap_directory = MmapDirectory::open(directory_path)?;
|
||||
let directory = ManagedDirectory::new(mmap_directory)?;
|
||||
let metas = load_metas(&directory)?;
|
||||
|
||||
@@ -23,11 +23,11 @@ use std::sync::RwLock;
|
||||
use std::sync::Weak;
|
||||
use tempdir::TempDir;
|
||||
|
||||
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
|
||||
let file = File::open(&full_path).map_err(|e| if e.kind() ==
|
||||
io::ErrorKind::NotFound
|
||||
{
|
||||
OpenReadError::FileDoesNotExist(full_path.clone())
|
||||
OpenReadError::FileDoesNotExist(full_path.to_owned())
|
||||
} else {
|
||||
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
|
||||
})?;
|
||||
@@ -180,7 +180,8 @@ impl MmapDirectory {
|
||||
///
|
||||
/// Returns an error if the `directory_path` does not
|
||||
/// exist or if it is not a directory.
|
||||
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let directory_path: &Path = directory_path.as_ref();
|
||||
if !directory_path.exists() {
|
||||
Err(OpenDirectoryError::DoesNotExist(
|
||||
PathBuf::from(directory_path),
|
||||
|
||||
@@ -5,14 +5,12 @@ that serves as an address in their respective posting list.
|
||||
|
||||
The term dictionary API makes it possible to iterate through
|
||||
a range of keys in a sorted manner.
|
||||
```
|
||||
|
||||
|
||||
# Implementations
|
||||
|
||||
There is currently two implementations of the term dictionary.
|
||||
|
||||
## Default implementation : `fstdict`
|
||||
## Default implementation : *fstdict*
|
||||
|
||||
The default one relies heavily on the `fst` crate.
|
||||
It associate each terms `&[u8]` representation to a `u64`
|
||||
@@ -20,7 +18,7 @@ that is in fact an address in a buffer. The value is then accessible
|
||||
via deserializing the value at this address.
|
||||
|
||||
|
||||
## Stream implementation : `streamdict`
|
||||
## Stream implementation : *streamdict*
|
||||
|
||||
The `fstdict` is a tiny bit slow when streaming all of
|
||||
the terms.
|
||||
|
||||
@@ -1,35 +1,133 @@
|
||||
//! Tokenizer are in charge of processing text for indexing.
|
||||
//! Tokenizer are in charge of chopping text into a stream of tokens
|
||||
//! ready for indexing.
|
||||
//!
|
||||
//! An tokenizer is a configurable pipeline that starts by a `Tokenizer`,
|
||||
//! followed by a sequence of [`TokenFilter`s](./trait.TokenFilter.html) to it.
|
||||
//! You must define in your schema which tokenizer should be used for
|
||||
//! each of your fields :
|
||||
//!
|
||||
//! The `Tokenizer` is in charge of chopping the text into tokens. There is no
|
||||
//! trait called `Tokenizer`. Instead `Tokenizer` like [`SimpleTokenizer`](./struct.SimpleTokenizer.html)
|
||||
//! are just directly implementing the tokenizer trait.
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! use tantivy::schema::*;
|
||||
//!
|
||||
//! - choosing a tokenizer. A tokenizer is in charge of chopping your text into token.
|
||||
//! - adding so called filter to modify your tokens (e.g. filter out stop words, apply stemming etc.)
|
||||
//! # fn main() {
|
||||
//! let mut schema_builder = SchemaBuilder::new();
|
||||
//!
|
||||
//! let text_options = TextOptions::default()
|
||||
//! .set_indexing_options(
|
||||
//! TextFieldIndexing::default()
|
||||
//! .set_tokenizer("en_stem")
|
||||
//! .set_index_option(IndexRecordOption::Basic)
|
||||
//! )
|
||||
//! .set_stored();
|
||||
//!
|
||||
//! let id_options = TextOptions::default()
|
||||
//! .set_indexing_options(
|
||||
//! TextFieldIndexing::default()
|
||||
//! .set_tokenizer("raw_ids")
|
||||
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions)
|
||||
//! )
|
||||
//! .set_stored();
|
||||
//!
|
||||
//! schema_builder.add_text_field("title", text_options.clone());
|
||||
//! schema_builder.add_text_field("text", text_options);
|
||||
//! schema_builder.add_text_field("uuid", id_options);
|
||||
//!
|
||||
//! let schema = schema_builder.build();
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! By default, `tantivy` offers the following tokenizers:
|
||||
//!
|
||||
//! ## `default`
|
||||
//!
|
||||
//! `default` is the tokenizer that will be used if you do not
|
||||
//! assign a specific tokenizer to your text field.
|
||||
//! It will chop your text on punctuation and whitespaces,
|
||||
//! removes tokens that are longer than 40 chars, and lowercase your text.
|
||||
//!
|
||||
//! ## `raw`
|
||||
//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
|
||||
//! It can be useful to index uuids, or urls for instance.
|
||||
//!
|
||||
//! ## `en_stem`
|
||||
//!
|
||||
//! In addition to what `default` does, the `en_stem` tokenizer also
|
||||
//! apply stemming to your tokens. Stemming consists in trimming words to
|
||||
//! remove their inflection. This tokenizer is slower than the default one,
|
||||
//! but is recommended to improve recall.
|
||||
//!
|
||||
//!
|
||||
//! # Custom tokenizers
|
||||
//!
|
||||
//! You can write your own tokenizer by implementing the [`Tokenizer`](./trait.Tokenizer.html)
|
||||
//! or you can extend an existing [`Tokenizer`](./trait.Tokenizer.html) by chaining it several
|
||||
//! [`TokenFilter`s](./trait.TokenFilter.html).
|
||||
//!
|
||||
//! For instance, the `en_stem` is defined as follows.
|
||||
//!
|
||||
//! ```rust
|
||||
//! # extern crate tantivy;
|
||||
//!
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! # fn main() {
|
||||
//! let en_stem = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser)
|
||||
//! .filter(Stemmer::new());
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! Once your tokenizer is defined, you need to
|
||||
//! register it with a name in your index's [TokenizerManager](./struct.TokenizerManager.html).
|
||||
//!
|
||||
//! ```
|
||||
//! # extern crate tantivy;
|
||||
//! # use tantivy::schema::SchemaBuilder;
|
||||
//! # use tantivy::tokenizer::*;
|
||||
//! # use tantivy::Index;
|
||||
//! # fn main() {
|
||||
//! # let custom_en_tokenizer = SimpleTokenizer;
|
||||
//! # let schema = SchemaBuilder::new().build();
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//! index.tokenizers()
|
||||
//! .register("custom_en", custom_en_tokenizer);
|
||||
//! # }
|
||||
//! ```
|
||||
//!
|
||||
//! If you built your schema programmatically, a complete example
|
||||
//! could like this for instance.
|
||||
//!
|
||||
//! # Example
|
||||
//!
|
||||
//! ```
|
||||
//! extern crate tantivy;
|
||||
//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
|
||||
//! use tantivy::tokenizer::*;
|
||||
//!
|
||||
//! // ...
|
||||
//! use tantivy::Index;
|
||||
//!
|
||||
//! # fn main() {
|
||||
//! let mut tokenizer = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser);
|
||||
//! tokenizer
|
||||
//! .token_stream("Hello, happy tax payer")
|
||||
//! .process(&mut |token| {
|
||||
//! println!("token {:?}", token.text);
|
||||
//! });
|
||||
//! let mut schema_builder = SchemaBuilder::new();
|
||||
//! let text_field_indexing = TextFieldIndexing::default()
|
||||
//! .set_tokenizer("custom_en")
|
||||
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
|
||||
//! let text_options = TextOptions::default()
|
||||
//! .set_indexing_options(text_field_indexing)
|
||||
//! .set_stored();
|
||||
//! schema_builder.add_text_field("title", text_options);
|
||||
//! let schema = schema_builder.build();
|
||||
//! let index = Index::create_in_ram(schema);
|
||||
//!
|
||||
//! // We need to register our tokenizer :
|
||||
//! let custom_en_tokenizer = SimpleTokenizer
|
||||
//! .filter(RemoveLongFilter::limit(40))
|
||||
//! .filter(LowerCaser);
|
||||
//! index
|
||||
//! .tokenizers()
|
||||
//! .register("custom_en", custom_en_tokenizer);
|
||||
//! // ...
|
||||
//! # }
|
||||
//! ```
|
||||
|
||||
//!
|
||||
mod tokenizer;
|
||||
mod simple_tokenizer;
|
||||
mod lower_caser;
|
||||
|
||||
@@ -50,7 +50,7 @@ impl Default for TokenizerManager {
|
||||
/// the default pre-configured tokenizers of `tantivy`.
|
||||
/// - simple
|
||||
/// - en_stem
|
||||
/// - jp
|
||||
/// - ja
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new()))
|
||||
|
||||
Reference in New Issue
Block a user