From aaeeda2bc5f942cdc6ea80332f40ac45456560ad Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sat, 25 Nov 2017 13:23:32 +0900 Subject: [PATCH] Editing rustdoc --- src/core/index.rs | 4 +- src/directory/mmap_directory.rs | 7 +- src/termdict/mod.rs | 6 +- src/tokenizer/mod.rs | 136 +++++++++++++++++++++++++---- src/tokenizer/tokenizer_manager.rs | 2 +- 5 files changed, 126 insertions(+), 29 deletions(-) diff --git a/src/core/index.rs b/src/core/index.rs index ce1284474..e790fecd9 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -61,7 +61,7 @@ impl Index { /// The index will use the `MMapDirectory`. /// /// If a previous index was in this directory, then its meta file will be destroyed. - pub fn create(directory_path: &Path, schema: Schema) -> Result { + pub fn create>(directory_path: P, schema: Schema) -> Result { let mmap_directory = MmapDirectory::open(directory_path)?; let directory = ManagedDirectory::new(mmap_directory)?; Index::from_directory(directory, schema) @@ -107,7 +107,7 @@ impl Index { } /// Opens a new directory from an index path. - pub fn open(directory_path: &Path) -> Result { + pub fn open>(directory_path: P) -> Result { let mmap_directory = MmapDirectory::open(directory_path)?; let directory = ManagedDirectory::new(mmap_directory)?; let metas = load_metas(&directory)?; diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs index 970b987cb..74edfc51a 100644 --- a/src/directory/mmap_directory.rs +++ b/src/directory/mmap_directory.rs @@ -23,11 +23,11 @@ use std::sync::RwLock; use std::sync::Weak; use tempdir::TempDir; -fn open_mmap(full_path: &PathBuf) -> result::Result>, OpenReadError> { +fn open_mmap(full_path: &Path) -> result::Result>, OpenReadError> { let file = File::open(&full_path).map_err(|e| if e.kind() == io::ErrorKind::NotFound { - OpenReadError::FileDoesNotExist(full_path.clone()) + OpenReadError::FileDoesNotExist(full_path.to_owned()) } else { OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e)) })?; @@ -180,7 +180,8 @@ impl MmapDirectory { /// /// Returns an error if the `directory_path` does not /// exist or if it is not a directory. - pub fn open(directory_path: &Path) -> Result { + pub fn open>(directory_path: P) -> Result { + let directory_path: &Path = directory_path.as_ref(); if !directory_path.exists() { Err(OpenDirectoryError::DoesNotExist( PathBuf::from(directory_path), diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs index 9150b8f85..7f004c87e 100644 --- a/src/termdict/mod.rs +++ b/src/termdict/mod.rs @@ -5,14 +5,12 @@ that serves as an address in their respective posting list. The term dictionary API makes it possible to iterate through a range of keys in a sorted manner. -``` - # Implementations There is currently two implementations of the term dictionary. -## Default implementation : `fstdict` +## Default implementation : *fstdict* The default one relies heavily on the `fst` crate. It associate each terms `&[u8]` representation to a `u64` @@ -20,7 +18,7 @@ that is in fact an address in a buffer. The value is then accessible via deserializing the value at this address. -## Stream implementation : `streamdict` +## Stream implementation : *streamdict* The `fstdict` is a tiny bit slow when streaming all of the terms. diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index b8905909d..bb423c888 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -1,35 +1,133 @@ -//! Tokenizer are in charge of processing text for indexing. +//! Tokenizer are in charge of chopping text into a stream of tokens +//! ready for indexing. //! -//! An tokenizer is a configurable pipeline that starts by a `Tokenizer`, -//! followed by a sequence of [`TokenFilter`s](./trait.TokenFilter.html) to it. +//! You must define in your schema which tokenizer should be used for +//! each of your fields : //! -//! The `Tokenizer` is in charge of chopping the text into tokens. There is no -//! trait called `Tokenizer`. Instead `Tokenizer` like [`SimpleTokenizer`](./struct.SimpleTokenizer.html) -//! are just directly implementing the tokenizer trait. +//! ``` +//! extern crate tantivy; +//! use tantivy::schema::*; //! -//! - choosing a tokenizer. A tokenizer is in charge of chopping your text into token. -//! - adding so called filter to modify your tokens (e.g. filter out stop words, apply stemming etc.) +//! # fn main() { +//! let mut schema_builder = SchemaBuilder::new(); +//! +//! let text_options = TextOptions::default() +//! .set_indexing_options( +//! TextFieldIndexing::default() +//! .set_tokenizer("en_stem") +//! .set_index_option(IndexRecordOption::Basic) +//! ) +//! .set_stored(); +//! +//! let id_options = TextOptions::default() +//! .set_indexing_options( +//! TextFieldIndexing::default() +//! .set_tokenizer("raw_ids") +//! .set_index_option(IndexRecordOption::WithFreqsAndPositions) +//! ) +//! .set_stored(); +//! +//! schema_builder.add_text_field("title", text_options.clone()); +//! schema_builder.add_text_field("text", text_options); +//! schema_builder.add_text_field("uuid", id_options); +//! +//! let schema = schema_builder.build(); +//! # } +//! ``` +//! +//! By default, `tantivy` offers the following tokenizers: +//! +//! ## `default` +//! +//! `default` is the tokenizer that will be used if you do not +//! assign a specific tokenizer to your text field. +//! It will chop your text on punctuation and whitespaces, +//! removes tokens that are longer than 40 chars, and lowercase your text. +//! +//! ## `raw` +//! Does not actual tokenizer your text. It keeps it entirely unprocessed. +//! It can be useful to index uuids, or urls for instance. +//! +//! ## `en_stem` +//! +//! In addition to what `default` does, the `en_stem` tokenizer also +//! apply stemming to your tokens. Stemming consists in trimming words to +//! remove their inflection. This tokenizer is slower than the default one, +//! but is recommended to improve recall. +//! +//! +//! # Custom tokenizers +//! +//! You can write your own tokenizer by implementing the [`Tokenizer`](./trait.Tokenizer.html) +//! or you can extend an existing [`Tokenizer`](./trait.Tokenizer.html) by chaining it several +//! [`TokenFilter`s](./trait.TokenFilter.html). +//! +//! For instance, the `en_stem` is defined as follows. +//! +//! ```rust +//! # extern crate tantivy; +//! +//! use tantivy::tokenizer::*; +//! +//! # fn main() { +//! let en_stem = SimpleTokenizer +//! .filter(RemoveLongFilter::limit(40)) +//! .filter(LowerCaser) +//! .filter(Stemmer::new()); +//! # } +//! ``` +//! +//! Once your tokenizer is defined, you need to +//! register it with a name in your index's [TokenizerManager](./struct.TokenizerManager.html). +//! +//! ``` +//! # extern crate tantivy; +//! # use tantivy::schema::SchemaBuilder; +//! # use tantivy::tokenizer::*; +//! # use tantivy::Index; +//! # fn main() { +//! # let custom_en_tokenizer = SimpleTokenizer; +//! # let schema = SchemaBuilder::new().build(); +//! let index = Index::create_in_ram(schema); +//! index.tokenizers() +//! .register("custom_en", custom_en_tokenizer); +//! # } +//! ``` +//! +//! If you built your schema programmatically, a complete example +//! could like this for instance. //! //! # Example //! //! ``` //! extern crate tantivy; +//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing}; //! use tantivy::tokenizer::*; -//! -//! // ... +//! use tantivy::Index; //! //! # fn main() { -//! let mut tokenizer = SimpleTokenizer -//! .filter(RemoveLongFilter::limit(40)) -//! .filter(LowerCaser); -//! tokenizer -//! .token_stream("Hello, happy tax payer") -//! .process(&mut |token| { -//! println!("token {:?}", token.text); -//! }); +//! let mut schema_builder = SchemaBuilder::new(); +//! let text_field_indexing = TextFieldIndexing::default() +//! .set_tokenizer("custom_en") +//! .set_index_option(IndexRecordOption::WithFreqsAndPositions); +//! let text_options = TextOptions::default() +//! .set_indexing_options(text_field_indexing) +//! .set_stored(); +//! schema_builder.add_text_field("title", text_options); +//! let schema = schema_builder.build(); +//! let index = Index::create_in_ram(schema); +//! +//! // We need to register our tokenizer : +//! let custom_en_tokenizer = SimpleTokenizer +//! .filter(RemoveLongFilter::limit(40)) +//! .filter(LowerCaser); +//! index +//! .tokenizers() +//! .register("custom_en", custom_en_tokenizer); +//! // ... //! # } //! ``` - +//! mod tokenizer; mod simple_tokenizer; mod lower_caser; diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs index e405f7596..286d7ae6d 100644 --- a/src/tokenizer/tokenizer_manager.rs +++ b/src/tokenizer/tokenizer_manager.rs @@ -50,7 +50,7 @@ impl Default for TokenizerManager { /// the default pre-configured tokenizers of `tantivy`. /// - simple /// - en_stem - /// - jp + /// - ja fn default() -> TokenizerManager { let manager = TokenizerManager { tokenizers: Arc::new(RwLock::new(HashMap::new()))