From aaeeda2bc5f942cdc6ea80332f40ac45456560ad Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Sat, 25 Nov 2017 13:23:32 +0900
Subject: [PATCH] Editing rustdoc

---
 src/core/index.rs                  |   4 +-
 src/directory/mmap_directory.rs    |   7 +-
 src/termdict/mod.rs                |   6 +-
 src/tokenizer/mod.rs               | 136 +++++++++++++++++++++++++----
 src/tokenizer/tokenizer_manager.rs |   2 +-
 5 files changed, 126 insertions(+), 29 deletions(-)
diff --git a/src/core/index.rs b/src/core/index.rs
index ce1284474..e790fecd9 100644
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -61,7 +61,7 @@ impl Index {
     /// The index will use the `MMapDirectory`.
     ///
     /// If a previous index was in this directory, then its meta file will be destroyed.
-    pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
+    pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
         let mmap_directory = MmapDirectory::open(directory_path)?;
         let directory = ManagedDirectory::new(mmap_directory)?;
         Index::from_directory(directory, schema)
@@ -107,7 +107,7 @@ impl Index {
     }
 
     /// Opens a new directory from an index path.
-    pub fn open(directory_path: &Path) -> Result<Index> {
+    pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
         let mmap_directory = MmapDirectory::open(directory_path)?;
         let directory = ManagedDirectory::new(mmap_directory)?;
         let metas = load_metas(&directory)?;
diff --git a/src/directory/mmap_directory.rs b/src/directory/mmap_directory.rs
index 970b987cb..74edfc51a 100644
--- a/src/directory/mmap_directory.rs
+++ b/src/directory/mmap_directory.rs
@@ -23,11 +23,11 @@ use std::sync::RwLock;
 use std::sync::Weak;
 use tempdir::TempDir;
 
-fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
+fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
     let file = File::open(&full_path).map_err(|e| if e.kind() ==
         io::ErrorKind::NotFound
     {
-        OpenReadError::FileDoesNotExist(full_path.clone())
+        OpenReadError::FileDoesNotExist(full_path.to_owned())
     } else {
         OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
     })?;
@@ -180,7 +180,8 @@ impl MmapDirectory {
     ///
     /// Returns an error if the `directory_path` does not
     /// exist or if it is not a directory.
-    pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
+    pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
+        let directory_path: &Path = directory_path.as_ref();
         if !directory_path.exists() {
             Err(OpenDirectoryError::DoesNotExist(
                 PathBuf::from(directory_path),
diff --git a/src/termdict/mod.rs b/src/termdict/mod.rs
index 9150b8f85..7f004c87e 100644
--- a/src/termdict/mod.rs
+++ b/src/termdict/mod.rs
@@ -5,14 +5,12 @@ that serves as an address in their respective posting list.
 
 The term dictionary API makes it possible to iterate through
 a range of keys in a sorted manner.
-```
-
 
 # Implementations
 
 There is currently two implementations of the term dictionary.
 
-## Default implementation : `fstdict`
+## Default implementation : *fstdict*
 
 The default one relies heavily on the `fst` crate.
 It associate each terms `&[u8]` representation to a `u64`
@@ -20,7 +18,7 @@ that is in fact an address in a buffer. The value is then accessible
 via deserializing the value at this address.
 
 
-## Stream implementation : `streamdict`
+## Stream implementation : *streamdict*
 
 The `fstdict` is a tiny bit slow when streaming all of
 the terms.
diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs
index b8905909d..bb423c888 100644
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@@ -1,35 +1,133 @@
-//! Tokenizer are in charge of processing text for indexing.
+//! Tokenizer are in charge of chopping text into a stream of tokens
+//! ready for indexing.
 //!
-//! An tokenizer is a configurable pipeline that starts by a `Tokenizer`,
-//! followed by a sequence of [`TokenFilter`s](./trait.TokenFilter.html) to it.
+//! You must define in your schema which tokenizer should be used for
+//! each of your fields :
 //!
-//! The `Tokenizer` is in charge of chopping the text into tokens. There is no
-//! trait called `Tokenizer`. Instead `Tokenizer` like [`SimpleTokenizer`](./struct.SimpleTokenizer.html)
-//! are just directly implementing the tokenizer trait.
+//! ```
+//! extern crate tantivy;
+//! use tantivy::schema::*;
 //!
-//! - choosing a tokenizer. A tokenizer is in charge of chopping your text into token.
-//! - adding so called filter to modify your tokens (e.g. filter out stop words, apply stemming etc.)
+//! # fn main() {
+//! let mut schema_builder = SchemaBuilder::new();
+//!
+//! let text_options = TextOptions::default()
+//!     .set_indexing_options(
+//!         TextFieldIndexing::default()
+//!             .set_tokenizer("en_stem")
+//!             .set_index_option(IndexRecordOption::Basic)
+//!     )
+//!     .set_stored();
+//!
+//! let id_options = TextOptions::default()
+//!     .set_indexing_options(
+//!         TextFieldIndexing::default()
+//!             .set_tokenizer("raw_ids")
+//!             .set_index_option(IndexRecordOption::WithFreqsAndPositions)
+//!     )
+//!     .set_stored();
+//!
+//! schema_builder.add_text_field("title", text_options.clone());
+//! schema_builder.add_text_field("text", text_options);
+//! schema_builder.add_text_field("uuid", id_options);
+//!
+//! let schema = schema_builder.build();
+//! # }
+//! ```
+//!
+//! By default, `tantivy` offers the following tokenizers:
+//!
+//! ## `default`
+//!
+//! `default` is the tokenizer that will be used if you do not
+//! assign a specific tokenizer to your text field.
+//! It will chop your text on punctuation and whitespaces,
+//! removes tokens that are longer than 40 chars, and lowercase your text.
+//!
+//! ## `raw`
+//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
+//! It can be useful to index uuids, or urls for instance.
+//!
+//! ## `en_stem`
+//!
+//! In addition to what `default` does, the `en_stem` tokenizer also
+//! apply stemming to your tokens. Stemming consists in trimming words to
+//! remove their inflection. This tokenizer is slower than the default one,
+//! but is recommended to improve recall.
+//!
+//!
+//! # Custom tokenizers
+//!
+//! You can write your own tokenizer by implementing the [`Tokenizer`](./trait.Tokenizer.html)
+//! or you can extend an existing [`Tokenizer`](./trait.Tokenizer.html) by chaining it several
+//! [`TokenFilter`s](./trait.TokenFilter.html).
+//!
+//! For instance, the `en_stem` is defined as follows.
+//!
+//! ```rust
+//! # extern crate tantivy;
+//!
+//! use tantivy::tokenizer::*;
+//!
+//! # fn main() {
+//! let en_stem = SimpleTokenizer
+//!     .filter(RemoveLongFilter::limit(40))
+//!     .filter(LowerCaser)
+//!     .filter(Stemmer::new());
+//! # }
+//! ```
+//!
+//! Once your tokenizer is defined, you need to
+//! register it with a name in your index's [TokenizerManager](./struct.TokenizerManager.html).
+//!
+//! ```
+//! # extern crate tantivy;
+//! # use tantivy::schema::SchemaBuilder;
+//! # use tantivy::tokenizer::*;
+//! # use tantivy::Index;
+//! # fn main() {
+//! # let custom_en_tokenizer = SimpleTokenizer;
+//! # let schema = SchemaBuilder::new().build();
+//! let index = Index::create_in_ram(schema);
+//! index.tokenizers()
+//!      .register("custom_en", custom_en_tokenizer);
+//! # }
+//! ```
+//!
+//! If you built your schema programmatically, a complete example
+//! could like this for instance.
 //!
 //! # Example
 //!
 //! ```
 //! extern crate tantivy;
+//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
 //! use tantivy::tokenizer::*;
-//!
-//! // ...
+//! use tantivy::Index;
 //!
 //! # fn main() {
-//! let mut tokenizer = SimpleTokenizer
-//!        .filter(RemoveLongFilter::limit(40))
-//!        .filter(LowerCaser);
-//! tokenizer
-//!     .token_stream("Hello, happy tax payer")
-//!     .process(&mut |token| {
-//!         println!("token {:?}", token.text);
-//!     });
+//! let mut schema_builder = SchemaBuilder::new();
+//! let text_field_indexing = TextFieldIndexing::default()
+//!     .set_tokenizer("custom_en")
+//!     .set_index_option(IndexRecordOption::WithFreqsAndPositions);
+//! let text_options = TextOptions::default()
+//!     .set_indexing_options(text_field_indexing)
+//!     .set_stored();
+//! schema_builder.add_text_field("title", text_options);
+//! let schema = schema_builder.build();
+//! let index = Index::create_in_ram(schema);
+//!
+//! // We need to register our tokenizer :
+//! let custom_en_tokenizer = SimpleTokenizer
+//!     .filter(RemoveLongFilter::limit(40))
+//!     .filter(LowerCaser);
+//! index
+//!     .tokenizers()
+//!     .register("custom_en", custom_en_tokenizer);
+//! // ...
 //! # }
 //! ```
-
+//!
 mod tokenizer;
 mod simple_tokenizer;
 mod lower_caser;
diff --git a/src/tokenizer/tokenizer_manager.rs b/src/tokenizer/tokenizer_manager.rs
index e405f7596..286d7ae6d 100644
--- a/src/tokenizer/tokenizer_manager.rs
+++ b/src/tokenizer/tokenizer_manager.rs
@@ -50,7 +50,7 @@ impl Default for TokenizerManager {
     /// the default pre-configured tokenizers of `tantivy`.
     /// - simple
     /// - en_stem
-    /// - jp
+    /// - ja
     fn default() -> TokenizerManager {
         let manager = TokenizerManager {
             tokenizers: Arc::new(RwLock::new(HashMap::new()))