Editing rustdoc

This commit is contained in:
Paul Masurel
2017-11-25 13:23:32 +09:00
parent ac4d433fad
commit aaeeda2bc5
5 changed files with 126 additions and 29 deletions

View File

@@ -61,7 +61,7 @@ impl Index {
/// The index will use the `MMapDirectory`.
///
/// If a previous index was in this directory, then its meta file will be destroyed.
pub fn create(directory_path: &Path, schema: Schema) -> Result<Index> {
pub fn create<P: AsRef<Path>>(directory_path: P, schema: Schema) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
Index::from_directory(directory, schema)
@@ -107,7 +107,7 @@ impl Index {
}
/// Opens a new directory from an index path.
pub fn open(directory_path: &Path) -> Result<Index> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<Index> {
let mmap_directory = MmapDirectory::open(directory_path)?;
let directory = ManagedDirectory::new(mmap_directory)?;
let metas = load_metas(&directory)?;

View File

@@ -23,11 +23,11 @@ use std::sync::RwLock;
use std::sync::Weak;
use tempdir::TempDir;
fn open_mmap(full_path: &PathBuf) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
fn open_mmap(full_path: &Path) -> result::Result<Option<Arc<Mmap>>, OpenReadError> {
let file = File::open(&full_path).map_err(|e| if e.kind() ==
io::ErrorKind::NotFound
{
OpenReadError::FileDoesNotExist(full_path.clone())
OpenReadError::FileDoesNotExist(full_path.to_owned())
} else {
OpenReadError::IOError(IOError::with_path(full_path.to_owned(), e))
})?;
@@ -180,7 +180,8 @@ impl MmapDirectory {
///
/// Returns an error if the `directory_path` does not
/// exist or if it is not a directory.
pub fn open(directory_path: &Path) -> Result<MmapDirectory, OpenDirectoryError> {
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref();
if !directory_path.exists() {
Err(OpenDirectoryError::DoesNotExist(
PathBuf::from(directory_path),

View File

@@ -5,14 +5,12 @@ that serves as an address in their respective posting list.
The term dictionary API makes it possible to iterate through
a range of keys in a sorted manner.
```
# Implementations
There is currently two implementations of the term dictionary.
## Default implementation : `fstdict`
## Default implementation : *fstdict*
The default one relies heavily on the `fst` crate.
It associate each terms `&[u8]` representation to a `u64`
@@ -20,7 +18,7 @@ that is in fact an address in a buffer. The value is then accessible
via deserializing the value at this address.
## Stream implementation : `streamdict`
## Stream implementation : *streamdict*
The `fstdict` is a tiny bit slow when streaming all of
the terms.

View File

@@ -1,35 +1,133 @@
//! Tokenizer are in charge of processing text for indexing.
//! Tokenizer are in charge of chopping text into a stream of tokens
//! ready for indexing.
//!
//! An tokenizer is a configurable pipeline that starts by a `Tokenizer`,
//! followed by a sequence of [`TokenFilter`s](./trait.TokenFilter.html) to it.
//! You must define in your schema which tokenizer should be used for
//! each of your fields :
//!
//! The `Tokenizer` is in charge of chopping the text into tokens. There is no
//! trait called `Tokenizer`. Instead `Tokenizer` like [`SimpleTokenizer`](./struct.SimpleTokenizer.html)
//! are just directly implementing the tokenizer trait.
//! ```
//! extern crate tantivy;
//! use tantivy::schema::*;
//!
//! - choosing a tokenizer. A tokenizer is in charge of chopping your text into token.
//! - adding so called filter to modify your tokens (e.g. filter out stop words, apply stemming etc.)
//! # fn main() {
//! let mut schema_builder = SchemaBuilder::new();
//!
//! let text_options = TextOptions::default()
//! .set_indexing_options(
//! TextFieldIndexing::default()
//! .set_tokenizer("en_stem")
//! .set_index_option(IndexRecordOption::Basic)
//! )
//! .set_stored();
//!
//! let id_options = TextOptions::default()
//! .set_indexing_options(
//! TextFieldIndexing::default()
//! .set_tokenizer("raw_ids")
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions)
//! )
//! .set_stored();
//!
//! schema_builder.add_text_field("title", text_options.clone());
//! schema_builder.add_text_field("text", text_options);
//! schema_builder.add_text_field("uuid", id_options);
//!
//! let schema = schema_builder.build();
//! # }
//! ```
//!
//! By default, `tantivy` offers the following tokenizers:
//!
//! ## `default`
//!
//! `default` is the tokenizer that will be used if you do not
//! assign a specific tokenizer to your text field.
//! It will chop your text on punctuation and whitespaces,
//! removes tokens that are longer than 40 chars, and lowercase your text.
//!
//! ## `raw`
//! Does not actual tokenizer your text. It keeps it entirely unprocessed.
//! It can be useful to index uuids, or urls for instance.
//!
//! ## `en_stem`
//!
//! In addition to what `default` does, the `en_stem` tokenizer also
//! apply stemming to your tokens. Stemming consists in trimming words to
//! remove their inflection. This tokenizer is slower than the default one,
//! but is recommended to improve recall.
//!
//!
//! # Custom tokenizers
//!
//! You can write your own tokenizer by implementing the [`Tokenizer`](./trait.Tokenizer.html)
//! or you can extend an existing [`Tokenizer`](./trait.Tokenizer.html) by chaining it several
//! [`TokenFilter`s](./trait.TokenFilter.html).
//!
//! For instance, the `en_stem` is defined as follows.
//!
//! ```rust
//! # extern crate tantivy;
//!
//! use tantivy::tokenizer::*;
//!
//! # fn main() {
//! let en_stem = SimpleTokenizer
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser)
//! .filter(Stemmer::new());
//! # }
//! ```
//!
//! Once your tokenizer is defined, you need to
//! register it with a name in your index's [TokenizerManager](./struct.TokenizerManager.html).
//!
//! ```
//! # extern crate tantivy;
//! # use tantivy::schema::SchemaBuilder;
//! # use tantivy::tokenizer::*;
//! # use tantivy::Index;
//! # fn main() {
//! # let custom_en_tokenizer = SimpleTokenizer;
//! # let schema = SchemaBuilder::new().build();
//! let index = Index::create_in_ram(schema);
//! index.tokenizers()
//! .register("custom_en", custom_en_tokenizer);
//! # }
//! ```
//!
//! If you built your schema programmatically, a complete example
//! could like this for instance.
//!
//! # Example
//!
//! ```
//! extern crate tantivy;
//! use tantivy::schema::{SchemaBuilder, IndexRecordOption, TextOptions, TextFieldIndexing};
//! use tantivy::tokenizer::*;
//!
//! // ...
//! use tantivy::Index;
//!
//! # fn main() {
//! let mut tokenizer = SimpleTokenizer
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! tokenizer
//! .token_stream("Hello, happy tax payer")
//! .process(&mut |token| {
//! println!("token {:?}", token.text);
//! });
//! let mut schema_builder = SchemaBuilder::new();
//! let text_field_indexing = TextFieldIndexing::default()
//! .set_tokenizer("custom_en")
//! .set_index_option(IndexRecordOption::WithFreqsAndPositions);
//! let text_options = TextOptions::default()
//! .set_indexing_options(text_field_indexing)
//! .set_stored();
//! schema_builder.add_text_field("title", text_options);
//! let schema = schema_builder.build();
//! let index = Index::create_in_ram(schema);
//!
//! // We need to register our tokenizer :
//! let custom_en_tokenizer = SimpleTokenizer
//! .filter(RemoveLongFilter::limit(40))
//! .filter(LowerCaser);
//! index
//! .tokenizers()
//! .register("custom_en", custom_en_tokenizer);
//! // ...
//! # }
//! ```
//!
mod tokenizer;
mod simple_tokenizer;
mod lower_caser;

View File

@@ -50,7 +50,7 @@ impl Default for TokenizerManager {
/// the default pre-configured tokenizers of `tantivy`.
/// - simple
/// - en_stem
/// - jp
/// - ja
fn default() -> TokenizerManager {
let manager = TokenizerManager {
tokenizers: Arc::new(RwLock::new(HashMap::new()))