diff --git a/src/core/mod.rs b/src/core/mod.rs index 3a6c9568a..ac2d122e8 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -9,6 +9,7 @@ mod pool; mod segment_meta; mod inverted_index_reader; + pub use self::inverted_index_reader::InvertedIndexReader; pub use self::searcher::Searcher; pub use self::segment_component::SegmentComponent; diff --git a/src/query/query_parser/query_parser.rs b/src/query/query_parser/query_parser.rs index 3c9061500..22699d0fe 100644 --- a/src/query/query_parser/query_parser.rs +++ b/src/query/query_parser/query_parser.rs @@ -81,7 +81,7 @@ pub struct QueryParser { } impl QueryParser { - /// Creates a `QueryParser` + /// Creates a `QueryParser`, given /// * schema - index Schema /// * default_fields - fields used to search if no field is specifically defined /// in the query. @@ -96,6 +96,10 @@ impl QueryParser { } } + /// Creates a `QueryParser`, given + /// * an index + /// * a set of default - fields used to search if no field is specifically defined + /// in the query. pub fn for_index(index: Index, default_fields: Vec) -> QueryParser { QueryParser::new( @@ -106,7 +110,9 @@ impl QueryParser { /// Set the default way to compose queries to a conjunction. /// - /// By default a , + /// By default, the query `happy tax payer` is equivalent to the query + /// `happy OR tax OR payer`. After calling `.set_conjunction_by_default()` + /// `happy tax payer` will be interpreted by the parser as `happy AND tax AND payer`. pub fn set_conjunction_by_default(&mut self) { self.conjunction_by_default = true; } diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index bfc97482b..7a1fd9d9d 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -1,15 +1,14 @@ -/// `IndexRecordOption` describes an amount of information associated -/// for a given field. +/// `IndexRecordOption` describes an amount information associated +/// to a given indexed field. /// -/// It is used in the schema to configure how much data should be -/// indexed for a given field. +/// It is both used to: /// -/// It is also used to describe the amount of information that -/// you want to be decoded as you go through a posting list. -/// -/// For instance, positions are useful when running phrase queries -/// but useless for most queries. +/// * describe in the schema the amount of information +/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option)) +/// * to request for a given +/// amount of information to be decoded as one goes through a posting list. +/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings)) /// #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] pub enum IndexRecordOption { @@ -17,10 +16,12 @@ pub enum IndexRecordOption { #[serde(rename = "basic")] Basic, /// records the document ids as well as the term frequency. + /// The term frequency can help giving better scoring of the documents. #[serde(rename = "freq")] WithFreqs, /// records the document id, the term frequency and the positions of /// the occurences in the document. + /// Positions are required to run [PhraseQueries](../query/struct.PhraseQuery.html). #[serde(rename = "position")] WithFreqsAndPositions, } diff --git a/src/schema/text_options.rs b/src/schema/text_options.rs index 46ea89b98..058e3d256 100644 --- a/src/schema/text_options.rs +++ b/src/schema/text_options.rs @@ -61,15 +61,21 @@ impl Default for TextFieldIndexing { } impl TextFieldIndexing { + /// Sets the tokenizer to be used for a given field. pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing { self.tokenizer = Cow::Owned(tokenizer_name.to_string()); self } + /// Returns the tokenizer that will be used for this field. pub fn tokenizer(&self) -> &str { &self.tokenizer } + + /// Sets which information should be indexed with the tokens. + /// + /// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail. pub fn set_index_option(mut self, index_option: IndexRecordOption) -> TextFieldIndexing { self.record = index_option; self diff --git a/src/tokenizer/lower_caser.rs b/src/tokenizer/lower_caser.rs index 18da55654..c603709f1 100644 --- a/src/tokenizer/lower_caser.rs +++ b/src/tokenizer/lower_caser.rs @@ -1,10 +1,10 @@ -use super::{TokenFilterFactory, TokenStream, Token}; +use super::{TokenFilter, TokenStream, Token}; /// Token filter that lowercase terms. #[derive(Clone)] pub struct LowerCaser; -impl TokenFilterFactory for LowerCaser +impl TokenFilter for LowerCaser where TailTokenStream: TokenStream { type ResultTokenStream = LowerCaserTokenStream; diff --git a/src/tokenizer/mod.rs b/src/tokenizer/mod.rs index bb423c888..94431797f 100644 --- a/src/tokenizer/mod.rs +++ b/src/tokenizer/mod.rs @@ -139,7 +139,7 @@ mod token_stream_chain; mod raw_tokenizer; -pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilterFactory, TokenStream}; +pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilter, TokenStream}; pub use self::tokenizer::BoxedTokenizer; pub use self::tokenizer_manager::TokenizerManager; pub use self::simple_tokenizer::SimpleTokenizer; diff --git a/src/tokenizer/raw_tokenizer.rs b/src/tokenizer/raw_tokenizer.rs index 65875f301..23bd6efe4 100644 --- a/src/tokenizer/raw_tokenizer.rs +++ b/src/tokenizer/raw_tokenizer.rs @@ -1,5 +1,7 @@ use super::{Token, Tokenizer, TokenStream}; + +/// For each value of the field, emit a single unprocessed token. #[derive(Clone)] pub struct RawTokenizer; diff --git a/src/tokenizer/remove_long.rs b/src/tokenizer/remove_long.rs index ff7748cdc..1ba3ba3b4 100644 --- a/src/tokenizer/remove_long.rs +++ b/src/tokenizer/remove_long.rs @@ -1,4 +1,4 @@ -use super::{TokenFilterFactory, TokenStream, Token}; +use super::{TokenFilter, TokenStream, Token}; /// `RemoveLongFilter` removes tokens that are longer @@ -36,7 +36,7 @@ impl RemoveLongFilterStream } -impl TokenFilterFactory for RemoveLongFilter +impl TokenFilter for RemoveLongFilter where TailTokenStream: TokenStream { type ResultTokenStream = RemoveLongFilterStream; diff --git a/src/tokenizer/simple_tokenizer.rs b/src/tokenizer/simple_tokenizer.rs index cf4bbd487..f6e223fc2 100644 --- a/src/tokenizer/simple_tokenizer.rs +++ b/src/tokenizer/simple_tokenizer.rs @@ -2,6 +2,8 @@ use std::str::CharIndices; use super::{Token, Tokenizer, TokenStream}; + +/// Tokenize the text by splitting on whitespaces and punctuation. #[derive(Clone)] pub struct SimpleTokenizer; diff --git a/src/tokenizer/stemmer.rs b/src/tokenizer/stemmer.rs index 29875f018..890c8b551 100644 --- a/src/tokenizer/stemmer.rs +++ b/src/tokenizer/stemmer.rs @@ -1,5 +1,5 @@ use std::sync::Arc; -use super::{TokenFilterFactory, TokenStream, Token}; +use super::{TokenFilter, TokenStream, Token}; use rust_stemmers::{self, Algorithm}; #[derive(Clone)] @@ -13,7 +13,7 @@ impl Stemmer { } } -impl TokenFilterFactory for Stemmer +impl TokenFilter for Stemmer where TailTokenStream: TokenStream { type ResultTokenStream = StemmerTokenStream; diff --git a/src/tokenizer/tokenizer.rs b/src/tokenizer/tokenizer.rs index eb03e3138..3b862f7ed 100644 --- a/src/tokenizer/tokenizer.rs +++ b/src/tokenizer/tokenizer.rs @@ -4,38 +4,8 @@ use std::borrow::{Borrow, BorrowMut}; use tokenizer::TokenStreamChain; -/// Token -/// -/// -/// -/// # Example -/// -/// ``` -/// extern crate tantivy; -/// use tantivy::tokenizer::*; -/// -/// # fn main() { -/// let mut tokenizer = SimpleTokenizer -/// .filter(RemoveLongFilter::limit(40)) -/// .filter(LowerCaser); -/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "hello"); -/// assert_eq!(token.offset_from, 0); -/// assert_eq!(token.offset_to, 5); -/// assert_eq!(token.position, 0); -/// } -/// { -/// let token = token_stream.next().unwrap(); -/// assert_eq!(&token.text, "happy"); -/// assert_eq!(token.offset_from, 7); -/// assert_eq!(token.offset_to, 12); -/// assert_eq!(token.position, 1); -/// } -/// # } -/// ``` -/// # + +/// Token pub struct Token { /// Offset (byte index) of the first character of the token. /// Offsets shall not be modified by token filters. @@ -62,17 +32,46 @@ impl Default for Token { } -// Warning! TODO may change once associated type constructor -// land in nightly. +/// `Tokenizer` are in charge of splitting text into a stream of token +/// before indexing. +/// +/// See the [module documentation](./index.html) for more detail. +/// +/// # Warning +/// +/// This API may change to use associated types. pub trait Tokenizer<'a>: Sized + Clone { + + /// Type associated to the resulting tokenstream tokenstream. type TokenStreamImpl: TokenStream; + /// Creates a token stream for a given `str`. fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl; + /// Appends a token filter to the current tokenizer. + /// + /// The method consumes the current `TokenStream` and returns a + /// new one. + /// + /// # Example + /// + /// ```rust + /// # extern crate tantivy; + /// + /// use tantivy::tokenizer::*; + /// + /// # fn main() { + /// let en_stem = SimpleTokenizer + /// .filter(RemoveLongFilter::limit(40)) + /// .filter(LowerCaser) + /// .filter(Stemmer::new()); + /// # } + /// ``` + /// fn filter(self, new_filter: NewFilter) -> ChainTokenizer - where NewFilter: TokenFilterFactory<>::TokenStreamImpl> + where NewFilter: TokenFilter<>::TokenStreamImpl> { ChainTokenizer { head: new_filter, @@ -81,6 +80,7 @@ pub trait Tokenizer<'a>: Sized + Clone { } } + pub trait BoxedTokenizer: Send + Sync { fn token_stream<'a>(&mut self, text: &'a str) -> Box; fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box; @@ -146,6 +146,38 @@ impl<'b> TokenStream for Box { } +/// `TokenStream` is the result of the tokenization. +/// +/// It consists consumable stream of `Token`s. +/// +/// # Example +/// +/// ``` +/// extern crate tantivy; +/// use tantivy::tokenizer::*; +/// +/// # fn main() { +/// let mut tokenizer = SimpleTokenizer +/// .filter(RemoveLongFilter::limit(40)) +/// .filter(LowerCaser); +/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer"); +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "hello"); +/// assert_eq!(token.offset_from, 0); +/// assert_eq!(token.offset_to, 5); +/// assert_eq!(token.position, 0); +/// } +/// { +/// let token = token_stream.next().unwrap(); +/// assert_eq!(&token.text, "happy"); +/// assert_eq!(token.offset_from, 7); +/// assert_eq!(token.offset_to, 12); +/// assert_eq!(token.position, 1); +/// } +/// # } +/// ``` +/// pub trait TokenStream { fn advance(&mut self) -> bool; @@ -180,7 +212,7 @@ pub struct ChainTokenizer { impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a> for ChainTokenizer - where HeadTokenFilterFactory: TokenFilterFactory, + where HeadTokenFilterFactory: TokenFilter, TailTokenizer: Tokenizer<'a> { type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; @@ -192,8 +224,9 @@ impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a> } -pub trait TokenFilterFactory: Clone { +pub trait TokenFilter: Clone { type ResultTokenStream: TokenStream; + /// Wraps a token stream and returns the modified one. fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream; }