Added comments

This commit is contained in:
Paul Masurel
2017-11-26 10:44:49 +09:00
parent aaeeda2bc5
commit acd7c1ea2d
11 changed files with 106 additions and 55 deletions

View File

@@ -9,6 +9,7 @@ mod pool;
mod segment_meta;
mod inverted_index_reader;
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::searcher::Searcher;
pub use self::segment_component::SegmentComponent;

View File

@@ -81,7 +81,7 @@ pub struct QueryParser {
}
impl QueryParser {
/// Creates a `QueryParser`
/// Creates a `QueryParser`, given
/// * schema - index Schema
/// * default_fields - fields used to search if no field is specifically defined
/// in the query.
@@ -96,6 +96,10 @@ impl QueryParser {
}
}
/// Creates a `QueryParser`, given
/// * an index
/// * a set of default - fields used to search if no field is specifically defined
/// in the query.
pub fn for_index(index: Index,
default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(
@@ -106,7 +110,9 @@ impl QueryParser {
/// Set the default way to compose queries to a conjunction.
///
/// By default a ,
/// By default, the query `happy tax payer` is equivalent to the query
/// `happy OR tax OR payer`. After calling `.set_conjunction_by_default()`
/// `happy tax payer` will be interpreted by the parser as `happy AND tax AND payer`.
pub fn set_conjunction_by_default(&mut self) {
self.conjunction_by_default = true;
}

View File

@@ -1,15 +1,14 @@
/// `IndexRecordOption` describes an amount of information associated
/// for a given field.
/// `IndexRecordOption` describes an amount information associated
/// to a given indexed field.
///
/// It is used in the schema to configure how much data should be
/// indexed for a given field.
/// It is both used to:
///
/// It is also used to describe the amount of information that
/// you want to be decoded as you go through a posting list.
///
/// For instance, positions are useful when running phrase queries
/// but useless for most queries.
/// * describe in the schema the amount of information
/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option))
/// * to request for a given
/// amount of information to be decoded as one goes through a posting list.
/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings))
///
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
pub enum IndexRecordOption {
@@ -17,10 +16,12 @@ pub enum IndexRecordOption {
#[serde(rename = "basic")]
Basic,
/// records the document ids as well as the term frequency.
/// The term frequency can help giving better scoring of the documents.
#[serde(rename = "freq")]
WithFreqs,
/// records the document id, the term frequency and the positions of
/// the occurences in the document.
/// Positions are required to run [PhraseQueries](../query/struct.PhraseQuery.html).
#[serde(rename = "position")]
WithFreqsAndPositions,
}

View File

@@ -61,15 +61,21 @@ impl Default for TextFieldIndexing {
}
impl TextFieldIndexing {
/// Sets the tokenizer to be used for a given field.
pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
self.tokenizer = Cow::Owned(tokenizer_name.to_string());
self
}
/// Returns the tokenizer that will be used for this field.
pub fn tokenizer(&self) -> &str {
&self.tokenizer
}
/// Sets which information should be indexed with the tokens.
///
/// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail.
pub fn set_index_option(mut self, index_option: IndexRecordOption) -> TextFieldIndexing {
self.record = index_option;
self

View File

@@ -1,10 +1,10 @@
use super::{TokenFilterFactory, TokenStream, Token};
use super::{TokenFilter, TokenStream, Token};
/// Token filter that lowercase terms.
#[derive(Clone)]
pub struct LowerCaser;
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
where TailTokenStream: TokenStream
{
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;

View File

@@ -139,7 +139,7 @@ mod token_stream_chain;
mod raw_tokenizer;
pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilterFactory, TokenStream};
pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilter, TokenStream};
pub use self::tokenizer::BoxedTokenizer;
pub use self::tokenizer_manager::TokenizerManager;
pub use self::simple_tokenizer::SimpleTokenizer;

View File

@@ -1,5 +1,7 @@
use super::{Token, Tokenizer, TokenStream};
/// For each value of the field, emit a single unprocessed token.
#[derive(Clone)]
pub struct RawTokenizer;

View File

@@ -1,4 +1,4 @@
use super::{TokenFilterFactory, TokenStream, Token};
use super::{TokenFilter, TokenStream, Token};
/// `RemoveLongFilter` removes tokens that are longer
@@ -36,7 +36,7 @@ impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
where TailTokenStream: TokenStream
{
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;

View File

@@ -2,6 +2,8 @@
use std::str::CharIndices;
use super::{Token, Tokenizer, TokenStream};
/// Tokenize the text by splitting on whitespaces and punctuation.
#[derive(Clone)]
pub struct SimpleTokenizer;

View File

@@ -1,5 +1,5 @@
use std::sync::Arc;
use super::{TokenFilterFactory, TokenStream, Token};
use super::{TokenFilter, TokenStream, Token};
use rust_stemmers::{self, Algorithm};
#[derive(Clone)]
@@ -13,7 +13,7 @@ impl Stemmer {
}
}
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
where TailTokenStream: TokenStream
{
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;

View File

@@ -4,38 +4,8 @@
use std::borrow::{Borrow, BorrowMut};
use tokenizer::TokenStreamChain;
/// Token
///
///
///
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
///
/// # fn main() {
/// let mut tokenizer = SimpleTokenizer
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// # }
/// ```
/// #
/// Token
pub struct Token {
/// Offset (byte index) of the first character of the token.
/// Offsets shall not be modified by token filters.
@@ -62,17 +32,46 @@ impl Default for Token {
}
// Warning! TODO may change once associated type constructor
// land in nightly.
/// `Tokenizer` are in charge of splitting text into a stream of token
/// before indexing.
///
/// See the [module documentation](./index.html) for more detail.
///
/// # Warning
///
/// This API may change to use associated types.
pub trait Tokenizer<'a>: Sized + Clone {
/// Type associated to the resulting tokenstream tokenstream.
type TokenStreamImpl: TokenStream;
/// Creates a token stream for a given `str`.
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
/// Appends a token filter to the current tokenizer.
///
/// The method consumes the current `TokenStream` and returns a
/// new one.
///
/// # Example
///
/// ```rust
/// # extern crate tantivy;
///
/// use tantivy::tokenizer::*;
///
/// # fn main() {
/// let en_stem = SimpleTokenizer
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser)
/// .filter(Stemmer::new());
/// # }
/// ```
///
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
where NewFilter: TokenFilterFactory<<Self as Tokenizer<'a>>::TokenStreamImpl>
where NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>
{
ChainTokenizer {
head: new_filter,
@@ -81,6 +80,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
}
}
pub trait BoxedTokenizer: Send + Sync {
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b>;
@@ -146,6 +146,38 @@ impl<'b> TokenStream for Box<TokenStream + 'b> {
}
/// `TokenStream` is the result of the tokenization.
///
/// It consists consumable stream of `Token`s.
///
/// # Example
///
/// ```
/// extern crate tantivy;
/// use tantivy::tokenizer::*;
///
/// # fn main() {
/// let mut tokenizer = SimpleTokenizer
/// .filter(RemoveLongFilter::limit(40))
/// .filter(LowerCaser);
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "hello");
/// assert_eq!(token.offset_from, 0);
/// assert_eq!(token.offset_to, 5);
/// assert_eq!(token.position, 0);
/// }
/// {
/// let token = token_stream.next().unwrap();
/// assert_eq!(&token.text, "happy");
/// assert_eq!(token.offset_from, 7);
/// assert_eq!(token.offset_to, 12);
/// assert_eq!(token.position, 1);
/// }
/// # }
/// ```
///
pub trait TokenStream {
fn advance(&mut self) -> bool;
@@ -180,7 +212,7 @@ pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
where HeadTokenFilterFactory: TokenFilterFactory<TailTokenizer::TokenStreamImpl>,
where HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
TailTokenizer: Tokenizer<'a>
{
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
@@ -192,8 +224,9 @@ impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
}
pub trait TokenFilterFactory<TailTokenStream: TokenStream>: Clone {
pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
type ResultTokenStream: TokenStream;
/// Wraps a token stream and returns the modified one.
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
}