mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-17 08:40:41 +00:00
Added comments
This commit is contained in:
@@ -9,6 +9,7 @@ mod pool;
|
||||
mod segment_meta;
|
||||
mod inverted_index_reader;
|
||||
|
||||
|
||||
pub use self::inverted_index_reader::InvertedIndexReader;
|
||||
pub use self::searcher::Searcher;
|
||||
pub use self::segment_component::SegmentComponent;
|
||||
|
||||
@@ -81,7 +81,7 @@ pub struct QueryParser {
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * schema - index Schema
|
||||
/// * default_fields - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
@@ -96,6 +96,10 @@ impl QueryParser {
|
||||
}
|
||||
}
|
||||
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * an index
|
||||
/// * a set of default - fields used to search if no field is specifically defined
|
||||
/// in the query.
|
||||
pub fn for_index(index: Index,
|
||||
default_fields: Vec<Field>) -> QueryParser {
|
||||
QueryParser::new(
|
||||
@@ -106,7 +110,9 @@ impl QueryParser {
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
///
|
||||
/// By default a ,
|
||||
/// By default, the query `happy tax payer` is equivalent to the query
|
||||
/// `happy OR tax OR payer`. After calling `.set_conjunction_by_default()`
|
||||
/// `happy tax payer` will be interpreted by the parser as `happy AND tax AND payer`.
|
||||
pub fn set_conjunction_by_default(&mut self) {
|
||||
self.conjunction_by_default = true;
|
||||
}
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
|
||||
/// `IndexRecordOption` describes an amount of information associated
|
||||
/// for a given field.
|
||||
/// `IndexRecordOption` describes an amount information associated
|
||||
/// to a given indexed field.
|
||||
///
|
||||
/// It is used in the schema to configure how much data should be
|
||||
/// indexed for a given field.
|
||||
/// It is both used to:
|
||||
///
|
||||
/// It is also used to describe the amount of information that
|
||||
/// you want to be decoded as you go through a posting list.
|
||||
///
|
||||
/// For instance, positions are useful when running phrase queries
|
||||
/// but useless for most queries.
|
||||
/// * describe in the schema the amount of information
|
||||
/// that should be retained during indexing (See [TextFieldIndexing.html.set_index_option](../schema/struct.TextFieldIndexing.html#method.set_index_option))
|
||||
/// * to request for a given
|
||||
/// amount of information to be decoded as one goes through a posting list.
|
||||
/// (See [InvertedIndexReader.read_postings](../struct.InvertedIndexReader.html#method.read_postings))
|
||||
///
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum IndexRecordOption {
|
||||
@@ -17,10 +16,12 @@ pub enum IndexRecordOption {
|
||||
#[serde(rename = "basic")]
|
||||
Basic,
|
||||
/// records the document ids as well as the term frequency.
|
||||
/// The term frequency can help giving better scoring of the documents.
|
||||
#[serde(rename = "freq")]
|
||||
WithFreqs,
|
||||
/// records the document id, the term frequency and the positions of
|
||||
/// the occurences in the document.
|
||||
/// Positions are required to run [PhraseQueries](../query/struct.PhraseQuery.html).
|
||||
#[serde(rename = "position")]
|
||||
WithFreqsAndPositions,
|
||||
}
|
||||
|
||||
@@ -61,15 +61,21 @@ impl Default for TextFieldIndexing {
|
||||
}
|
||||
|
||||
impl TextFieldIndexing {
|
||||
/// Sets the tokenizer to be used for a given field.
|
||||
pub fn set_tokenizer(mut self, tokenizer_name: &str) -> TextFieldIndexing {
|
||||
self.tokenizer = Cow::Owned(tokenizer_name.to_string());
|
||||
self
|
||||
}
|
||||
|
||||
/// Returns the tokenizer that will be used for this field.
|
||||
pub fn tokenizer(&self) -> &str {
|
||||
&self.tokenizer
|
||||
}
|
||||
|
||||
|
||||
/// Sets which information should be indexed with the tokens.
|
||||
///
|
||||
/// See [IndexRecordOption](./enum.IndexRecordOption.html) for more detail.
|
||||
pub fn set_index_option(mut self, index_option: IndexRecordOption) -> TextFieldIndexing {
|
||||
self.record = index_option;
|
||||
self
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use super::{TokenFilter, TokenStream, Token};
|
||||
|
||||
/// Token filter that lowercase terms.
|
||||
#[derive(Clone)]
|
||||
pub struct LowerCaser;
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for LowerCaser
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for LowerCaser
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = LowerCaserTokenStream<TailTokenStream>;
|
||||
|
||||
@@ -139,7 +139,7 @@ mod token_stream_chain;
|
||||
mod raw_tokenizer;
|
||||
|
||||
|
||||
pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilterFactory, TokenStream};
|
||||
pub use self::tokenizer::{box_tokenizer, Tokenizer, Token, TokenFilter, TokenStream};
|
||||
pub use self::tokenizer::BoxedTokenizer;
|
||||
pub use self::tokenizer_manager::TokenizerManager;
|
||||
pub use self::simple_tokenizer::SimpleTokenizer;
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
use super::{Token, Tokenizer, TokenStream};
|
||||
|
||||
|
||||
/// For each value of the field, emit a single unprocessed token.
|
||||
#[derive(Clone)]
|
||||
pub struct RawTokenizer;
|
||||
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use super::{TokenFilter, TokenStream, Token};
|
||||
|
||||
|
||||
/// `RemoveLongFilter` removes tokens that are longer
|
||||
@@ -36,7 +36,7 @@ impl<TailTokenStream> RemoveLongFilterStream<TailTokenStream>
|
||||
}
|
||||
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for RemoveLongFilter
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for RemoveLongFilter
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = RemoveLongFilterStream<TailTokenStream>;
|
||||
|
||||
@@ -2,6 +2,8 @@
|
||||
use std::str::CharIndices;
|
||||
use super::{Token, Tokenizer, TokenStream};
|
||||
|
||||
|
||||
/// Tokenize the text by splitting on whitespaces and punctuation.
|
||||
#[derive(Clone)]
|
||||
pub struct SimpleTokenizer;
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
use std::sync::Arc;
|
||||
use super::{TokenFilterFactory, TokenStream, Token};
|
||||
use super::{TokenFilter, TokenStream, Token};
|
||||
use rust_stemmers::{self, Algorithm};
|
||||
|
||||
#[derive(Clone)]
|
||||
@@ -13,7 +13,7 @@ impl Stemmer {
|
||||
}
|
||||
}
|
||||
|
||||
impl<TailTokenStream> TokenFilterFactory<TailTokenStream> for Stemmer
|
||||
impl<TailTokenStream> TokenFilter<TailTokenStream> for Stemmer
|
||||
where TailTokenStream: TokenStream
|
||||
{
|
||||
type ResultTokenStream = StemmerTokenStream<TailTokenStream>;
|
||||
|
||||
@@ -4,38 +4,8 @@
|
||||
use std::borrow::{Borrow, BorrowMut};
|
||||
use tokenizer::TokenStreamChain;
|
||||
|
||||
/// Token
|
||||
///
|
||||
///
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// let mut tokenizer = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// # }
|
||||
/// ```
|
||||
/// #
|
||||
|
||||
/// Token
|
||||
pub struct Token {
|
||||
/// Offset (byte index) of the first character of the token.
|
||||
/// Offsets shall not be modified by token filters.
|
||||
@@ -62,17 +32,46 @@ impl Default for Token {
|
||||
}
|
||||
|
||||
|
||||
// Warning! TODO may change once associated type constructor
|
||||
// land in nightly.
|
||||
|
||||
|
||||
/// `Tokenizer` are in charge of splitting text into a stream of token
|
||||
/// before indexing.
|
||||
///
|
||||
/// See the [module documentation](./index.html) for more detail.
|
||||
///
|
||||
/// # Warning
|
||||
///
|
||||
/// This API may change to use associated types.
|
||||
pub trait Tokenizer<'a>: Sized + Clone {
|
||||
|
||||
/// Type associated to the resulting tokenstream tokenstream.
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
/// Creates a token stream for a given `str`.
|
||||
fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl;
|
||||
|
||||
/// Appends a token filter to the current tokenizer.
|
||||
///
|
||||
/// The method consumes the current `TokenStream` and returns a
|
||||
/// new one.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```rust
|
||||
/// # extern crate tantivy;
|
||||
///
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// let en_stem = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser)
|
||||
/// .filter(Stemmer::new());
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
fn filter<NewFilter>(self, new_filter: NewFilter) -> ChainTokenizer<NewFilter, Self>
|
||||
where NewFilter: TokenFilterFactory<<Self as Tokenizer<'a>>::TokenStreamImpl>
|
||||
where NewFilter: TokenFilter<<Self as Tokenizer<'a>>::TokenStreamImpl>
|
||||
{
|
||||
ChainTokenizer {
|
||||
head: new_filter,
|
||||
@@ -81,6 +80,7 @@ pub trait Tokenizer<'a>: Sized + Clone {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
pub trait BoxedTokenizer: Send + Sync {
|
||||
fn token_stream<'a>(&mut self, text: &'a str) -> Box<TokenStream + 'a>;
|
||||
fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box<TokenStream + 'b>;
|
||||
@@ -146,6 +146,38 @@ impl<'b> TokenStream for Box<TokenStream + 'b> {
|
||||
}
|
||||
|
||||
|
||||
/// `TokenStream` is the result of the tokenization.
|
||||
///
|
||||
/// It consists consumable stream of `Token`s.
|
||||
///
|
||||
/// # Example
|
||||
///
|
||||
/// ```
|
||||
/// extern crate tantivy;
|
||||
/// use tantivy::tokenizer::*;
|
||||
///
|
||||
/// # fn main() {
|
||||
/// let mut tokenizer = SimpleTokenizer
|
||||
/// .filter(RemoveLongFilter::limit(40))
|
||||
/// .filter(LowerCaser);
|
||||
/// let mut token_stream = tokenizer.token_stream("Hello, happy tax payer");
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "hello");
|
||||
/// assert_eq!(token.offset_from, 0);
|
||||
/// assert_eq!(token.offset_to, 5);
|
||||
/// assert_eq!(token.position, 0);
|
||||
/// }
|
||||
/// {
|
||||
/// let token = token_stream.next().unwrap();
|
||||
/// assert_eq!(&token.text, "happy");
|
||||
/// assert_eq!(token.offset_from, 7);
|
||||
/// assert_eq!(token.offset_to, 12);
|
||||
/// assert_eq!(token.position, 1);
|
||||
/// }
|
||||
/// # }
|
||||
/// ```
|
||||
///
|
||||
pub trait TokenStream {
|
||||
fn advance(&mut self) -> bool;
|
||||
|
||||
@@ -180,7 +212,7 @@ pub struct ChainTokenizer<HeadTokenFilterFactory, TailTokenizer> {
|
||||
|
||||
impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
|
||||
for ChainTokenizer<HeadTokenFilterFactory, TailTokenizer>
|
||||
where HeadTokenFilterFactory: TokenFilterFactory<TailTokenizer::TokenStreamImpl>,
|
||||
where HeadTokenFilterFactory: TokenFilter<TailTokenizer::TokenStreamImpl>,
|
||||
TailTokenizer: Tokenizer<'a>
|
||||
{
|
||||
type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream;
|
||||
@@ -192,8 +224,9 @@ impl<'a, HeadTokenFilterFactory, TailTokenizer> Tokenizer<'a>
|
||||
}
|
||||
|
||||
|
||||
pub trait TokenFilterFactory<TailTokenStream: TokenStream>: Clone {
|
||||
pub trait TokenFilter<TailTokenStream: TokenStream>: Clone {
|
||||
type ResultTokenStream: TokenStream;
|
||||
|
||||
/// Wraps a token stream and returns the modified one.
|
||||
fn transform(&self, token_stream: TailTokenStream) -> Self::ResultTokenStream;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user