mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-02 16:40:43 +00:00
#191 Analyzer
This commit is contained in:
@@ -30,6 +30,11 @@ impl Default for Token {
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Warning! TODO may change once associated type constructor
|
||||
// land in nightly.
|
||||
|
||||
|
||||
pub trait Analyzer<'a>: Sized + Clone {
|
||||
type TokenStreamImpl: TokenStream;
|
||||
|
||||
|
||||
@@ -11,6 +11,14 @@ use analyzer::LowerCaser;
|
||||
use analyzer::Stemmer;
|
||||
|
||||
|
||||
|
||||
/// The analyzer manager serves as a store for
|
||||
/// all of the configured analyzers.
|
||||
///
|
||||
/// By default, it is populated with the following managers.
|
||||
///
|
||||
/// * raw : does not process nor tokenize the text.
|
||||
/// * default : Tokenizes according to whitespace and punctuation, removes tokens that are too long, lowercases the
|
||||
#[derive(Clone)]
|
||||
pub struct AnalyzerManager {
|
||||
analyzers: Arc< RwLock<HashMap<String, Box<BoxedAnalyzer> >> >
|
||||
|
||||
@@ -67,8 +67,10 @@ impl Index {
|
||||
Index::from_directory(directory, schema)
|
||||
}
|
||||
|
||||
pub fn analyzers(&self) -> AnalyzerManager {
|
||||
self.analyzers.clone()
|
||||
|
||||
/// Accessor for the analyzer manager.
|
||||
pub fn analyzers(&self) -> &AnalyzerManager {
|
||||
&self.analyzers
|
||||
}
|
||||
|
||||
/// Creates a new index in a temp directory.
|
||||
|
||||
@@ -11,7 +11,6 @@ use store::StoreReader;
|
||||
use directory::ReadOnlySource;
|
||||
use schema::Document;
|
||||
use DocId;
|
||||
use std::str;
|
||||
use std::sync::Arc;
|
||||
use std::collections::HashMap;
|
||||
use common::CompositeFile;
|
||||
|
||||
@@ -90,8 +90,8 @@ impl SegmentPostings {
|
||||
SegmentPostings {
|
||||
block_cursor: segment_block_postings,
|
||||
cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
|
||||
delete_bitset: delete_bitset,
|
||||
position_computer: position_computer,
|
||||
delete_bitset,
|
||||
position_computer,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -89,10 +89,10 @@ impl QueryParser {
|
||||
default_fields: Vec<Field>,
|
||||
analyzer_manager: AnalyzerManager) -> QueryParser {
|
||||
QueryParser {
|
||||
schema: schema,
|
||||
default_fields: default_fields,
|
||||
schema,
|
||||
default_fields,
|
||||
analyzer_manager,
|
||||
conjunction_by_default: false,
|
||||
analyzer_manager: analyzer_manager,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -101,7 +101,7 @@ impl QueryParser {
|
||||
QueryParser::new(
|
||||
index.schema(),
|
||||
default_fields,
|
||||
index.analyzers())
|
||||
index.analyzers().clone())
|
||||
}
|
||||
|
||||
/// Set the default way to compose queries to a conjunction.
|
||||
@@ -223,8 +223,8 @@ impl QueryParser {
|
||||
match user_input_ast {
|
||||
UserInputAST::Clause(sub_queries) => {
|
||||
let default_occur = self.default_occur();
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
|
||||
sub_queries
|
||||
let logical_sub_queries: Vec<(Occur, LogicalAST)> =
|
||||
try!(sub_queries
|
||||
.into_iter()
|
||||
.map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
|
||||
.map(|res| {
|
||||
@@ -232,24 +232,23 @@ impl QueryParser {
|
||||
(compose_occur(default_occur, occur), sub_ast)
|
||||
})
|
||||
})
|
||||
.collect()
|
||||
);
|
||||
.collect());
|
||||
Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
|
||||
}
|
||||
UserInputAST::Not(subquery) => {
|
||||
let (occur, logical_sub_queries) =
|
||||
try!(self.compute_logical_ast_with_occur(*subquery));
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Must(subquery) => {
|
||||
let (occur, logical_sub_queries) =
|
||||
try!(self.compute_logical_ast_with_occur(*subquery));
|
||||
self.compute_logical_ast_with_occur(*subquery)?;
|
||||
Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
|
||||
}
|
||||
UserInputAST::Leaf(literal) => {
|
||||
let term_phrases: Vec<(Field, String)> = match literal.field_name {
|
||||
Some(ref field_name) => {
|
||||
let field = try!(self.resolve_field_name(field_name));
|
||||
let field = self.resolve_field_name(field_name)?;
|
||||
vec![(field, literal.phrase.clone())]
|
||||
}
|
||||
None => {
|
||||
|
||||
@@ -1,18 +1,24 @@
|
||||
|
||||
/// Describing the amount of information indexed.
|
||||
/// `IndexRecordOption` describes an amount of information associated
|
||||
/// for a given field.
|
||||
///
|
||||
/// It is used in the schema to configure how much data should be
|
||||
/// indexed for a given field.
|
||||
///
|
||||
/// It is also used to describe the amount of information that
|
||||
/// you want to be decoded as you go through a posting list.
|
||||
///
|
||||
/// Since decoding information is not free, this makes it possible to
|
||||
/// avoid this extra cost when the information is not required.
|
||||
/// For instance, positions are useful when running phrase queries
|
||||
/// but useless in other queries.
|
||||
/// but useless for most queries.
|
||||
///
|
||||
#[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
|
||||
pub enum IndexRecordOption {
|
||||
#[serde(rename = "basic")]
|
||||
Basic,
|
||||
Basic, //< records only the `DocId`s
|
||||
#[serde(rename = "freq")]
|
||||
WithFreqs,
|
||||
WithFreqs, //< records the document ids as well as the term frequency.
|
||||
#[serde(rename = "position")]
|
||||
WithFreqsAndPositions,
|
||||
WithFreqsAndPositions, //< records the document id, the term frequency and the positions of the occurences in the document.
|
||||
}
|
||||
|
||||
impl IndexRecordOption {
|
||||
|
||||
@@ -40,7 +40,7 @@ let schema = schema_builder.build();
|
||||
We can split the problem of generating a search result page into two phases :
|
||||
|
||||
* identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`)
|
||||
* for each of these documents, retrieving the information required to generate the serp page.
|
||||
* for each of these documents, retrieving the information required to generate the search results page.
|
||||
(`doc_ids[] -> Document[]`)
|
||||
|
||||
In the first phase, the ability to search for documents by the given field is determined by the
|
||||
|
||||
Reference in New Issue
Block a user