#191 Analyzer

2026-06-02 16:40:43 +00:00 · 2017-09-20 22:56:55 +09:00
parent 426cc436da
commit 2c9302290f
8 changed files with 43 additions and 24 deletions
--- a/src/analyzer/analyzer.rs
+++ b/src/analyzer/analyzer.rs
@@ -30,6 +30,11 @@ impl Default for Token {
    }
 }

+
+// Warning! TODO may change once associated type constructor
+// land in nightly.
+
+
 pub trait Analyzer<'a>: Sized + Clone {
    type TokenStreamImpl: TokenStream;

--- a/src/analyzer/analyzer_manager.rs
+++ b/src/analyzer/analyzer_manager.rs
@@ -11,6 +11,14 @@ use analyzer::LowerCaser;
 use analyzer::Stemmer;


+
+/// The analyzer manager serves as a store for
+/// all of the configured analyzers.
+///
+/// By default, it is populated with the following managers.
+///
+///  * raw : does not process nor tokenize the text.
+///  * default : Tokenizes according to whitespace and punctuation, removes tokens that are too long, lowercases the
 #[derive(Clone)]
 pub struct AnalyzerManager {
    analyzers: Arc< RwLock<HashMap<String, Box<BoxedAnalyzer> >> >
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -67,8 +67,10 @@ impl Index {
        Index::from_directory(directory, schema)
    }

-    pub fn analyzers(&self) -> AnalyzerManager {
-        self.analyzers.clone()
+
+    /// Accessor for the analyzer manager.
+    pub fn analyzers(&self) -> &AnalyzerManager {
+        &self.analyzers
    }

    /// Creates a new index in a temp directory.
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -11,7 +11,6 @@ use store::StoreReader;
 use directory::ReadOnlySource;
 use schema::Document;
 use DocId;
-use std::str;
 use std::sync::Arc;
 use std::collections::HashMap;
 use common::CompositeFile;
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -90,8 +90,8 @@ impl SegmentPostings {
        SegmentPostings {
            block_cursor: segment_block_postings,
            cur: COMPRESSION_BLOCK_SIZE, // cursor within the block
-            delete_bitset: delete_bitset,
-            position_computer: position_computer,
+            delete_bitset,
+            position_computer,
        }
    }

--- a/src/query/query_parser/query_parser.rs
+++ b/src/query/query_parser/query_parser.rs
@@ -89,10 +89,10 @@ impl QueryParser {
               default_fields: Vec<Field>,
               analyzer_manager: AnalyzerManager) -> QueryParser {
        QueryParser {
-            schema: schema,
-            default_fields: default_fields,
+            schema,
+            default_fields,
+            analyzer_manager,
            conjunction_by_default: false,
-            analyzer_manager: analyzer_manager,
        }
    }

@@ -101,7 +101,7 @@ impl QueryParser {
        QueryParser::new(
            index.schema(),
            default_fields,
-            index.analyzers())
+            index.analyzers().clone())
    }

    /// Set the default way to compose queries to a conjunction.
@@ -223,8 +223,8 @@ impl QueryParser {
        match user_input_ast {
            UserInputAST::Clause(sub_queries) => {
                let default_occur = self.default_occur();
-                let logical_sub_queries: Vec<(Occur, LogicalAST)> = try!(
-                    sub_queries
+                let logical_sub_queries: Vec<(Occur, LogicalAST)> =
+                    try!(sub_queries
                        .into_iter()
                        .map(|sub_query| self.compute_logical_ast_with_occur(*sub_query))
                        .map(|res| {
@@ -232,24 +232,23 @@ impl QueryParser {
                                (compose_occur(default_occur, occur), sub_ast)
                            })
                        })
-                        .collect()
-                );
+                        .collect());
                Ok((Occur::Should, LogicalAST::Clause(logical_sub_queries)))
            }
            UserInputAST::Not(subquery) => {
                let (occur, logical_sub_queries) =
-                    try!(self.compute_logical_ast_with_occur(*subquery));
+                    self.compute_logical_ast_with_occur(*subquery)?;
                Ok((compose_occur(Occur::MustNot, occur), logical_sub_queries))
            }
            UserInputAST::Must(subquery) => {
                let (occur, logical_sub_queries) =
-                    try!(self.compute_logical_ast_with_occur(*subquery));
+                    self.compute_logical_ast_with_occur(*subquery)?;
                Ok((compose_occur(Occur::Must, occur), logical_sub_queries))
            }
            UserInputAST::Leaf(literal) => {
                let term_phrases: Vec<(Field, String)> = match literal.field_name {
                    Some(ref field_name) => {
-                        let field = try!(self.resolve_field_name(field_name));
+                        let field = self.resolve_field_name(field_name)?;
                        vec![(field, literal.phrase.clone())]
                    }
                    None => {
--- a/src/schema/index_record_option.rs
+++ b/src/schema/index_record_option.rs
@@ -1,18 +1,24 @@

-/// Describing the amount of information indexed.
+/// `IndexRecordOption` describes an amount of information associated
+/// for a given field.
+///
+/// It is used in the schema to configure how much data should be
+/// indexed for a given field.
+///
+/// It is also used to describe the amount of information that
+/// you want to be decoded as you go through a posting list.
 ///
-/// Since decoding information is not free, this makes it possible to
-/// avoid this extra cost when the information is not required.
 /// For instance, positions are useful when running phrase queries
-/// but useless in other queries.
+/// but useless for most queries.
+///
 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)]
 pub enum IndexRecordOption {
    #[serde(rename = "basic")]
-    Basic,
+    Basic,  //< records only the `DocId`s
    #[serde(rename = "freq")]
-    WithFreqs,
+    WithFreqs, //< records the document ids as well as the term frequency.
    #[serde(rename = "position")]
-    WithFreqsAndPositions,
+    WithFreqsAndPositions, //< records the document id, the term frequency and the positions of the occurences in the document.
 }

 impl IndexRecordOption {
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -40,7 +40,7 @@ let schema = schema_builder.build();
 We can split the problem of generating a search result page into two phases :

 * identifying the list of 10 or so documents to be displayed (Conceptually `query -> doc_ids[]`)
-* for each of these documents, retrieving the information required to generate the serp page.
+* for each of these documents, retrieving the information required to generate the search results page.
  (`doc_ids[] -> Document[]`)

 In the first phase, the ability to search for documents by the given field is determined by the