From 185a72b3419a8d2ee695be7552fd6b40e3245a74 Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 16 Nov 2017 08:22:54 +0900 Subject: [PATCH 1/2] Closes #224. Fixes documentation about STORED in the example. (#225) --- examples/html/simple_search.html | 99 ++++++++++++++++++++++---------- examples/simple_search.rs | 18 +++--- 2 files changed, 80 insertions(+), 37 deletions(-) diff --git a/examples/html/simple_search.html b/examples/html/simple_search.html index 1aa6b63ab..178313d74 100644 --- a/examples/html/simple_search.html +++ b/examples/html/simple_search.html @@ -30,10 +30,12 @@ -
extern crate rustc_serialize;
-extern crate tantivy;
+            
extern crate tantivy;
 extern crate tempdir;
 
+#[macro_use]
+extern crate serde_json;
+
 use std::path::Path;
 use tempdir::TempDir;
 use tantivy::Index;
@@ -108,8 +110,8 @@ be indexed”.

Our first field is title. -We want full-text search for it, and we want to be able -to retrieve the document after the search.

+We want full-text search for it, and we also want +to be able to retrieve the document after the search.

TEXT | STORED is some syntactic sugar to describe that.

TEXT means the field should be tokenized and indexed, @@ -132,9 +134,12 @@ documents that were selected during the search phase.

-

Our first field is body. -We want full-text search for it, and we want to be able -to retrieve the body after the search.

+

Our second field is body. +We want full-text search for it, but we do not +need to be able to be able to retrieve it +for our application.

+

We can make our index lighter and +by omitting STORED flag.

@@ -158,7 +163,7 @@ with our schema in the directory.

-
    let index = try!(Index::create(index_path, schema.clone()));
+
    let index = Index::create(index_path, schema.clone())?;
@@ -178,7 +183,7 @@ heap for the indexer can increase its throughput.

-
    let mut index_writer = try!(index.writer(50_000_000));
+
    let mut index_writer = index.writer(50_000_000)?;
@@ -214,9 +219,11 @@ one by one in a Document object.

let mut old_man_doc = Document::default(); old_man_doc.add_text(title, "The Old Man and the Sea"); - old_man_doc.add_text(body, - "He was an old man who fished alone in a skiff in the Gulf Stream and \ - he had gone eighty-four days now without taking a fish."); + old_man_doc.add_text( + body, + "He was an old man who fished alone in a skiff in the Gulf Stream and \ + he had gone eighty-four days now without taking a fish.", + ); @@ -243,16 +250,25 @@ one by one in a Document object.

Create a document directly from json.

-

Alternatively, we can use our schema to parse -a document object directly from json.

+

Alternatively, we can use our schema to parse a +document object directly from json. +The document is a string, but we use the json macro +from serde_json for the convenience of multi-line support.

-
-    let mice_and_men_doc = try!(schema.parse_document(r#"{
-       "title": "Of Mice and Men",
-       "body": "few miles south of Soledad, the Salinas River drops in close to the hillside bank and runs deep and green. The water is warm too, for it has slipped twinkling over the yellow sands in the sunlight before reaching the narrow pool. On one side of the river the golden foothill slopes curve up to the strong and rocky Gabilan Mountains, but on the valley side the water is lined with trees—willows fresh and green with every spring, carrying in their lower leaf junctures the debris of the winter’s flooding; and sycamores with mottled, white,recumbent limbs and branches that arch over the pool"  
-    }"#));
+            
    let json = json!({
+       "title": "Of Mice and Men",
+       "body": "A few miles south of Soledad, the Salinas River drops in close to the hillside \
+                bank and runs deep and green. The water is warm too, for it has slipped twinkling \
+                over the yellow sands in the sunlight before reaching the narrow pool. On one \
+                side of the river the golden foothill slopes curve up to the strong and rocky \
+                Gabilan Mountains, but on the valley side the water is lined with trees—willows \
+                fresh and green with every spring, carrying in their lower leaf junctures the \
+                debris of the winter’s flooding; and sycamores with mottled, white, recumbent \
+                limbs and branches that arch over the pool"
+    });
+    let mice_and_men_doc = schema.parse_document(&json.to_string())?;
 
     index_writer.add_document(mice_and_men_doc);
@@ -271,10 +287,15 @@ The following document has two titles.

-
    let frankenstein_doc = try!(schema.parse_document(r#"{
-       "title": ["Frankenstein", "The Modern Promotheus"],
-       "body": "You will rejoice to hear that no disaster has accompanied the commencement of an enterprise which you have regarded with such evil forebodings.  I arrived here yesterday, and my first task is to assure my dear sister of my welfare and increasing confidence in the success of my undertaking."  
-    }"#));
+            
    let json = json!({
+       "title": ["Frankenstein", "The Modern Prometheus"],
+       "body": "You will rejoice to hear that no disaster has accompanied the commencement of an \
+                enterprise which you have regarded with such evil forebodings.  I arrived here \
+                yesterday, and my first task is to assure my dear sister of my welfare and \
+                increasing confidence in the success of my undertaking."
+    });
+    let frankenstein_doc = schema.parse_document(&json.to_string())?;
+
     index_writer.add_document(frankenstein_doc);
@@ -313,7 +334,7 @@ the existence of new documents.

-
    try!(index_writer.commit());
+
    index_writer.commit()?;
@@ -349,7 +370,7 @@ after every commit().

-
    try!(index.load_searchers());
+
    index.load_searchers()?;
@@ -384,7 +405,7 @@ in both title and body.

-
    let query_parser = QueryParser::new(index.schema(), vec![title, body]);
+
    let mut query_parser = QueryParser::for_index(index, vec![title, body]);
@@ -401,7 +422,7 @@ A ticket has been opened regarding this problem.

-
    let query = try!(query_parser.parse_query("sea whale"));
+
    let query = query_parser.parse_query("sea whale")?;
@@ -451,7 +472,7 @@ is the role of the TopCollector.

-
    try!(searcher.search(&*query, &mut top_collector));
+
    searcher.search(&*query, &mut top_collector)?;
@@ -488,9 +509,27 @@ a title.

     for doc_address in doc_addresses {
-        let retrieved_doc = try!(searcher.doc(&doc_address));
+        let retrieved_doc = searcher.doc(&doc_address)?;
         println!("{}", schema.to_json(&retrieved_doc));
-    }
+    }
+ + + + +
  • +
    + +
    + +
    +

    Wait for indexing and merging threads to shut down. +Usually this isn’t needed, but in main we try to +delete the temporary directory and that fails on +Windows if the files are still open.

    + +
    + +
        index_writer.wait_merging_threads()?;
     
         Ok(())
     }
    diff --git a/examples/simple_search.rs b/examples/simple_search.rs index 20e3812c0..301508cd5 100644 --- a/examples/simple_search.rs +++ b/examples/simple_search.rs @@ -36,12 +36,12 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { let mut schema_builder = SchemaBuilder::default(); // Our first field is title. - // We want full-text search for it, and we want to be able - // to retrieve the document after the search. - // + // We want full-text search for it, and we also want + // to be able to retrieve the document after the search. + // // TEXT | STORED is some syntactic sugar to describe // that. - // + // // `TEXT` means the field should be tokenized and indexed, // along with its term frequency and term positions. // @@ -51,9 +51,13 @@ fn run_example(index_path: &Path) -> tantivy::Result<()> { // documents that were selected during the search phase. schema_builder.add_text_field("title", TEXT | STORED); - // Our first field is body. - // We want full-text search for it, and we want to be able - // to retrieve the body after the search. + // Our second field is body. + // We want full-text search for it, but we do not + // need to be able to be able to retrieve it + // for our application. + // + // We can make our index lighter and + // by omitting `STORED` flag. schema_builder.add_text_field("body", TEXT); let schema = schema_builder.build(); From a298c084e66ea7b610f1625f05e11c8337f83ccc Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Wed, 22 Nov 2017 20:37:34 +0900 Subject: [PATCH 2/2] Analyzer's Analyzer::token_stream does not need to me `&mut self` --- src/analyzer/analyzer.rs | 14 +++++++------- src/analyzer/japanese_tokenizer.rs | 2 +- src/analyzer/lower_caser.rs | 1 - src/analyzer/mod.rs | 8 ++++---- src/analyzer/raw_tokenizer.rs | 2 +- src/analyzer/simple_tokenizer.rs | 2 +- src/analyzer/token_stream_chain.rs | 14 ++++++++------ src/schema/index_record_option.rs | 10 +++++++--- 8 files changed, 29 insertions(+), 24 deletions(-) diff --git a/src/analyzer/analyzer.rs b/src/analyzer/analyzer.rs index f2a485557..08cb0afcd 100644 --- a/src/analyzer/analyzer.rs +++ b/src/analyzer/analyzer.rs @@ -38,7 +38,7 @@ impl Default for Token { pub trait Analyzer<'a>: Sized + Clone { type TokenStreamImpl: TokenStream; - fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl; + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl; fn filter(self, new_filter: NewFilter) -> ChainAnalyzer where NewFilter: TokenFilterFactory<>::TokenStreamImpl> @@ -51,8 +51,8 @@ pub trait Analyzer<'a>: Sized + Clone { } pub trait BoxedAnalyzer: Send + Sync { - fn token_stream<'a>(&mut self, text: &'a str) -> Box; - fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box; + fn token_stream<'a>(&self, text: &'a str) -> Box; + fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box; fn boxed_clone(&self) -> Box; } @@ -60,11 +60,11 @@ pub trait BoxedAnalyzer: Send + Sync { struct BoxableAnalyzer(A) where A: for <'a> Analyzer<'a> + Send + Sync; impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + Send + Sync + for <'a> Analyzer<'a> { - fn token_stream<'a>(&mut self, text: &'a str) -> Box { + fn token_stream<'a>(&self, text: &'a str) -> Box { box self.0.token_stream(text) } - fn token_stream_texts<'b>(&mut self, texts: &'b [&'b str]) -> Box { + fn token_stream_texts<'b>(&self, texts: &'b [&'b str]) -> Box { assert!(texts.len() > 0); if texts.len() == 1 { box self.0.token_stream(texts[0]) @@ -72,7 +72,7 @@ impl BoxedAnalyzer for BoxableAnalyzer where A: 'static + Send + Sync + fo else { let mut offsets = vec!(); let mut total_offset = 0; - for text in texts { + for &text in texts { offsets.push(total_offset); total_offset += text.len(); } @@ -154,7 +154,7 @@ impl<'a, HeadTokenFilterFactory, TailAnalyzer> Analyzer<'a> { type TokenStreamImpl = HeadTokenFilterFactory::ResultTokenStream; - fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { let tail_token_stream = self.tail.token_stream(text ); self.head.transform(tail_token_stream) } diff --git a/src/analyzer/japanese_tokenizer.rs b/src/analyzer/japanese_tokenizer.rs index 909ccbb0c..e80ae9f5d 100644 --- a/src/analyzer/japanese_tokenizer.rs +++ b/src/analyzer/japanese_tokenizer.rs @@ -21,7 +21,7 @@ pub struct JapaneseTokenizerStream { impl<'a> Analyzer<'a> for JapaneseTokenizer { type TokenStreamImpl = JapaneseTokenizerStream; - fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { let mut tokens = vec![]; let mut offset_from; let mut offset_to = 0; diff --git a/src/analyzer/lower_caser.rs b/src/analyzer/lower_caser.rs index 866508782..c23e71ec3 100644 --- a/src/analyzer/lower_caser.rs +++ b/src/analyzer/lower_caser.rs @@ -1,5 +1,4 @@ use super::{TokenFilterFactory, TokenStream, Token}; -use std::ascii::AsciiExt; /// Token filter that lowercase terms. diff --git a/src/analyzer/mod.rs b/src/analyzer/mod.rs index 227995b85..a312bc787 100644 --- a/src/analyzer/mod.rs +++ b/src/analyzer/mod.rs @@ -29,7 +29,7 @@ mod test { #[test] fn test_raw_tokenizer() { let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("raw").unwrap(); + let en_analyzer = analyzer_manager.get("raw").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; @@ -44,7 +44,7 @@ mod test { fn test_en_analyzer() { let analyzer_manager = AnalyzerManager::default(); assert!(analyzer_manager.get("en_doesnotexist").is_none()); - let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); + let en_analyzer = analyzer_manager.get("en_stem").unwrap(); let mut tokens: Vec = vec![]; { let mut add_token = |token: &Token| { tokens.push(token.term.clone()); }; @@ -60,7 +60,7 @@ mod test { #[test] fn test_jp_analyzer() { let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("ja").unwrap(); + let en_analyzer = analyzer_manager.get("ja").unwrap(); let mut tokens: Vec = vec![]; { @@ -78,7 +78,7 @@ mod test { #[test] fn test_tokenizer_empty() { let analyzer_manager = AnalyzerManager::default(); - let mut en_analyzer = analyzer_manager.get("en_stem").unwrap(); + let en_analyzer = analyzer_manager.get("en_stem").unwrap(); { let mut tokens: Vec = vec![]; { diff --git a/src/analyzer/raw_tokenizer.rs b/src/analyzer/raw_tokenizer.rs index 488ca5590..a5b2d3f6b 100644 --- a/src/analyzer/raw_tokenizer.rs +++ b/src/analyzer/raw_tokenizer.rs @@ -11,7 +11,7 @@ pub struct RawTokenStream { impl<'a> Analyzer<'a> for RawTokenizer { type TokenStreamImpl = RawTokenStream; - fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { let token = Token { offset_from: 0, offset_to: text.len(), diff --git a/src/analyzer/simple_tokenizer.rs b/src/analyzer/simple_tokenizer.rs index 1d4b71c22..e6cf30fb6 100644 --- a/src/analyzer/simple_tokenizer.rs +++ b/src/analyzer/simple_tokenizer.rs @@ -14,7 +14,7 @@ pub struct SimpleTokenStream<'a> { impl<'a> Analyzer<'a> for SimpleTokenizer { type TokenStreamImpl = SimpleTokenStream<'a>; - fn token_stream(&mut self, text: &'a str) -> Self::TokenStreamImpl { + fn token_stream(&self, text: &'a str) -> Self::TokenStreamImpl { SimpleTokenStream { text: text, chars: text.char_indices(), diff --git a/src/analyzer/token_stream_chain.rs b/src/analyzer/token_stream_chain.rs index 6f59f9ae2..89087fb02 100644 --- a/src/analyzer/token_stream_chain.rs +++ b/src/analyzer/token_stream_chain.rs @@ -48,16 +48,18 @@ impl<'a, TTokenStream> TokenStream for TokenStreamChain } fn token(&self) -> &Token { - if self.stream_idx > self.token_streams.len() { - panic!("You called .token(), after the end of the token stream has been reached"); - } + assert!( + self.stream_idx <= self.token_streams.len(), + "You called .token(), after the end of the token stream has been reached" + ); &self.token } fn token_mut(&mut self) -> &mut Token { - if self.stream_idx > self.token_streams.len() { - panic!("You called .token(), after the end of the token stream has been reached"); - } + assert!( + self.stream_idx <= self.token_streams.len(), + "You called .token(), after the end of the token stream has been reached" + ); &mut self.token } } diff --git a/src/schema/index_record_option.rs b/src/schema/index_record_option.rs index edb57eb3a..e74f70c3a 100644 --- a/src/schema/index_record_option.rs +++ b/src/schema/index_record_option.rs @@ -13,12 +13,16 @@ /// #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Ord, Eq, Hash, Serialize, Deserialize)] pub enum IndexRecordOption { + /// Records only the `DocId`s #[serde(rename = "basic")] - Basic, //< records only the `DocId`s + Basic, + /// Records the document ids as well as the term frequency. #[serde(rename = "freq")] - WithFreqs, //< records the document ids as well as the term frequency. + WithFreqs, + /// Records the document id, the term frequency and the positions of + /// the occurences in the document. #[serde(rename = "position")] - WithFreqsAndPositions, //< records the document id, the term frequency and the positions of the occurences in the document. + WithFreqsAndPositions, } impl IndexRecordOption {