From cdcc72a0c90b89d1a3f85225bebf75568a8ce70b Mon Sep 17 00:00:00 2001
From: Paul Masurel <paul.masurel@gmail.com>
Date: Thu, 19 May 2016 11:08:50 +0900
Subject: [PATCH] test passing

---
 TODO.md                          |  1 +
 src/core/index.rs                |  4 ++
 src/core/merger.rs               |  7 +--
 src/core/writer.rs               |  2 +-
 src/lib.rs                       |  7 +--
 src/postings/freq_handler.rs     |  6 ++-
 src/postings/segment_postings.rs |  2 +-
 src/postings/serializer.rs       | 38 ++++++++++----
 src/schema/mod.rs                |  3 +-
 src/schema/schema.rs             | 17 +++---
 src/schema/term.rs               | 21 ++++++++
 src/schema/text_field.rs         | 90 +++++++++++++++++++++++++++-----
 12 files changed, 152 insertions(+), 46 deletions(-)

diff --git a/TODO.md b/TODO.md
index 72b741276..41859995f 100644
--- a/TODO.md
+++ b/TODO.md
@@ -11,3 +11,4 @@ use skip list for each blocks
 find a clear way to put the tokenized/untokenized thing upstream
 index frequent bigrams
 clean up compression
+reconsider the first byte == field in the [u8] repr of a term.
diff --git a/src/core/index.rs b/src/core/index.rs
index d5a88e420..51ab3d9c3 100644
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -258,6 +258,10 @@ impl fmt::Debug for Segment {
 
 impl Segment {
 
+    pub fn schema(&self,) -> Schema {
+        self.index.schema()
+    }
+
     pub fn id(&self,) -> SegmentId {
         self.segment_id.clone()
     }
diff --git a/src/core/merger.rs b/src/core/merger.rs
index a98ad7505..130df2ea8 100644
--- a/src/core/merger.rs
+++ b/src/core/merger.rs
@@ -74,7 +74,7 @@ impl<'a> PostingsMerger<'a> {
         }
         postings_merger
     }
-    
+
     // pushes the term_reader associated with the given segment ordinal
     // into the heap.
     fn push_next_segment_el(&mut self, segment_ord: usize) {
@@ -96,7 +96,7 @@ impl<'a> PostingsMerger<'a> {
             let offset = self.doc_offsets[heap_item.segment_ord];
             let reader = &self.readers[heap_item.segment_ord];
             let segment_postings = reader.read_postings(&heap_item.term_info);
-            let offset_postings = OffsetPostings::new(segment_postings, offset); 
+            let offset_postings = OffsetPostings::new(segment_postings, offset);
             segment_postings_list.push(offset_postings);
         }
         self.push_next_segment_el(heap_item.segment_ord);
@@ -223,11 +223,12 @@ mod tests {
     use core::searcher::DocAddress;
     use collector::FastFieldTestCollector;
     use collector::TestCollector;
+    use schema::TextIndexingOptions;
 
     #[test]
     fn test_index_merger() {
         let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed().set_stored();
+        let text_fieldtype = schema::TextOptions::new().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored();
         let text_field = schema.add_text_field("text", &text_fieldtype);
         let score_fieldtype = schema::U32Options::new().set_fast();
         let score_field = schema.add_u32_field("score", &score_fieldtype);
diff --git a/src/core/writer.rs b/src/core/writer.rs
index d8eba23d5..8ce0a0034 100644
--- a/src/core/writer.rs
+++ b/src/core/writer.rs
@@ -161,7 +161,7 @@ impl SegmentWriter {
         let doc_id = self.max_doc;
         for field_value in doc.text_fields() {
 			let field_options = schema.text_field_options(&field_value.field);
-			if field_options.is_tokenized_indexed() {
+			if field_options.indexing_options().is_tokenized() {
 				let mut tokens = self.tokenizer.tokenize(&field_value.text);
 				let mut pos = 0u32;
 				loop {
diff --git a/src/lib.rs b/src/lib.rs
index e363fb8aa..bf1d67119 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,11 +63,9 @@ mod tests {
     #[test]
     fn test_indexing() {
         let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
-        let text_field = schema.add_text_field("text", &text_fieldtype);
+        let text_field = schema.add_text_field("text", &schema::TEXT);
 
         let index = Index::create_from_tempdir(schema).unwrap();
-
         {
             // writing the segment
             let mut index_writer = index.writer_with_num_threads(1).unwrap();
@@ -99,8 +97,7 @@ mod tests {
     #[test]
     fn test_searcher() {
         let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
-        let text_field = schema.add_text_field("text", &text_fieldtype);
+        let text_field = schema.add_text_field("text", &schema::TEXT);
         let index = Index::create_in_ram(schema);
 
         {
diff --git a/src/postings/freq_handler.rs b/src/postings/freq_handler.rs
index 70bf54a1c..8cec21154 100644
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -1,5 +1,4 @@
 use compression::SIMDBlockDecoder;
-use DocId;
 
 pub enum FreqHandler {
     FreqReader(SIMDBlockDecoder),
@@ -8,6 +7,11 @@ pub enum FreqHandler {
 }
 
 impl FreqHandler {
+
+    pub fn new_freq_reader() -> FreqHandler {
+        FreqHandler::FreqReader(SIMDBlockDecoder::new())
+    }
+
     pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
         match *self {
             FreqHandler::FreqReader(ref mut block_decoder) => {
diff --git a/src/postings/segment_postings.rs b/src/postings/segment_postings.rs
index f8f44332b..3dce489d8 100644
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -50,7 +50,7 @@ impl<'a> SegmentPostings<'a> {
             doc_freq: doc_freq as usize,
             doc_offset: 0,
             block_decoder: SIMDBlockDecoder::new(),
-            freq_reader: FreqHandler::NoFreq,
+            freq_reader: FreqHandler::new_freq_reader(),
             remaining_data: data,
             cur: Wrapping(usize::max_value()),
         }
diff --git a/src/postings/serializer.rs b/src/postings/serializer.rs
index 7542b2efc..d257acb76 100644
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -1,6 +1,8 @@
 use datastruct::FstMapBuilder;
 use super::TermInfo;
 use schema::Term;
+use schema::Schema;
+use schema::TextIndexingOptions;
 use directory::WritePtr;
 use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockEncoder, CompositeEncoder};
 use DocId;
@@ -23,8 +25,8 @@ pub struct PostingsSerializer {
     doc_ids: Vec<DocId>,
     term_freqs: Vec<u32>,
     position_deltas: Vec<u32>,
-    is_termfreq_enabled: bool,
-    is_positions_enabled: bool,
+    schema: Schema,
+    text_indexing_options: TextIndexingOptions,
 }
 
 impl PostingsSerializer {
@@ -34,6 +36,7 @@ impl PostingsSerializer {
         let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
         let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
         let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
+        let schema = segment.schema();
         Ok(PostingsSerializer {
             terms_fst_builder: terms_fst_builder,
             postings_write: postings_write,
@@ -46,13 +49,26 @@ impl PostingsSerializer {
             doc_ids: Vec::new(),
             term_freqs: Vec::new(),
             position_deltas: Vec::new(),
-            is_positions_enabled: false,
-            is_termfreq_enabled: false,
+            schema: schema,
+            text_indexing_options: TextIndexingOptions::Unindexed,
         })
     }
 
+    pub fn load_indexing_options(&mut self, term: &Term) {
+        self.text_indexing_options = match term.get_text_field() {
+            Some(text_field) => {
+                let text_options = self.schema.text_field_options(&text_field);
+                text_options.indexing_options() 
+            }
+            None => {
+                TextIndexingOptions::Unindexed
+            }
+        };
+    }
+
     pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
         try!(self.close_term());
+        self.load_indexing_options(term);
         self.doc_ids.clear();
         self.last_doc_id_encoded = 0;
         self.term_freqs.clear();
@@ -72,7 +88,7 @@ impl PostingsSerializer {
                 self.written_bytes_postings += block_encoded.len();
                 try!(self.postings_write.write_all(block_encoded));
             }
-            if self.is_termfreq_enabled {
+            if self.text_indexing_options.is_termfreq_enabled() {
                 {
                     let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
                     self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
@@ -81,7 +97,7 @@ impl PostingsSerializer {
                     }
                     self.term_freqs.clear();
                 }
-                if self.is_positions_enabled {
+                if self.text_indexing_options.is_position_enabled() {
                     let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
                     try!(self.positions_write.write_all(positions_encoded));
                     self.written_bytes_positions += positions_encoded.len();
@@ -95,13 +111,13 @@ impl PostingsSerializer {
 
     pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
         self.doc_ids.push(doc_id);
-        if self.is_termfreq_enabled {
+        if self.text_indexing_options.is_termfreq_enabled() {
             self.term_freqs.push(term_freq as u32);
         }
-        if self.is_positions_enabled {
+        if self.text_indexing_options.is_position_enabled() {
             self.position_deltas.extend_from_slice(position_deltas);
         }
-        if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { 
+        if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
             {
                 // encode the positions
                 let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
@@ -109,7 +125,7 @@ impl PostingsSerializer {
                 try!(self.postings_write.write_all(block_encoded));
                 self.written_bytes_postings += block_encoded.len();
             }
-            if self.is_termfreq_enabled {
+            if self.text_indexing_options.is_termfreq_enabled() {
                 // encode the term_freqs
                 let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
                 try!(self.postings_write.write_all(block_encoded));
@@ -120,7 +136,7 @@ impl PostingsSerializer {
         }
         Ok(())
     }
-    
+
     pub fn close(mut self,) -> io::Result<()> {
         try!(self.close_term());
         try!(self.terms_fst_builder.finish());
diff --git a/src/schema/mod.rs b/src/schema/mod.rs
index 62a22dc0f..c6d913ae4 100644
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -12,8 +12,9 @@ pub use self::text_field::TextFieldValue;
 pub use self::text_field::TextOptions;
 pub use self::text_field::FAST;
 pub use self::text_field::TEXT;
+pub use self::text_field::STRING;
 pub use self::text_field::STORED;
-
+pub use self::text_field::TextIndexingOptions;
 
 pub use self::u32_field::U32Field;
 pub use self::u32_field::U32FieldValue;
diff --git a/src/schema/schema.rs b/src/schema/schema.rs
index 1a72c7f03..25293e799 100644
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -33,19 +33,18 @@ pub struct U32FieldEntry {
 /// # Examples
 ///
 /// ```
-/// use tantivy::schema::{Schema, TextOptions};
+/// use tantivy::schema::*;
 ///
 /// fn create_schema() -> Schema {
 ///   let mut schema = Schema::new();
 ///   let str_fieldtype = TextOptions::new();
-///   let text_fieldtype = TextOptions::new().set_tokenized_indexed();
-///   let id_field = schema.add_text_field("id", &str_fieldtype);
-///   let url_field = schema.add_text_field("url", &str_fieldtype);
-///   let body_field = schema.add_text_field("body", &text_fieldtype);
-///   let id_field = schema.add_text_field("id", &str_fieldtype);
-///   let url_field = schema.add_text_field("url", &str_fieldtype);
-///   let title_field = schema.add_text_field("title", &text_fieldtype);
-///   let body_field = schema.add_text_field("body", &text_fieldtype);
+///   let id_field = schema.add_text_field("id", &STRING);
+///   let url_field = schema.add_text_field("url", &STRING);
+///   let body_field = schema.add_text_field("body", &TEXT);
+///   let id_field = schema.add_text_field("id", &STRING);
+///   let url_field = schema.add_text_field("url", &STRING);
+///   let title_field = schema.add_text_field("title", &TEXT);
+///   let body_field = schema.add_text_field("body", &TEXT);
 ///   schema
 /// }
 ///
diff --git a/src/schema/term.rs b/src/schema/term.rs
index 5c07978fa..7535e787b 100644
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -24,6 +24,27 @@ impl Term {
         }
     }
 
+    fn type_num(&self,) -> u8 {
+        self.data[0]
+    }
+
+    pub fn is_u32(&self,) -> bool {
+        !self.is_text()
+    }
+
+    pub fn is_text(&self,) -> bool {
+        self.type_num() & 128 == 0
+    }
+
+    pub fn get_text_field(&self,) -> Option<TextField> {
+        if self.is_text() {
+            Some(TextField(self.type_num()))
+        }
+        else {
+            None
+        }
+    }
+
     pub fn from_field_text(field: &TextField, text: &str) -> Term {
         let mut buffer = Vec::with_capacity(1 + text.len());
         let TextField(field_idx) = *field;
diff --git a/src/schema/text_field.rs b/src/schema/text_field.rs
index ecd8340a0..e8fd12524 100644
--- a/src/schema/text_field.rs
+++ b/src/schema/text_field.rs
@@ -10,17 +10,74 @@ use std::ops::BitOr;
 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
 pub struct TextField(pub u8);
 
+#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)]
+pub enum TextIndexingOptions {
+    Unindexed,
+    Untokenized,
+    TokenizedNoFreq,
+    TokenizedWithFreq,
+    TokenizedWithFreqAndPosition,
+}
+
+impl TextIndexingOptions {
+    pub fn is_termfreq_enabled(&self) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedWithFreq => true,
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    }
+    
+    pub fn is_tokenized(&self,) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedNoFreq => true,
+            TextIndexingOptions::TokenizedWithFreq => true,
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    } 
+    
+    pub fn is_position_enabled(&self,) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    }
+}
+
+
+impl BitOr for TextIndexingOptions {
+     type Output = TextIndexingOptions;
+
+    fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions {
+        use super::TextIndexingOptions::*;
+        if self == Unindexed {
+            other
+        }
+        else if other == Unindexed {
+            self
+        }
+        else if self == other {
+            self
+        }
+        else {
+            // make it possible
+            panic!("Combining {:?} and {:?} is ambiguous");
+        }
+    }
+}
 
 #[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
 pub struct TextOptions {
-    tokenized_indexed: bool,
+    indexing_options: TextIndexingOptions,
     stored: bool,
     fast: bool,
 }
 
 impl TextOptions {
-    pub fn is_tokenized_indexed(&self,) -> bool {
-        self.tokenized_indexed
+    
+    pub fn indexing_options(&self,) -> TextIndexingOptions {
+        self.indexing_options.clone()
     }
 
     pub fn is_stored(&self,) -> bool {
@@ -41,15 +98,15 @@ impl TextOptions {
         self
     }
 
-    pub fn set_tokenized_indexed(mut self,) -> TextOptions {
-        self.tokenized_indexed = true;
+    pub fn set_indexing_options(mut self, indexing_options: TextIndexingOptions) -> TextOptions {
+        self.indexing_options = indexing_options;
         self
     }
 
     pub fn new() -> TextOptions {
         TextOptions {
             fast: false,
-            tokenized_indexed: false,
+            indexing_options: TextIndexingOptions::Unindexed,
             stored: false,
         }
     }
@@ -94,12 +151,17 @@ pub struct TextFieldValue {
 }
 
 
-
+/// The field will be untokenized and indexed
+pub const STRING: TextOptions = TextOptions {
+    indexing_options: TextIndexingOptions::Untokenized,
+    stored: false,
+    fast: false,
+};
 
 
 /// The field will be tokenized and indexed
 pub const TEXT: TextOptions = TextOptions {
-    tokenized_indexed: true,
+    indexing_options: TextIndexingOptions::TokenizedWithFreqAndPosition,
     stored: false,
     fast: false,
 };
@@ -109,7 +171,7 @@ pub const TEXT: TextOptions = TextOptions {
 /// Reading the stored fields of a document is relatively slow.
 /// (100 microsecs)
 pub const STORED: TextOptions = TextOptions {
-    tokenized_indexed: false,
+    indexing_options: TextIndexingOptions::Unindexed,
     stored: true,
     fast: false,
 };
@@ -117,7 +179,7 @@ pub const STORED: TextOptions = TextOptions {
 /// Fast field are used for field you need to access many times during
 /// collection. (e.g: for sort, aggregates).
 pub const FAST: TextOptions = TextOptions {
-    tokenized_indexed: false,
+    indexing_options: TextIndexingOptions::Unindexed,
     stored: false,
     fast: true
 };
@@ -129,7 +191,7 @@ impl BitOr for TextOptions {
 
     fn bitor(self, other: TextOptions) -> TextOptions {
         let mut res = TextOptions::new();
-        res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
+        res.indexing_options = self.indexing_options | other.indexing_options;
         res.stored = self.stored || other.stored;
         res.fast = self.fast || other.fast;
         res
@@ -148,19 +210,19 @@ mod tests {
             let field_options = STORED | FAST;
             assert!(field_options.is_stored());
             assert!(field_options.is_fast());
-            assert!(!field_options.is_tokenized_indexed());
+            assert!(!field_options.indexing_options().is_tokenized());
         }
         {
             let field_options = STORED | TEXT;
             assert!(field_options.is_stored());
             assert!(!field_options.is_fast());
-            assert!(field_options.is_tokenized_indexed());
+            assert!(field_options.indexing_options().is_tokenized());
         }
         {
             let mut schema = Schema::new();
             let _body_field: TextField = schema.add_text_field("body", &TEXT);
             let field = schema.text_field("body");
-            assert!(schema.text_field_options(&field).is_tokenized_indexed());
+            assert!(schema.text_field_options(&field).indexing_options().is_tokenized());
         }
     }
 }