test passing

2026-05-30 15:10:40 +00:00 · 2016-05-19 11:08:50 +09:00
parent 9226338bf0
commit cdcc72a0c9
12 changed files with 152 additions and 46 deletions
--- a/TODO.md
+++ b/TODO.md
@@ -11,3 +11,4 @@ use skip list for each blocks
 find a clear way to put the tokenized/untokenized thing upstream
 index frequent bigrams
 clean up compression
+reconsider the first byte == field in the [u8] repr of a term.
--- a/src/core/index.rs
+++ b/src/core/index.rs
@@ -258,6 +258,10 @@ impl fmt::Debug for Segment {

 impl Segment {

+    pub fn schema(&self,) -> Schema {
+        self.index.schema()
+    }
+
    pub fn id(&self,) -> SegmentId {
        self.segment_id.clone()
    }
--- a/src/core/merger.rs
+++ b/src/core/merger.rs
@@ -74,7 +74,7 @@ impl<'a> PostingsMerger<'a> {
        }
        postings_merger
    }
-    
+
    // pushes the term_reader associated with the given segment ordinal
    // into the heap.
    fn push_next_segment_el(&mut self, segment_ord: usize) {
@@ -96,7 +96,7 @@ impl<'a> PostingsMerger<'a> {
            let offset = self.doc_offsets[heap_item.segment_ord];
            let reader = &self.readers[heap_item.segment_ord];
            let segment_postings = reader.read_postings(&heap_item.term_info);
-            let offset_postings = OffsetPostings::new(segment_postings, offset); 
+            let offset_postings = OffsetPostings::new(segment_postings, offset);
            segment_postings_list.push(offset_postings);
        }
        self.push_next_segment_el(heap_item.segment_ord);
@@ -223,11 +223,12 @@ mod tests {
    use core::searcher::DocAddress;
    use collector::FastFieldTestCollector;
    use collector::TestCollector;
+    use schema::TextIndexingOptions;

    #[test]
    fn test_index_merger() {
        let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed().set_stored();
+        let text_fieldtype = schema::TextOptions::new().set_indexing_options(TextIndexingOptions::TokenizedWithFreq).set_stored();
        let text_field = schema.add_text_field("text", &text_fieldtype);
        let score_fieldtype = schema::U32Options::new().set_fast();
        let score_field = schema.add_u32_field("score", &score_fieldtype);
--- a/src/core/writer.rs
+++ b/src/core/writer.rs
@@ -161,7 +161,7 @@ impl SegmentWriter {
        let doc_id = self.max_doc;
        for field_value in doc.text_fields() {
 			let field_options = schema.text_field_options(&field_value.field);
-			if field_options.is_tokenized_indexed() {
+			if field_options.indexing_options().is_tokenized() {
 				let mut tokens = self.tokenizer.tokenize(&field_value.text);
 				let mut pos = 0u32;
 				loop {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -63,11 +63,9 @@ mod tests {
    #[test]
    fn test_indexing() {
        let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
-        let text_field = schema.add_text_field("text", &text_fieldtype);
+        let text_field = schema.add_text_field("text", &schema::TEXT);

        let index = Index::create_from_tempdir(schema).unwrap();
-
        {
            // writing the segment
            let mut index_writer = index.writer_with_num_threads(1).unwrap();
@@ -99,8 +97,7 @@ mod tests {
    #[test]
    fn test_searcher() {
        let mut schema = schema::Schema::new();
-        let text_fieldtype = schema::TextOptions::new().set_tokenized_indexed();
-        let text_field = schema.add_text_field("text", &text_fieldtype);
+        let text_field = schema.add_text_field("text", &schema::TEXT);
        let index = Index::create_in_ram(schema);

        {
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -1,5 +1,4 @@
 use compression::SIMDBlockDecoder;
-use DocId;

 pub enum FreqHandler {
    FreqReader(SIMDBlockDecoder),
@@ -8,6 +7,11 @@ pub enum FreqHandler {
 }

 impl FreqHandler {
+
+    pub fn new_freq_reader() -> FreqHandler {
+        FreqHandler::FreqReader(SIMDBlockDecoder::new())
+    }
+
    pub fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
        match *self {
            FreqHandler::FreqReader(ref mut block_decoder) => {
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -50,7 +50,7 @@ impl<'a> SegmentPostings<'a> {
            doc_freq: doc_freq as usize,
            doc_offset: 0,
            block_decoder: SIMDBlockDecoder::new(),
-            freq_reader: FreqHandler::NoFreq,
+            freq_reader: FreqHandler::new_freq_reader(),
            remaining_data: data,
            cur: Wrapping(usize::max_value()),
        }
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -1,6 +1,8 @@
 use datastruct::FstMapBuilder;
 use super::TermInfo;
 use schema::Term;
+use schema::Schema;
+use schema::TextIndexingOptions;
 use directory::WritePtr;
 use compression::{NUM_DOCS_PER_BLOCK, SIMDBlockEncoder, CompositeEncoder};
 use DocId;
@@ -23,8 +25,8 @@ pub struct PostingsSerializer {
    doc_ids: Vec<DocId>,
    term_freqs: Vec<u32>,
    position_deltas: Vec<u32>,
-    is_termfreq_enabled: bool,
-    is_positions_enabled: bool,
+    schema: Schema,
+    text_indexing_options: TextIndexingOptions,
 }

 impl PostingsSerializer {
@@ -34,6 +36,7 @@ impl PostingsSerializer {
        let terms_fst_builder = try!(FstMapBuilder::new(terms_write));
        let postings_write = try!(segment.open_write(SegmentComponent::POSTINGS));
        let positions_write = try!(segment.open_write(SegmentComponent::POSITIONS));
+        let schema = segment.schema();
        Ok(PostingsSerializer {
            terms_fst_builder: terms_fst_builder,
            postings_write: postings_write,
@@ -46,13 +49,26 @@ impl PostingsSerializer {
            doc_ids: Vec::new(),
            term_freqs: Vec::new(),
            position_deltas: Vec::new(),
-            is_positions_enabled: false,
-            is_termfreq_enabled: false,
+            schema: schema,
+            text_indexing_options: TextIndexingOptions::Unindexed,
        })
    }

+    pub fn load_indexing_options(&mut self, term: &Term) {
+        self.text_indexing_options = match term.get_text_field() {
+            Some(text_field) => {
+                let text_options = self.schema.text_field_options(&text_field);
+                text_options.indexing_options() 
+            }
+            None => {
+                TextIndexingOptions::Unindexed
+            }
+        };
+    }
+
    pub fn new_term(&mut self, term: &Term, doc_freq: DocId) -> io::Result<()> {
        try!(self.close_term());
+        self.load_indexing_options(term);
        self.doc_ids.clear();
        self.last_doc_id_encoded = 0;
        self.term_freqs.clear();
@@ -72,7 +88,7 @@ impl PostingsSerializer {
                self.written_bytes_postings += block_encoded.len();
                try!(self.postings_write.write_all(block_encoded));
            }
-            if self.is_termfreq_enabled {
+            if self.text_indexing_options.is_termfreq_enabled() {
                {
                    let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
                    self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
@@ -81,7 +97,7 @@ impl PostingsSerializer {
                    }
                    self.term_freqs.clear();
                }
-                if self.is_positions_enabled {
+                if self.text_indexing_options.is_position_enabled() {
                    let positions_encoded: &[u8] = self.positions_encoder.compress_unsorted(&self.position_deltas[..]);
                    try!(self.positions_write.write_all(positions_encoded));
                    self.written_bytes_positions += positions_encoded.len();
@@ -95,13 +111,13 @@ impl PostingsSerializer {

    pub fn write_doc(&mut self, doc_id: DocId, term_freq: u32, position_deltas: &[u32]) -> io::Result<()> {
        self.doc_ids.push(doc_id);
-        if self.is_termfreq_enabled {
+        if self.text_indexing_options.is_termfreq_enabled() {
            self.term_freqs.push(term_freq as u32);
        }
-        if self.is_positions_enabled {
+        if self.text_indexing_options.is_position_enabled() {
            self.position_deltas.extend_from_slice(position_deltas);
        }
-        if self.doc_ids.len() == NUM_DOCS_PER_BLOCK { 
+        if self.doc_ids.len() == NUM_DOCS_PER_BLOCK {
            {
                // encode the positions
                let block_encoded: &[u8] = self.block_encoder.compress_block_sorted(&self.doc_ids, self.last_doc_id_encoded);
@@ -109,7 +125,7 @@ impl PostingsSerializer {
                try!(self.postings_write.write_all(block_encoded));
                self.written_bytes_postings += block_encoded.len();
            }
-            if self.is_termfreq_enabled {
+            if self.text_indexing_options.is_termfreq_enabled() {
                // encode the term_freqs
                let block_encoded: &[u8] = self.block_encoder.compress_block_unsorted(&self.term_freqs);
                try!(self.postings_write.write_all(block_encoded));
@@ -120,7 +136,7 @@ impl PostingsSerializer {
        }
        Ok(())
    }
-    
+
    pub fn close(mut self,) -> io::Result<()> {
        try!(self.close_term());
        try!(self.terms_fst_builder.finish());
--- a/src/schema/mod.rs
+++ b/src/schema/mod.rs
@@ -12,8 +12,9 @@ pub use self::text_field::TextFieldValue;
 pub use self::text_field::TextOptions;
 pub use self::text_field::FAST;
 pub use self::text_field::TEXT;
+pub use self::text_field::STRING;
 pub use self::text_field::STORED;
-
+pub use self::text_field::TextIndexingOptions;

 pub use self::u32_field::U32Field;
 pub use self::u32_field::U32FieldValue;
--- a/src/schema/schema.rs
+++ b/src/schema/schema.rs
@@ -33,19 +33,18 @@ pub struct U32FieldEntry {
 /// # Examples
 ///
 /// ```
-/// use tantivy::schema::{Schema, TextOptions};
+/// use tantivy::schema::*;
 ///
 /// fn create_schema() -> Schema {
 ///   let mut schema = Schema::new();
 ///   let str_fieldtype = TextOptions::new();
-///   let text_fieldtype = TextOptions::new().set_tokenized_indexed();
-///   let id_field = schema.add_text_field("id", &str_fieldtype);
-///   let url_field = schema.add_text_field("url", &str_fieldtype);
-///   let body_field = schema.add_text_field("body", &text_fieldtype);
-///   let id_field = schema.add_text_field("id", &str_fieldtype);
-///   let url_field = schema.add_text_field("url", &str_fieldtype);
-///   let title_field = schema.add_text_field("title", &text_fieldtype);
-///   let body_field = schema.add_text_field("body", &text_fieldtype);
+///   let id_field = schema.add_text_field("id", &STRING);
+///   let url_field = schema.add_text_field("url", &STRING);
+///   let body_field = schema.add_text_field("body", &TEXT);
+///   let id_field = schema.add_text_field("id", &STRING);
+///   let url_field = schema.add_text_field("url", &STRING);
+///   let title_field = schema.add_text_field("title", &TEXT);
+///   let body_field = schema.add_text_field("body", &TEXT);
 ///   schema
 /// }
 ///
--- a/src/schema/term.rs
+++ b/src/schema/term.rs
@@ -24,6 +24,27 @@ impl Term {
        }
    }

+    fn type_num(&self,) -> u8 {
+        self.data[0]
+    }
+
+    pub fn is_u32(&self,) -> bool {
+        !self.is_text()
+    }
+
+    pub fn is_text(&self,) -> bool {
+        self.type_num() & 128 == 0
+    }
+
+    pub fn get_text_field(&self,) -> Option<TextField> {
+        if self.is_text() {
+            Some(TextField(self.type_num()))
+        }
+        else {
+            None
+        }
+    }
+
    pub fn from_field_text(field: &TextField, text: &str) -> Term {
        let mut buffer = Vec::with_capacity(1 + text.len());
        let TextField(field_idx) = *field;
--- a/src/schema/text_field.rs
+++ b/src/schema/text_field.rs
@@ -10,17 +10,74 @@ use std::ops::BitOr;
 #[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash)]
 pub struct TextField(pub u8);

+#[derive(Clone,Debug,PartialEq,PartialOrd,Eq,Hash, RustcDecodable, RustcEncodable)]
+pub enum TextIndexingOptions {
+    Unindexed,
+    Untokenized,
+    TokenizedNoFreq,
+    TokenizedWithFreq,
+    TokenizedWithFreqAndPosition,
+}
+
+impl TextIndexingOptions {
+    pub fn is_termfreq_enabled(&self) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedWithFreq => true,
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    }
+    
+    pub fn is_tokenized(&self,) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedNoFreq => true,
+            TextIndexingOptions::TokenizedWithFreq => true,
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    } 
+    
+    pub fn is_position_enabled(&self,) -> bool {
+        match *self {
+            TextIndexingOptions::TokenizedWithFreqAndPosition => true,
+            _ => false,
+        }
+    }
+}
+
+
+impl BitOr for TextIndexingOptions {
+     type Output = TextIndexingOptions;
+
+    fn bitor(self, other: TextIndexingOptions) -> TextIndexingOptions {
+        use super::TextIndexingOptions::*;
+        if self == Unindexed {
+            other
+        }
+        else if other == Unindexed {
+            self
+        }
+        else if self == other {
+            self
+        }
+        else {
+            // make it possible
+            panic!("Combining {:?} and {:?} is ambiguous");
+        }
+    }
+}

 #[derive(Clone,Debug,PartialEq,Eq, RustcDecodable, RustcEncodable)]
 pub struct TextOptions {
-    tokenized_indexed: bool,
+    indexing_options: TextIndexingOptions,
    stored: bool,
    fast: bool,
 }

 impl TextOptions {
-    pub fn is_tokenized_indexed(&self,) -> bool {
-        self.tokenized_indexed
+    
+    pub fn indexing_options(&self,) -> TextIndexingOptions {
+        self.indexing_options.clone()
    }

    pub fn is_stored(&self,) -> bool {
@@ -41,15 +98,15 @@ impl TextOptions {
        self
    }

-    pub fn set_tokenized_indexed(mut self,) -> TextOptions {
-        self.tokenized_indexed = true;
+    pub fn set_indexing_options(mut self, indexing_options: TextIndexingOptions) -> TextOptions {
+        self.indexing_options = indexing_options;
        self
    }

    pub fn new() -> TextOptions {
        TextOptions {
            fast: false,
-            tokenized_indexed: false,
+            indexing_options: TextIndexingOptions::Unindexed,
            stored: false,
        }
    }
@@ -94,12 +151,17 @@ pub struct TextFieldValue {
 }


-
+/// The field will be untokenized and indexed
+pub const STRING: TextOptions = TextOptions {
+    indexing_options: TextIndexingOptions::Untokenized,
+    stored: false,
+    fast: false,
+};


 /// The field will be tokenized and indexed
 pub const TEXT: TextOptions = TextOptions {
-    tokenized_indexed: true,
+    indexing_options: TextIndexingOptions::TokenizedWithFreqAndPosition,
    stored: false,
    fast: false,
 };
@@ -109,7 +171,7 @@ pub const TEXT: TextOptions = TextOptions {
 /// Reading the stored fields of a document is relatively slow.
 /// (100 microsecs)
 pub const STORED: TextOptions = TextOptions {
-    tokenized_indexed: false,
+    indexing_options: TextIndexingOptions::Unindexed,
    stored: true,
    fast: false,
 };
@@ -117,7 +179,7 @@ pub const STORED: TextOptions = TextOptions {
 /// Fast field are used for field you need to access many times during
 /// collection. (e.g: for sort, aggregates).
 pub const FAST: TextOptions = TextOptions {
-    tokenized_indexed: false,
+    indexing_options: TextIndexingOptions::Unindexed,
    stored: false,
    fast: true
 };
@@ -129,7 +191,7 @@ impl BitOr for TextOptions {

    fn bitor(self, other: TextOptions) -> TextOptions {
        let mut res = TextOptions::new();
-        res.tokenized_indexed = self.tokenized_indexed || other.tokenized_indexed;
+        res.indexing_options = self.indexing_options | other.indexing_options;
        res.stored = self.stored || other.stored;
        res.fast = self.fast || other.fast;
        res
@@ -148,19 +210,19 @@ mod tests {
            let field_options = STORED | FAST;
            assert!(field_options.is_stored());
            assert!(field_options.is_fast());
-            assert!(!field_options.is_tokenized_indexed());
+            assert!(!field_options.indexing_options().is_tokenized());
        }
        {
            let field_options = STORED | TEXT;
            assert!(field_options.is_stored());
            assert!(!field_options.is_fast());
-            assert!(field_options.is_tokenized_indexed());
+            assert!(field_options.indexing_options().is_tokenized());
        }
        {
            let mut schema = Schema::new();
            let _body_field: TextField = schema.add_text_field("body", &TEXT);
            let field = schema.text_field("body");
-            assert!(schema.text_field_options(&field).is_tokenized_indexed());
+            assert!(schema.text_field_options(&field).indexing_options().is_tokenized());
        }
    }
 }