fixed reading docfreq

2026-05-30 23:20:40 +00:00 · 2016-06-21 10:07:12 +09:00
parent 36684d76c5
commit 0c230c9ebf
8 changed files with 66 additions and 62 deletions
--- a/src/core/segment_reader.rs
+++ b/src/core/segment_reader.rs
@@ -12,16 +12,13 @@ use datastruct::FstMap;
 use std::fmt;
 use rustc_serialize::json;
 use core::index::SegmentInfo;
-use common::OpenTimer;
 use schema::Field;
 use core::convert_to_ioerror;
 use postings::SegmentPostings;
 use postings::Postings;
 use fastfield::{U32FastFieldsReader, U32FastFieldReader};
-use postings::intersection;
 use schema::FieldEntry;
 use schema::Schema;
-use schema::FieldValue;
 use postings::FreqHandler;

 pub struct SegmentReader {
--- a/src/core/writer.rs
+++ b/src/core/writer.rs
@@ -172,7 +172,7 @@ impl SegmentWriter {
 							match tokens.next() {
 								Some(token) => {
 									let term = Term::from_field_text(field, token);
-									self.postings_writer.suscribe(doc_id, pos.clone(), term);
+									self.postings_writer.suscribe(doc_id, pos, term);
 									pos += 1;
 								},
 								None => { break; }
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -71,6 +71,7 @@ mod tests {
    use super::*;
    use collector::TestCollector;
    use query::MultiTermQuery;
+    use postings::Postings;

    #[test]
    fn test_indexing() {
@@ -106,6 +107,32 @@ mod tests {
    }


+    #[test]
+    fn test_docfreq() {
+        let mut schema = schema::Schema::new();
+        let text_field = schema.add_text_field("text", schema::TEXT);
+        let index = Index::create_in_ram(schema);
+        {
+            // writing the segment
+            let mut index_writer = index.writer_with_num_threads(1).unwrap();
+            {
+                let mut doc = Document::new();
+                doc.add_text(text_field, "af af af bc bc");
+                index_writer.add_document(doc).unwrap();
+            }
+            index_writer.wait().unwrap();
+        }
+        {
+            let searcher = index.searcher().unwrap();
+            let reader = &searcher.segments()[0];
+            let mut postings = reader.read_postings(&Term::from_field_text(text_field, "af")).unwrap();
+            assert!(postings.next());
+            assert_eq!(postings.doc(), 0);
+            assert_eq!(postings.freq(), 3);
+            assert!(!postings.next());
+        }
+    }
+    
    #[test]
    fn test_searcher() {
        let mut schema = schema::Schema::new();
@@ -117,7 +144,7 @@ mod tests {
            let mut index_writer = index.writer_with_num_threads(1).unwrap();
            {
                let mut doc = Document::new();
-                doc.add_text(text_field, "af b");
+                doc.add_text(text_field, "af af af b");
                index_writer.add_document(doc).unwrap();
            }
            {
--- a/src/postings/freq_handler.rs
+++ b/src/postings/freq_handler.rs
@@ -6,6 +6,8 @@ pub enum FreqHandler {
    NoFreq,
 }

+const EMPTY: [u32; 0] = [];
+
 impl FreqHandler {

    pub fn new_freq_reader() -> FreqHandler {
@@ -34,39 +36,15 @@ impl FreqHandler {

    }

-}
-//
-//
-// pub struct FreqReader {
-//     block_decoder: SIMDBlockDecoder,
-// }
-//
-// impl FreqReader {
-//     fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
-//         self.block_decoder.uncompress_block_unsorted(data)
-//     }
-//
-//     fn term_freq_block(&self, doc: DocId) -> u32 {
-//         self.block_decoder.output()[doc as usize]
-//     }
-//
-//     fn read_freq_vint(&mut self, data: &[u8], num_els: usize) {
-//         self.block_decoder.uncompress_vint_unsorted(data, num_els)
-//     }
-// }
-//
-//
-// pub struct NoFreqReader;
-//
-// impl FreqHandler for NoFreqReader {
-//     fn read_freq_block<'a>(&mut self, data: &'a [u8]) -> &'a [u8] {
-//         data
-//     }
-//
-//     fn term_freq_block(&self, _doc: DocId) -> u32 {
-//         0
-//     }
-//
-//     fn read_freq_vint(&mut self, _data: &[u8], _num_els: usize) {
-//     }
-// }
+    pub fn output(&self,)-> &[u32] {
+        match *self {
+            FreqHandler::FreqReader(ref block_decoder) => {
+                block_decoder.output()
+            }
+            FreqHandler::NoFreq => {
+                &EMPTY
+            }
+        }
+    }
+
+}
--- a/src/postings/segment_postings.rs
+++ b/src/postings/segment_postings.rs
@@ -55,6 +55,10 @@ impl<'a> SegmentPostings<'a> {
            cur: Wrapping(usize::max_value()),
        }
    }
+
+    pub fn freq(&self,) -> u32 {
+        self.freq_handler.output()[self.cur.0]
+    }
 }

 impl<'a> Postings for SegmentPostings<'a> {
@@ -99,4 +103,6 @@ impl<'a> Postings for SegmentPostings<'a> {
    fn doc_freq(&self,) -> usize {
        self.doc_freq
    }
+
+
 }
--- a/src/postings/serializer.rs
+++ b/src/postings/serializer.rs
@@ -12,7 +12,6 @@ use core::index::Segment;
 use std::io;
 use core::index::SegmentComponent;
 use common::BinarySerializable;
-use common::VInt;


 pub struct PostingsSerializer {
@@ -93,7 +92,6 @@ impl PostingsSerializer {
            if self.text_indexing_options.is_termfreq_enabled() {
                {
                    let block_encoded = self.block_encoder.compress_vint_unsorted(&self.term_freqs[..]);
-                    self.written_bytes_postings += try!(VInt(block_encoded.len() as u64).serialize(&mut self.postings_write));
                    for num in block_encoded {
                        self.written_bytes_postings += try!(num.serialize(&mut self.postings_write));
                    }
--- a/src/postings/writer.rs
+++ b/src/postings/writer.rs
@@ -3,8 +3,8 @@ use std::collections::BTreeMap;
 use schema::Term;
 use postings::PostingsSerializer;
 use std::io;
-pub use postings::Recorder;
-pub use postings::NothingRecorder;
+use postings::Recorder;
+use postings::TermFrequencyRecorder;



@@ -25,24 +25,22 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
        self.recorder.close_doc();
    }
    
-    fn is_new_doc(&self, doc: &DocId) -> bool {
-        match self.doc_ids.last() {
-            Some(&last_doc) => last_doc != *doc,
-            None => true,
-        }
-    }
-
    pub fn doc_freq(&self) -> u32 {
        self.doc_ids.len() as u32
    }

    pub fn suscribe(&mut self, doc: DocId, pos: u32) {
-        if self.is_new_doc(&doc) {
-            // this is the first time we meet this term for this document
-            // first close the previous document, and write its doc_freq.
-            self.close_doc();
-            self.doc_ids.push(doc);
-		}
+         match self.doc_ids.last() {
+            Some(&last_doc) => {
+                if last_doc != doc {
+                    self.close_doc();
+                    self.doc_ids.push(doc);
+                }
+            },
+            None => {
+                self.doc_ids.push(doc)
+            },
+        }
        self.recorder.record_position(pos);
    }
    
@@ -55,9 +53,10 @@ impl<Rec: Recorder> TermPostingsWriter<Rec> {
    }       
 }

+// TODO use something faster than the TermFrequencyRecorder when possible.

 pub struct PostingsWriter {
-    postings: Vec<TermPostingsWriter<NothingRecorder>>,
+    postings: Vec<TermPostingsWriter<TermFrequencyRecorder>>,
    term_index: BTreeMap<Term, usize>,
 }

@@ -77,11 +76,11 @@ impl PostingsWriter {
    }

    pub fn suscribe(&mut self, doc: DocId, pos: u32, term: Term) {
-        let doc_ids: &mut TermPostingsWriter<NothingRecorder> = self.get_term_postings(term);
+        let doc_ids: &mut TermPostingsWriter<TermFrequencyRecorder> = self.get_term_postings(term);
        doc_ids.suscribe(doc, pos);
    }

-    fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<NothingRecorder> {
+    fn get_term_postings(&mut self, term: Term) -> &mut TermPostingsWriter<TermFrequencyRecorder> {
        match self.term_index.get(&term) {
            Some(unord_id) => {
                return &mut self.postings[*unord_id];
--- a/src/schema/field_entry.rs
+++ b/src/schema/field_entry.rs
@@ -1,4 +1,3 @@
-use rustc_serialize::Encodable;
 use rustc_serialize::Decoder;
 use rustc_serialize::Encoder;
 use schema::TextOptions;