reader wokring with compressed data.

2025-12-23 02:29:57 +00:00 · 2016-02-18 20:31:19 +09:00
parent c0c0a2c579
commit b78e5320c3
3 changed files with 43 additions and 25 deletions
--- a/src/core/codec.rs
+++ b/src/core/codec.rs
@@ -43,23 +43,26 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {

    fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<()> {
        // TODO write_all transmuted [u8]
-        for num in self.encoder.encode(doc_ids) {
+        let docs_data = self.encoder.encode(doc_ids);
+        match self.postings_write.write_u32::<BigEndian>(docs_data.len() as u32) {
+            Ok(_) => {}
+            Err(_) =>{
+                let msg = String::from("Failed while writing posting list");
+                return Err(Error::WriteError(msg));
+            }
+        }
+        self.written_bytes_postings += 4;
+        for num in docs_data {
            match self.postings_write.write_u32::<BigEndian>(num.clone() as u32) {
-                Ok(_) => {},
+                Ok(_) => {
+                    self.written_bytes_postings += 4;
+                },
                Err(_) => {
                    let msg = String::from("Failed while writing posting list");
                    return Err(Error::WriteError(msg));
                },
            }
        }
-        // match self.postings_write.write_u32::<BigEndian>(doc_id as u32) {
-        //     Ok(_) => {},
-        //     Err(_) => {
-        //         let msg = String::from("Failed while writing posting list");
-        //         return Err(Error::WriteError(msg));
-        //     },
-        // }
-        //self.written_bytes_postings +=  4;
        Ok(())
    }

--- a/src/core/reader.rs
+++ b/src/core/reader.rs
@@ -17,6 +17,7 @@ use core::directory::SegmentComponent;
 use fst::raw::MmapReadOnly;
 use core::error::{Result, Error};
 use core::postings::Postings;
+use core::simdcompression::Decoder;

 // TODO file structure should be in codec

@@ -27,24 +28,37 @@ pub struct SegmentReader {
 }


-pub struct SegmentPostings<'a> {
-    cursor: Cursor<&'a [u8]>,
-    num_docs_remaining: usize,
+pub struct SegmentPostings {
+    doc_id: usize,
+    doc_ids: Vec<u32>,
 }

-impl<'a> SegmentPostings<'a> {
+impl SegmentPostings {
    pub fn from_data(data: &[u8]) -> SegmentPostings {
        let mut cursor = Cursor::new(data);
        let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
+        println!("doc_freq {}", doc_freq);
+        let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
+        // TODO remove allocs
+        let mut data = Vec::with_capacity(data_size);
+        for _ in 0..data_size {
+            data.push(cursor.read_u32::<BigEndian>().unwrap());
+        }
+        let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect();
+        let decoder = Decoder::new();
+        decoder.decode(&data, &mut doc_ids);
+        for a in doc_ids.iter() {
+            println!("uncompressed {}", a);
+        }
        SegmentPostings {
-            cursor: cursor,
-            num_docs_remaining: doc_freq,
+            doc_ids: doc_ids,
+            doc_id: 0,
        }
    }

 }

-impl<'a> Postings for SegmentPostings<'a> {
+impl Postings for SegmentPostings {
    fn skip_next(&mut self, target: DocId) -> Option<DocId> {
        loop {
            match Iterator::next(self) {
@@ -61,17 +75,18 @@ impl<'a> Postings for SegmentPostings<'a> {
 }


-impl<'a> Iterator for SegmentPostings<'a> {
+impl Iterator for SegmentPostings {

    type Item = DocId;

    fn next(&mut self,) -> Option<DocId> {
-        if self.num_docs_remaining <= 0 {
-            None
+        if self.doc_id < self.doc_ids.len() {
+            let res = Some(self.doc_ids[self.doc_id]);
+            self.doc_id += 1;
+            return res;
        }
        else {
-            self.num_docs_remaining -= 1;
-            Some(self.cursor.read_u32::<BigEndian>().unwrap() as DocId)
+            None
        }
    }
 }
@@ -109,7 +124,7 @@ impl SegmentReader {
        SegmentPostings::from_data(&postings_data)
    }

-    pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings<'a>> {
+    pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings> {
        println!("Term {:?}", term);
        match self.term_offsets.get(term.as_slice()) {
            Some(offset) => {
@@ -120,7 +135,7 @@ impl SegmentReader {
        }
    }

-    pub fn search<'a>(&'a self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings<'a>> {
+    pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
        let segment_postings: Vec<SegmentPostings> = terms
            .iter()
            .map(|term| self.get_term(term).unwrap())
--- a/tests/core.rs
+++ b/tests/core.rs
@@ -138,7 +138,7 @@ fn test_searcher() {
    }
    {
        let searcher = Searcher::for_directory(directory);
-        let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
+        let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), );
        let mut collector = TestCollector::new();
        searcher.search(&terms, &mut collector);
        let vals: Vec<DocId> = collector.docs().iter()