From b78e5320c36efe68957bc51b879cd59ac546599c Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Thu, 18 Feb 2016 20:31:19 +0900 Subject: [PATCH] reader wokring with compressed data. --- src/core/codec.rs | 23 +++++++++++++---------- src/core/reader.rs | 43 +++++++++++++++++++++++++++++-------------- tests/core.rs | 2 +- 3 files changed, 43 insertions(+), 25 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index db0f186b4..9e750495f 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -43,23 +43,26 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<()> { // TODO write_all transmuted [u8] - for num in self.encoder.encode(doc_ids) { + let docs_data = self.encoder.encode(doc_ids); + match self.postings_write.write_u32::(docs_data.len() as u32) { + Ok(_) => {} + Err(_) =>{ + let msg = String::from("Failed while writing posting list"); + return Err(Error::WriteError(msg)); + } + } + self.written_bytes_postings += 4; + for num in docs_data { match self.postings_write.write_u32::(num.clone() as u32) { - Ok(_) => {}, + Ok(_) => { + self.written_bytes_postings += 4; + }, Err(_) => { let msg = String::from("Failed while writing posting list"); return Err(Error::WriteError(msg)); }, } } - // match self.postings_write.write_u32::(doc_id as u32) { - // Ok(_) => {}, - // Err(_) => { - // let msg = String::from("Failed while writing posting list"); - // return Err(Error::WriteError(msg)); - // }, - // } - //self.written_bytes_postings += 4; Ok(()) } diff --git a/src/core/reader.rs b/src/core/reader.rs index d51cb1c42..db0c11f78 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -17,6 +17,7 @@ use core::directory::SegmentComponent; use fst::raw::MmapReadOnly; use core::error::{Result, Error}; use core::postings::Postings; +use core::simdcompression::Decoder; // TODO file structure should be in codec @@ -27,24 +28,37 @@ pub struct SegmentReader { } -pub struct SegmentPostings<'a> { - cursor: Cursor<&'a [u8]>, - num_docs_remaining: usize, +pub struct SegmentPostings { + doc_id: usize, + doc_ids: Vec, } -impl<'a> SegmentPostings<'a> { +impl SegmentPostings { pub fn from_data(data: &[u8]) -> SegmentPostings { let mut cursor = Cursor::new(data); let doc_freq = cursor.read_u32::().unwrap() as usize; + println!("doc_freq {}", doc_freq); + let data_size = cursor.read_u32::().unwrap() as usize; + // TODO remove allocs + let mut data = Vec::with_capacity(data_size); + for _ in 0..data_size { + data.push(cursor.read_u32::().unwrap()); + } + let mut doc_ids: Vec = (0..doc_freq as u32 ).collect(); + let decoder = Decoder::new(); + decoder.decode(&data, &mut doc_ids); + for a in doc_ids.iter() { + println!("uncompressed {}", a); + } SegmentPostings { - cursor: cursor, - num_docs_remaining: doc_freq, + doc_ids: doc_ids, + doc_id: 0, } } } -impl<'a> Postings for SegmentPostings<'a> { +impl Postings for SegmentPostings { fn skip_next(&mut self, target: DocId) -> Option { loop { match Iterator::next(self) { @@ -61,17 +75,18 @@ impl<'a> Postings for SegmentPostings<'a> { } -impl<'a> Iterator for SegmentPostings<'a> { +impl Iterator for SegmentPostings { type Item = DocId; fn next(&mut self,) -> Option { - if self.num_docs_remaining <= 0 { - None + if self.doc_id < self.doc_ids.len() { + let res = Some(self.doc_ids[self.doc_id]); + self.doc_id += 1; + return res; } else { - self.num_docs_remaining -= 1; - Some(self.cursor.read_u32::().unwrap() as DocId) + None } } } @@ -109,7 +124,7 @@ impl SegmentReader { SegmentPostings::from_data(&postings_data) } - pub fn get_term<'a>(&'a self, term: &Term) -> Option> { + pub fn get_term<'a>(&'a self, term: &Term) -> Option { println!("Term {:?}", term); match self.term_offsets.get(term.as_slice()) { Some(offset) => { @@ -120,7 +135,7 @@ impl SegmentReader { } } - pub fn search<'a>(&'a self, terms: &Vec) -> IntersectionPostings> { + pub fn search(&self, terms: &Vec) -> IntersectionPostings { let segment_postings: Vec = terms .iter() .map(|term| self.get_term(term).unwrap()) diff --git a/tests/core.rs b/tests/core.rs index 507f03249..3d3aa1d58 100644 --- a/tests/core.rs +++ b/tests/core.rs @@ -138,7 +138,7 @@ fn test_searcher() { } { let searcher = Searcher::for_directory(directory); - let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), ); + let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), ); let mut collector = TestCollector::new(); searcher.search(&terms, &mut collector); let vals: Vec = collector.docs().iter()