reader wokring with compressed data.

This commit is contained in:
Paul Masurel
2016-02-18 20:31:19 +09:00
parent c0c0a2c579
commit b78e5320c3
3 changed files with 43 additions and 25 deletions

View File

@@ -43,23 +43,26 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer {
fn write_docs(&mut self, doc_ids: &[DocId]) -> Result<()> {
// TODO write_all transmuted [u8]
for num in self.encoder.encode(doc_ids) {
let docs_data = self.encoder.encode(doc_ids);
match self.postings_write.write_u32::<BigEndian>(docs_data.len() as u32) {
Ok(_) => {}
Err(_) =>{
let msg = String::from("Failed while writing posting list");
return Err(Error::WriteError(msg));
}
}
self.written_bytes_postings += 4;
for num in docs_data {
match self.postings_write.write_u32::<BigEndian>(num.clone() as u32) {
Ok(_) => {},
Ok(_) => {
self.written_bytes_postings += 4;
},
Err(_) => {
let msg = String::from("Failed while writing posting list");
return Err(Error::WriteError(msg));
},
}
}
// match self.postings_write.write_u32::<BigEndian>(doc_id as u32) {
// Ok(_) => {},
// Err(_) => {
// let msg = String::from("Failed while writing posting list");
// return Err(Error::WriteError(msg));
// },
// }
//self.written_bytes_postings += 4;
Ok(())
}

View File

@@ -17,6 +17,7 @@ use core::directory::SegmentComponent;
use fst::raw::MmapReadOnly;
use core::error::{Result, Error};
use core::postings::Postings;
use core::simdcompression::Decoder;
// TODO file structure should be in codec
@@ -27,24 +28,37 @@ pub struct SegmentReader {
}
pub struct SegmentPostings<'a> {
cursor: Cursor<&'a [u8]>,
num_docs_remaining: usize,
pub struct SegmentPostings {
doc_id: usize,
doc_ids: Vec<u32>,
}
impl<'a> SegmentPostings<'a> {
impl SegmentPostings {
pub fn from_data(data: &[u8]) -> SegmentPostings {
let mut cursor = Cursor::new(data);
let doc_freq = cursor.read_u32::<BigEndian>().unwrap() as usize;
println!("doc_freq {}", doc_freq);
let data_size = cursor.read_u32::<BigEndian>().unwrap() as usize;
// TODO remove allocs
let mut data = Vec::with_capacity(data_size);
for _ in 0..data_size {
data.push(cursor.read_u32::<BigEndian>().unwrap());
}
let mut doc_ids: Vec<u32> = (0..doc_freq as u32 ).collect();
let decoder = Decoder::new();
decoder.decode(&data, &mut doc_ids);
for a in doc_ids.iter() {
println!("uncompressed {}", a);
}
SegmentPostings {
cursor: cursor,
num_docs_remaining: doc_freq,
doc_ids: doc_ids,
doc_id: 0,
}
}
}
impl<'a> Postings for SegmentPostings<'a> {
impl Postings for SegmentPostings {
fn skip_next(&mut self, target: DocId) -> Option<DocId> {
loop {
match Iterator::next(self) {
@@ -61,17 +75,18 @@ impl<'a> Postings for SegmentPostings<'a> {
}
impl<'a> Iterator for SegmentPostings<'a> {
impl Iterator for SegmentPostings {
type Item = DocId;
fn next(&mut self,) -> Option<DocId> {
if self.num_docs_remaining <= 0 {
None
if self.doc_id < self.doc_ids.len() {
let res = Some(self.doc_ids[self.doc_id]);
self.doc_id += 1;
return res;
}
else {
self.num_docs_remaining -= 1;
Some(self.cursor.read_u32::<BigEndian>().unwrap() as DocId)
None
}
}
}
@@ -109,7 +124,7 @@ impl SegmentReader {
SegmentPostings::from_data(&postings_data)
}
pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings<'a>> {
pub fn get_term<'a>(&'a self, term: &Term) -> Option<SegmentPostings> {
println!("Term {:?}", term);
match self.term_offsets.get(term.as_slice()) {
Some(offset) => {
@@ -120,7 +135,7 @@ impl SegmentReader {
}
}
pub fn search<'a>(&'a self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings<'a>> {
pub fn search(&self, terms: &Vec<Term>) -> IntersectionPostings<SegmentPostings> {
let segment_postings: Vec<SegmentPostings> = terms
.iter()
.map(|term| self.get_term(term).unwrap())

View File

@@ -138,7 +138,7 @@ fn test_searcher() {
}
{
let searcher = Searcher::for_directory(directory);
let terms = vec!(Term::from_field_text(&text_field, "a"), Term::from_field_text(&text_field, "b"), );
let terms = vec!(Term::from_field_text(&text_field, "b"), Term::from_field_text(&text_field, "a"), );
let mut collector = TestCollector::new();
searcher.search(&terms, &mut collector);
let vals: Vec<DocId> = collector.docs().iter()