From fbceebcce368d88c89eeb5d108bbf31665c0b03b Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Mon, 7 Mar 2016 10:14:41 +0900 Subject: [PATCH] blop --- src/core/codec.rs | 9 ++-- src/core/directory.rs | 2 +- src/core/index.rs | 6 --- src/core/mod.rs | 12 ++++- src/core/postings.rs | 99 +++++++++++++++++----------------- src/core/reader.rs | 11 +--- src/core/simdcompression.rs | 102 ++++++++++++++++++------------------ src/core/store.rs | 10 ++-- src/lib.rs | 4 ++ 9 files changed, 127 insertions(+), 128 deletions(-) diff --git a/src/core/codec.rs b/src/core/codec.rs index 4c878afac..51c66e727 100644 --- a/src/core/codec.rs +++ b/src/core/codec.rs @@ -14,7 +14,7 @@ use core::store::StoreWriter; use core::serialize::BinarySerializable; use core::simdcompression; use core::schema::FieldValue; - +use core::convert_to_ioerror; #[derive(Debug)] pub struct TermInfo { @@ -22,6 +22,7 @@ pub struct TermInfo { pub postings_offset: u32, } + impl BinarySerializable for TermInfo { const SIZE: Size = Size::Constant(8); @@ -89,9 +90,9 @@ impl SegmentSerializer<()> for SimpleSegmentSerializer { fn write_segment_info(&self, segment_info: &SegmentInfo) -> io::Result<()> { let mut write = try!(self.segment.open_write(SegmentComponent::INFO)); - let json_data = json::encode(segment_info).unwrap(); - write.write_all(json_data.as_bytes()); - write.flush(); + let json_data = try!(json::encode(segment_info).map_err(convert_to_ioerror)); + try!(write.write_all(json_data.as_bytes())); + try!(write.flush()); Ok(()) } diff --git a/src/core/directory.rs b/src/core/directory.rs index 5c9ddf570..a8538f39d 100644 --- a/src/core/directory.rs +++ b/src/core/directory.rs @@ -215,7 +215,7 @@ impl Directory for RAMDirectory { } fn open_write(&mut self, path: &Path) -> io::Result { let full_path = PathBuf::from(&path); - let mut data = SharedVec::new(); + let data = SharedVec::new(); self.fs.insert(full_path, data.clone()); Ok(Box::new(data)) } diff --git a/src/core/index.rs b/src/core/index.rs index 1ac118649..271d61c0e 100644 --- a/src/core/index.rs +++ b/src/core/index.rs @@ -32,12 +32,6 @@ pub struct IndexMeta { } impl IndexMeta { - fn new() -> IndexMeta { - IndexMeta { - segments: Vec::new(), - schema: Schema::new(), - } - } fn with_schema(schema: Schema) -> IndexMeta { IndexMeta { segments: Vec::new(), diff --git a/src/core/mod.rs b/src/core/mod.rs index e33809c7a..77d2c9dfe 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -8,9 +8,19 @@ pub mod reader; pub mod codec; pub mod searcher; pub mod collector; -pub mod skip; pub mod serialize; pub mod store; pub mod simdcompression; pub mod fstmap; pub mod index; + + +use std::error; +use std::io; + +pub fn convert_to_ioerror(err: E) -> io::Error { + io::Error::new( + io::ErrorKind::InvalidData, + err + ) +} diff --git a/src/core/postings.rs b/src/core/postings.rs index fa3adca34..ca8716630 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -17,56 +17,6 @@ pub trait Postings: Iterator { // impl> Postings for T {} -#[derive(Debug)] -pub struct VecPostings { - doc_ids: Vec, - cursor: usize, -} - -impl VecPostings { - pub fn new(vals: Vec) -> VecPostings { - VecPostings { - doc_ids: vals, - cursor: 0, - } - } -} - -impl Postings for VecPostings { - // after skipping position - // the iterator in such a way that the - // next call to next() will return a - // value greater or equal to target. - fn skip_next(&mut self, target: DocId) -> Option { - loop { - match Iterator::next(self) { - Some(val) if val >= target => { - return Some(val); - }, - None => { - return None; - }, - _ => {} - } - } - } -} - -impl Iterator for VecPostings { - type Item = DocId; - fn next(&mut self,) -> Option { - if self.cursor >= self.doc_ids.len() { - None - } - else { - self.cursor += 1; - Some(self.doc_ids[self.cursor - 1]) - } - } -} - - - pub struct IntersectionPostings { @@ -127,6 +77,55 @@ mod tests { use test::Bencher; use core::schema::DocId; + + #[derive(Debug)] + pub struct VecPostings { + doc_ids: Vec, + cursor: usize, + } + + impl VecPostings { + pub fn new(vals: Vec) -> VecPostings { + VecPostings { + doc_ids: vals, + cursor: 0, + } + } + } + + impl Postings for VecPostings { + // after skipping position + // the iterator in such a way that the + // next call to next() will return a + // value greater or equal to target. + fn skip_next(&mut self, target: DocId) -> Option { + loop { + match Iterator::next(self) { + Some(val) if val >= target => { + return Some(val); + }, + None => { + return None; + }, + _ => {} + } + } + } + } + + impl Iterator for VecPostings { + type Item = DocId; + fn next(&mut self,) -> Option { + if self.cursor >= self.doc_ids.len() { + None + } + else { + self.cursor += 1; + Some(self.doc_ids[self.cursor - 1]) + } + } + } + #[test] fn test_intersection() { { diff --git a/src/core/reader.rs b/src/core/reader.rs index 4fcce0024..7aea65936 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -14,12 +14,11 @@ use std::io; use std::str; use core::codec::TermInfo; use core::fstmap::FstMap; -use std::error; use rustc_serialize::json; use core::serial::SegmentSerializer; use core::serial::SerializableSegment; -use std::str::Utf8Error; use core::index::SegmentInfo; +use core::convert_to_ioerror; // TODO file structure should be in codec @@ -99,14 +98,6 @@ impl Iterator for SegmentPostings { } -fn convert_to_ioerror(err: E) -> io::Error { - io::Error::new( - io::ErrorKind::InvalidData, - err - ) -} - - impl SegmentReader { diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index f41a37c12..4606c1eba 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -4,12 +4,12 @@ use std::ptr; // use std::iter; extern { - fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - + // fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + // fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + // fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; + } pub struct Encoder { @@ -48,26 +48,26 @@ impl Encoder { } - pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] { - self.input_buffer.clear(); - let input_len = input.len(); - if input_len + 10000 >= self.input_buffer.len() { - let target_length = input_len + 1024; - self.input_buffer.resize(target_length, 0); - self.output_buffer.resize(target_length, 0); - } - // TODO use clone_from when available - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - let written_size = encode_unsorted_native( - self.input_buffer.as_mut_ptr(), - input_len as size_t, - self.output_buffer.as_mut_ptr(), - self.output_buffer.len() as size_t, - ); - return &self.output_buffer[0..written_size]; - } - } + // pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] { + // self.input_buffer.clear(); + // let input_len = input.len(); + // if input_len + 10000 >= self.input_buffer.len() { + // let target_length = input_len + 1024; + // self.input_buffer.resize(target_length, 0); + // self.output_buffer.resize(target_length, 0); + // } + // // TODO use clone_from when available + // unsafe { + // ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + // let written_size = encode_unsorted_native( + // self.input_buffer.as_mut_ptr(), + // input_len as size_t, + // self.output_buffer.as_mut_ptr(), + // self.output_buffer.len() as size_t, + // ); + // return &self.output_buffer[0..written_size]; + // } + // } } @@ -92,17 +92,17 @@ impl Decoder { } } - pub fn decode_unsorted(&self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_unsorted_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } + // pub fn decode_unsorted(&self, + // compressed_data: &[u32], + // uncompressed_values: &mut [u32]) -> size_t { + // unsafe { + // return decode_unsorted_native( + // compressed_data.as_ptr(), + // compressed_data.len() as size_t, + // uncompressed_values.as_mut_ptr(), + // uncompressed_values.len() as size_t); + // } + // } } @@ -176,22 +176,22 @@ mod tests { assert_eq!(decoded_data, input); } - #[test] - fn test_encode_unsorted() { - let mut encoder = Encoder::new(); - let num_ints = 10_000 as usize; - let expected_length = 4361; - let input: Vec = (0..num_ints as u32) - .map(|i| i * 213_127 % 501) - .into_iter().collect(); - assert_eq!(input.len(), 10_000); - let encoded_data = encoder.encode_unsorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = Decoder::new(); - let mut decoded_data: Vec = (0..num_ints as u32).collect(); - assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); - } + // #[test] + // fn test_encode_unsorted() { + // let mut encoder = Encoder::new(); + // let num_ints = 10_000 as usize; + // let expected_length = 4361; + // let input: Vec = (0..num_ints as u32) + // .map(|i| i * 213_127 % 501) + // .into_iter().collect(); + // assert_eq!(input.len(), 10_000); + // let encoded_data = encoder.encode_unsorted(&input); + // assert_eq!(encoded_data.len(), expected_length); + // let decoder = Decoder::new(); + // let mut decoded_data: Vec = (0..num_ints as u32).collect(); + // assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data)); + // assert_eq!(decoded_data, input); + // } // // #[test] // fn test_simd_intersection() { diff --git a/src/core/store.rs b/src/core/store.rs index 4432a5f50..d82e2e06b 100644 --- a/src/core/store.rs +++ b/src/core/store.rs @@ -139,7 +139,7 @@ impl StoreReader { return offset; } - fn read_block(&self, block_offset: usize) { + fn read_block(&self, block_offset: usize) -> io::Result<()> { let mut current_block_mut = self.current_block.borrow_mut(); current_block_mut.clear(); let total_buffer = self.data.as_slice(); @@ -147,21 +147,21 @@ impl StoreReader { let block_length = u32::deserialize(&mut cursor).unwrap(); let block_array: &[u8] = &total_buffer[(block_offset + 4 as usize)..(block_offset + 4 + block_length as usize)]; let mut lz4_decoder = lz4::Decoder::new(Cursor::new(block_array)).unwrap(); - lz4_decoder.read_to_end(&mut current_block_mut); + lz4_decoder.read_to_end(&mut current_block_mut).map(|_| ()) } pub fn get(&self, doc_id: &DocId) -> io::Result { let OffsetIndex(first_doc_id, block_offset) = self.block_offset(doc_id); - self.read_block(block_offset as usize); + try!(self.read_block(block_offset as usize)); let mut current_block_mut = self.current_block.borrow_mut(); let mut cursor = Cursor::new(&mut current_block_mut[..]); for _ in first_doc_id..*doc_id { let block_length = try!(u32::deserialize(&mut cursor)); - cursor.seek(SeekFrom::Current(block_length as i64)); + try!(cursor.seek(SeekFrom::Current(block_length as i64))); } try!(u32::deserialize(&mut cursor)); let mut field_values = Vec::new(); - let num_fields = u32::deserialize(&mut cursor).unwrap(); + let num_fields = try!(u32::deserialize(&mut cursor)); for _ in 0..num_fields { let field_value = try!(FieldValue::deserialize(&mut cursor)); field_values.push(field_value); diff --git a/src/lib.rs b/src/lib.rs index 4f38e5981..2a86201c1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -35,6 +35,10 @@ pub use core::schema::Document; pub use core::collector; pub use core::reader::SegmentReader; + + + + #[cfg(test)] mod tests {