From 056e4e6cf3dbc76bea4b1f03fc503351384dd47d Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 1 May 2016 14:58:46 +0900 Subject: [PATCH] Moved collector out. --- src/collector/count_collector.rs | 32 +++ src/collector/first_n_collector.rs | 40 +++ src/collector/mod.rs | 109 +++++++++ src/collector/multi_collector.rs | 33 +++ src/compression/block128.rs | 87 +++++++ src/compression/intersection.rs | 14 ++ src/compression/mod.rs | 376 +---------------------------- src/compression/s4bp128.rs | 122 ++++++++++ src/compression/vints.rs | 97 ++++++++ src/core/reader.rs | 2 +- 10 files changed, 545 insertions(+), 367 deletions(-) create mode 100644 src/collector/count_collector.rs create mode 100644 src/collector/first_n_collector.rs create mode 100644 src/collector/mod.rs create mode 100644 src/collector/multi_collector.rs create mode 100644 src/compression/block128.rs create mode 100644 src/compression/intersection.rs create mode 100644 src/compression/s4bp128.rs create mode 100644 src/compression/vints.rs diff --git a/src/collector/count_collector.rs b/src/collector/count_collector.rs new file mode 100644 index 000000000..74794550c --- /dev/null +++ b/src/collector/count_collector.rs @@ -0,0 +1,32 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; + +pub struct CountCollector { + count: usize, +} + +impl CountCollector { + pub fn new() -> CountCollector { + CountCollector { + count: 0, + } + } + + pub fn count(&self,) -> usize { + self.count + } +} + +impl Collector for CountCollector { + + fn set_segment(&mut self, _: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + Ok(()) + } + + fn collect(&mut self, _: DocId) { + self.count += 1; + } +} diff --git a/src/collector/first_n_collector.rs b/src/collector/first_n_collector.rs new file mode 100644 index 000000000..9e570247b --- /dev/null +++ b/src/collector/first_n_collector.rs @@ -0,0 +1,40 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; +use core::searcher::DocAddress; + +pub struct FirstNCollector { + docs: Vec, + current_segment: u32, + limit: usize, +} + +impl FirstNCollector { + pub fn with_limit(limit: usize) -> FirstNCollector { + FirstNCollector { + docs: Vec::new(), + limit: limit, + current_segment: 0, + } + } + + pub fn docs(self,) -> Vec { + self.docs + } +} + +impl Collector for FirstNCollector { + + fn set_segment(&mut self, segment_local_id: SegmentLocalId, _: &SegmentReader) -> io::Result<()> { + self.current_segment = segment_local_id; + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + if self.docs.len() < self.limit { + self.docs.push(DocAddress(self.current_segment.clone(), doc_id)); + } + } +} diff --git a/src/collector/mod.rs b/src/collector/mod.rs new file mode 100644 index 000000000..048d7db32 --- /dev/null +++ b/src/collector/mod.rs @@ -0,0 +1,109 @@ +use DocId; +use SegmentReader; +use SegmentLocalId; +use fastfield::U32FastFieldReader; +use schema::U32Field; +use std::io; + + +mod count_collector; +pub use self::count_collector::CountCollector; + +mod first_n_collector; +pub use self::first_n_collector::FirstNCollector; + +mod multi_collector; +pub use self::multi_collector::MultiCollector; + +pub trait Collector { + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()>; + fn collect(&mut self, doc_id: DocId); +} + +pub struct TestCollector { + offset: DocId, + segment_max_doc: DocId, + docs: Vec, +} + +impl TestCollector { + pub fn new() -> TestCollector { + TestCollector { + docs: Vec::new(), + offset: 0, + segment_max_doc: 0, + } + } + + pub fn docs(self,) -> Vec { + self.docs + } +} + +impl Collector for TestCollector { + + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + self.offset += self.segment_max_doc; + self.segment_max_doc = reader.max_doc(); + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + self.docs.push(doc_id + self.offset); + } +} + + +pub struct FastFieldTestCollector { + vals: Vec, + u32_field: U32Field, + ff_reader: Option, +} + +impl FastFieldTestCollector { + pub fn for_field(u32_field: U32Field) -> FastFieldTestCollector { + FastFieldTestCollector { + vals: Vec::new(), + u32_field: u32_field, + ff_reader: None, + } + } + + pub fn vals(&self,) -> &Vec { + &self.vals + } +} + +impl Collector for FastFieldTestCollector { + + fn set_segment(&mut self, _: SegmentLocalId, reader: &SegmentReader) -> io::Result<()> { + self.ff_reader = Some(try!(reader.get_fast_field_reader(&self.u32_field))); + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + let val = self.ff_reader.as_ref().unwrap().get(doc_id); + self.vals.push(val); + } +} + + + +#[cfg(test)] +mod tests { + + use super::*; + use test::Bencher; + + #[bench] + fn build_collector(b: &mut Bencher) { + b.iter(|| { + let mut count_collector = CountCollector::new(); + let docs: Vec = (0..1_000_000).collect(); + for doc in docs { + count_collector.collect(doc); + } + count_collector.count() + }); + } +} diff --git a/src/collector/multi_collector.rs b/src/collector/multi_collector.rs new file mode 100644 index 000000000..8fafb458a --- /dev/null +++ b/src/collector/multi_collector.rs @@ -0,0 +1,33 @@ +use std::io; +use super::Collector; +use DocId; +use SegmentReader; +use SegmentLocalId; + +pub struct MultiCollector<'a> { + collectors: Vec<&'a mut Collector>, +} + +impl<'a> MultiCollector<'a> { + pub fn from(collectors: Vec<&'a mut Collector>) -> MultiCollector { + MultiCollector { + collectors: collectors, + } + } +} + +impl<'a> Collector for MultiCollector<'a> { + + fn set_segment(&mut self, segment_local_id: SegmentLocalId, segment: &SegmentReader) -> io::Result<()> { + for collector in self.collectors.iter_mut() { + try!(collector.set_segment(segment_local_id, segment)); + } + Ok(()) + } + + fn collect(&mut self, doc_id: DocId) { + for collector in self.collectors.iter_mut() { + collector.collect(doc_id); + } + } +} diff --git a/src/compression/block128.rs b/src/compression/block128.rs new file mode 100644 index 000000000..f48bd5011 --- /dev/null +++ b/src/compression/block128.rs @@ -0,0 +1,87 @@ +use libc::size_t; +use std::ptr; +use std::iter; + +extern { + fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +//------------------------- +// Block128 + +pub struct Block128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl Block128Encoder { + + pub fn new() -> Block128Encoder { + Block128Encoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert_eq!(input.len(), 128); + // TODO use clone_from when available + let written_size: usize; + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); + written_size = encode_sorted_block128_native( + self.input_buffer.as_mut_ptr(), + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + +pub struct Block128Decoder; + +impl Block128Decoder { + + pub fn new() -> Block128Decoder { + Block128Decoder + } + + pub fn decode_sorted( + &self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_block128_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + + +#[cfg(test)] +mod tests { + + use super::*; + use std::iter; + + #[test] + fn test_encode_block() { + let mut encoder = Block128Encoder::new(); + let expected_length = 21; + let input: Vec = (0u32..128u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Block128Decoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } +} diff --git a/src/compression/intersection.rs b/src/compression/intersection.rs new file mode 100644 index 000000000..ddc24df42 --- /dev/null +++ b/src/compression/intersection.rs @@ -0,0 +1,14 @@ +use libc::size_t; + +extern { + fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; +} + +pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { + unsafe { + intersection_native( + left.as_ptr(), left.len(), + right.as_ptr(), right.len(), + output.as_mut_ptr()) + } +} diff --git a/src/compression/mod.rs b/src/compression/mod.rs index ca97f6b5b..e4ed74ab7 100644 --- a/src/compression/mod.rs +++ b/src/compression/mod.rs @@ -1,267 +1,20 @@ -use libc::size_t; -use std::ptr; -use std::iter; +mod intersection; +pub use self::intersection::intersection; -extern { - // fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - // fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +mod s4bp128; +pub use self::s4bp128::{S4BP128Encoder, S4BP128Decoder}; - fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; +mod block128; +pub use self::block128::{Block128Encoder, Block128Decoder}; - // complete s4-bp128-dm - fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +mod vints; +pub use self::vints::{SortedVIntsEncoder, SortedVIntsDecoder}; - // bp128, only encodes group of 128 u32 at a time - fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - - // vints, used as the left over codec for the <128 remaining values - fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; - fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; - -} - -pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { - unsafe { - intersection_native( - left.as_ptr(), left.len(), - right.as_ptr(), right.len(), - output.as_mut_ptr()) - } -} - - - -//------------------------- -// Vint - - -pub struct VIntEncoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl VIntEncoder { - - pub fn new() -> VIntEncoder { - VIntEncoder { - input_buffer: Vec::with_capacity(128), - output_buffer: iter::repeat(0u32).take(256).collect(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - assert!(input.len() < 128); - let input_len = input.len(); - let written_size: usize; - // TODO use clone_from when available - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - written_size = encode_sorted_vint_native( - self.input_buffer.as_mut_ptr(), - input_len as size_t, - self.output_buffer.as_mut_ptr(), - 256, - ); - } - return &self.output_buffer[0..written_size]; - } -} - - - -pub struct VIntDecoder; - -impl VIntDecoder { - - pub fn new() -> VIntDecoder { - VIntDecoder - } - - pub fn decode_sorted(&self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_sorted_vint_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } -} - -//------------------------- -// Block128 - -pub struct Block128Encoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl Block128Encoder { - - pub fn new() -> Block128Encoder { - Block128Encoder { - input_buffer: Vec::with_capacity(128), - output_buffer: iter::repeat(0u32).take(256).collect(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - assert_eq!(input.len(), 128); - // TODO use clone_from when available - let written_size: usize; - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); - written_size = encode_s4_bp128_dm_native( - self.input_buffer.as_mut_ptr(), - 128, - self.output_buffer.as_mut_ptr(), - 256, - ); - } - return &self.output_buffer[0..written_size]; - } -} - -pub struct Block128Decoder; - -impl Block128Decoder { - - pub fn new() -> Block128Decoder { - Block128Decoder - } - - pub fn decode_sorted( - &self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_s4_bp128_dm_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } -} - -//------------------------- -// s4-bp128-dm - - -pub struct S4BP128Encoder { - input_buffer: Vec, - output_buffer: Vec, -} - -impl S4BP128Encoder { - - pub fn new() -> S4BP128Encoder { - S4BP128Encoder { - input_buffer: Vec::new(), - output_buffer: Vec::new(), - } - } - - pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { - self.input_buffer.clear(); - let input_len = input.len(); - if input_len + 10000 >= self.input_buffer.len() { - let target_length = input_len + 1024; - self.input_buffer.resize(target_length, 0); - self.output_buffer.resize(target_length, 0); - } - // TODO use clone_from when available - unsafe { - ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - let written_size = encode_s4_bp128_dm_native( - self.input_buffer.as_mut_ptr(), - input_len as size_t, - self.output_buffer.as_mut_ptr(), - self.output_buffer.len() as size_t, - ); - return &self.output_buffer[0..written_size]; - } - } -} - - - -pub struct S4BP128Decoder; - -impl S4BP128Decoder { - - pub fn new() -> S4BP128Decoder { - S4BP128Decoder - } - - pub fn decode_sorted(&self, - compressed_data: &[u32], - uncompressed_values: &mut [u32]) -> size_t { - unsafe { - return decode_s4_bp128_dm_native( - compressed_data.as_ptr(), - compressed_data.len() as size_t, - uncompressed_values.as_mut_ptr(), - uncompressed_values.len() as size_t); - } - } - - // pub fn decode_unsorted(&self, - // compressed_data: &[u32], - // uncompressed_values: &mut [u32]) -> size_t { - // unsafe { - // return decode_unsorted_native( - // compressed_data.as_ptr(), - // compressed_data.len() as size_t, - // uncompressed_values.as_mut_ptr(), - // uncompressed_values.len() as size_t); - // } - // } -} - - - - -// -// pub struct Intersector { -// output_buffer: Vec, -// } -// -// impl Intersector { -// fn new() -> Intersector { -// Intersector::with_capacity(1_000_000) -// } -// fn with_capacity(capacity: usize) -> Intersector { -// Intersector { -// output_buffer: iter::repeat(0u32).take(capacity).collect() -// } -// } -// fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] { -// let max_intersection_length = min(left.len(), right.len()); -// if self.output_buffer.len() < max_intersection_length { -// self.output_buffer.resize(max_intersection_length, 0); -// } -// unsafe { -// let intersection_len = intersection_native( -// left.as_ptr(), left.len() as size_t, -// right.as_ptr(), right.len() as size_t, -// self.output_buffer.as_mut_ptr()); -// return &self.output_buffer[0..intersection_len]; -// } -// } -// } #[cfg(test)] -mod tests { +pub mod tests { - use super::*; - use test::Bencher; - use std::iter; use rand::Rng; use rand::SeedableRng; use rand::XorShiftRng; @@ -275,116 +28,7 @@ mod tests { .collect() } - fn generate_array(n: usize, ratio: f32) -> Vec { + pub fn generate_array(n: usize, ratio: f32) -> Vec { generate_array_with_seed(n, ratio, 4) } - - #[test] - fn test_encode_big() { - let mut encoder = S4BP128Encoder::new(); - let num_ints = 10000 as usize; - let expected_length = 1274; - let input: Vec = (0..num_ints as u32) - .map(|i| i * 7 / 2) - .into_iter().collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = S4BP128Decoder::new(); - let mut decoded_data: Vec = (0..num_ints as u32).collect(); - assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); - } - - #[test] - fn test_encode_block() { - let mut encoder = Block128Encoder::new(); - let expected_length = 21; - let input: Vec = (0u32..128u32) - .map(|i| i * 7 / 2) - .into_iter() - .collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = Block128Decoder::new(); - let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); - assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); - } - - - - #[test] - fn test_encode_vint() { - { - let mut encoder = VIntEncoder::new(); - let expected_length = 31; - let input: Vec = (0u32..123u32) - .map(|i| i * 7 / 2) - .into_iter() - .collect(); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = VIntDecoder::new(); - let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); - assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); - assert_eq!(&decoded_data[0..123], &input[..]); - } - { - let mut encoder = VIntEncoder::new(); - let input = vec!(3, 17u32, 187); - let encoded_data = encoder.encode_sorted(&input); - assert_eq!(encoded_data.len(), 1); - assert_eq!(encoded_data[0], 2167049859u32); - } - } - - // #[test] - // fn test_encode_unsorted() { - // let mut encoder = Encoder::new(); - // let num_ints = 10_000 as usize; - // let expected_length = 4361; - // let input: Vec = (0..num_ints as u32) - // .map(|i| i * 213_127 % 501) - // .into_iter().collect(); - // assert_eq!(input.len(), 10_000); - // let encoded_data = encoder.encode_unsorted(&input); - // assert_eq!(encoded_data.len(), expected_length); - // let decoder = Decoder::new(); - // let mut decoded_data: Vec = (0..num_ints as u32).collect(); - // assert_eq!(num_ints, decoder.decode_unsorted(&encoded_data[..], &mut decoded_data)); - // assert_eq!(decoded_data, input); - // } - // - // #[test] - // fn test_simd_intersection() { - // let mut intersector = Intersector::new(); - // let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); - // let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); - // let intersection = intersector.intersection(&arr1[..], &arr2[..]) ; - // assert_eq!(intersection.len(), 500_233); - // } - - #[bench] - fn bench_decode(b: &mut Bencher) { - const TEST_SIZE: usize = 1_000_000; - let arr = generate_array(TEST_SIZE, 0.1); - let mut encoder = S4BP128Encoder::new(); - let encoded = encoder.encode_sorted(&arr); - let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); - let decoder = S4BP128Decoder; - b.iter(|| { - decoder.decode_sorted(&encoded, &mut uncompressed); - }); - } - - - // #[bench] - // fn bench_simd_intersection(b: &mut Bencher) { - // let mut intersector = Intersector::new(); - // let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); - // let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); - // b.iter(|| { - // intersector.intersection(&arr1[..], &arr2[..]).len() - // }); - // } } diff --git a/src/compression/s4bp128.rs b/src/compression/s4bp128.rs new file mode 100644 index 000000000..2069e53bb --- /dev/null +++ b/src/compression/s4bp128.rs @@ -0,0 +1,122 @@ + +use libc::size_t; +use std::ptr; + +extern { + // complete s4-bp128-dm + fn encode_s4_bp128_dm_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_s4_bp128_dm_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +//------------------------- +// s4-bp128-dm + + +pub struct S4BP128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl S4BP128Encoder { + + pub fn new() -> S4BP128Encoder { + S4BP128Encoder { + input_buffer: Vec::new(), + output_buffer: Vec::new(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + self.input_buffer.clear(); + let input_len = input.len(); + if input_len + 10000 >= self.input_buffer.len() { + let target_length = input_len + 1024; + self.input_buffer.resize(target_length, 0); + self.output_buffer.resize(target_length, 0); + } + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + let written_size = encode_s4_bp128_dm_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + self.output_buffer.len() as size_t, + ); + return &self.output_buffer[0..written_size]; + } + } +} + + +pub struct S4BP128Decoder; + +impl S4BP128Decoder { + + pub fn new() -> S4BP128Decoder { + S4BP128Decoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_s4_bp128_dm_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } + + // pub fn decode_unsorted(&self, + // compressed_data: &[u32], + // uncompressed_values: &mut [u32]) -> size_t { + // unsafe { + // return decode_unsorted_native( + // compressed_data.as_ptr(), + // compressed_data.len() as size_t, + // uncompressed_values.as_mut_ptr(), + // uncompressed_values.len() as size_t); + // } + // } +} + + + +#[cfg(test)] +mod tests { + + use super::*; + use test::Bencher; + use compression::tests::generate_array; + + #[test] + fn test_encode_big() { + let mut encoder = S4BP128Encoder::new(); + let num_ints = 10000 as usize; + let expected_length = 1274; + let input: Vec = (0..num_ints as u32) + .map(|i| i * 7 / 2) + .into_iter().collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = S4BP128Decoder::new(); + let mut decoded_data: Vec = (0..num_ints as u32).collect(); + assert_eq!(num_ints, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } + + #[bench] + fn bench_decode(b: &mut Bencher) { + const TEST_SIZE: usize = 1_000_000; + let arr = generate_array(TEST_SIZE, 0.1); + let mut encoder = S4BP128Encoder::new(); + let encoded = encoder.encode_sorted(&arr); + let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); + let decoder = S4BP128Decoder; + b.iter(|| { + decoder.decode_sorted(&encoded, &mut uncompressed); + }); + } +} diff --git a/src/compression/vints.rs b/src/compression/vints.rs new file mode 100644 index 000000000..ae0cdbf8c --- /dev/null +++ b/src/compression/vints.rs @@ -0,0 +1,97 @@ +use libc::size_t; +use std::ptr; +use std::iter; + +extern { + fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; +} + +pub struct SortedVIntsEncoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl SortedVIntsEncoder { + + pub fn new() -> SortedVIntsEncoder { + SortedVIntsEncoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert!(input.len() < 128); + let input_len = input.len(); + let written_size: usize; + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + written_size = encode_sorted_vint_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + + + +pub struct SortedVIntsDecoder; + +impl SortedVIntsDecoder { + + pub fn new() -> SortedVIntsDecoder { + SortedVIntsDecoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_vint_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + + +#[cfg(test)] +mod tests { + + use std::iter; + use super::*; + + #[test] + fn test_encode_vint() { + { + let mut encoder = SortedVIntsEncoder::new(); + let expected_length = 31; + let input: Vec = (0u32..123u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = SortedVIntsDecoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(&decoded_data[0..123], &input[..]); + } + { + let mut encoder = SortedVIntsEncoder::new(); + let input = vec!(3, 17u32, 187); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), 1); + assert_eq!(encoded_data[0], 2167049859u32); + } + } + +} diff --git a/src/core/reader.rs b/src/core/reader.rs index c271efa04..77d8f0094 100644 --- a/src/core/reader.rs +++ b/src/core/reader.rs @@ -75,7 +75,7 @@ impl SegmentPostings { let mut doc_ids: Vec = Vec::with_capacity(doc_freq as usize); unsafe { doc_ids.set_len(doc_freq as usize); } { - let decoder = compression::S4BP128Decoder::new(); + let decoder = S4BP128Decoder::new(); decoder.decode_sorted(&data_u32[1..(num_u32s+1) as usize], &mut doc_ids); SegmentPostings(doc_ids) }