From acf7312af956e3849255287588569a15223280ee Mon Sep 17 00:00:00 2001 From: Paul Masurel Date: Sun, 24 Apr 2016 19:09:35 +0900 Subject: [PATCH] Added independant method for block compression, vint compression --- Cargo.toml | 1 - cpp/encode.cpp | 55 +++++++++- src/core/simdcompression.rs | 200 ++++++++++++++++++++++++++++++++---- src/lib.rs | 1 - 4 files changed, 232 insertions(+), 25 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 837e63965..5748dbd3e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -18,7 +18,6 @@ log = "0.3.5" combine = "1.2.0" tempdir = "0.3.4" bincode = "0.4.0" -serde = "0.6.11" libc = "0.2.6" argparse = "*" num_cpus = "0.2" diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 47da680cd..fa538b0f1 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -4,19 +4,72 @@ #include "codecfactory.h" #include "intersection.h" #include "variablebyte.h" +#include "util.h" using namespace SIMDCompressionLib; + // sorted static shared_ptr codec_sorted = CODECFactory::getFromName("s4-bp128-dm"); + // variable byte static VariableByte codec_unsorted = VariableByte(); -static SIMDBinaryPacking> codec_packed_sorted = SIMDBinaryPacking>(); +static SIMDBinaryPacking> simd_pack_sorted = SIMDBinaryPacking>(); + +static VariableByte vint_codec = VariableByte(); +// SIMDBinaryPacking extern "C" { + // encode 128 u32 at a time. + size_t encode_sorted_block128_native( + uint32_t* begin, + uint32_t* output, + const size_t output_capacity) { + size_t output_length = output_capacity; + simd_pack_sorted.encodeArray(begin, + 128, + output, + output_length); + return output_length; + } + + size_t decode_sorted_block128_native( + const uint32_t* compressed_data, + const size_t compressed_size, + uint32_t* uncompressed, + const size_t uncompressed_capacity) { + size_t num_ints = uncompressed_capacity; + simd_pack_sorted.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return num_ints; + } + + size_t encode_sorted_vint_native( + uint32_t* begin, + const size_t num_els, + uint32_t* output, + const size_t output_capacity) { + size_t output_length = output_capacity; + vint_codec.encodeArray(begin, + num_els, + output, + output_length); + return output_length; + } + + size_t decode_sorted_vint_native( + const uint32_t* compressed_data, + const size_t compressed_size, + uint32_t* uncompressed, + const size_t uncompressed_capacity) { + size_t num_ints = uncompressed_capacity; + vint_codec.decodeArray(compressed_data, compressed_size, uncompressed, num_ints); + return num_ints; + } + + size_t encode_sorted_native( uint32_t* begin, const size_t num_els, diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index 4434c071c..9e816ca64 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -1,12 +1,25 @@ use libc::size_t; use std::ptr; +use std::iter; extern { // fn encode_unsorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; // fn decode_unsorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; + + // complete s4-bp128-dm fn encode_sorted_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; fn decode_sorted_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + + // bp128, only encodes group of 128 u32 at a time + fn encode_sorted_block128_native(data: *mut u32, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_block128_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + + // vints, used as the left over codec for the <128 remaining values + fn encode_sorted_vint_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; + fn decode_sorted_vint_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + } pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { @@ -18,6 +31,127 @@ pub fn intersection(left: &[u32], right: &[u32], output: &mut [u32]) -> usize { } } + + +//------------------------- +// Vint + + +pub struct VIntEncoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl VIntEncoder { + + pub fn new() -> VIntEncoder { + VIntEncoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert!(input.len() < 128); + let input_len = input.len(); + let written_size: usize; + // TODO use clone_from when available + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + written_size = encode_sorted_vint_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + + + +pub struct VIntDecoder; + +impl VIntDecoder { + + pub fn new() -> VIntDecoder { + VIntDecoder + } + + pub fn decode_sorted(&self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_vint_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + +//------------------------- +// Block128 + +pub struct Block128Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl Block128Encoder { + + pub fn new() -> Block128Encoder { + Block128Encoder { + input_buffer: Vec::with_capacity(128), + output_buffer: iter::repeat(0u32).take(256).collect(), + } + } + + pub fn encode_sorted(&mut self, input: &[u32]) -> &[u32] { + assert_eq!(input.len(), 128); + // TODO use clone_from when available + let written_size: usize; + unsafe { + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), 128); + written_size = encode_sorted_native( + self.input_buffer.as_mut_ptr(), + 128, + self.output_buffer.as_mut_ptr(), + 256, + ); + } + return &self.output_buffer[0..written_size]; + } +} + +pub struct Block128Decoder; + +impl Block128Decoder { + + pub fn new() -> Block128Decoder { + Block128Decoder + } + + pub fn decode_sorted( + &self, + compressed_data: &[u32], + uncompressed_values: &mut [u32]) -> size_t { + unsafe { + return decode_sorted_native( + compressed_data.as_ptr(), + compressed_data.len() as size_t, + uncompressed_values.as_mut_ptr(), + uncompressed_values.len() as size_t); + } + } +} + +//------------------------- +// s4-bp128-dm + + pub struct Encoder { input_buffer: Vec, output_buffer: Vec, @@ -52,28 +186,6 @@ impl Encoder { return &self.output_buffer[0..written_size]; } } - - - // pub fn encode_unsorted(&mut self, input: &[u32]) -> &[u32] { - // self.input_buffer.clear(); - // let input_len = input.len(); - // if input_len + 10000 >= self.input_buffer.len() { - // let target_length = input_len + 1024; - // self.input_buffer.resize(target_length, 0); - // self.output_buffer.resize(target_length, 0); - // } - // // TODO use clone_from when available - // unsafe { - // ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); - // let written_size = encode_unsorted_native( - // self.input_buffer.as_mut_ptr(), - // input_len as size_t, - // self.output_buffer.as_mut_ptr(), - // self.output_buffer.len() as size_t, - // ); - // return &self.output_buffer[0..written_size]; - // } - // } } @@ -149,6 +261,7 @@ mod tests { use super::*; use test::Bencher; + use std::iter; use rand::Rng; use rand::SeedableRng; use rand::XorShiftRng; @@ -182,6 +295,49 @@ mod tests { assert_eq!(decoded_data, input); } + #[test] + fn test_encode_block() { + let mut encoder = Block128Encoder::new(); + let expected_length = 21; + let input: Vec = (0u32..128u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Block128Decoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(128, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } + + + + #[test] + fn test_encode_vint() { + { + let mut encoder = VIntEncoder::new(); + let expected_length = 31; + let input: Vec = (0u32..123u32) + .map(|i| i * 7 / 2) + .into_iter() + .collect(); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = VIntDecoder::new(); + let mut decoded_data: Vec = iter::repeat(0u32).take(128).collect(); + assert_eq!(123, decoder.decode_sorted(&encoded_data[..], &mut decoded_data)); + assert_eq!(&decoded_data[0..123], &input[..]); + } + { + let mut encoder = VIntEncoder::new(); + let input = vec!(3, 17u32, 187); + let encoded_data = encoder.encode_sorted(&input); + assert_eq!(encoded_data.len(), 1); + assert_eq!(encoded_data[0], 2167049859u32); + } + } + // #[test] // fn test_encode_unsorted() { // let mut encoder = Encoder::new(); diff --git a/src/lib.rs b/src/lib.rs index c22fbce4f..93735e573 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -19,7 +19,6 @@ extern crate atomicwrites; extern crate tempdir; extern crate bincode; extern crate time; -extern crate serde; extern crate libc; extern crate lz4; extern crate uuid;