diff --git a/build.rs b/build.rs index 573767810..1ebaaa1cf 100644 --- a/build.rs +++ b/build.rs @@ -12,7 +12,7 @@ fn main() { .object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o") .object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o") .object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o") - // .object("cpp/SIMDCompressionAndIntersection/intersection.o") + .object("cpp/SIMDCompressionAndIntersection/intersection.o") .object("cpp/SIMDCompressionAndIntersection/varintdecode.o") .object("cpp/SIMDCompressionAndIntersection/streamvbyte.o") .object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o") diff --git a/cpp/encode.cpp b/cpp/encode.cpp index 9bf0ad7e8..c2a4201b7 100644 --- a/cpp/encode.cpp +++ b/cpp/encode.cpp @@ -8,8 +8,10 @@ using namespace SIMDCompressionLib; static shared_ptr codec = CODECFactory::getFromName("s4-bp128-dm"); - extern "C" { + + + size_t encode_native( uint32_t* begin, const size_t num_els, @@ -32,4 +34,14 @@ extern "C" { codec -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints); return num_ints; } + + size_t intersection_native( + const uint32_t* left, + const size_t left_size, + const uint32_t* right, + const size_t right_size, + uint32_t* output) { + return IntersectionFactory::getFromName("simd")(left, left_size, right, right_size, output); + } + } diff --git a/src/core/fstmap.rs b/src/core/fstmap.rs index a571fa079..cfe3db5f6 100644 --- a/src/core/fstmap.rs +++ b/src/core/fstmap.rs @@ -3,23 +3,17 @@ use std::io::Seek; use std::io::Write; use std::io::Cursor; use std::fs::File; -use fst::Map; -use fst::MapBuilder; -use std::rc::Rc; -use fst::raw::Fst; -use core::serialize::BinarySerializable; -use std::marker::PhantomData; use fst; use fst::raw::MmapReadOnly; -use std::ops::Deref; - +use core::serialize::BinarySerializable; +use std::marker::PhantomData; fn convert_fst_error(e: fst::Error) -> io::Error { io::Error::new(io::ErrorKind::Other, e) } pub struct FstMapBuilder { - fst_builder: MapBuilder, + fst_builder: fst::MapBuilder, data: Vec, _phantom_: PhantomData, } @@ -27,7 +21,7 @@ pub struct FstMapBuilder { impl FstMapBuilder { fn new(w: W) -> io::Result> { - let fst_builder = try!(MapBuilder::new(w).map_err(convert_fst_error)); + let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error)); Ok(FstMapBuilder { fst_builder: fst_builder, data: Vec::new(), @@ -44,12 +38,14 @@ impl FstMapBuilder { } fn close(self,) -> io::Result { - let mut file = try!(self.fst_builder + let mut file = try!( + self.fst_builder .into_inner() .map_err(convert_fst_error)); - let footer_size = self.data.len(); + let footer_size = self.data.len() as u32; file.write_all(&self.data); (footer_size as u32).serialize(&mut file); + file.flush(); Ok(file) } } @@ -63,14 +59,14 @@ pub struct FstMap { impl FstMap { - pub fn open(file: &File) -> io::Result> { + pub fn open(file: File) -> io::Result> { let mmap = try!(MmapReadOnly::open(&file)); let mut cursor = Cursor::new(unsafe {mmap.as_slice()}); try!(cursor.seek(io::SeekFrom::End(-4))); let footer_size = try!(u32::deserialize(&mut cursor)) as usize; let split_len = mmap.len() - 4 - footer_size; let fst_mmap = mmap.range(0, split_len); - let values_mmap = mmap.range(split_len, mmap.len() - 4); + let values_mmap = mmap.range(split_len, footer_size); let fst = try!(fst::raw::Fst::from_mmap(fst_mmap).map_err(convert_fst_error)); Ok(FstMap { fst_index: fst::Map::from(fst), @@ -92,13 +88,21 @@ impl FstMap { mod tests { - use super::{FstMapBuilder, FstMap}; - + use super::*; + use tempfile; + #[test] fn test_fstmap() { - let mut fst_map_builder: FstMapBuilder, u32> = FstMapBuilder::new(Vec::new()).unwrap(); - fst_map_builder.insert("abc".as_bytes(), &34).unwrap(); - fst_map_builder.insert("abcd".as_bytes(), &343).unwrap(); - let data = fst_map_builder.close().unwrap(); + let fstmap_file; + { + let tempfile = tempfile::tempfile().unwrap(); // 41 + let mut fstmap_builder = FstMapBuilder::new(tempfile).unwrap(); + fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap(); + fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap(); + fstmap_file = fstmap_builder.close().unwrap(); + } + let fstmap = FstMap::open(fstmap_file).unwrap(); + assert_eq!(fstmap.get("abc"), Some(34u32)); + assert_eq!(fstmap.get("abcd"), Some(346u32)); } } diff --git a/src/core/postings.rs b/src/core/postings.rs index 3a01e592e..fa3adca34 100644 --- a/src/core/postings.rs +++ b/src/core/postings.rs @@ -120,9 +120,6 @@ impl Iterator for IntersectionPostings { } - - - #[cfg(test)] mod tests { @@ -162,38 +159,3 @@ mod tests { }); } } - - - -#[cfg(test)] -mod tests { - - - use super::*; - use test::Bencher; - use rand::Rng; - use rand::SeedableRng; - use rand::StdRng; - - fn generate_array(n: usize, ratio: f32) -> Vec { - let seed: &[_] = &[1, 2, 3, 4]; - let mut rng: StdRng = SeedableRng::from_seed(seed); - (0..u32::max_value()) - .filter(|_| rng.next_f32()< ratio) - .take(n) - .collect() - } - - #[bench] - fn bench_intersection(b: &mut Bencher) { - const TEST_SIZE: usize = 100_000; - let arr = generate_array(TEST_SIZE, 0.1); - let mut encoder = Encoder::new(); - let encoded = encoder.encode(&arr); - let mut uncompressed: Vec = (0..TEST_SIZE as u32).collect(); - let decoder = Decoder; - b.iter(|| { - decoder.decode(&encoded, &mut uncompressed); - }); - } -} diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs index 34acc4505..e4c15d07e 100644 --- a/src/core/simdcompression.rs +++ b/src/core/simdcompression.rs @@ -1,12 +1,14 @@ - use libc::size_t; use std::ptr; +use std::cmp::min; +use std::iter; extern { fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t; fn decode_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t; + fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t; } pub struct Encoder { @@ -27,9 +29,9 @@ impl Encoder { self.input_buffer.clear(); let input_len = input.len(); if input_len + 10000 >= self.input_buffer.len() { - self.input_buffer = (0..(input_len as u32) + 1024).collect(); - self.output_buffer = (0..(input_len as u32) + 1024).collect(); - // TODO use resize when available + let target_length = input_len + 1024; + self.input_buffer.resize(target_length, 0); + self.output_buffer.resize(target_length, 0); } // TODO use clone_from when available unsafe { @@ -68,21 +70,32 @@ impl Decoder { } } +pub struct Intersector { + output_buffer: Vec, +} -#[test] -fn test_encode_big() { - let mut encoder = Encoder::new(); - let num_ints = 10000 as usize; - let expected_length = 1274; - let input: Vec = (0..num_ints as u32) - .map(|i| i * 7 / 2) - .into_iter().collect(); - let encoded_data = encoder.encode(&input); - assert_eq!(encoded_data.len(), expected_length); - let decoder = Decoder::new(); - let mut decoded_data: Vec = (0..num_ints as u32).collect(); - assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data)); - assert_eq!(decoded_data, input); +impl Intersector { + fn new() -> Intersector { + Intersector::with_capacity(1_000_000) + } + fn with_capacity(capacity: usize) -> Intersector { + Intersector { + output_buffer: iter::repeat(0u32).take(capacity).collect() + } + } + fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] { + let max_intersection_length = min(left.len(), right.len()); + if self.output_buffer.len() < max_intersection_length { + self.output_buffer.resize(max_intersection_length, 0); + } + unsafe { + let intersection_len = intersection_native( + left.as_ptr(), left.len() as size_t, + right.as_ptr(), right.len() as size_t, + self.output_buffer.as_mut_ptr()); + return &self.output_buffer[0..intersection_len]; + } + } } @@ -94,17 +107,50 @@ mod tests { use test::Bencher; use rand::Rng; use rand::SeedableRng; - use rand::StdRng; + use rand::XorShiftRng; - fn generate_array(n: usize, ratio: f32) -> Vec { - let seed: &[_] = &[1, 2, 3, 4]; - let mut rng: StdRng = SeedableRng::from_seed(seed); + + + fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec { + let seed: &[u32; 4] = &[1, 2, 3, seed_val]; + let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed); (0..u32::max_value()) .filter(|_| rng.next_f32()< ratio) .take(n) .collect() } + fn generate_array(n: usize, ratio: f32) -> Vec { + generate_array_with_seed(n, ratio, 4) + } + + #[test] + fn test_encode_big() { + let mut encoder = Encoder::new(); + let num_ints = 10000 as usize; + let expected_length = 1274; + let input: Vec = (0..num_ints as u32) + .map(|i| i * 7 / 2) + .into_iter().collect(); + let encoded_data = encoder.encode(&input); + assert_eq!(encoded_data.len(), expected_length); + let decoder = Decoder::new(); + let mut decoded_data: Vec = (0..num_ints as u32).collect(); + assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data)); + assert_eq!(decoded_data, input); + } + + + #[test] + fn test_simd_intersection() { + let mut intersector = Intersector::new(); + let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); + let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); + let intersection = intersector.intersection(&arr1[..], &arr2[..]) ; + assert_eq!(intersection.len(), 500_233); + } + + #[bench] fn bench_decode(b: &mut Bencher) { const TEST_SIZE: usize = 1_000_000; @@ -117,4 +163,15 @@ mod tests { decoder.decode(&encoded, &mut uncompressed); }); } + + + #[bench] + fn bench_simd_intersection(b: &mut Bencher) { + let mut intersector = Intersector::new(); + let arr1 = generate_array_with_seed(1_000_000, 0.1, 2); + let arr2 = generate_array_with_seed(5_000_000, 0.5, 3); + b.iter(|| { + intersector.intersection(&arr1[..], &arr2[..]).len() + }); + } }