This commit is contained in:
Paul Masurel
2016-02-28 12:47:28 +09:00
parent 2876afd8f8
commit 7af27bd927
5 changed files with 117 additions and 82 deletions

View File

@@ -12,7 +12,7 @@ fn main() {
.object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o")
// .object("cpp/SIMDCompressionAndIntersection/intersection.o")
.object("cpp/SIMDCompressionAndIntersection/intersection.o")
.object("cpp/SIMDCompressionAndIntersection/varintdecode.o")
.object("cpp/SIMDCompressionAndIntersection/streamvbyte.o")
.object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o")

View File

@@ -8,8 +8,10 @@ using namespace SIMDCompressionLib;
static shared_ptr<IntegerCODEC> codec = CODECFactory::getFromName("s4-bp128-dm");
extern "C" {
size_t encode_native(
uint32_t* begin,
const size_t num_els,
@@ -32,4 +34,14 @@ extern "C" {
codec -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
return num_ints;
}
size_t intersection_native(
const uint32_t* left,
const size_t left_size,
const uint32_t* right,
const size_t right_size,
uint32_t* output) {
return IntersectionFactory::getFromName("simd")(left, left_size, right, right_size, output);
}
}

View File

@@ -3,23 +3,17 @@ use std::io::Seek;
use std::io::Write;
use std::io::Cursor;
use std::fs::File;
use fst::Map;
use fst::MapBuilder;
use std::rc::Rc;
use fst::raw::Fst;
use core::serialize::BinarySerializable;
use std::marker::PhantomData;
use fst;
use fst::raw::MmapReadOnly;
use std::ops::Deref;
use core::serialize::BinarySerializable;
use std::marker::PhantomData;
fn convert_fst_error(e: fst::Error) -> io::Error {
io::Error::new(io::ErrorKind::Other, e)
}
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
fst_builder: MapBuilder<W>,
fst_builder: fst::MapBuilder<W>,
data: Vec<u8>,
_phantom_: PhantomData<V>,
}
@@ -27,7 +21,7 @@ pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
let fst_builder = try!(MapBuilder::new(w).map_err(convert_fst_error));
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
Ok(FstMapBuilder {
fst_builder: fst_builder,
data: Vec::new(),
@@ -44,12 +38,14 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
}
fn close(self,) -> io::Result<W> {
let mut file = try!(self.fst_builder
let mut file = try!(
self.fst_builder
.into_inner()
.map_err(convert_fst_error));
let footer_size = self.data.len();
let footer_size = self.data.len() as u32;
file.write_all(&self.data);
(footer_size as u32).serialize(&mut file);
file.flush();
Ok(file)
}
}
@@ -63,14 +59,14 @@ pub struct FstMap<V: BinarySerializable> {
impl<V: BinarySerializable> FstMap<V> {
pub fn open(file: &File) -> io::Result<FstMap<V>> {
pub fn open(file: File) -> io::Result<FstMap<V>> {
let mmap = try!(MmapReadOnly::open(&file));
let mut cursor = Cursor::new(unsafe {mmap.as_slice()});
try!(cursor.seek(io::SeekFrom::End(-4)));
let footer_size = try!(u32::deserialize(&mut cursor)) as usize;
let split_len = mmap.len() - 4 - footer_size;
let fst_mmap = mmap.range(0, split_len);
let values_mmap = mmap.range(split_len, mmap.len() - 4);
let values_mmap = mmap.range(split_len, footer_size);
let fst = try!(fst::raw::Fst::from_mmap(fst_mmap).map_err(convert_fst_error));
Ok(FstMap {
fst_index: fst::Map::from(fst),
@@ -92,13 +88,21 @@ impl<V: BinarySerializable> FstMap<V> {
mod tests {
use super::{FstMapBuilder, FstMap};
use super::*;
use tempfile;
#[test]
fn test_fstmap() {
let mut fst_map_builder: FstMapBuilder<Vec<u8>, u32> = FstMapBuilder::new(Vec::new()).unwrap();
fst_map_builder.insert("abc".as_bytes(), &34).unwrap();
fst_map_builder.insert("abcd".as_bytes(), &343).unwrap();
let data = fst_map_builder.close().unwrap();
let fstmap_file;
{
let tempfile = tempfile::tempfile().unwrap(); // 41
let mut fstmap_builder = FstMapBuilder::new(tempfile).unwrap();
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
fstmap_file = fstmap_builder.close().unwrap();
}
let fstmap = FstMap::open(fstmap_file).unwrap();
assert_eq!(fstmap.get("abc"), Some(34u32));
assert_eq!(fstmap.get("abcd"), Some(346u32));
}
}

View File

@@ -120,9 +120,6 @@ impl<T: Postings> Iterator for IntersectionPostings<T> {
}
#[cfg(test)]
mod tests {
@@ -162,38 +159,3 @@ mod tests {
});
}
}
#[cfg(test)]
mod tests {
use super::*;
use test::Bencher;
use rand::Rng;
use rand::SeedableRng;
use rand::StdRng;
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
let seed: &[_] = &[1, 2, 3, 4];
let mut rng: StdRng = SeedableRng::from_seed(seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
#[bench]
fn bench_intersection(b: &mut Bencher) {
const TEST_SIZE: usize = 100_000;
let arr = generate_array(TEST_SIZE, 0.1);
let mut encoder = Encoder::new();
let encoded = encoder.encode(&arr);
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
let decoder = Decoder;
b.iter(|| {
decoder.decode(&encoded, &mut uncompressed);
});
}
}

View File

@@ -1,12 +1,14 @@
use libc::size_t;
use std::ptr;
use std::cmp::min;
use std::iter;
extern {
fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
fn decode_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
}
pub struct Encoder {
@@ -27,9 +29,9 @@ impl Encoder {
self.input_buffer.clear();
let input_len = input.len();
if input_len + 10000 >= self.input_buffer.len() {
self.input_buffer = (0..(input_len as u32) + 1024).collect();
self.output_buffer = (0..(input_len as u32) + 1024).collect();
// TODO use resize when available
let target_length = input_len + 1024;
self.input_buffer.resize(target_length, 0);
self.output_buffer.resize(target_length, 0);
}
// TODO use clone_from when available
unsafe {
@@ -68,21 +70,32 @@ impl Decoder {
}
}
pub struct Intersector {
output_buffer: Vec<u32>,
}
#[test]
fn test_encode_big() {
let mut encoder = Encoder::new();
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new();
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
impl Intersector {
fn new() -> Intersector {
Intersector::with_capacity(1_000_000)
}
fn with_capacity(capacity: usize) -> Intersector {
Intersector {
output_buffer: iter::repeat(0u32).take(capacity).collect()
}
}
fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] {
let max_intersection_length = min(left.len(), right.len());
if self.output_buffer.len() < max_intersection_length {
self.output_buffer.resize(max_intersection_length, 0);
}
unsafe {
let intersection_len = intersection_native(
left.as_ptr(), left.len() as size_t,
right.as_ptr(), right.len() as size_t,
self.output_buffer.as_mut_ptr());
return &self.output_buffer[0..intersection_len];
}
}
}
@@ -94,17 +107,50 @@ mod tests {
use test::Bencher;
use rand::Rng;
use rand::SeedableRng;
use rand::StdRng;
use rand::XorShiftRng;
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
let seed: &[_] = &[1, 2, 3, 4];
let mut rng: StdRng = SeedableRng::from_seed(seed);
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
(0..u32::max_value())
.filter(|_| rng.next_f32()< ratio)
.take(n)
.collect()
}
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
generate_array_with_seed(n, ratio, 4)
}
#[test]
fn test_encode_big() {
let mut encoder = Encoder::new();
let num_ints = 10000 as usize;
let expected_length = 1274;
let input: Vec<u32> = (0..num_ints as u32)
.map(|i| i * 7 / 2)
.into_iter().collect();
let encoded_data = encoder.encode(&input);
assert_eq!(encoded_data.len(), expected_length);
let decoder = Decoder::new();
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
assert_eq!(decoded_data, input);
}
#[test]
fn test_simd_intersection() {
let mut intersector = Intersector::new();
let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
let intersection = intersector.intersection(&arr1[..], &arr2[..]) ;
assert_eq!(intersection.len(), 500_233);
}
#[bench]
fn bench_decode(b: &mut Bencher) {
const TEST_SIZE: usize = 1_000_000;
@@ -117,4 +163,15 @@ mod tests {
decoder.decode(&encoded, &mut uncompressed);
});
}
#[bench]
fn bench_simd_intersection(b: &mut Bencher) {
let mut intersector = Intersector::new();
let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
b.iter(|| {
intersector.intersection(&arr1[..], &arr2[..]).len()
});
}
}