added simd compression

This commit is contained in:
Paul Masurel
2016-02-18 15:46:54 +09:00
parent b16deba433
commit 0824e3a3cf
11 changed files with 161 additions and 4 deletions

3
.gitmodules vendored Normal file
View File

@@ -0,0 +1,3 @@
[submodule "cpp/SIMDCompressionAndIntersection"]
path = cpp/SIMDCompressionAndIntersection
url = git@github.com:lemire/SIMDCompressionAndIntersection.git

View File

@@ -2,6 +2,8 @@
name = "tantivy"
version = "0.1.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
build = "build.rs"
[dependencies]
byteorder = "0.4.2"
@@ -19,6 +21,7 @@ combine = "1.2.0"
tempdir = "0.3.4"
bincode = "0.4.0"
serde = "0.6.11"
libc = "0.2.6"
#![feature(step_by)]
[build-dependencies]
gcc = "0.3.24"

23
build.rs Normal file
View File

@@ -0,0 +1,23 @@
extern crate gcc;
fn main() {
gcc::Config::new()
.cpp(true)
.flag("-std=c++11")
.include("./cpp/SIMDCompressionAndIntersection/include")
.object("cpp/SIMDCompressionAndIntersection/bitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/integratedbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o")
.object("cpp/SIMDCompressionAndIntersection/intersection.o")
.object("cpp/SIMDCompressionAndIntersection/varintdecode.o")
.object("cpp/SIMDCompressionAndIntersection/streamvbyte.o")
.object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o")
.object("cpp/SIMDCompressionAndIntersection/simdpackedselect.o")
.object("cpp/SIMDCompressionAndIntersection/frameofreference.o")
.object("cpp/SIMDCompressionAndIntersection/for.o")
.file("cpp/encode.cpp")
.compile("libsimdcompression.a");
println!("cargo:rustc-flags=-l dylib=stdc++");
}

70
cpp/encode.cpp Normal file
View File

@@ -0,0 +1,70 @@
// /usr/bin/c++ -Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -I/Users/pmasurel/github/FastPFor/headers -o CMakeFiles/example.dir/example.cpp.o -c /Users/pmasurel/github/FastPFor/example.cpp
#include <iostream>
#include <stdint.h>
#include "codecfactory.h"
#include "intersection.h"
using namespace SIMDCompressionLib;
static shared_ptr<IntegerCODEC> codec = CODECFactory::getFromName("s4-bp128-dm");
extern "C" {
size_t encode_native(
uint32_t* begin,
const size_t num_els,
uint32_t* output) {
size_t output_length = 10000;
codec -> encodeArray(begin,
num_els,
output,
output_length);
return output_length;
//
// if desired, shrink back the array:
//compressed_output.resize(compressedsize);
// compressed_output.shrink_to_fit();
// display compression rate:
// cout << setprecision(3);
// cout << "You are using " << 32.0 * static_cast<double>(compressed_output.size()) /
// static_cast<double>(mydata.size()) << " bits per integer. " << endl;
// //
// You are done!... with the compression...
//
///
// // decompressing is also easy:
// //
// vector<uint32_t> mydataback(N);
// size_t recoveredsize = mydataback.size();
// //
// codec.decodeArray(compressed_output.data(),
// compressed_output.size(), mydataback.data(), recoveredsize);
// mydataback.resize(recoveredsize);
// //
// // That's it for compression!
// //
// if (mydataback != mydata) throw runtime_error("bug!");
//
// //
// // Next we are going to test out intersection...
// //
// vector<uint32_t> mydata2(N);
// for (uint32_t i = 0; i < N; ++i) mydata2[i] = 6 * i;
// intersectionfunction inter = IntersectionFactory::getFromName("simd");// using SIMD intersection
// //
// // we are going to intersect mydata and mydata2 and write back
// the result to mydata2
//
// size_t intersize = inter(mydata2.data(), mydata2.size(), mydata.data(), mydata.size(), mydata2.data());
// mydata2.resize(intersize);
// mydata2.shrink_to_fit();
// cout << "Intersection size: " << mydata2.size() << " integers. " << endl;
// return mydata2.size();
}
}

View File

@@ -6,7 +6,6 @@ lazy_static! {
static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
}
pub struct TokenIter<'a> {
text: &'a str,
token_it: Box<Iterator<Item=(usize, usize)> + 'a>,

View File

@@ -1,3 +1,4 @@
pub mod query;
pub mod postings;
pub mod global;
@@ -14,3 +15,4 @@ pub mod collector;
pub mod skip;
pub use core::global::DocId;
pub mod serialize;
pub mod simdcompression;

View File

@@ -35,7 +35,7 @@ impl<T: BinarySerializable> BinarySerializable for Vec<T> {
// TODO error
let num_items = reader.read_u32::<BigEndian>().unwrap();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for i in 0..num_items {
for _ in 0..num_items {
let item = try!(T::deserialize(reader));
items.push(item);
}

View File

@@ -0,0 +1,43 @@
use libc::size_t;
use std::ptr;
#[link(name = "simdcompression", kind = "static")]
extern {
fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32) -> size_t;
}
pub struct Encoder {
input_buffer: Vec<u32>,
output_buffer: Vec<u32>,
}
impl Encoder {
pub fn new() -> Encoder {
Encoder {
input_buffer: Vec::new(),
output_buffer: Vec::new(),
}
}
pub fn encode(&mut self, input: &[u32]) -> &[u32] {
unsafe {
self.input_buffer.clear();
let input_len = input.len();
if input_len > self.input_buffer.len() {
// let delta_size = self.input_buffer.len() - input_len;
self.input_buffer = (0..input_len as u32 + 10 ).collect();
self.output_buffer = (0..input_len as u32 + 10).collect();
// TODO use resize when available
}
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
// TODO use clone_from when available
let written_size = encode_native(
self.input_buffer.as_mut_ptr(),
input_len as size_t,
self.output_buffer.as_mut_ptr()
);
return &self.output_buffer[0..written_size];
}
}
}

View File

@@ -1,5 +1,7 @@
#[allow(unused_imports)]
#[macro_use]
extern crate lazy_static;
@@ -17,5 +19,6 @@ extern crate atomicwrites;
extern crate tempdir;
extern crate bincode;
extern crate serde;
extern crate libc;
pub mod core;

10
src/main.rs Normal file
View File

@@ -0,0 +1,10 @@
extern crate tantivy;
use tantivy::core::simdcompression::Encoder;
fn main() {
let data: Vec<u32> = vec!(2,3,3,4,12,32,34);
let mut encoder = Encoder::new();
let output = encoder.encode(&data);
println!("{}", output.len());
}