diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 000000000..3d56a2bca --- /dev/null +++ b/.gitmodules @@ -0,0 +1,3 @@ +[submodule "cpp/SIMDCompressionAndIntersection"] + path = cpp/SIMDCompressionAndIntersection + url = git@github.com:lemire/SIMDCompressionAndIntersection.git diff --git a/Cargo.toml b/Cargo.toml index 66ab8509e..acb45ce1d 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,6 +2,8 @@ name = "tantivy" version = "0.1.0" authors = ["Paul Masurel "] +build = "build.rs" + [dependencies] byteorder = "0.4.2" @@ -19,6 +21,7 @@ combine = "1.2.0" tempdir = "0.3.4" bincode = "0.4.0" serde = "0.6.11" +libc = "0.2.6" - -#![feature(step_by)] +[build-dependencies] +gcc = "0.3.24" diff --git a/build.rs b/build.rs new file mode 100644 index 000000000..a9998e997 --- /dev/null +++ b/build.rs @@ -0,0 +1,23 @@ +extern crate gcc; + +fn main() { + gcc::Config::new() + .cpp(true) + .flag("-std=c++11") + .include("./cpp/SIMDCompressionAndIntersection/include") + .object("cpp/SIMDCompressionAndIntersection/bitpacking.o") + .object("cpp/SIMDCompressionAndIntersection/integratedbitpacking.o") + .object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o") + .object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o") + .object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o") + .object("cpp/SIMDCompressionAndIntersection/intersection.o") + .object("cpp/SIMDCompressionAndIntersection/varintdecode.o") + .object("cpp/SIMDCompressionAndIntersection/streamvbyte.o") + .object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o") + .object("cpp/SIMDCompressionAndIntersection/simdpackedselect.o") + .object("cpp/SIMDCompressionAndIntersection/frameofreference.o") + .object("cpp/SIMDCompressionAndIntersection/for.o") + .file("cpp/encode.cpp") + .compile("libsimdcompression.a"); + println!("cargo:rustc-flags=-l dylib=stdc++"); +} diff --git a/cpp/SIMDCompressionAndIntersection b/cpp/SIMDCompressionAndIntersection new file mode 160000 index 000000000..1f8e12aeb --- /dev/null +++ b/cpp/SIMDCompressionAndIntersection @@ -0,0 +1 @@ +Subproject commit 1f8e12aebd0a845b49fc8b1765d69c6fde118d4d diff --git a/cpp/encode.cpp b/cpp/encode.cpp new file mode 100644 index 000000000..c4a40beea --- /dev/null +++ b/cpp/encode.cpp @@ -0,0 +1,70 @@ + + +// /usr/bin/c++ -Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -I/Users/pmasurel/github/FastPFor/headers -o CMakeFiles/example.dir/example.cpp.o -c /Users/pmasurel/github/FastPFor/example.cpp + +#include +#include + + +#include "codecfactory.h" +#include "intersection.h" + +using namespace SIMDCompressionLib; + +static shared_ptr codec = CODECFactory::getFromName("s4-bp128-dm"); + + +extern "C" { + size_t encode_native( + uint32_t* begin, + const size_t num_els, + uint32_t* output) { + size_t output_length = 10000; + codec -> encodeArray(begin, + num_els, + output, + output_length); + + return output_length; + // + // if desired, shrink back the array: + //compressed_output.resize(compressedsize); + // compressed_output.shrink_to_fit(); + // display compression rate: + // cout << setprecision(3); + // cout << "You are using " << 32.0 * static_cast(compressed_output.size()) / + // static_cast(mydata.size()) << " bits per integer. " << endl; + // // + // You are done!... with the compression... + // + /// + // // decompressing is also easy: + // // + // vector mydataback(N); + // size_t recoveredsize = mydataback.size(); + // // + // codec.decodeArray(compressed_output.data(), + // compressed_output.size(), mydataback.data(), recoveredsize); + // mydataback.resize(recoveredsize); + // // + // // That's it for compression! + // // + // if (mydataback != mydata) throw runtime_error("bug!"); + // + // // + // // Next we are going to test out intersection... + // // + // vector mydata2(N); + // for (uint32_t i = 0; i < N; ++i) mydata2[i] = 6 * i; + // intersectionfunction inter = IntersectionFactory::getFromName("simd");// using SIMD intersection + // // + // // we are going to intersect mydata and mydata2 and write back + // the result to mydata2 + // + // size_t intersize = inter(mydata2.data(), mydata2.size(), mydata.data(), mydata.size(), mydata2.data()); + // mydata2.resize(intersize); + // mydata2.shrink_to_fit(); + // cout << "Intersection size: " << mydata2.size() << " integers. " << endl; + // return mydata2.size(); + } +} diff --git a/src/core/analyzer.rs b/src/core/analyzer.rs index 50e6db624..360b121f4 100644 --- a/src/core/analyzer.rs +++ b/src/core/analyzer.rs @@ -6,7 +6,6 @@ lazy_static! { static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap(); } - pub struct TokenIter<'a> { text: &'a str, token_it: Box + 'a>, diff --git a/src/core/mod.rs b/src/core/mod.rs index 72e347cff..ea8bd08e5 100644 --- a/src/core/mod.rs +++ b/src/core/mod.rs @@ -1,3 +1,4 @@ + pub mod query; pub mod postings; pub mod global; @@ -14,3 +15,4 @@ pub mod collector; pub mod skip; pub use core::global::DocId; pub mod serialize; +pub mod simdcompression; diff --git a/src/core/serialize.rs b/src/core/serialize.rs index d16e98289..f66e761cb 100644 --- a/src/core/serialize.rs +++ b/src/core/serialize.rs @@ -35,7 +35,7 @@ impl BinarySerializable for Vec { // TODO error let num_items = reader.read_u32::().unwrap(); let mut items: Vec = Vec::with_capacity(num_items as usize); - for i in 0..num_items { + for _ in 0..num_items { let item = try!(T::deserialize(reader)); items.push(item); } diff --git a/src/core/simdcompression.rs b/src/core/simdcompression.rs new file mode 100644 index 000000000..68f9439b4 --- /dev/null +++ b/src/core/simdcompression.rs @@ -0,0 +1,43 @@ + +use libc::size_t; +use std::ptr; +#[link(name = "simdcompression", kind = "static")] +extern { + fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32) -> size_t; +} + +pub struct Encoder { + input_buffer: Vec, + output_buffer: Vec, +} + +impl Encoder { + + pub fn new() -> Encoder { + Encoder { + input_buffer: Vec::new(), + output_buffer: Vec::new(), + } + } + + pub fn encode(&mut self, input: &[u32]) -> &[u32] { + unsafe { + self.input_buffer.clear(); + let input_len = input.len(); + if input_len > self.input_buffer.len() { + // let delta_size = self.input_buffer.len() - input_len; + self.input_buffer = (0..input_len as u32 + 10 ).collect(); + self.output_buffer = (0..input_len as u32 + 10).collect(); + // TODO use resize when available + } + ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len); + // TODO use clone_from when available + let written_size = encode_native( + self.input_buffer.as_mut_ptr(), + input_len as size_t, + self.output_buffer.as_mut_ptr() + ); + return &self.output_buffer[0..written_size]; + } + } +} diff --git a/src/lib.rs b/src/lib.rs index 32462008c..7b929cda8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,5 +1,7 @@ #[allow(unused_imports)] + + #[macro_use] extern crate lazy_static; @@ -17,5 +19,6 @@ extern crate atomicwrites; extern crate tempdir; extern crate bincode; extern crate serde; +extern crate libc; pub mod core; diff --git a/src/main.rs b/src/main.rs new file mode 100644 index 000000000..f5f10a1e8 --- /dev/null +++ b/src/main.rs @@ -0,0 +1,10 @@ +extern crate tantivy; +use tantivy::core::simdcompression::Encoder; + + +fn main() { + let data: Vec = vec!(2,3,3,4,12,32,34); + let mut encoder = Encoder::new(); + let output = encoder.encode(&data); + println!("{}", output.len()); +}