mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
added simd compression
This commit is contained in:
3
.gitmodules
vendored
Normal file
3
.gitmodules
vendored
Normal file
@@ -0,0 +1,3 @@
|
||||
[submodule "cpp/SIMDCompressionAndIntersection"]
|
||||
path = cpp/SIMDCompressionAndIntersection
|
||||
url = git@github.com:lemire/SIMDCompressionAndIntersection.git
|
||||
@@ -2,6 +2,8 @@
|
||||
name = "tantivy"
|
||||
version = "0.1.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
build = "build.rs"
|
||||
|
||||
|
||||
[dependencies]
|
||||
byteorder = "0.4.2"
|
||||
@@ -19,6 +21,7 @@ combine = "1.2.0"
|
||||
tempdir = "0.3.4"
|
||||
bincode = "0.4.0"
|
||||
serde = "0.6.11"
|
||||
libc = "0.2.6"
|
||||
|
||||
|
||||
#![feature(step_by)]
|
||||
[build-dependencies]
|
||||
gcc = "0.3.24"
|
||||
|
||||
23
build.rs
Normal file
23
build.rs
Normal file
@@ -0,0 +1,23 @@
|
||||
extern crate gcc;
|
||||
|
||||
fn main() {
|
||||
gcc::Config::new()
|
||||
.cpp(true)
|
||||
.flag("-std=c++11")
|
||||
.include("./cpp/SIMDCompressionAndIntersection/include")
|
||||
.object("cpp/SIMDCompressionAndIntersection/bitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/integratedbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/intersection.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/varintdecode.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/streamvbyte.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdpackedselect.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/frameofreference.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/for.o")
|
||||
.file("cpp/encode.cpp")
|
||||
.compile("libsimdcompression.a");
|
||||
println!("cargo:rustc-flags=-l dylib=stdc++");
|
||||
}
|
||||
1
cpp/SIMDCompressionAndIntersection
Submodule
1
cpp/SIMDCompressionAndIntersection
Submodule
Submodule cpp/SIMDCompressionAndIntersection added at 1f8e12aebd
70
cpp/encode.cpp
Normal file
70
cpp/encode.cpp
Normal file
@@ -0,0 +1,70 @@
|
||||
|
||||
|
||||
// /usr/bin/c++ -Wall -Wcast-align -O3 -DNDEBUG -std=c++11 -DHAVE_CXX0X -msse4.1 -march=native -isysroot /Applications/Xcode.app/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX10.9.sdk -I/Users/pmasurel/github/FastPFor/headers -o CMakeFiles/example.dir/example.cpp.o -c /Users/pmasurel/github/FastPFor/example.cpp
|
||||
|
||||
#include <iostream>
|
||||
#include <stdint.h>
|
||||
|
||||
|
||||
#include "codecfactory.h"
|
||||
#include "intersection.h"
|
||||
|
||||
using namespace SIMDCompressionLib;
|
||||
|
||||
static shared_ptr<IntegerCODEC> codec = CODECFactory::getFromName("s4-bp128-dm");
|
||||
|
||||
|
||||
extern "C" {
|
||||
size_t encode_native(
|
||||
uint32_t* begin,
|
||||
const size_t num_els,
|
||||
uint32_t* output) {
|
||||
size_t output_length = 10000;
|
||||
codec -> encodeArray(begin,
|
||||
num_els,
|
||||
output,
|
||||
output_length);
|
||||
|
||||
return output_length;
|
||||
//
|
||||
// if desired, shrink back the array:
|
||||
//compressed_output.resize(compressedsize);
|
||||
// compressed_output.shrink_to_fit();
|
||||
// display compression rate:
|
||||
// cout << setprecision(3);
|
||||
// cout << "You are using " << 32.0 * static_cast<double>(compressed_output.size()) /
|
||||
// static_cast<double>(mydata.size()) << " bits per integer. " << endl;
|
||||
// //
|
||||
// You are done!... with the compression...
|
||||
//
|
||||
///
|
||||
// // decompressing is also easy:
|
||||
// //
|
||||
// vector<uint32_t> mydataback(N);
|
||||
// size_t recoveredsize = mydataback.size();
|
||||
// //
|
||||
// codec.decodeArray(compressed_output.data(),
|
||||
// compressed_output.size(), mydataback.data(), recoveredsize);
|
||||
// mydataback.resize(recoveredsize);
|
||||
// //
|
||||
// // That's it for compression!
|
||||
// //
|
||||
// if (mydataback != mydata) throw runtime_error("bug!");
|
||||
//
|
||||
// //
|
||||
// // Next we are going to test out intersection...
|
||||
// //
|
||||
// vector<uint32_t> mydata2(N);
|
||||
// for (uint32_t i = 0; i < N; ++i) mydata2[i] = 6 * i;
|
||||
// intersectionfunction inter = IntersectionFactory::getFromName("simd");// using SIMD intersection
|
||||
// //
|
||||
// // we are going to intersect mydata and mydata2 and write back
|
||||
// the result to mydata2
|
||||
//
|
||||
// size_t intersize = inter(mydata2.data(), mydata2.size(), mydata.data(), mydata.size(), mydata2.data());
|
||||
// mydata2.resize(intersize);
|
||||
// mydata2.shrink_to_fit();
|
||||
// cout << "Intersection size: " << mydata2.size() << " integers. " << endl;
|
||||
// return mydata2.size();
|
||||
}
|
||||
}
|
||||
@@ -6,7 +6,6 @@ lazy_static! {
|
||||
static ref WORD_PTN: Regex = Regex::new(r"[a-zA-Z0-9]+").unwrap();
|
||||
}
|
||||
|
||||
|
||||
pub struct TokenIter<'a> {
|
||||
text: &'a str,
|
||||
token_it: Box<Iterator<Item=(usize, usize)> + 'a>,
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
|
||||
pub mod query;
|
||||
pub mod postings;
|
||||
pub mod global;
|
||||
@@ -14,3 +15,4 @@ pub mod collector;
|
||||
pub mod skip;
|
||||
pub use core::global::DocId;
|
||||
pub mod serialize;
|
||||
pub mod simdcompression;
|
||||
|
||||
@@ -35,7 +35,7 @@ impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
// TODO error
|
||||
let num_items = reader.read_u32::<BigEndian>().unwrap();
|
||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||
for i in 0..num_items {
|
||||
for _ in 0..num_items {
|
||||
let item = try!(T::deserialize(reader));
|
||||
items.push(item);
|
||||
}
|
||||
|
||||
43
src/core/simdcompression.rs
Normal file
43
src/core/simdcompression.rs
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
#[link(name = "simdcompression", kind = "static")]
|
||||
extern {
|
||||
fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32) -> size_t;
|
||||
}
|
||||
|
||||
pub struct Encoder {
|
||||
input_buffer: Vec<u32>,
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
impl Encoder {
|
||||
|
||||
pub fn new() -> Encoder {
|
||||
Encoder {
|
||||
input_buffer: Vec::new(),
|
||||
output_buffer: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn encode(&mut self, input: &[u32]) -> &[u32] {
|
||||
unsafe {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len > self.input_buffer.len() {
|
||||
// let delta_size = self.input_buffer.len() - input_len;
|
||||
self.input_buffer = (0..input_len as u32 + 10 ).collect();
|
||||
self.output_buffer = (0..input_len as u32 + 10).collect();
|
||||
// TODO use resize when available
|
||||
}
|
||||
ptr::copy_nonoverlapping(input.as_ptr(), self.input_buffer.as_mut_ptr(), input_len);
|
||||
// TODO use clone_from when available
|
||||
let written_size = encode_native(
|
||||
self.input_buffer.as_mut_ptr(),
|
||||
input_len as size_t,
|
||||
self.output_buffer.as_mut_ptr()
|
||||
);
|
||||
return &self.output_buffer[0..written_size];
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,7 @@
|
||||
#[allow(unused_imports)]
|
||||
|
||||
|
||||
|
||||
#[macro_use]
|
||||
extern crate lazy_static;
|
||||
|
||||
@@ -17,5 +19,6 @@ extern crate atomicwrites;
|
||||
extern crate tempdir;
|
||||
extern crate bincode;
|
||||
extern crate serde;
|
||||
extern crate libc;
|
||||
|
||||
pub mod core;
|
||||
|
||||
10
src/main.rs
Normal file
10
src/main.rs
Normal file
@@ -0,0 +1,10 @@
|
||||
extern crate tantivy;
|
||||
use tantivy::core::simdcompression::Encoder;
|
||||
|
||||
|
||||
fn main() {
|
||||
let data: Vec<u32> = vec!(2,3,3,4,12,32,34);
|
||||
let mut encoder = Encoder::new();
|
||||
let output = encoder.encode(&data);
|
||||
println!("{}", output.len());
|
||||
}
|
||||
Reference in New Issue
Block a user