mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-07 01:32:53 +00:00
blop
This commit is contained in:
2
build.rs
2
build.rs
@@ -12,7 +12,7 @@ fn main() {
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/usimdbitpacking.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdintegratedbitpacking.o")
|
||||
// .object("cpp/SIMDCompressionAndIntersection/intersection.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/intersection.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/varintdecode.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/streamvbyte.o")
|
||||
.object("cpp/SIMDCompressionAndIntersection/simdpackedsearch.o")
|
||||
|
||||
@@ -8,8 +8,10 @@ using namespace SIMDCompressionLib;
|
||||
|
||||
static shared_ptr<IntegerCODEC> codec = CODECFactory::getFromName("s4-bp128-dm");
|
||||
|
||||
|
||||
extern "C" {
|
||||
|
||||
|
||||
|
||||
size_t encode_native(
|
||||
uint32_t* begin,
|
||||
const size_t num_els,
|
||||
@@ -32,4 +34,14 @@ extern "C" {
|
||||
codec -> decodeArray(compressed_data, compressed_size, uncompressed, num_ints);
|
||||
return num_ints;
|
||||
}
|
||||
|
||||
size_t intersection_native(
|
||||
const uint32_t* left,
|
||||
const size_t left_size,
|
||||
const uint32_t* right,
|
||||
const size_t right_size,
|
||||
uint32_t* output) {
|
||||
return IntersectionFactory::getFromName("simd")(left, left_size, right, right_size, output);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -3,23 +3,17 @@ use std::io::Seek;
|
||||
use std::io::Write;
|
||||
use std::io::Cursor;
|
||||
use std::fs::File;
|
||||
use fst::Map;
|
||||
use fst::MapBuilder;
|
||||
use std::rc::Rc;
|
||||
use fst::raw::Fst;
|
||||
use core::serialize::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
use fst;
|
||||
use fst::raw::MmapReadOnly;
|
||||
use std::ops::Deref;
|
||||
|
||||
use core::serialize::BinarySerializable;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
fn convert_fst_error(e: fst::Error) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::Other, e)
|
||||
}
|
||||
|
||||
pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
fst_builder: MapBuilder<W>,
|
||||
fst_builder: fst::MapBuilder<W>,
|
||||
data: Vec<u8>,
|
||||
_phantom_: PhantomData<V>,
|
||||
}
|
||||
@@ -27,7 +21,7 @@ pub struct FstMapBuilder<W: Write, V: BinarySerializable> {
|
||||
impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
|
||||
fn new(w: W) -> io::Result<FstMapBuilder<W, V>> {
|
||||
let fst_builder = try!(MapBuilder::new(w).map_err(convert_fst_error));
|
||||
let fst_builder = try!(fst::MapBuilder::new(w).map_err(convert_fst_error));
|
||||
Ok(FstMapBuilder {
|
||||
fst_builder: fst_builder,
|
||||
data: Vec::new(),
|
||||
@@ -44,12 +38,14 @@ impl<W: Write, V: BinarySerializable> FstMapBuilder<W, V> {
|
||||
}
|
||||
|
||||
fn close(self,) -> io::Result<W> {
|
||||
let mut file = try!(self.fst_builder
|
||||
let mut file = try!(
|
||||
self.fst_builder
|
||||
.into_inner()
|
||||
.map_err(convert_fst_error));
|
||||
let footer_size = self.data.len();
|
||||
let footer_size = self.data.len() as u32;
|
||||
file.write_all(&self.data);
|
||||
(footer_size as u32).serialize(&mut file);
|
||||
file.flush();
|
||||
Ok(file)
|
||||
}
|
||||
}
|
||||
@@ -63,14 +59,14 @@ pub struct FstMap<V: BinarySerializable> {
|
||||
|
||||
|
||||
impl<V: BinarySerializable> FstMap<V> {
|
||||
pub fn open(file: &File) -> io::Result<FstMap<V>> {
|
||||
pub fn open(file: File) -> io::Result<FstMap<V>> {
|
||||
let mmap = try!(MmapReadOnly::open(&file));
|
||||
let mut cursor = Cursor::new(unsafe {mmap.as_slice()});
|
||||
try!(cursor.seek(io::SeekFrom::End(-4)));
|
||||
let footer_size = try!(u32::deserialize(&mut cursor)) as usize;
|
||||
let split_len = mmap.len() - 4 - footer_size;
|
||||
let fst_mmap = mmap.range(0, split_len);
|
||||
let values_mmap = mmap.range(split_len, mmap.len() - 4);
|
||||
let values_mmap = mmap.range(split_len, footer_size);
|
||||
let fst = try!(fst::raw::Fst::from_mmap(fst_mmap).map_err(convert_fst_error));
|
||||
Ok(FstMap {
|
||||
fst_index: fst::Map::from(fst),
|
||||
@@ -92,13 +88,21 @@ impl<V: BinarySerializable> FstMap<V> {
|
||||
|
||||
|
||||
mod tests {
|
||||
use super::{FstMapBuilder, FstMap};
|
||||
|
||||
use super::*;
|
||||
use tempfile;
|
||||
|
||||
#[test]
|
||||
fn test_fstmap() {
|
||||
let mut fst_map_builder: FstMapBuilder<Vec<u8>, u32> = FstMapBuilder::new(Vec::new()).unwrap();
|
||||
fst_map_builder.insert("abc".as_bytes(), &34).unwrap();
|
||||
fst_map_builder.insert("abcd".as_bytes(), &343).unwrap();
|
||||
let data = fst_map_builder.close().unwrap();
|
||||
let fstmap_file;
|
||||
{
|
||||
let tempfile = tempfile::tempfile().unwrap(); // 41
|
||||
let mut fstmap_builder = FstMapBuilder::new(tempfile).unwrap();
|
||||
fstmap_builder.insert("abc".as_bytes(), &34u32).unwrap();
|
||||
fstmap_builder.insert("abcd".as_bytes(), &346u32).unwrap();
|
||||
fstmap_file = fstmap_builder.close().unwrap();
|
||||
}
|
||||
let fstmap = FstMap::open(fstmap_file).unwrap();
|
||||
assert_eq!(fstmap.get("abc"), Some(34u32));
|
||||
assert_eq!(fstmap.get("abcd"), Some(346u32));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -120,9 +120,6 @@ impl<T: Postings> Iterator for IntersectionPostings<T> {
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
@@ -162,38 +159,3 @@ mod tests {
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
use super::*;
|
||||
use test::Bencher;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::StdRng;
|
||||
|
||||
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
let seed: &[_] = &[1, 2, 3, 4];
|
||||
let mut rng: StdRng = SeedableRng::from_seed(seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32()< ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intersection(b: &mut Bencher) {
|
||||
const TEST_SIZE: usize = 100_000;
|
||||
let arr = generate_array(TEST_SIZE, 0.1);
|
||||
let mut encoder = Encoder::new();
|
||||
let encoded = encoder.encode(&arr);
|
||||
let mut uncompressed: Vec<u32> = (0..TEST_SIZE as u32).collect();
|
||||
let decoder = Decoder;
|
||||
b.iter(|| {
|
||||
decoder.decode(&encoded, &mut uncompressed);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
|
||||
use libc::size_t;
|
||||
use std::ptr;
|
||||
use std::cmp::min;
|
||||
use std::iter;
|
||||
|
||||
|
||||
|
||||
extern {
|
||||
fn encode_native(data: *mut u32, num_els: size_t, output: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn decode_native(compressed_data: *const u32, compressed_size: size_t, uncompressed: *mut u32, output_capacity: size_t) -> size_t;
|
||||
fn intersection_native(left_data: *const u32, left_size: size_t, right_data: *const u32, right_size: size_t, output: *mut u32) -> size_t;
|
||||
}
|
||||
|
||||
pub struct Encoder {
|
||||
@@ -27,9 +29,9 @@ impl Encoder {
|
||||
self.input_buffer.clear();
|
||||
let input_len = input.len();
|
||||
if input_len + 10000 >= self.input_buffer.len() {
|
||||
self.input_buffer = (0..(input_len as u32) + 1024).collect();
|
||||
self.output_buffer = (0..(input_len as u32) + 1024).collect();
|
||||
// TODO use resize when available
|
||||
let target_length = input_len + 1024;
|
||||
self.input_buffer.resize(target_length, 0);
|
||||
self.output_buffer.resize(target_length, 0);
|
||||
}
|
||||
// TODO use clone_from when available
|
||||
unsafe {
|
||||
@@ -68,21 +70,32 @@ impl Decoder {
|
||||
}
|
||||
}
|
||||
|
||||
pub struct Intersector {
|
||||
output_buffer: Vec<u32>,
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_big() {
|
||||
let mut encoder = Encoder::new();
|
||||
let num_ints = 10000 as usize;
|
||||
let expected_length = 1274;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
let encoded_data = encoder.encode(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
impl Intersector {
|
||||
fn new() -> Intersector {
|
||||
Intersector::with_capacity(1_000_000)
|
||||
}
|
||||
fn with_capacity(capacity: usize) -> Intersector {
|
||||
Intersector {
|
||||
output_buffer: iter::repeat(0u32).take(capacity).collect()
|
||||
}
|
||||
}
|
||||
fn intersection(&mut self, left: &[u32], right: &[u32]) -> &[u32] {
|
||||
let max_intersection_length = min(left.len(), right.len());
|
||||
if self.output_buffer.len() < max_intersection_length {
|
||||
self.output_buffer.resize(max_intersection_length, 0);
|
||||
}
|
||||
unsafe {
|
||||
let intersection_len = intersection_native(
|
||||
left.as_ptr(), left.len() as size_t,
|
||||
right.as_ptr(), right.len() as size_t,
|
||||
self.output_buffer.as_mut_ptr());
|
||||
return &self.output_buffer[0..intersection_len];
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -94,17 +107,50 @@ mod tests {
|
||||
use test::Bencher;
|
||||
use rand::Rng;
|
||||
use rand::SeedableRng;
|
||||
use rand::StdRng;
|
||||
use rand::XorShiftRng;
|
||||
|
||||
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
let seed: &[_] = &[1, 2, 3, 4];
|
||||
let mut rng: StdRng = SeedableRng::from_seed(seed);
|
||||
|
||||
|
||||
fn generate_array_with_seed(n: usize, ratio: f32, seed_val: u32) -> Vec<u32> {
|
||||
let seed: &[u32; 4] = &[1, 2, 3, seed_val];
|
||||
let mut rng: XorShiftRng = XorShiftRng::from_seed(*seed);
|
||||
(0..u32::max_value())
|
||||
.filter(|_| rng.next_f32()< ratio)
|
||||
.take(n)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn generate_array(n: usize, ratio: f32) -> Vec<u32> {
|
||||
generate_array_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_encode_big() {
|
||||
let mut encoder = Encoder::new();
|
||||
let num_ints = 10000 as usize;
|
||||
let expected_length = 1274;
|
||||
let input: Vec<u32> = (0..num_ints as u32)
|
||||
.map(|i| i * 7 / 2)
|
||||
.into_iter().collect();
|
||||
let encoded_data = encoder.encode(&input);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
let decoder = Decoder::new();
|
||||
let mut decoded_data: Vec<u32> = (0..num_ints as u32).collect();
|
||||
assert_eq!(num_ints, decoder.decode(&encoded_data[..], &mut decoded_data));
|
||||
assert_eq!(decoded_data, input);
|
||||
}
|
||||
|
||||
|
||||
#[test]
|
||||
fn test_simd_intersection() {
|
||||
let mut intersector = Intersector::new();
|
||||
let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
|
||||
let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
|
||||
let intersection = intersector.intersection(&arr1[..], &arr2[..]) ;
|
||||
assert_eq!(intersection.len(), 500_233);
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn bench_decode(b: &mut Bencher) {
|
||||
const TEST_SIZE: usize = 1_000_000;
|
||||
@@ -117,4 +163,15 @@ mod tests {
|
||||
decoder.decode(&encoded, &mut uncompressed);
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
#[bench]
|
||||
fn bench_simd_intersection(b: &mut Bencher) {
|
||||
let mut intersector = Intersector::new();
|
||||
let arr1 = generate_array_with_seed(1_000_000, 0.1, 2);
|
||||
let arr2 = generate_array_with_seed(5_000_000, 0.5, 3);
|
||||
b.iter(|| {
|
||||
intersector.intersection(&arr1[..], &arr2[..]).len()
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user