mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 00:02:55 +00:00
issue 120. Using streamvbyte codec for the vbyte part of the encoding
This commit is contained in:
22
build.rs
22
build.rs
@@ -4,7 +4,8 @@ mod build {
|
||||
|
||||
pub fn build() {
|
||||
let mut config = gcc::Config::new();
|
||||
config.include("./cpp/simdcomp/include")
|
||||
config
|
||||
.include("./cpp/simdcomp/include")
|
||||
.file("cpp/simdcomp/src/avxbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdintegratedbitpacking.c")
|
||||
.file("cpp/simdcomp/src/simdbitpacking.c")
|
||||
@@ -12,24 +13,33 @@ mod build {
|
||||
.file("cpp/simdcomp/src/simdcomputil.c")
|
||||
.file("cpp/simdcomp/src/simdpackedselect.c")
|
||||
.file("cpp/simdcomp/src/simdfor.c")
|
||||
.file("cpp/simdcomp_wrapper.c");
|
||||
.file("cpp/simdcomp_wrapper.c")
|
||||
.include("./cpp/streamvbyte/include")
|
||||
.file("cpp/streamvbyte/src/streamvbyte.c")
|
||||
.file("cpp/streamvbyte/src/streamvbytedelta.c")
|
||||
;
|
||||
|
||||
if !cfg!(debug_assertions) {
|
||||
config.opt_level(3);
|
||||
|
||||
if cfg!(target_env = "msvc") {
|
||||
config.define("NDEBUG", None)
|
||||
config
|
||||
.define("NDEBUG", None)
|
||||
.flag("/Gm-")
|
||||
.flag("/GS-")
|
||||
.flag("/Gy")
|
||||
.flag("/Oi")
|
||||
.flag("/GL");
|
||||
} else {
|
||||
config.flag("-msse4.1")
|
||||
.flag("-march=native");
|
||||
}
|
||||
}
|
||||
|
||||
if !cfg!(target_env = "msvc") {
|
||||
config
|
||||
.flag("-msse4.1")
|
||||
.flag("-march=native")
|
||||
.flag("-std=c99");
|
||||
}
|
||||
|
||||
config.compile("libsimdcomp.a");
|
||||
|
||||
// Workaround for linking static libraries built with /GL
|
||||
|
||||
@@ -9,7 +9,7 @@
|
||||
|
||||
// Encode an array of a given length read from in to bout in varint format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(uint32_t *in, uint32_t length, uint8_t *out);
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t length, uint8_t *out);
|
||||
|
||||
// Read "length" 32-bit integers in varint format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
|
||||
@@ -12,7 +12,7 @@
|
||||
// Encode an array of a given length read from in to bout in StreamVByte format.
|
||||
// Returns the number of bytes written.
|
||||
// this version uses differential coding (coding differences between values) starting at prev (you can often set prev to zero)
|
||||
size_t streamvbyte_delta_encode(uint32_t *in, uint32_t length, uint8_t *out, uint32_t prev);
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t length, uint8_t *out, uint32_t prev);
|
||||
|
||||
// Read "length" 32-bit integers in StreamVByte format from in, storing the result in out.
|
||||
// Returns the number of bytes read.
|
||||
|
||||
@@ -345,7 +345,7 @@ static uint8_t *svb_encode_scalar(const uint32_t *in,
|
||||
|
||||
// Encode an array of a given length read from in to bout in streamvbyte format.
|
||||
// Returns the number of bytes written.
|
||||
size_t streamvbyte_encode(uint32_t *in, uint32_t count, uint8_t *out) {
|
||||
size_t streamvbyte_encode(const uint32_t *in, uint32_t count, uint8_t *out) {
|
||||
uint8_t *keyPtr = out;
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
uint8_t *dataPtr = keyPtr + keyLen; // variable byte data after all keys
|
||||
|
||||
@@ -344,7 +344,7 @@ static uint8_t *svb_encode_scalar_d1_init(const uint32_t *in,
|
||||
return dataPtr; // pointer to first unused data byte
|
||||
}
|
||||
|
||||
size_t streamvbyte_delta_encode(uint32_t *in, uint32_t count, uint8_t *out,
|
||||
size_t streamvbyte_delta_encode(const uint32_t *in, uint32_t count, uint8_t *out,
|
||||
uint32_t prev) {
|
||||
uint8_t *keyPtr = out; // keys come immediately after 32-bit count
|
||||
uint32_t keyLen = (count + 3) / 4; // 2-bits rounded to full byte
|
||||
|
||||
@@ -110,7 +110,7 @@ pub mod tests {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_unsorted(&data);
|
||||
assert_eq!(compressed.len(), 19_790);
|
||||
assert!(compressed.len() <= 19_794);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_unsorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
@@ -123,7 +123,7 @@ pub mod tests {
|
||||
let data = generate_array(10_000, 0.1);
|
||||
let mut encoder = CompositeEncoder::new();
|
||||
let compressed = encoder.compress_sorted(&data);
|
||||
assert_eq!(compressed.len(), 7_822);
|
||||
assert!(compressed.len() <= 7_826);
|
||||
let mut decoder = CompositeDecoder::new();
|
||||
let result = decoder.uncompress_sorted(&compressed, data.len());
|
||||
for i in 0..data.len() {
|
||||
|
||||
@@ -4,16 +4,32 @@
|
||||
mod composite;
|
||||
pub use self::composite::{CompositeEncoder, CompositeDecoder};
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod compression_simd;
|
||||
#[cfg(feature="simdcompression")]
|
||||
pub use self::compression_simd::{BlockEncoder, BlockDecoder};
|
||||
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
mod compression_nosimd;
|
||||
mod pack {
|
||||
mod compression_pack_nosimd;
|
||||
pub use self::compression_pack_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod pack {
|
||||
mod compression_pack_simd;
|
||||
pub use self::compression_pack_simd::*;
|
||||
}
|
||||
|
||||
pub use self::pack::{BlockEncoder, BlockDecoder};
|
||||
|
||||
#[cfg(not(feature="simdcompression"))]
|
||||
pub use self::compression_nosimd::{BlockEncoder, BlockDecoder};
|
||||
mod vint {
|
||||
mod compression_vint_nosimd;
|
||||
pub use self::compression_vint_nosimd::*;
|
||||
}
|
||||
|
||||
#[cfg(feature="simdcompression")]
|
||||
mod vint {
|
||||
mod compression_vint_simd;
|
||||
pub use self::compression_vint_simd::*;
|
||||
}
|
||||
|
||||
|
||||
pub trait VIntEncoder {
|
||||
@@ -26,51 +42,16 @@ pub trait VIntDecoder {
|
||||
fn uncompress_vint_unsorted<'a>(&mut self, compressed_data: &'a [u8], num_els: usize) -> &'a [u8];
|
||||
}
|
||||
|
||||
impl VIntEncoder for BlockEncoder{
|
||||
impl VIntEncoder for BlockEncoder {
|
||||
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], mut offset: u32) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
fn compress_vint_sorted(&mut self, input: &[u32], offset: u32) -> &[u8] {
|
||||
vint::compress_sorted(input, &mut self.output, offset)
|
||||
}
|
||||
|
||||
fn compress_vint_unsorted(&mut self, input: &[u32]) -> &[u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
self.output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
self.output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&self.output[..byte_written]
|
||||
vint::compress_unsorted(input, &mut self.output)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl VIntDecoder for BlockDecoder {
|
||||
|
||||
@@ -79,52 +60,19 @@ impl VIntDecoder for BlockDecoder {
|
||||
compressed_data: &'a [u8],
|
||||
offset: u32,
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
vint::uncompress_sorted(compressed_data, &mut self.output[..num_els], offset)
|
||||
}
|
||||
|
||||
fn uncompress_vint_unsorted<'a>(
|
||||
&mut self,
|
||||
compressed_data: &'a [u8],
|
||||
num_els: usize) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
self.output[i] = result;
|
||||
}
|
||||
self.output_len = num_els;
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
vint::uncompress_unsorted(compressed_data, &mut self.output[..num_els])
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
pub const NUM_DOCS_PER_BLOCK: usize = 128; //< should be a power of 2 to let the compiler optimize.
|
||||
|
||||
@@ -224,7 +172,7 @@ pub mod tests {
|
||||
#[test]
|
||||
fn test_encode_vint() {
|
||||
{
|
||||
let expected_length = 123;
|
||||
let expected_length = 154;
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input: Vec<u32> = (0u32..123u32)
|
||||
.map(|i| 4 + i * 7 / 2)
|
||||
@@ -232,23 +180,13 @@ pub mod tests {
|
||||
.collect();
|
||||
for offset in &[0u32, 1u32, 2u32] {
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, *offset);
|
||||
assert_eq!(encoded_data.len(), expected_length);
|
||||
assert!(encoded_data.len() <= expected_length);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
let remaining_data = decoder.uncompress_vint_sorted(&encoded_data, *offset, input.len());
|
||||
assert_eq!(0, remaining_data.len());
|
||||
assert_eq!(input, decoder.output_array());
|
||||
}
|
||||
}
|
||||
{
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let input = vec!(3u32, 17u32, 187u32);
|
||||
let encoded_data = encoder.compress_vint_sorted(&input, 0);
|
||||
assert_eq!(encoded_data.len(), 4);
|
||||
assert_eq!(encoded_data[0], 3u8 + 128u8);
|
||||
assert_eq!(encoded_data[1], (17u8 - 3u8) + 128u8);
|
||||
assert_eq!(encoded_data[2], (187u8 - 17u8 - 128u8));
|
||||
assert_eq!(encoded_data[3], (1u8 + 128u8));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -272,4 +210,27 @@ pub mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
|
||||
const NUM_INTS_BENCH_VINT: usize = 10;
|
||||
|
||||
#[bench]
|
||||
fn bench_compress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
b.iter(|| {
|
||||
encoder.compress_vint_sorted(&data, 0u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_uncompress_vint(b: &mut Bencher) {
|
||||
let mut encoder = BlockEncoder::new();
|
||||
let data = generate_array(NUM_INTS_BENCH_VINT, 0.001);
|
||||
let compressed = encoder.compress_vint_sorted(&data, 0u32);
|
||||
let mut decoder = BlockDecoder::new();
|
||||
b.iter(|| {
|
||||
decoder.uncompress_vint_sorted(compressed, 0u32, NUM_INTS_BENCH_VINT);
|
||||
});
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use common::bitpacker::compute_num_bits;
|
||||
use common::bitpacker::{BitPacker, BitUnpacker};
|
||||
use std::cmp;
|
||||
use std::io::Write;
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use super::NUM_DOCS_PER_BLOCK;
|
||||
use super::super::NUM_DOCS_PER_BLOCK;
|
||||
|
||||
const COMPRESSED_BLOCK_MAX_SIZE: usize = NUM_DOCS_PER_BLOCK * 4 + 1;
|
||||
|
||||
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
92
src/compression/vint/compression_vint_nosimd.rs
Normal file
@@ -0,0 +1,92 @@
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], mut offset: u32) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v - offset;
|
||||
offset = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let mut byte_written = 0;
|
||||
for &v in input {
|
||||
let mut to_encode: u32 = v;
|
||||
loop {
|
||||
let next_byte: u8 = (to_encode % 128u32) as u8;
|
||||
to_encode /= 128u32;
|
||||
if to_encode == 0u32 {
|
||||
output[byte_written] = next_byte | 128u8;
|
||||
byte_written += 1;
|
||||
break;
|
||||
}
|
||||
else {
|
||||
output[byte_written] = next_byte;
|
||||
byte_written += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
&output[..byte_written]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let mut result = offset;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
let mut read_byte = 0;
|
||||
let num_els = output.len();
|
||||
for i in 0..num_els {
|
||||
let mut result = 0u32;
|
||||
let mut shift = 0u32;
|
||||
loop {
|
||||
let cur_byte = compressed_data[read_byte];
|
||||
read_byte += 1;
|
||||
result += ((cur_byte % 128u8) as u32) << shift;
|
||||
if cur_byte & 128u8 != 0u8 {
|
||||
break;
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
output[i] = result;
|
||||
}
|
||||
&compressed_data[read_byte..]
|
||||
}
|
||||
82
src/compression/vint/compression_vint_simd.rs
Normal file
82
src/compression/vint/compression_vint_simd.rs
Normal file
@@ -0,0 +1,82 @@
|
||||
|
||||
mod streamvbyte {
|
||||
|
||||
use libc::size_t;
|
||||
|
||||
extern {
|
||||
pub fn streamvbyte_delta_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn streamvbyte_delta_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: u32,
|
||||
offset: u32) -> size_t;
|
||||
|
||||
pub fn streamvbyte_encode(
|
||||
data: *const u32,
|
||||
num_els: u32,
|
||||
output: *mut u8) -> size_t;
|
||||
|
||||
pub fn streamvbyte_decode(
|
||||
compressed_data: *const u8,
|
||||
output: *mut u32,
|
||||
num_els: usize) -> size_t;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_sorted<'a>(input: &[u32], output: &'a mut [u8], offset: u32) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_delta_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr(),
|
||||
offset)
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn compress_unsorted<'a>(input: &[u32], output: &'a mut [u8]) -> &'a [u8] {
|
||||
let compress_length = unsafe {
|
||||
streamvbyte::streamvbyte_encode(
|
||||
input.as_ptr(),
|
||||
input.len() as u32,
|
||||
output.as_mut_ptr())
|
||||
};
|
||||
&output[..compress_length]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_sorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32],
|
||||
offset: u32) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_delta_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len() as u32,
|
||||
offset)
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn uncompress_unsorted<'a>(
|
||||
compressed_data: &'a [u8],
|
||||
output: &mut [u32]) -> &'a [u8] {
|
||||
let consumed_bytes = unsafe {
|
||||
streamvbyte::streamvbyte_decode(
|
||||
compressed_data.as_ptr(),
|
||||
output.as_mut_ptr(),
|
||||
output.len())
|
||||
};
|
||||
&compressed_data[consumed_bytes..]
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ mod tests {
|
||||
posting_serializer.close_term().unwrap();
|
||||
posting_serializer.close().unwrap();
|
||||
let read = segment.open_read(SegmentComponent::POSITIONS).unwrap();
|
||||
assert_eq!(read.len(), 13);
|
||||
assert!(read.len() <= 16);
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user