mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-05-21 02:30:43 +00:00
issue/55 externalizing the bitpacker
This commit is contained in:
157
src/common/bitpacker.rs
Normal file
157
src/common/bitpacker.rs
Normal file
@@ -0,0 +1,157 @@
|
||||
use std::io::Write;
|
||||
use std::io;
|
||||
use common::serialize::BinarySerializable;
|
||||
use std::mem;
|
||||
|
||||
|
||||
pub fn compute_num_bits(amplitude: u32) -> u8 {
|
||||
(32u32 - amplitude.leading_zeros()) as u8
|
||||
}
|
||||
|
||||
pub struct BitPacker<TWrite: Write> {
|
||||
output: TWrite,
|
||||
mini_buffer: u64,
|
||||
mini_buffer_written: usize,
|
||||
num_bits: usize,
|
||||
written_size: usize,
|
||||
}
|
||||
|
||||
impl<TWrite: Write> BitPacker<TWrite> {
|
||||
|
||||
pub fn new(output: TWrite, num_bits: usize) -> BitPacker<TWrite> {
|
||||
BitPacker {
|
||||
output: output,
|
||||
mini_buffer: 0u64,
|
||||
mini_buffer_written: 0,
|
||||
num_bits: num_bits,
|
||||
written_size: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write(&mut self, val: u32) -> io::Result<()> {
|
||||
let val_u64 = val as u64;
|
||||
if self.mini_buffer_written + self.num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
self.written_size += self.mini_buffer.serialize(&mut self.output)?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + (self.num_bits as usize) - 64;
|
||||
}
|
||||
else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += self.num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
self.written_size += self.mini_buffer.serialize(&mut self.output)?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn flush(&mut self) -> io::Result<()>{
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let arr: [u8; 8] = unsafe { mem::transmute::<u64, [u8; 8]>(self.mini_buffer) };
|
||||
self.output.write_all(&arr[..num_bytes])?;
|
||||
self.written_size += num_bytes;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn close(mut self) -> io::Result<(TWrite, usize)> {
|
||||
self.flush()?;
|
||||
Ok((self.output, self.written_size))
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
pub struct BitUnpacker<'a> {
|
||||
data: &'a [u8],
|
||||
num_bits: usize,
|
||||
mask: u32,
|
||||
}
|
||||
|
||||
impl<'a> BitUnpacker<'a> {
|
||||
pub fn new(data: &'a [u8], num_bits: usize) -> BitUnpacker<'a> {
|
||||
BitUnpacker {
|
||||
data: data,
|
||||
num_bits: num_bits,
|
||||
mask: (1u32 << num_bits) - 1u32,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: usize) -> u32 {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
}
|
||||
let addr = (idx * self.num_bits) / 8;
|
||||
let bit_shift = (idx * self.num_bits) - addr * 8;
|
||||
let val_unshifted_unmasked: u64;
|
||||
if addr + 8 <= self.data.len() {
|
||||
val_unshifted_unmasked = unsafe { * (self.data.as_ptr().offset(addr as isize) as *const u64) };
|
||||
}
|
||||
else {
|
||||
let mut arr = [0u8; 8];
|
||||
for i in 0..self.data.len() - addr {
|
||||
arr[i] = self.data[addr + i];
|
||||
}
|
||||
val_unshifted_unmasked = unsafe { mem::transmute::<[u8; 8], u64>(arr) };
|
||||
}
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u32;
|
||||
(val_shifted & self.mask)
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker, compute_num_bits};
|
||||
|
||||
#[test]
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: usize) {
|
||||
let mut bitpacker = BitPacker::new(Vec::new(), num_bits);
|
||||
let max_val: u32 = (1 << num_bits) - 1;
|
||||
let vals: Vec<u32> = (0u32..len as u32).map(|i| {
|
||||
if max_val == 0 {
|
||||
0
|
||||
}
|
||||
else {
|
||||
i % max_val
|
||||
}
|
||||
}).collect();
|
||||
for &val in &vals {
|
||||
bitpacker.write(val).unwrap();
|
||||
}
|
||||
let (data, num_bytes) = bitpacker.close().unwrap();
|
||||
assert_eq!(num_bytes, (num_bits * len + 7) / 8);
|
||||
assert_eq!(data.len(), num_bytes);
|
||||
let bitunpacker = BitUnpacker::new(&data, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i), *val);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bitpacker() {
|
||||
test_bitpacker_util(10, 3);
|
||||
test_bitpacker_util(10, 0);
|
||||
test_bitpacker_util(10, 1);
|
||||
test_bitpacker_util(6, 14);
|
||||
test_bitpacker_util(1000, 14);
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,17 @@
|
||||
mod serialize;
|
||||
mod timer;
|
||||
mod vint;
|
||||
mod bitpacker;
|
||||
|
||||
|
||||
pub use self::serialize::BinarySerializable;
|
||||
pub use self::timer::Timing;
|
||||
pub use self::timer::TimerTree;
|
||||
pub use self::timer::OpenTimer;
|
||||
pub use self::vint::VInt;
|
||||
pub use self::bitpacker::compute_num_bits;
|
||||
|
||||
|
||||
use std::io;
|
||||
|
||||
pub fn make_io_err(msg: String) -> io::Error {
|
||||
@@ -26,36 +31,3 @@ pub trait HasLen {
|
||||
}
|
||||
|
||||
|
||||
fn count_leading_zeros(mut val: u32) -> u8 {
|
||||
if val == 0 {
|
||||
return 32;
|
||||
}
|
||||
let mut result = 0u8;
|
||||
while (val & (1u32 << 31)) == 0 {
|
||||
val <<= 1;
|
||||
result += 1;
|
||||
}
|
||||
result
|
||||
}
|
||||
|
||||
|
||||
pub fn compute_num_bits(amplitude: u32) -> u8 {
|
||||
32u8 - count_leading_zeros(amplitude)
|
||||
}
|
||||
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::compute_num_bits;
|
||||
|
||||
fn test_compute_num_bits() {
|
||||
assert_eq!(compute_num_bits(1), 1u8);
|
||||
assert_eq!(compute_num_bits(0), 0u8);
|
||||
assert_eq!(compute_num_bits(2), 2u8);
|
||||
assert_eq!(compute_num_bits(3), 2u8);
|
||||
assert_eq!(compute_num_bits(4), 3u8);
|
||||
assert_eq!(compute_num_bits(255), 8u8);
|
||||
assert_eq!(compute_num_bits(256), 9u8);
|
||||
}
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
|
||||
use byteorder::{NativeEndian, ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::{ReadBytesExt, WriteBytesExt};
|
||||
use byteorder::LittleEndian as Endianness;
|
||||
use std::fmt;
|
||||
use std::io::Write;
|
||||
use std::io::Read;
|
||||
@@ -59,13 +60,13 @@ impl<Left: BinarySerializable, Right: BinarySerializable> BinarySerializable for
|
||||
|
||||
impl BinarySerializable for u32 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_u32::<NativeEndian>(*self)
|
||||
writer.write_u32::<Endianness>(*self)
|
||||
.map(|_| 4)
|
||||
.map_err(convert_byte_order_error)
|
||||
}
|
||||
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u32> {
|
||||
reader.read_u32::<NativeEndian>()
|
||||
reader.read_u32::<Endianness>()
|
||||
.map_err(convert_byte_order_error)
|
||||
}
|
||||
}
|
||||
@@ -73,12 +74,12 @@ impl BinarySerializable for u32 {
|
||||
|
||||
impl BinarySerializable for u64 {
|
||||
fn serialize(&self, writer: &mut Write) -> io::Result<usize> {
|
||||
writer.write_u64::<NativeEndian>(*self)
|
||||
writer.write_u64::<Endianness>(*self)
|
||||
.map(|_| 8)
|
||||
.map_err(convert_byte_order_error)
|
||||
}
|
||||
fn deserialize(reader: &mut Read) -> io::Result<u64> {
|
||||
reader.read_u64::<NativeEndian>()
|
||||
reader.read_u64::<Endianness>()
|
||||
.map_err(convert_byte_order_error)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,8 +22,6 @@ pub use self::serializer::FastFieldSerializer;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::compute_num_bits;
|
||||
use super::*;
|
||||
use schema::Field;
|
||||
use std::path::Path;
|
||||
|
||||
@@ -2,6 +2,8 @@ use common::BinarySerializable;
|
||||
use directory::WritePtr;
|
||||
use schema::Field;
|
||||
use common::compute_num_bits;
|
||||
use std::io;
|
||||
use std::io::{Write, Seek, SeekFrom};
|
||||
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
|
||||
Reference in New Issue
Block a user