move tantivy bitpacker to crate, refator bitpacker

remove byteorder dependency
This commit is contained in:
Pascal Seitz
2021-04-29 16:40:02 +02:00
parent 88a1a90c3c
commit daa53522b5
9 changed files with 104 additions and 30 deletions

View File

@@ -35,6 +35,7 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
crossbeam = "0.8"
futures = {version = "0.3", features=["thread-pool"] }
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
stable_deref_trait = "1"
rust-stemmers = "1"
downcast-rs = "1"
@@ -86,7 +87,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar"]
members = ["query-grammar", "bitpacker"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

8
bitpacker/Cargo.toml Normal file
View File

@@ -0,0 +1,8 @@
[package]
name = "tantivy-bitpacker"
version = "0.1.0"
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]

View File

@@ -1,9 +1,6 @@
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
use std::io;
use std::{convert::TryInto, io};
use crate::directory::OwnedBytes;
pub(crate) struct BitPacker {
pub struct BitPacker {
mini_buffer: u64,
mini_buffer_written: usize,
}
@@ -26,14 +23,14 @@ impl BitPacker {
let num_bits = num_bits as usize;
if self.mini_buffer_written + num_bits > 64 {
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
output.write_u64::<LittleEndian>(self.mini_buffer)?;
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
} else {
self.mini_buffer |= val_u64 << self.mini_buffer_written;
self.mini_buffer_written += num_bits;
if self.mini_buffer_written == 64 {
output.write_u64::<LittleEndian>(self.mini_buffer)?;
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
self.mini_buffer_written = 0;
self.mini_buffer = 0u64;
}
@@ -44,9 +41,8 @@ impl BitPacker {
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
if self.mini_buffer_written > 0 {
let num_bytes = (self.mini_buffer_written + 7) / 8;
let mut arr: [u8; 8] = [0u8; 8];
LittleEndian::write_u64(&mut arr, self.mini_buffer);
output.write_all(&arr[..num_bytes])?;
let bytes = self.mini_buffer.to_le_bytes();
output.write_all(&bytes[..num_bytes])?;
self.mini_buffer_written = 0;
}
Ok(())
@@ -64,11 +60,10 @@ impl BitPacker {
pub struct BitUnpacker {
num_bits: u64,
mask: u64,
data: OwnedBytes,
}
impl BitUnpacker {
pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker {
pub fn new(num_bits: u8) -> BitUnpacker {
let mask: u64 = if num_bits == 64 {
!0u64
} else {
@@ -77,15 +72,13 @@ impl BitUnpacker {
BitUnpacker {
num_bits: u64::from(num_bits),
mask,
data,
}
}
pub fn get(&self, idx: u64) -> u64 {
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
if self.num_bits == 0 {
return 0u64;
}
let data: &[u8] = self.data.as_slice();
let num_bits = self.num_bits;
let mask = self.mask;
let addr_in_bits = idx * num_bits;
@@ -95,7 +88,10 @@ impl BitUnpacker {
addr + 8 <= data.len() as u64,
"The fast field field should have been padded with 7 bytes."
);
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
.try_into()
.unwrap();
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
val_shifted & mask
}
@@ -104,9 +100,8 @@ impl BitUnpacker {
#[cfg(test)]
mod test {
use super::{BitPacker, BitUnpacker};
use crate::directory::OwnedBytes;
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>) {
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
let mut data = Vec::new();
let mut bitpacker = BitPacker::new();
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
@@ -118,14 +113,14 @@ mod test {
}
bitpacker.close(&mut data).unwrap();
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits);
(bitunpacker, vals)
let bitunpacker = BitUnpacker::new(num_bits);
(bitunpacker, vals, data)
}
fn test_bitpacker_util(len: usize, num_bits: u8) {
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
for (i, val) in vals.iter().enumerate() {
assert_eq!(bitunpacker.get(i as u64), *val);
assert_eq!(bitunpacker.get(i as u64, &data), *val);
}
}

View File

@@ -0,0 +1,55 @@
use super::{bitpacker::BitPacker, compute_num_bits};
#[derive(Debug, Clone)]
struct BlockedBitpacker {
compressed_blocks: Vec<u8>,
cache: Vec<u64>,
offset_and_bits: Vec<(u32, u8)>,
blocksize: u32,
}
impl BlockedBitpacker {
fn new(blocksize: u32) -> Self {
Self {
compressed_blocks: vec![],
cache: vec![],
offset_and_bits: vec![],
blocksize,
}
}
fn add(&mut self, el: u64) {
self.cache.push(el);
if self.cache.len() > self.blocksize as usize {
self.flush();
}
}
fn flush(&mut self) {
if self.cache.is_empty() {
return;
}
let mut bit_packer = BitPacker::new();
let num_bits_block = self
.cache
.iter()
.map(|el| compute_num_bits(*el))
.max()
.unwrap();
for val in self.cache.iter() {
bit_packer
.write(*val, num_bits_block, &mut self.compressed_blocks)
.unwrap(); // write to im can't fail
}
self.offset_and_bits
.push((self.compressed_blocks.len() as u32, num_bits_block));
}
}
#[cfg(test)]
mod tests {
use super::*;
test
}

12
bitpacker/src/lib.rs Normal file
View File

@@ -0,0 +1,12 @@
mod bitpacker;
pub use crate::bitpacker::BitPacker;
pub use crate::bitpacker::BitUnpacker;
#[cfg(test)]
mod tests {
#[test]
fn it_works() {
assert_eq!(2 + 2, 4);
}
}

View File

@@ -1,4 +1,3 @@
pub mod bitpacker;
mod bitset;
mod composite_file;
mod counting_writer;

View File

@@ -1,9 +1,9 @@
use super::FastValue;
use crate::common::bitpacker::BitUnpacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeFile;
use crate::directory::FileSlice;
use crate::directory::OwnedBytes;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
@@ -12,6 +12,7 @@ use crate::DocId;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
use tantivy_bitpacker::BitUnpacker;
/// Trait for accessing a fastfield.
///
@@ -19,6 +20,7 @@ use std::path::Path;
/// fast field is required.
#[derive(Clone)]
pub struct FastFieldReader<Item: FastValue> {
bytes: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
@@ -33,8 +35,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(bytes, num_bits);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(FastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
@@ -55,7 +58,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
}
/// Internally `multivalued` also use SingleValue Fast fields.

View File

@@ -1,4 +1,3 @@
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
@@ -6,6 +5,7 @@ use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use std::io::{self, Write};
use tantivy_bitpacker::BitPacker;
/// `FastFieldSerializer` is in charge of serializing
/// fastfields on disk.

View File

@@ -1,11 +1,12 @@
use crate::common::compute_num_bits;
use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize};
use crate::common::{BinarySerializable, FixedSize};
use crate::directory::{FileSlice, OwnedBytes};
use crate::postings::TermInfo;
use crate::termdict::TermOrdinal;
use byteorder::{ByteOrder, LittleEndian};
use std::cmp;
use std::io::{self, Read, Write};
use tantivy_bitpacker::BitPacker;
const BLOCK_LEN: usize = 256;
@@ -290,11 +291,11 @@ mod tests {
use super::TermInfoBlockMeta;
use super::{TermInfoStore, TermInfoStoreWriter};
use crate::common;
use crate::common::bitpacker::BitPacker;
use crate::common::compute_num_bits;
use crate::common::BinarySerializable;
use crate::directory::FileSlice;
use crate::postings::TermInfo;
use tantivy_bitpacker::BitPacker;
#[test]
fn test_term_info_block() {