diff --git a/Cargo.toml b/Cargo.toml index 25d81a8c4..12d3a08c3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -35,6 +35,7 @@ uuid = { version = "0.8", features = ["v4", "serde"] } crossbeam = "0.8" futures = {version = "0.3", features=["thread-pool"] } tantivy-query-grammar = { version="0.14.0", path="./query-grammar" } +tantivy-bitpacker = { version="0.1", path="./bitpacker" } stable_deref_trait = "1" rust-stemmers = "1" downcast-rs = "1" @@ -86,7 +87,7 @@ unstable = [] # useful for benches. wasm-bindgen = ["uuid/wasm-bindgen"] [workspace] -members = ["query-grammar"] +members = ["query-grammar", "bitpacker"] [badges] travis-ci = { repository = "tantivy-search/tantivy" } diff --git a/bitpacker/Cargo.toml b/bitpacker/Cargo.toml new file mode 100644 index 000000000..05a4c81d8 --- /dev/null +++ b/bitpacker/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "tantivy-bitpacker" +version = "0.1.0" +edition = "2018" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] diff --git a/src/common/bitpacker.rs b/bitpacker/src/bitpacker.rs similarity index 76% rename from src/common/bitpacker.rs rename to bitpacker/src/bitpacker.rs index 640d8adcf..7f936760b 100644 --- a/src/common/bitpacker.rs +++ b/bitpacker/src/bitpacker.rs @@ -1,9 +1,6 @@ -use byteorder::{ByteOrder, LittleEndian, WriteBytesExt}; -use std::io; +use std::{convert::TryInto, io}; -use crate::directory::OwnedBytes; - -pub(crate) struct BitPacker { +pub struct BitPacker { mini_buffer: u64, mini_buffer_written: usize, } @@ -26,14 +23,14 @@ impl BitPacker { let num_bits = num_bits as usize; if self.mini_buffer_written + num_bits > 64 { self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32); - output.write_u64::(self.mini_buffer)?; + output.write_all(self.mini_buffer.to_le_bytes().as_ref())?; self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32); self.mini_buffer_written = self.mini_buffer_written + num_bits - 64; } else { self.mini_buffer |= val_u64 << self.mini_buffer_written; self.mini_buffer_written += num_bits; if self.mini_buffer_written == 64 { - output.write_u64::(self.mini_buffer)?; + output.write_all(self.mini_buffer.to_le_bytes().as_ref())?; self.mini_buffer_written = 0; self.mini_buffer = 0u64; } @@ -44,9 +41,8 @@ impl BitPacker { pub fn flush(&mut self, output: &mut TWrite) -> io::Result<()> { if self.mini_buffer_written > 0 { let num_bytes = (self.mini_buffer_written + 7) / 8; - let mut arr: [u8; 8] = [0u8; 8]; - LittleEndian::write_u64(&mut arr, self.mini_buffer); - output.write_all(&arr[..num_bytes])?; + let bytes = self.mini_buffer.to_le_bytes(); + output.write_all(&bytes[..num_bytes])?; self.mini_buffer_written = 0; } Ok(()) @@ -64,11 +60,10 @@ impl BitPacker { pub struct BitUnpacker { num_bits: u64, mask: u64, - data: OwnedBytes, } impl BitUnpacker { - pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker { + pub fn new(num_bits: u8) -> BitUnpacker { let mask: u64 = if num_bits == 64 { !0u64 } else { @@ -77,15 +72,13 @@ impl BitUnpacker { BitUnpacker { num_bits: u64::from(num_bits), mask, - data, } } - pub fn get(&self, idx: u64) -> u64 { + pub fn get(&self, idx: u64, data: &[u8]) -> u64 { if self.num_bits == 0 { return 0u64; } - let data: &[u8] = self.data.as_slice(); let num_bits = self.num_bits; let mask = self.mask; let addr_in_bits = idx * num_bits; @@ -95,7 +88,10 @@ impl BitUnpacker { addr + 8 <= data.len() as u64, "The fast field field should have been padded with 7 bytes." ); - let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]); + let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8]) + .try_into() + .unwrap(); + let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes); let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64; val_shifted & mask } @@ -104,9 +100,8 @@ impl BitUnpacker { #[cfg(test)] mod test { use super::{BitPacker, BitUnpacker}; - use crate::directory::OwnedBytes; - fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec) { + fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec, Vec) { let mut data = Vec::new(); let mut bitpacker = BitPacker::new(); let max_val: u64 = (1u64 << num_bits as u64) - 1u64; @@ -118,14 +113,14 @@ mod test { } bitpacker.close(&mut data).unwrap(); assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7); - let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits); - (bitunpacker, vals) + let bitunpacker = BitUnpacker::new(num_bits); + (bitunpacker, vals, data) } fn test_bitpacker_util(len: usize, num_bits: u8) { - let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits); + let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits); for (i, val) in vals.iter().enumerate() { - assert_eq!(bitunpacker.get(i as u64), *val); + assert_eq!(bitunpacker.get(i as u64, &data), *val); } } diff --git a/bitpacker/src/blocked_bitpacker.rs b/bitpacker/src/blocked_bitpacker.rs new file mode 100644 index 000000000..a635e329e --- /dev/null +++ b/bitpacker/src/blocked_bitpacker.rs @@ -0,0 +1,55 @@ +use super::{bitpacker::BitPacker, compute_num_bits}; + +#[derive(Debug, Clone)] +struct BlockedBitpacker { + compressed_blocks: Vec, + cache: Vec, + offset_and_bits: Vec<(u32, u8)>, + blocksize: u32, +} + +impl BlockedBitpacker { + fn new(blocksize: u32) -> Self { + Self { + compressed_blocks: vec![], + cache: vec![], + offset_and_bits: vec![], + blocksize, + } + } + + fn add(&mut self, el: u64) { + self.cache.push(el); + if self.cache.len() > self.blocksize as usize { + self.flush(); + } + } + + fn flush(&mut self) { + if self.cache.is_empty() { + return; + } + let mut bit_packer = BitPacker::new(); + let num_bits_block = self + .cache + .iter() + .map(|el| compute_num_bits(*el)) + .max() + .unwrap(); + for val in self.cache.iter() { + bit_packer + .write(*val, num_bits_block, &mut self.compressed_blocks) + .unwrap(); // write to im can't fail + } + self.offset_and_bits + .push((self.compressed_blocks.len() as u32, num_bits_block)); + } +} + + +#[cfg(test)] +mod tests { + use super::*; + + test +} diff --git a/bitpacker/src/lib.rs b/bitpacker/src/lib.rs new file mode 100644 index 000000000..be227ea0b --- /dev/null +++ b/bitpacker/src/lib.rs @@ -0,0 +1,12 @@ +mod bitpacker; + +pub use crate::bitpacker::BitPacker; +pub use crate::bitpacker::BitUnpacker; + +#[cfg(test)] +mod tests { + #[test] + fn it_works() { + assert_eq!(2 + 2, 4); + } +} diff --git a/src/common/mod.rs b/src/common/mod.rs index 9405067ba..5abc2fb6c 100644 --- a/src/common/mod.rs +++ b/src/common/mod.rs @@ -1,4 +1,3 @@ -pub mod bitpacker; mod bitset; mod composite_file; mod counting_writer; diff --git a/src/fastfield/reader.rs b/src/fastfield/reader.rs index 0e4cc2f64..5f49bcc53 100644 --- a/src/fastfield/reader.rs +++ b/src/fastfield/reader.rs @@ -1,9 +1,9 @@ use super::FastValue; -use crate::common::bitpacker::BitUnpacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; use crate::common::CompositeFile; use crate::directory::FileSlice; +use crate::directory::OwnedBytes; use crate::directory::{Directory, RamDirectory, WritePtr}; use crate::fastfield::{FastFieldSerializer, FastFieldsWriter}; use crate::schema::Schema; @@ -12,6 +12,7 @@ use crate::DocId; use std::collections::HashMap; use std::marker::PhantomData; use std::path::Path; +use tantivy_bitpacker::BitUnpacker; /// Trait for accessing a fastfield. /// @@ -19,6 +20,7 @@ use std::path::Path; /// fast field is required. #[derive(Clone)] pub struct FastFieldReader { + bytes: OwnedBytes, bit_unpacker: BitUnpacker, min_value_u64: u64, max_value_u64: u64, @@ -33,8 +35,9 @@ impl FastFieldReader { let amplitude = u64::deserialize(&mut bytes)?; let max_value = min_value + amplitude; let num_bits = compute_num_bits(amplitude); - let bit_unpacker = BitUnpacker::new(bytes, num_bits); + let bit_unpacker = BitUnpacker::new(num_bits); Ok(FastFieldReader { + bytes, min_value_u64: min_value, max_value_u64: max_value, bit_unpacker, @@ -55,7 +58,7 @@ impl FastFieldReader { } pub(crate) fn get_u64(&self, doc: u64) -> Item { - Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc)) + Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes)) } /// Internally `multivalued` also use SingleValue Fast fields. diff --git a/src/fastfield/serializer.rs b/src/fastfield/serializer.rs index d2871d651..a06bc02e8 100644 --- a/src/fastfield/serializer.rs +++ b/src/fastfield/serializer.rs @@ -1,4 +1,3 @@ -use crate::common::bitpacker::BitPacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; use crate::common::CompositeWrite; @@ -6,6 +5,7 @@ use crate::common::CountingWriter; use crate::directory::WritePtr; use crate::schema::Field; use std::io::{self, Write}; +use tantivy_bitpacker::BitPacker; /// `FastFieldSerializer` is in charge of serializing /// fastfields on disk. diff --git a/src/termdict/fst_termdict/term_info_store.rs b/src/termdict/fst_termdict/term_info_store.rs index 905c56a96..382661e2d 100644 --- a/src/termdict/fst_termdict/term_info_store.rs +++ b/src/termdict/fst_termdict/term_info_store.rs @@ -1,11 +1,12 @@ use crate::common::compute_num_bits; -use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize}; +use crate::common::{BinarySerializable, FixedSize}; use crate::directory::{FileSlice, OwnedBytes}; use crate::postings::TermInfo; use crate::termdict::TermOrdinal; use byteorder::{ByteOrder, LittleEndian}; use std::cmp; use std::io::{self, Read, Write}; +use tantivy_bitpacker::BitPacker; const BLOCK_LEN: usize = 256; @@ -290,11 +291,11 @@ mod tests { use super::TermInfoBlockMeta; use super::{TermInfoStore, TermInfoStoreWriter}; use crate::common; - use crate::common::bitpacker::BitPacker; use crate::common::compute_num_bits; use crate::common::BinarySerializable; use crate::directory::FileSlice; use crate::postings::TermInfo; + use tantivy_bitpacker::BitPacker; #[test] fn test_term_info_block() {