mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 15:52:55 +00:00
move tantivy bitpacker to crate, refator bitpacker
remove byteorder dependency
This commit is contained in:
@@ -35,6 +35,7 @@ uuid = { version = "0.8", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8"
|
||||
futures = {version = "0.3", features=["thread-pool"] }
|
||||
tantivy-query-grammar = { version="0.14.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
stable_deref_trait = "1"
|
||||
rust-stemmers = "1"
|
||||
downcast-rs = "1"
|
||||
@@ -86,7 +87,7 @@ unstable = [] # useful for benches.
|
||||
wasm-bindgen = ["uuid/wasm-bindgen"]
|
||||
|
||||
[workspace]
|
||||
members = ["query-grammar"]
|
||||
members = ["query-grammar", "bitpacker"]
|
||||
|
||||
[badges]
|
||||
travis-ci = { repository = "tantivy-search/tantivy" }
|
||||
|
||||
8
bitpacker/Cargo.toml
Normal file
8
bitpacker/Cargo.toml
Normal file
@@ -0,0 +1,8 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.1.0"
|
||||
edition = "2018"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
@@ -1,9 +1,6 @@
|
||||
use byteorder::{ByteOrder, LittleEndian, WriteBytesExt};
|
||||
use std::io;
|
||||
use std::{convert::TryInto, io};
|
||||
|
||||
use crate::directory::OwnedBytes;
|
||||
|
||||
pub(crate) struct BitPacker {
|
||||
pub struct BitPacker {
|
||||
mini_buffer: u64,
|
||||
mini_buffer_written: usize,
|
||||
}
|
||||
@@ -26,14 +23,14 @@ impl BitPacker {
|
||||
let num_bits = num_bits as usize;
|
||||
if self.mini_buffer_written + num_bits > 64 {
|
||||
self.mini_buffer |= val_u64.wrapping_shl(self.mini_buffer_written as u32);
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
||||
self.mini_buffer = val_u64.wrapping_shr((64 - self.mini_buffer_written) as u32);
|
||||
self.mini_buffer_written = self.mini_buffer_written + num_bits - 64;
|
||||
} else {
|
||||
self.mini_buffer |= val_u64 << self.mini_buffer_written;
|
||||
self.mini_buffer_written += num_bits;
|
||||
if self.mini_buffer_written == 64 {
|
||||
output.write_u64::<LittleEndian>(self.mini_buffer)?;
|
||||
output.write_all(self.mini_buffer.to_le_bytes().as_ref())?;
|
||||
self.mini_buffer_written = 0;
|
||||
self.mini_buffer = 0u64;
|
||||
}
|
||||
@@ -44,9 +41,8 @@ impl BitPacker {
|
||||
pub fn flush<TWrite: io::Write>(&mut self, output: &mut TWrite) -> io::Result<()> {
|
||||
if self.mini_buffer_written > 0 {
|
||||
let num_bytes = (self.mini_buffer_written + 7) / 8;
|
||||
let mut arr: [u8; 8] = [0u8; 8];
|
||||
LittleEndian::write_u64(&mut arr, self.mini_buffer);
|
||||
output.write_all(&arr[..num_bytes])?;
|
||||
let bytes = self.mini_buffer.to_le_bytes();
|
||||
output.write_all(&bytes[..num_bytes])?;
|
||||
self.mini_buffer_written = 0;
|
||||
}
|
||||
Ok(())
|
||||
@@ -64,11 +60,10 @@ impl BitPacker {
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u64,
|
||||
mask: u64,
|
||||
data: OwnedBytes,
|
||||
}
|
||||
|
||||
impl BitUnpacker {
|
||||
pub fn new(data: OwnedBytes, num_bits: u8) -> BitUnpacker {
|
||||
pub fn new(num_bits: u8) -> BitUnpacker {
|
||||
let mask: u64 = if num_bits == 64 {
|
||||
!0u64
|
||||
} else {
|
||||
@@ -77,15 +72,13 @@ impl BitUnpacker {
|
||||
BitUnpacker {
|
||||
num_bits: u64::from(num_bits),
|
||||
mask,
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: u64) -> u64 {
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
if self.num_bits == 0 {
|
||||
return 0u64;
|
||||
}
|
||||
let data: &[u8] = self.data.as_slice();
|
||||
let num_bits = self.num_bits;
|
||||
let mask = self.mask;
|
||||
let addr_in_bits = idx * num_bits;
|
||||
@@ -95,7 +88,10 @@ impl BitUnpacker {
|
||||
addr + 8 <= data.len() as u64,
|
||||
"The fast field field should have been padded with 7 bytes."
|
||||
);
|
||||
let val_unshifted_unmasked: u64 = LittleEndian::read_u64(&data[(addr as usize)..]);
|
||||
let bytes: [u8; 8] = (&data[(addr as usize)..(addr as usize) + 8])
|
||||
.try_into()
|
||||
.unwrap();
|
||||
let val_unshifted_unmasked: u64 = u64::from_le_bytes(bytes);
|
||||
let val_shifted = (val_unshifted_unmasked >> bit_shift) as u64;
|
||||
val_shifted & mask
|
||||
}
|
||||
@@ -104,9 +100,8 @@ impl BitUnpacker {
|
||||
#[cfg(test)]
|
||||
mod test {
|
||||
use super::{BitPacker, BitUnpacker};
|
||||
use crate::directory::OwnedBytes;
|
||||
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>) {
|
||||
fn create_fastfield_bitpacker(len: usize, num_bits: u8) -> (BitUnpacker, Vec<u64>, Vec<u8>) {
|
||||
let mut data = Vec::new();
|
||||
let mut bitpacker = BitPacker::new();
|
||||
let max_val: u64 = (1u64 << num_bits as u64) - 1u64;
|
||||
@@ -118,14 +113,14 @@ mod test {
|
||||
}
|
||||
bitpacker.close(&mut data).unwrap();
|
||||
assert_eq!(data.len(), ((num_bits as usize) * len + 7) / 8 + 7);
|
||||
let bitunpacker = BitUnpacker::new(OwnedBytes::new(data), num_bits);
|
||||
(bitunpacker, vals)
|
||||
let bitunpacker = BitUnpacker::new(num_bits);
|
||||
(bitunpacker, vals, data)
|
||||
}
|
||||
|
||||
fn test_bitpacker_util(len: usize, num_bits: u8) {
|
||||
let (bitunpacker, vals) = create_fastfield_bitpacker(len, num_bits);
|
||||
let (bitunpacker, vals, data) = create_fastfield_bitpacker(len, num_bits);
|
||||
for (i, val) in vals.iter().enumerate() {
|
||||
assert_eq!(bitunpacker.get(i as u64), *val);
|
||||
assert_eq!(bitunpacker.get(i as u64, &data), *val);
|
||||
}
|
||||
}
|
||||
|
||||
55
bitpacker/src/blocked_bitpacker.rs
Normal file
55
bitpacker/src/blocked_bitpacker.rs
Normal file
@@ -0,0 +1,55 @@
|
||||
use super::{bitpacker::BitPacker, compute_num_bits};
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct BlockedBitpacker {
|
||||
compressed_blocks: Vec<u8>,
|
||||
cache: Vec<u64>,
|
||||
offset_and_bits: Vec<(u32, u8)>,
|
||||
blocksize: u32,
|
||||
}
|
||||
|
||||
impl BlockedBitpacker {
|
||||
fn new(blocksize: u32) -> Self {
|
||||
Self {
|
||||
compressed_blocks: vec![],
|
||||
cache: vec![],
|
||||
offset_and_bits: vec![],
|
||||
blocksize,
|
||||
}
|
||||
}
|
||||
|
||||
fn add(&mut self, el: u64) {
|
||||
self.cache.push(el);
|
||||
if self.cache.len() > self.blocksize as usize {
|
||||
self.flush();
|
||||
}
|
||||
}
|
||||
|
||||
fn flush(&mut self) {
|
||||
if self.cache.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let num_bits_block = self
|
||||
.cache
|
||||
.iter()
|
||||
.map(|el| compute_num_bits(*el))
|
||||
.max()
|
||||
.unwrap();
|
||||
for val in self.cache.iter() {
|
||||
bit_packer
|
||||
.write(*val, num_bits_block, &mut self.compressed_blocks)
|
||||
.unwrap(); // write to im can't fail
|
||||
}
|
||||
self.offset_and_bits
|
||||
.push((self.compressed_blocks.len() as u32, num_bits_block));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
test
|
||||
}
|
||||
12
bitpacker/src/lib.rs
Normal file
12
bitpacker/src/lib.rs
Normal file
@@ -0,0 +1,12 @@
|
||||
mod bitpacker;
|
||||
|
||||
pub use crate::bitpacker::BitPacker;
|
||||
pub use crate::bitpacker::BitUnpacker;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
#[test]
|
||||
fn it_works() {
|
||||
assert_eq!(2 + 2, 4);
|
||||
}
|
||||
}
|
||||
@@ -1,4 +1,3 @@
|
||||
pub mod bitpacker;
|
||||
mod bitset;
|
||||
mod composite_file;
|
||||
mod counting_writer;
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
use super::FastValue;
|
||||
use crate::common::bitpacker::BitUnpacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeFile;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::directory::OwnedBytes;
|
||||
use crate::directory::{Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{FastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::Schema;
|
||||
@@ -12,6 +12,7 @@ use crate::DocId;
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::path::Path;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Trait for accessing a fastfield.
|
||||
///
|
||||
@@ -19,6 +20,7 @@ use std::path::Path;
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReader<Item: FastValue> {
|
||||
bytes: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
min_value_u64: u64,
|
||||
max_value_u64: u64,
|
||||
@@ -33,8 +35,9 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(bytes, num_bits);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(FastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
@@ -55,7 +58,7 @@ impl<Item: FastValue> FastFieldReader<Item> {
|
||||
}
|
||||
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc))
|
||||
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use crate::common::bitpacker::BitPacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
@@ -6,6 +5,7 @@ use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// `FastFieldSerializer` is in charge of serializing
|
||||
/// fastfields on disk.
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::{bitpacker::BitPacker, BinarySerializable, FixedSize};
|
||||
use crate::common::{BinarySerializable, FixedSize};
|
||||
use crate::directory::{FileSlice, OwnedBytes};
|
||||
use crate::postings::TermInfo;
|
||||
use crate::termdict::TermOrdinal;
|
||||
use byteorder::{ByteOrder, LittleEndian};
|
||||
use std::cmp;
|
||||
use std::io::{self, Read, Write};
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
const BLOCK_LEN: usize = 256;
|
||||
|
||||
@@ -290,11 +291,11 @@ mod tests {
|
||||
use super::TermInfoBlockMeta;
|
||||
use super::{TermInfoStore, TermInfoStoreWriter};
|
||||
use crate::common;
|
||||
use crate::common::bitpacker::BitPacker;
|
||||
use crate::common::compute_num_bits;
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::directory::FileSlice;
|
||||
use crate::postings::TermInfo;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
#[test]
|
||||
fn test_term_info_block() {
|
||||
|
||||
Reference in New Issue
Block a user