Embeds OwnedBytes into the FastFieldCodecReader. (#1458)

This commit is contained in:
Paul Masurel
2022-08-22 17:02:31 +02:00
committed by GitHub
parent 7f9ba0ee50
commit abbd934ac9
7 changed files with 44 additions and 34 deletions

View File

@@ -11,6 +11,7 @@ description = "Fast field codecs used by tantivy"
[dependencies]
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
prettytable-rs = {version="0.9.0", optional= true}
rand = {version="0.8.3", optional= true}

View File

@@ -1,6 +1,7 @@
use std::io::{self, Write};
use common::BinarySerializable;
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
@@ -9,6 +10,7 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
@@ -16,22 +18,24 @@ pub struct BitpackedFastFieldReader {
impl FastFieldCodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, data)
fn get_u64(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
fn min_value(&self) -> u64 {

View File

@@ -5,16 +5,16 @@ extern crate more_asserts;
use std::io;
use std::io::Write;
use ownedbytes::OwnedBytes;
pub mod bitpacked;
pub mod linearinterpol;
pub mod multilinearinterpol;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
@@ -98,7 +98,7 @@ mod tests {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let mut out = vec![];
let mut out: Vec<u8> = Vec::new();
S::serialize(
&mut out,
&data,
@@ -108,9 +108,11 @@ mod tests {
)
.unwrap();
let reader = R::open_from_bytes(&out).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64, &out);
let val = reader.get_u64(doc as u64);
if val != *orig_val {
panic!(
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
@@ -118,7 +120,6 @@ mod tests {
);
}
}
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
(estimation, actual_compression)
}
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {

View File

@@ -2,6 +2,7 @@ use std::io::{self, Read, Write};
use std::ops::Sub;
use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
@@ -10,6 +11,7 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
/// fast field is required.
#[derive(Clone)]
pub struct LinearInterpolFastFieldReader {
data: OwnedBytes,
bit_unpacker: BitUnpacker,
pub footer: LinearInterpolFooter,
pub slope: f32,
@@ -57,23 +59,24 @@ impl FixedSize for LinearInterpolFooter {
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearInterpolFastFieldReader {
data,
bit_unpacker,
footer,
slope,
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}
#[inline]

View File

@@ -14,6 +14,7 @@ use std::io::{self, Read, Write};
use std::ops::Sub;
use common::{BinarySerializable, CountingWriter, DeserializeFrom};
use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
@@ -24,6 +25,7 @@ const CHUNK_SIZE: u64 = 512;
/// fast field is required.
#[derive(Clone)]
pub struct MultiLinearInterpolFastFieldReader {
data: OwnedBytes,
pub footer: MultiLinearInterpolFooter,
}
@@ -145,24 +147,23 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
Ok(MultiLinearInterpolFastFieldReader { footer })
Ok(MultiLinearInterpolFastFieldReader { data, footer })
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
let doc = doc - interpolation.start_pos;
let calculated_value =
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
let diff = interpolation
.bit_unpacker
.get(doc, &data[interpolation.data_start_offset as usize..]);
.get(doc, &self.data[interpolation.data_start_offset as usize..]);
(calculated_value + diff) - interpolation.positive_val_offset
}

View File

@@ -4,6 +4,7 @@ use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader;
use gcd::Gcd;
use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1;
pub const GCD_CODEC_ID: u8 = 4;
@@ -19,12 +20,12 @@ pub struct GCDFastFieldCodec<CodecReader> {
}
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self> {
let (header, mut footer) = bytes.split_at(bytes.len() - 16);
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(header)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
@@ -33,8 +34,8 @@ impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let mut data = self.reader.get_u64(doc, data);
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
data *= self.gcd;
data += self.min_value;
data

View File

@@ -216,7 +216,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
bytes: OwnedBytes,
_phantom: PhantomData<Item>,
}
@@ -235,16 +234,16 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes.as_slice())?;
let reader = C::open_from_bytes(bytes)?;
Ok(FastFieldReaderCodecWrapper {
reader,
bytes,
_phantom: PhantomData,
})
}
#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc, self.bytes.as_slice());
let data = self.reader.get_u64(doc);
Item::from_u64(data)
}