mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-06-05 01:50:42 +00:00
Embeds OwnedBytes into the FastFieldCodecReader. (#1458)
This commit is contained in:
@@ -11,6 +11,7 @@ description = "Fast field codecs used by tantivy"
|
||||
[dependencies]
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
ownedbytes = { version = "0.3.0", path = "../ownedbytes" }
|
||||
prettytable-rs = {version="0.9.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::BinarySerializable;
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
@@ -9,6 +10,7 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
data: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
@@ -16,22 +18,24 @@ pub struct BitpackedFastFieldReader {
|
||||
|
||||
impl FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
|
||||
let footer_offset = bytes.len() - 16;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
data,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, data)
|
||||
fn get_u64(&self, doc: u64) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
|
||||
}
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
|
||||
@@ -5,16 +5,16 @@ extern crate more_asserts;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
|
||||
fn get_u64(&self, doc: u64) -> u64;
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
@@ -98,7 +98,7 @@ mod tests {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
let mut out: Vec<u8> = Vec::new();
|
||||
S::serialize(
|
||||
&mut out,
|
||||
&data,
|
||||
@@ -108,9 +108,11 @@ mod tests {
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
|
||||
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
|
||||
for (doc, orig_val) in data.iter().enumerate() {
|
||||
let val = reader.get_u64(doc as u64, &out);
|
||||
let val = reader.get_u64(doc as u64);
|
||||
if val != *orig_val {
|
||||
panic!(
|
||||
"val {:?} does not match orig_val {:?}, in data set {}, data {:?}",
|
||||
@@ -118,7 +120,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
}
|
||||
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
|
||||
(estimation, actual_compression)
|
||||
}
|
||||
pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
|
||||
@@ -2,6 +2,7 @@ use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
@@ -10,6 +11,7 @@ use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess,
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearInterpolFastFieldReader {
|
||||
data: OwnedBytes,
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f32,
|
||||
@@ -57,23 +59,24 @@ impl FixedSize for LinearInterpolFooter {
|
||||
|
||||
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
|
||||
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearInterpolFastFieldReader {
|
||||
data,
|
||||
bit_unpacker,
|
||||
footer,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
fn get_u64(&self, doc: u64) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
|
||||
@@ -14,6 +14,7 @@ use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
|
||||
use common::{BinarySerializable, CountingWriter, DeserializeFrom};
|
||||
use ownedbytes::OwnedBytes;
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
@@ -24,6 +25,7 @@ const CHUNK_SIZE: u64 = 512;
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiLinearInterpolFastFieldReader {
|
||||
data: OwnedBytes,
|
||||
pub footer: MultiLinearInterpolFooter,
|
||||
}
|
||||
|
||||
@@ -145,24 +147,23 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
|
||||
|
||||
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer_offset = bytes.len() - 4 - footer_len as usize;
|
||||
let (data, mut footer) = bytes.split(footer_offset);
|
||||
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
|
||||
|
||||
Ok(MultiLinearInterpolFastFieldReader { footer })
|
||||
Ok(MultiLinearInterpolFastFieldReader { data, footer })
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
fn get_u64(&self, doc: u64) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
.get(doc, &self.data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ use common::BinarySerializable;
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use gcd::Gcd;
|
||||
use ownedbytes::OwnedBytes;
|
||||
|
||||
pub const GCD_DEFAULT: u64 = 1;
|
||||
pub const GCD_CODEC_ID: u8 = 4;
|
||||
@@ -19,12 +20,12 @@ pub struct GCDFastFieldCodec<CodecReader> {
|
||||
}
|
||||
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
|
||||
/// Opens a fast field given the bytes.
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self> {
|
||||
let (header, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
|
||||
let footer_offset = bytes.len() - 16;
|
||||
let (body, mut footer) = bytes.split(footer_offset);
|
||||
let gcd = u64::deserialize(&mut footer)?;
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let reader = C::open_from_bytes(header)?;
|
||||
|
||||
let reader = C::open_from_bytes(body)?;
|
||||
Ok(GCDFastFieldCodec {
|
||||
gcd,
|
||||
min_value,
|
||||
@@ -33,8 +34,8 @@ impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let mut data = self.reader.get_u64(doc, data);
|
||||
fn get_u64(&self, doc: u64) -> u64 {
|
||||
let mut data = self.reader.get_u64(doc);
|
||||
data *= self.gcd;
|
||||
data += self.min_value;
|
||||
data
|
||||
|
||||
@@ -216,7 +216,6 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
|
||||
reader: CodecReader,
|
||||
bytes: OwnedBytes,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
@@ -235,16 +234,16 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
}
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = C::open_from_bytes(bytes.as_slice())?;
|
||||
let reader = C::open_from_bytes(bytes)?;
|
||||
Ok(FastFieldReaderCodecWrapper {
|
||||
reader,
|
||||
bytes,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
let data = self.reader.get_u64(doc, self.bytes.as_slice());
|
||||
let data = self.reader.get_u64(doc);
|
||||
Item::from_u64(data)
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user