Compare commits

...

5 Commits

Author SHA1 Message Date
Paul Masurel
e765706487 Removing Deserializer trait
And renaming the `Serializer` trait `FastFieldCodec`.
2022-08-27 21:02:02 +02:00
Pascal Seitz
fdd0f63787 merge traits 2022-08-27 17:01:41 +02:00
Pascal Seitz
fd60e6fe08 rename get_u64 to ge_val 2022-08-27 17:01:41 +02:00
Pascal Seitz
02c3252d1e split open_from_bytes to own trait 2022-08-27 17:01:39 +02:00
Pascal Seitz
4a6f36937c num_vals to FastFieldCodecReader 2022-08-27 17:00:55 +02:00
11 changed files with 296 additions and 292 deletions

View File

@@ -4,9 +4,9 @@ extern crate test;
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use fastfield_codecs::bitpacked::{BitpackedReader, BitpackedSerializer}; use fastfield_codecs::bitpacked::BitpackedCodec;
use fastfield_codecs::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::{LinearReader, LinearSerializer}; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::*; use fastfield_codecs::*;
fn get_data() -> Vec<u64> { fn get_data() -> Vec<u64> {
@@ -25,27 +25,25 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> { fn value_iter() -> impl Iterator<Item = u64> {
0..20_000 0..20_000
} }
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>( fn bench_get<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
b: &mut Bencher,
data: &[u64],
) {
let mut bytes = vec![]; let mut bytes = vec![];
S::serialize(&mut bytes, &data).unwrap(); Codec::serialize(&mut bytes, &data).unwrap();
let reader = R::open_from_bytes(OwnedBytes::new(bytes)).unwrap(); let reader = Codec::open_from_bytes(OwnedBytes::new(bytes)).unwrap();
b.iter(|| { b.iter(|| {
let mut sum = 0u64; let mut sum = 0u64;
for pos in value_iter() { for pos in value_iter() {
let val = reader.get_u64(pos as u64); let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val); debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val); sum = sum.wrapping_add(val);
} }
sum sum
}); });
} }
fn bench_create<S: FastFieldCodecSerializer>(b: &mut Bencher, data: &[u64]) { fn bench_create<Codec: FastFieldCodec>(b: &mut Bencher, data: &[u64]) {
let mut bytes = vec![]; let mut bytes = Vec::new();
b.iter(|| { b.iter(|| {
S::serialize(&mut bytes, &data).unwrap(); bytes.clear();
Codec::serialize(&mut bytes, &data).unwrap();
}); });
} }
@@ -54,32 +52,32 @@ mod tests {
#[bench] #[bench]
fn bench_fastfield_bitpack_create(b: &mut Bencher) { fn bench_fastfield_bitpack_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<BitpackedSerializer>(b, &data); bench_create::<BitpackedCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) { fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<LinearSerializer>(b, &data); bench_create::<LinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) { fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_create::<BlockwiseLinearSerializer>(b, &data); bench_create::<BlockwiseLinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) { fn bench_fastfield_bitpack_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<BitpackedSerializer, BitpackedReader>(b, &data); bench_get::<BitpackedCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) { fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<LinearSerializer, LinearReader>(b, &data); bench_get::<LinearCodec>(b, &data);
} }
#[bench] #[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) { fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data(); let data: Vec<_> = get_data();
bench_get::<BlockwiseLinearSerializer, BlockwiseLinearReader>(b, &data); bench_get::<BlockwiseLinearCodec>(b, &data);
} }
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0); let min_value = data.iter().cloned().min().unwrap_or(0);

View File

@@ -4,9 +4,7 @@ use common::BinarySerializable;
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{ use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different /// Depending on the field type, a different
/// fast field is required. /// fast field is required.
@@ -14,29 +12,14 @@ use crate::{
pub struct BitpackedReader { pub struct BitpackedReader {
data: OwnedBytes, data: OwnedBytes,
bit_unpacker: BitUnpacker, bit_unpacker: BitUnpacker,
pub min_value_u64: u64, min_value_u64: u64,
pub max_value_u64: u64, max_value_u64: u64,
num_vals: u64,
} }
impl FastFieldCodecReader for BitpackedReader { impl FastFieldDataAccess for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline] #[inline]
fn get_u64(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data) self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
@@ -47,11 +30,16 @@ impl FastFieldCodecReader for BitpackedReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.max_value_u64 self.max_value_u64
} }
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
} }
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> { pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker, bit_packer: BitPacker,
write: &'a mut W, write: &'a mut W,
min_value: u64, min_value: u64,
num_vals: u64,
amplitude: u64, amplitude: u64,
num_bits: u8, num_bits: u8,
} }
@@ -78,6 +66,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
bit_packer, bit_packer,
write, write,
min_value, min_value,
num_vals: 0,
amplitude, amplitude,
num_bits, num_bits,
}) })
@@ -88,22 +77,45 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value; let val_to_write: u64 = val - self.min_value;
self.bit_packer self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?; .write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(()) Ok(())
} }
pub fn close_field(mut self) -> io::Result<()> { pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?; self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?; self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?; self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(()) Ok(())
} }
} }
pub struct BitpackedSerializer {} pub struct BitpackedCodec;
impl FastFieldCodecSerializer for BitpackedSerializer { impl FastFieldCodec for BitpackedCodec {
/// The CODEC_TYPE is an enum value used for serialization. /// The CODEC_TYPE is an enum value used for serialization.
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Bitpacked;
type Reader = BitpackedReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
num_vals,
})
}
/// Serializes data with the BitpackedFastFieldSerializer. /// Serializes data with the BitpackedFastFieldSerializer.
/// ///
/// The serializer in fact encode the values by bitpacking /// The serializer in fact encode the values by bitpacking
@@ -146,7 +158,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) { fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedSerializer, BitpackedReader>(data, name); crate::tests::create_and_validate::<BitpackedCodec>(data, name);
} }
#[test] #[test]

View File

@@ -18,9 +18,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::linear::{get_calculated_value, get_slope}; use crate::linear::{get_calculated_value, get_slope};
use crate::{ use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
const CHUNK_SIZE: u64 = 512; const CHUNK_SIZE: u64 = 512;
@@ -148,18 +146,9 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)] &interpolations[get_interpolation_position(doc)]
} }
impl FastFieldCodecReader for BlockwiseLinearReader { impl FastFieldDataAccess for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
#[inline] #[inline]
fn get_u64(&self, idx: u64) -> u64 { fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations); let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos; let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value( let calculated_value = get_calculated_value(
@@ -182,13 +171,29 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.footer.max_value self.footer.max_value
} }
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
} }
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements. /// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct BlockwiseLinearSerializer {} pub struct BlockwiseLinearCodec;
impl FastFieldCodecSerializer for BlockwiseLinearSerializer { impl FastFieldCodec for BlockwiseLinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::BlockwiseLinear;
type Reader = BlockwiseLinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
/// Creates a new fast field serializer. /// Creates a new fast field serializer.
fn serialize( fn serialize(
write: &mut impl Write, write: &mut impl Write,
@@ -363,9 +368,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>( crate::tests::create_and_validate::<BlockwiseLinearCodec, BlockwiseLinearReader>(data, name)
data, name,
)
} }
const HIGHEST_BIT: u64 = 1 << 63; const HIGHEST_BIT: u64 = 1 << 63;

View File

@@ -12,12 +12,15 @@ pub mod bitpacked;
pub mod blockwise_linear; pub mod blockwise_linear;
pub mod linear; pub mod linear;
pub trait FastFieldCodecReader: Sized { pub trait FastFieldDataAccess {
/// reads the metadata and returns the CodecReader fn get_val(&self, doc: u64) -> u64;
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64; fn min_value(&self) -> u64;
fn max_value(&self) -> u64; fn max_value(&self) -> u64;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
} }
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)] #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
@@ -60,11 +63,25 @@ impl FastFieldCodecType {
/// The FastFieldSerializerEstimate trait is required on all variants /// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose. /// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodecSerializer { pub trait FastFieldCodec {
/// A codex needs to provide a unique name and id, which is /// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization. /// used for debugging and de/serialization.
const CODEC_TYPE: FastFieldCodecType; const CODEC_TYPE: FastFieldCodecType;
type Reader: FastFieldDataAccess;
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
/// Serializes the data using the serializer into write.
///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;
/// Check if the Codec is able to compress the data /// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool; fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess) -> bool;
@@ -74,40 +91,6 @@ pub trait FastFieldCodecSerializer {
/// It could make sense to also return a value representing /// It could make sense to also return a value representing
/// computational complexity. /// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32; fn estimate(fastfield_accessor: &impl FastFieldDataAccess) -> f32;
/// Serializes the data using the serializer into write.
///
/// The fastfield_accessor iterator should be preferred over using fastfield_accessor for
/// performance reasons.
fn serialize(
write: &mut impl Write,
fastfield_accessor: &dyn FastFieldDataAccess,
) -> io::Result<()>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
/// Returns a iterator over the data
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_>;
/// min value of the data
fn min_value(&self) -> u64;
/// max value of the data
fn max_value(&self) -> u64;
/// num vals
fn num_vals(&self) -> u64;
} }
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
@@ -165,26 +148,24 @@ mod tests {
use proptest::arbitrary::any; use proptest::arbitrary::any;
use proptest::proptest; use proptest::proptest;
use crate::bitpacked::{BitpackedReader, BitpackedSerializer}; use crate::bitpacked::BitpackedCodec;
use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer}; use crate::blockwise_linear::BlockwiseLinearCodec;
use crate::linear::{LinearReader, LinearSerializer}; use crate::linear::LinearCodec;
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>( pub fn create_and_validate<Codec: FastFieldCodec>(data: &[u64], name: &str) -> (f32, f32) {
data: &[u64], if !Codec::is_applicable(&data) {
name: &str,
) -> (f32, f32) {
if !S::is_applicable(&data) {
return (f32::MAX, 0.0); return (f32::MAX, 0.0);
} }
let estimation = S::estimate(&data); let estimation = Codec::estimate(&data);
let mut out: Vec<u8> = Vec::new(); let mut out: Vec<u8> = Vec::new();
S::serialize(&mut out, &data).unwrap(); Codec::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0); let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap(); let reader = Codec::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().enumerate() { for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64); let val = reader.get_val(doc as u64);
if val != *orig_val { if val != *orig_val {
panic!( panic!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \ "val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -198,16 +179,16 @@ mod tests {
proptest! { proptest! {
#[test] #[test]
fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) { fn test_proptest_small(data in proptest::collection::vec(any::<u64>(), 1..10)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol"); create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol"); create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked"); create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
} }
#[test] #[test]
fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) { fn test_proptest_large(data in proptest::collection::vec(any::<u64>(), 1..6000)) {
create_and_validate::<LinearSerializer, LinearReader>(&data, "proptest linearinterpol"); create_and_validate::<LinearCodec>(&data, "proptest linearinterpol");
create_and_validate::<BlockwiseLinearSerializer, BlockwiseLinearReader>(&data, "proptest multilinearinterpol"); create_and_validate::<BlockwiseLinearCodec>(&data, "proptest multilinearinterpol");
create_and_validate::<BitpackedSerializer, BitpackedReader>(&data, "proptest bitpacked"); create_and_validate::<BitpackedCodec>(&data, "proptest bitpacked");
} }
} }
@@ -228,10 +209,10 @@ mod tests {
data_and_names data_and_names
} }
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() { fn test_codec<C: FastFieldCodec>() {
let codec_name = format!("{:?}", S::CODEC_TYPE); let codec_name = format!("{:?}", C::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() { for (data, dataset_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate::<S, R>(&data, dataset_name); let (estimate, actual) = crate::tests::create_and_validate::<C>(&data, dataset_name);
let result = if estimate == f32::MAX { let result = if estimate == f32::MAX {
"Disabled".to_string() "Disabled".to_string()
} else { } else {
@@ -242,15 +223,15 @@ mod tests {
} }
#[test] #[test]
fn test_codec_bitpacking() { fn test_codec_bitpacking() {
test_codec::<BitpackedSerializer, BitpackedReader>(); test_codec::<BitpackedCodec>();
} }
#[test] #[test]
fn test_codec_interpolation() { fn test_codec_interpolation() {
test_codec::<LinearSerializer, LinearReader>(); test_codec::<LinearCodec>();
} }
#[test] #[test]
fn test_codec_multi_interpolation() { fn test_codec_multi_interpolation() {
test_codec::<BlockwiseLinearSerializer, BlockwiseLinearReader>(); test_codec::<BlockwiseLinearCodec>();
} }
use super::*; use super::*;
@@ -259,24 +240,24 @@ mod tests {
fn estimation_good_interpolation_case() { fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>(); let data = (10..=20000_u64).collect::<Vec<_>>();
let linear_interpol_estimation = LinearSerializer::estimate(&data); let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.01); assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation = BlockwiseLinearSerializer::estimate(&data); let multi_linear_interpol_estimation = BlockwiseLinearCodec::estimate(&data);
assert_le!(multi_linear_interpol_estimation, 0.2); assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation); assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(linear_interpol_estimation, bitpacked_estimation); assert_le!(linear_interpol_estimation, bitpacked_estimation);
} }
#[test] #[test]
fn estimation_test_bad_interpolation_case() { fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20]; let data = vec![200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation = LinearSerializer::estimate(&data); let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.32); assert_le!(linear_interpol_estimation, 0.32);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(bitpacked_estimation, linear_interpol_estimation); assert_le!(bitpacked_estimation, linear_interpol_estimation);
} }
#[test] #[test]
@@ -286,10 +267,10 @@ mod tests {
// in this case the linear interpolation can't in fact not be worse than bitpacking, // in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior // but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation = LinearSerializer::estimate(&data); let linear_interpol_estimation = LinearCodec::estimate(&data);
assert_le!(linear_interpol_estimation, 0.35); assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation = BitpackedSerializer::estimate(&data); let bitpacked_estimation = BitpackedCodec::estimate(&data);
assert_le!(bitpacked_estimation, 0.32); assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation); assert_le!(bitpacked_estimation, linear_interpol_estimation);
} }

View File

@@ -5,9 +5,7 @@ use common::{BinarySerializable, FixedSize};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker}; use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{ use crate::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different /// Depending on the field type, a different
/// fast field is required. /// fast field is required.
@@ -59,24 +57,9 @@ impl FixedSize for LinearFooter {
const SIZE_IN_BYTES: usize = 56; const SIZE_IN_BYTES: usize = 56;
} }
impl FastFieldCodecReader for LinearReader { impl FastFieldDataAccess for LinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
#[inline] #[inline]
fn get_u64(&self, doc: u64) -> u64 { fn get_val(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope); let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset (calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
} }
@@ -89,11 +72,15 @@ impl FastFieldCodecReader for LinearReader {
fn max_value(&self) -> u64 { fn max_value(&self) -> u64 {
self.footer.max_value self.footer.max_value
} }
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
} }
/// Fastfield serializer, which tries to guess values by linear interpolation /// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked. /// and stores the difference bitpacked.
pub struct LinearSerializer {} pub struct LinearCodec;
#[inline] #[inline]
pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 { pub(crate) fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f32 {
@@ -134,9 +121,27 @@ pub fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
} }
} }
impl FastFieldCodecSerializer for LinearSerializer { impl FastFieldCodec for LinearCodec {
const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear; const CODEC_TYPE: FastFieldCodecType = FastFieldCodecType::Linear;
type Reader = LinearReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearReader {
data,
bit_unpacker,
footer,
slope,
})
}
/// Creates a new fast field serializer. /// Creates a new fast field serializer.
fn serialize( fn serialize(
write: &mut impl Write, write: &mut impl Write,
@@ -260,7 +265,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets; use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) { fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<LinearSerializer, LinearReader>(data, name) crate::tests::create_and_validate::<LinearCodec, LinearReader>(data, name)
} }
#[test] #[test]

View File

@@ -1,8 +1,8 @@
#[macro_use] #[macro_use]
extern crate prettytable; extern crate prettytable;
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearSerializer; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldCodecType, FastFieldStats}; use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldStats};
use prettytable::{Cell, Row, Table}; use prettytable::{Cell, Row, Table};
fn main() { fn main() {
@@ -13,11 +13,11 @@ fn main() {
for (data, data_set_name) in get_codec_test_data_sets() { for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![]; let mut results = vec![];
let res = serialize_with_codec::<LinearSerializer>(&data); let res = serialize_with_codec::<LinearCodec>(&data);
results.push(res); results.push(res);
let res = serialize_with_codec::<BlockwiseLinearSerializer>(&data); let res = serialize_with_codec::<BlockwiseLinearCodec>(&data);
results.push(res); results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedSerializer>(&data); let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedCodec>(&data);
results.push(res); results.push(res);
// let best_estimation_codec = results // let best_estimation_codec = results
@@ -89,19 +89,19 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
data_and_names data_and_names
} }
pub fn serialize_with_codec<S: FastFieldCodecSerializer>( pub fn serialize_with_codec<C: FastFieldCodec>(
data: &[u64], data: &[u64],
) -> (bool, f32, f32, FastFieldCodecType) { ) -> (bool, f32, f32, FastFieldCodecType) {
let is_applicable = S::is_applicable(&data); let is_applicable = C::is_applicable(&data);
if !is_applicable { if !is_applicable {
return (false, 0.0, 0.0, S::CODEC_TYPE); return (false, 0.0, 0.0, C::CODEC_TYPE);
} }
let estimation = S::estimate(&data); let estimation = C::estimate(&data);
let mut out = vec![]; let mut out = vec![];
S::serialize(&mut out, &data).unwrap(); C::serialize(&mut out, &data).unwrap();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32; let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::CODEC_TYPE) (true, estimation, actual_compression, C::CODEC_TYPE)
} }
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats { pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -3,7 +3,7 @@ use std::num::NonZeroU64;
use common::BinarySerializable; use common::BinarySerializable;
use fastdivide::DividerU64; use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader; use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1; pub const GCD_DEFAULT: u64 = 1;
@@ -12,47 +12,82 @@ pub const GCD_DEFAULT: u64 = 1;
/// ///
/// Holds the data and the codec to the read the data. /// Holds the data and the codec to the read the data.
#[derive(Clone)] #[derive(Clone)]
pub struct GCDFastFieldCodec<CodecReader> { pub struct GCDReader<CodecReader: FastFieldDataAccess> {
gcd: u64, gcd_params: GCDParams,
min_value: u64,
reader: CodecReader, reader: CodecReader,
} }
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> { #[derive(Debug, Clone, Copy)]
/// Opens a fast field given the bytes. struct GCDParams {
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> { gcd: u64,
let footer_offset = bytes.len() - 16; min_value: u64,
let (body, mut footer) = bytes.split(footer_offset); num_vals: u64,
let gcd = u64::deserialize(&mut footer)?; }
let min_value = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
reader,
})
}
#[inline] impl GCDParams {
fn get_u64(&self, doc: u64) -> u64 { pub fn eval(&self, val: u64) -> u64 {
let mut data = self.reader.get_u64(doc); self.min_value + self.gcd * val
data *= self.gcd;
data += self.min_value;
data
}
fn min_value(&self) -> u64 {
self.min_value + self.reader.min_value() * self.gcd
}
fn max_value(&self) -> u64 {
self.min_value + self.reader.max_value() * self.gcd
} }
} }
pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> { impl BinarySerializable for GCDParams {
fn serialize<W: Write>(&self, writer: &mut W) -> io::Result<()> {
self.gcd.serialize(writer)?;
self.min_value.serialize(writer)?;
self.num_vals.serialize(writer)?;
Ok(())
}
fn deserialize<R: io::Read>(reader: &mut R) -> io::Result<Self> {
let gcd: u64 = u64::deserialize(reader)?;
let min_value: u64 = u64::deserialize(reader)?;
let num_vals: u64 = u64::deserialize(reader)?;
Ok(Self {
gcd,
min_value,
num_vals,
})
}
}
pub fn open_gcd_from_bytes<WrappedCodec: FastFieldCodec>(
bytes: OwnedBytes,
) -> io::Result<GCDReader<WrappedCodec::Reader>> {
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd_params = GCDParams::deserialize(&mut footer)?;
let reader: WrappedCodec::Reader = WrappedCodec::open_from_bytes(body)?;
Ok(GCDReader { gcd_params, reader })
}
impl<C: FastFieldDataAccess + Clone> FastFieldDataAccess for GCDReader<C> {
#[inline]
fn get_val(&self, doc: u64) -> u64 {
let val = self.reader.get_val(doc);
self.gcd_params.eval(val)
}
fn min_value(&self) -> u64 {
self.gcd_params.eval(self.reader.min_value())
}
fn max_value(&self) -> u64 {
self.gcd_params.eval(self.reader.max_value())
}
fn num_vals(&self) -> u64 {
self.gcd_params.num_vals
}
}
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?; gcd.serialize(field_write)?;
min_value.serialize(field_write)?; min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(()) Ok(())
} }

View File

@@ -26,7 +26,7 @@ pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveB
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter}; pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
pub use self::error::{FastFieldNotAvailableError, Result}; pub use self::error::{FastFieldNotAvailableError, Result};
pub use self::facet_reader::FacetReader; pub use self::facet_reader::FacetReader;
pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_DEFAULT}; pub(crate) use self::gcd::{find_gcd, GCDReader, GCD_DEFAULT};
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter}; pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
pub use self::reader::{DynamicFastFieldReader, FastFieldReader}; pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
pub use self::readers::FastFieldReaders; pub use self::readers::FastFieldReaders;
@@ -326,7 +326,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37); assert_eq!(file.len(), 45);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap(); let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?; let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
@@ -357,7 +357,7 @@ mod tests {
serializer.close()?; serializer.close()?;
} }
let file = directory.open_read(path)?; let file = directory.open_read(path)?;
assert_eq!(file.len(), 62); assert_eq!(file.len(), 70);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -393,7 +393,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35); assert_eq!(file.len(), 43);
{ {
let fast_fields_composite = CompositeFile::open(&file).unwrap(); let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -425,7 +425,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043); assert_eq!(file.len(), 80051);
{ {
let fast_fields_composite = CompositeFile::open(&file)?; let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap(); let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -896,7 +896,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 36); assert_eq!(file.len(), 44);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -932,7 +932,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 48); assert_eq!(file.len(), 56);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -966,7 +966,7 @@ mod tests {
serializer.close().unwrap(); serializer.close().unwrap();
} }
let file = directory.open_read(path).unwrap(); let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35); assert_eq!(file.len(), 43);
let composite_file = CompositeFile::open(&file)?; let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap(); let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?; let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;

View File

@@ -3,15 +3,16 @@ use std::marker::PhantomData;
use std::path::Path; use std::path::Path;
use common::BinarySerializable; use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedReader; use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedReader};
use fastfield_codecs::blockwise_linear::BlockwiseLinearReader; use fastfield_codecs::blockwise_linear::{BlockwiseLinearCodec, BlockwiseLinearReader};
use fastfield_codecs::linear::LinearReader; use fastfield_codecs::linear::{LinearCodec, LinearReader};
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecType}; use fastfield_codecs::{FastFieldCodec, FastFieldCodecType, FastFieldDataAccess};
use super::{FastValue, GCDFastFieldCodec}; use super::gcd::open_gcd_from_bytes;
use super::FastValue;
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr}; use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter}; use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter, GCDReader};
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::DocId; use crate::DocId;
@@ -68,11 +69,11 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>), BlockwiseLinear(FastFieldReaderCodecWrapper<Item, BlockwiseLinearReader>),
/// GCD and Bitpacked compressed fastfield data. /// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BitpackedReader>>), BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked /// GCD and Linear interpolated values + bitpacked
LinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<LinearReader>>), LinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<LinearReader>>),
/// GCD and Blockwise linear interpolated values + bitpacked /// GCD and Blockwise linear interpolated values + bitpacked
BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BlockwiseLinearReader>>), BlockwiseLinearGCD(FastFieldReaderCodecWrapper<Item, GCDReader<BlockwiseLinearReader>>),
} }
impl<Item: FastValue> DynamicFastFieldReader<Item> { impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -83,46 +84,27 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
) -> crate::Result<DynamicFastFieldReader<Item>> { ) -> crate::Result<DynamicFastFieldReader<Item>> {
let reader = match codec_type { let reader = match codec_type {
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Bitpacked => {
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::< DynamicFastFieldReader::Bitpacked(BitpackedCodec::open_from_bytes(bytes)?.into())
Item,
BitpackedReader,
>::open_from_bytes(bytes)?)
} }
FastFieldCodecType::Linear => DynamicFastFieldReader::Linear( FastFieldCodecType::Linear => {
FastFieldReaderCodecWrapper::<Item, LinearReader>::open_from_bytes(bytes)?, DynamicFastFieldReader::Linear(LinearCodec::open_from_bytes(bytes)?.into())
}
FastFieldCodecType::BlockwiseLinear => DynamicFastFieldReader::BlockwiseLinear(
BlockwiseLinearCodec::open_from_bytes(bytes)?.into(),
), ),
FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinear(FastFieldReaderCodecWrapper::<
Item,
BlockwiseLinearReader,
>::open_from_bytes(bytes)?)
}
FastFieldCodecType::Gcd => { FastFieldCodecType::Gcd => {
let codec_type = FastFieldCodecType::deserialize(&mut bytes)?; let codec_type = FastFieldCodecType::deserialize(&mut bytes)?;
match codec_type { match codec_type {
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Bitpacked => DynamicFastFieldReader::BitpackedGCD(
DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::< open_gcd_from_bytes::<BitpackedCodec>(bytes)?.into(),
Item, ),
GCDFastFieldCodec<BitpackedReader>, FastFieldCodecType::Linear => DynamicFastFieldReader::LinearGCD(
>::open_from_bytes( open_gcd_from_bytes::<LinearCodec>(bytes)?.into(),
bytes ),
)?)
}
FastFieldCodecType::Linear => {
DynamicFastFieldReader::LinearGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<LinearReader>,
>::open_from_bytes(
bytes
)?)
}
FastFieldCodecType::BlockwiseLinear => { FastFieldCodecType::BlockwiseLinear => {
DynamicFastFieldReader::BlockwiseLinearGCD(FastFieldReaderCodecWrapper::< DynamicFastFieldReader::BlockwiseLinearGCD(
Item, open_gcd_from_bytes::<BlockwiseLinearCodec>(bytes)?.into(),
GCDFastFieldCodec<BlockwiseLinearReader>, )
>::open_from_bytes(
bytes
)?)
} }
FastFieldCodecType::Gcd => { FastFieldCodecType::Gcd => {
return Err(DataCorruption::comment_only( return Err(DataCorruption::comment_only(
@@ -199,34 +181,21 @@ pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
_phantom: PhantomData<Item>, _phantom: PhantomData<Item>,
} }
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> { impl<Item: FastValue, CodecReader> From<CodecReader>
/// Opens a fast field given a file. for FastFieldReaderCodecWrapper<Item, CodecReader>
pub fn open(file: FileSlice) -> crate::Result<Self> { {
let mut bytes = file.read_bytes()?; fn from(reader: CodecReader) -> Self {
let codec_code = bytes.read_u8(); FastFieldReaderCodecWrapper {
let codec_type = FastFieldCodecType::from_code(codec_code).ok_or_else(|| {
DataCorruption::comment_only("Unknown codec code does not exist `{codec_code}`")
})?;
assert_eq!(
FastFieldCodecType::Bitpacked,
codec_type,
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
different id"
);
Self::open_from_bytes(bytes)
}
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes)?;
Ok(FastFieldReaderCodecWrapper {
reader, reader,
_phantom: PhantomData, _phantom: PhantomData,
}) }
} }
}
impl<Item: FastValue, D: FastFieldDataAccess> FastFieldReaderCodecWrapper<Item, D> {
#[inline] #[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item { pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc); let data = self.reader.get_val(doc);
Item::from_u64(data) Item::from_u64(data)
} }
@@ -249,7 +218,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
} }
} }
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item> impl<Item: FastValue, C: FastFieldDataAccess + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C> for FastFieldReaderCodecWrapper<Item, C>
{ {
/// Return the value associated to the given document. /// Return the value associated to the given document.

View File

@@ -3,11 +3,11 @@ use std::num::NonZeroU64;
use common::{BinarySerializable, CountingWriter}; use common::{BinarySerializable, CountingWriter};
use fastdivide::DividerU64; use fastdivide::DividerU64;
pub use fastfield_codecs::bitpacked::{BitpackedSerializer, BitpackedSerializerLegacy}; pub use fastfield_codecs::bitpacked::{BitpackedCodec, BitpackedSerializerLegacy};
use fastfield_codecs::blockwise_linear::BlockwiseLinearSerializer; use fastfield_codecs::blockwise_linear::BlockwiseLinearCodec;
use fastfield_codecs::linear::LinearSerializer; use fastfield_codecs::linear::LinearCodec;
use fastfield_codecs::FastFieldCodecType; use fastfield_codecs::FastFieldCodecType;
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats}; pub use fastfield_codecs::{FastFieldCodec, FastFieldDataAccess, FastFieldStats};
use super::{find_gcd, ALL_CODECS, GCD_DEFAULT}; use super::{find_gcd, ALL_CODECS, GCD_DEFAULT};
use crate::directory::{CompositeWrite, WritePtr}; use crate::directory::{CompositeWrite, WritePtr};
@@ -64,15 +64,15 @@ impl From<FastFieldCodecType> for FastFieldCodecEnableCheck {
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait // use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176 // https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>( fn codec_estimation<C: FastFieldCodec, A: FastFieldDataAccess>(
fastfield_accessor: &A, fastfield_accessor: &A,
estimations: &mut Vec<(f32, FastFieldCodecType)>, estimations: &mut Vec<(f32, FastFieldCodecType)>,
) { ) {
if !T::is_applicable(fastfield_accessor) { if !C::is_applicable(fastfield_accessor) {
return; return;
} }
let ratio = T::estimate(fastfield_accessor); let ratio = C::estimate(fastfield_accessor);
estimations.push((ratio, T::CODEC_TYPE)); estimations.push((ratio, C::CODEC_TYPE));
} }
impl CompositeFastFieldSerializer { impl CompositeFastFieldSerializer {
@@ -189,7 +189,7 @@ impl CompositeFastFieldSerializer {
field_write, field_write,
fastfield_accessor, fastfield_accessor,
)?; )?;
write_gcd_header(field_write, base_value, gcd)?; write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(()) Ok(())
} }
@@ -204,13 +204,13 @@ impl CompositeFastFieldSerializer {
let mut estimations = vec![]; let mut estimations = vec![];
if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) { if codec_enable_checker.is_enabled(FastFieldCodecType::Bitpacked) {
codec_estimation::<BitpackedSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BitpackedCodec, _>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) { if codec_enable_checker.is_enabled(FastFieldCodecType::Linear) {
codec_estimation::<LinearSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<LinearCodec, _>(&fastfield_accessor, &mut estimations);
} }
if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) { if codec_enable_checker.is_enabled(FastFieldCodecType::BlockwiseLinear) {
codec_estimation::<BlockwiseLinearSerializer, _>(&fastfield_accessor, &mut estimations); codec_estimation::<BlockwiseLinearCodec, _>(&fastfield_accessor, &mut estimations);
} }
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan()) if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{ {
@@ -229,13 +229,13 @@ impl CompositeFastFieldSerializer {
Self::write_header(field_write, codec_type)?; Self::write_header(field_write, codec_type)?;
match codec_type { match codec_type {
FastFieldCodecType::Bitpacked => { FastFieldCodecType::Bitpacked => {
BitpackedSerializer::serialize(field_write, &fastfield_accessor)?; BitpackedCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::Linear => { FastFieldCodecType::Linear => {
LinearSerializer::serialize(field_write, &fastfield_accessor)?; LinearCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::BlockwiseLinear => { FastFieldCodecType::BlockwiseLinear => {
BlockwiseLinearSerializer::serialize(field_write, &fastfield_accessor)?; BlockwiseLinearCodec::serialize(field_write, &fastfield_accessor)?;
} }
FastFieldCodecType::Gcd => { FastFieldCodecType::Gcd => {
return Err(io::Error::new( return Err(io::Error::new(

View File

@@ -2,12 +2,13 @@ use std::collections::HashMap;
use std::io; use std::io;
use common; use common;
use fastfield_codecs::FastFieldDataAccess;
use fnv::FnvHashMap; use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker; use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter; use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats; use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType, FastValue}; use super::{FastFieldType, FastValue};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer}; use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping; use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId; use crate::postings::UnorderedTermId;