refactor, add fastfield metadata to footer

change api to fastfield reader in codec crate
add fastfield metadata to footer
remove old code
merge codec files
This commit is contained in:
Pascal Seitz
2021-06-03 11:45:07 +02:00
parent 3298d6cb71
commit aefd0fc907
13 changed files with 452 additions and 536 deletions

View File

@@ -56,12 +56,6 @@ impl OwnedBytes {
self.data
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn into_slice(self) -> &'static [u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {

View File

@@ -107,7 +107,8 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader<Item: FastValue> {
reader: BitpackedReader<'static>,
reader: BitpackedReader,
bytes: OwnedBytes,
_phantom: PhantomData<Item>,
}
@@ -118,16 +119,17 @@ impl<Item: FastValue> BitpackedFastFieldReader<Item> {
let _id = u8::deserialize(&mut bytes)?;
Self::open_from_bytes(bytes)
}
/// Opens a fast field given a file.
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = BitpackedReader::open_from_bytes(bytes.into_slice())?;
let reader = BitpackedReader::open_from_bytes(bytes.as_slice())?;
Ok(BitpackedFastFieldReader {
reader,
bytes,
_phantom: PhantomData,
})
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.reader.get_u64(doc))
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
}
/// Internally `multivalued` also use SingleValue Fast fields.

View File

@@ -1,97 +0,0 @@
use super::FastFieldDataAccess;
use super::FastFieldSerializer;
use super::FastFieldSerializerEstimate;
use super::FastFieldStats;
use crate::common::BinarySerializable;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub(crate) fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(BitpackedFastFieldSerializer {
bit_packer,
write,
min_value,
num_bits,
})
}
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub(crate) fn create(
write: &'a mut W,
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
) -> io::Result<()> {
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
for val in data_iter {
serializer.add_val(val)?;
}
serializer.close_field()?;
Ok(())
}
}
impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
/// Pushes a new value to the currently open u64 fast field.
fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
fn estimate(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::codec_id().0;
(ratio, name)
}
fn codec_id() -> (&'static str, u8) {
("Bitpacked", 1)
}
}

View File

@@ -1,78 +0,0 @@
use super::FastFieldDataAccess;
use super::FastFieldSerializerEstimate;
use super::FastFieldStats;
use crate::common::BinarySerializable;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference.
pub struct LinearInterpolFastFieldSerializer {}
impl LinearInterpolFastFieldSerializer {
/// Creates a new fast field serializer.
pub(crate) fn create(
write: &mut impl Write,
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
data_iter2: impl Iterator<Item = u64>,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
let step = (stats.max_value - stats.min_value) as f64 / (stats.num_vals as u64 - 1) as f64;
// offset to ensure all values are positive
let offset = data_iter1
.enumerate()
.map(|(pos, val)| {
let calculated_value = stats.min_value + (pos as f64 * step) as u64;
val as i64 - calculated_value as i64
})
.min()
.unwrap()
.abs() as u64;
//calc new max
let rel_max = data_iter2
.enumerate()
.map(|(pos, val)| {
let calculated_value = stats.min_value + (pos as f64 * step) as u64;
(val + offset) - calculated_value
})
.max()
.unwrap();
stats.min_value.serialize(write)?;
let amplitude = rel_max;
amplitude.serialize(write)?;
offset.serialize(write)?;
stats.min_value.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let mut bit_packer = BitPacker::new();
for val in data_iter {
bit_packer.write(val, num_bits, write)?;
}
bit_packer.close(write)?;
Ok(())
}
}
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
fn estimate(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::codec_id().0;
(ratio, name)
}
fn codec_id() -> (&'static str, u8) {
("LinearInterpol", 2)
}
}

View File

@@ -1,12 +1,9 @@
mod bitpacked;
mod linearinterpol;
use crate::common::BinarySerializable;
use crate::common::CompositeWrite;
use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use crate::DocId;
use fastfield_codecs::CodecId;
//pub use bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::FastFieldDataAccess;
@@ -57,13 +54,14 @@ impl CompositeFastFieldSerializer {
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
let (_ratio, (name, id)) = (
let (_ratio, name, id) = (
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&fastfield_accessor, stats.clone()),
BitpackedFastFieldSerializer::<Vec<u8>>::codec_id(),
BitpackedFastFieldSerializer::<Vec<u8>>::NAME,
BitpackedFastFieldSerializer::<Vec<u8>>::ID,
);
id.serialize(field_write)?;
if name == BitpackedFastFieldSerializer::<Vec<u8>>::codec_id().0 {
if name == BitpackedFastFieldSerializer::<Vec<u8>>::NAME {
BitpackedFastFieldSerializer::create(
field_write,
&fastfield_accessor,
@@ -97,7 +95,7 @@ impl CompositeFastFieldSerializer {
) -> io::Result<BitpackedFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
let (_name, id) = BitpackedFastFieldSerializer::<Vec<u8>>::codec_id();
let id = BitpackedFastFieldSerializer::<Vec<u8>>::ID;
id.serialize(field_write)?;
BitpackedFastFieldSerializer::open(field_write, min_value, max_value)
}