mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 16:22:55 +00:00
refactor, add fastfield metadata to footer
change api to fastfield reader in codec crate add fastfield metadata to footer remove old code merge codec files
This commit is contained in:
187
fastfield_codecs/src/bitpacked.rs
Normal file
187
fastfield_codecs/src/bitpacked.rs
Normal file
@@ -0,0 +1,187 @@
|
||||
use crate::CodecId;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let amplitude = u64::deserialize(&mut footer)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
|
||||
}
|
||||
}
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
amplitude: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
let amplitude = max_value - min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
amplitude,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn create(
|
||||
write: &'a mut W,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
#[inline]
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)?;
|
||||
self.min_value.serialize(&mut self.write)?;
|
||||
self.amplitude.serialize(&mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::NAME;
|
||||
(ratio, name)
|
||||
}
|
||||
}
|
||||
impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> {
|
||||
const NAME: &'static str = "Bitpacked";
|
||||
const ID: u8 = 1;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_and_validate(data: &[u64]) {
|
||||
let mut out = vec![];
|
||||
BitpackedFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = BitpackedFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64, &out), *val);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
create_and_validate(&data);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn bitpacked_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn bitpacked_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,5 +0,0 @@
|
||||
mod reader;
|
||||
mod serialize;
|
||||
|
||||
pub use reader::BitpackedFastFieldReader;
|
||||
pub use serialize::BitpackedFastFieldSerializer;
|
||||
@@ -1,33 +0,0 @@
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader<'data> {
|
||||
bytes: &'data [u8],
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub min_value_u64: u64,
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl<'data> BitpackedFastFieldReader<'data> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
|
||||
let min_value = u64::deserialize(&mut bytes)?;
|
||||
let amplitude = u64::deserialize(&mut bytes)?;
|
||||
let max_value = min_value + amplitude;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(BitpackedFastFieldReader {
|
||||
bytes,
|
||||
min_value_u64: min_value,
|
||||
max_value_u64: max_value,
|
||||
bit_unpacker,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64) -> u64 {
|
||||
self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes)
|
||||
}
|
||||
}
|
||||
@@ -1,93 +0,0 @@
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub fn create(
|
||||
write: &'a mut W,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
pub fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("Bitpacked", 1)
|
||||
}
|
||||
}
|
||||
@@ -35,8 +35,15 @@ pub trait FastFieldSerializerEstimate {
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str);
|
||||
/// the unique (name, id) of the compressor. Used to distinguish when de/serializing.
|
||||
fn codec_id() -> (&'static str, u8);
|
||||
}
|
||||
|
||||
/// `CodecId` is required by each Codec.
|
||||
///
|
||||
/// It needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
pub trait CodecId {
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
@@ -45,3 +52,15 @@ pub struct FastFieldStats {
|
||||
pub max_value: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
232
fastfield_codecs/src/linearinterpol.rs
Normal file
232
fastfield_codecs/src/linearinterpol.rs
Normal file
@@ -0,0 +1,232 @@
|
||||
use crate::CodecId;
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use std::io::{self, Read, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use common::FixedSize;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearinterpolFastFieldReader {
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub footer: LinearInterpolFooter,
|
||||
pub slope: f64,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct LinearInterpolFooter {
|
||||
pub relative_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub first_val: u64,
|
||||
pub last_val: u64,
|
||||
pub num_vals: u64,
|
||||
}
|
||||
|
||||
impl BinarySerializable for LinearInterpolFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.relative_max_value.serialize(write)?;
|
||||
self.offset.serialize(write)?;
|
||||
self.first_val.serialize(write)?;
|
||||
self.last_val.serialize(write)?;
|
||||
self.num_vals.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<LinearInterpolFooter> {
|
||||
Ok(LinearInterpolFooter {
|
||||
relative_max_value: u64::deserialize(reader)?,
|
||||
offset: u64::deserialize(reader)?,
|
||||
first_val: u64::deserialize(reader)?,
|
||||
last_val: u64::deserialize(reader)?,
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl FixedSize for LinearInterpolFooter {
|
||||
const SIZE_IN_BYTES: usize = 40;
|
||||
}
|
||||
|
||||
impl LinearinterpolFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
|
||||
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
|
||||
//let rel_max_value = u64::deserialize(&mut footer)?;
|
||||
//let offset = u64::deserialize(&mut footer)?;
|
||||
//let first_value = u64::deserialize(&mut footer)?;
|
||||
//let last_value = u64::deserialize(&mut footer)?;
|
||||
//let num_vals = u64::deserialize(&mut footer)?;
|
||||
let slope = (footer.last_val as f64 - footer.first_val as f64)
|
||||
/ (footer.num_vals as u64 - 1) as f64;
|
||||
|
||||
let num_bits = compute_num_bits(footer.relative_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearinterpolFastFieldReader {
|
||||
footer,
|
||||
bit_unpacker,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = self.footer.first_val + (doc as f64 * self.slope) as u64;
|
||||
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
|
||||
}
|
||||
}
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
// TODO not suitable if max is larger than i64::MAX / 2
|
||||
|
||||
impl LinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
data_iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
// todo walk over data just once and calulate offset on the fly
|
||||
// offset to ensure all values are positive
|
||||
let offset = data_iter1
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
val as i64 - calculated_value as i64
|
||||
})
|
||||
.min()
|
||||
.unwrap()
|
||||
.abs() as u64;
|
||||
|
||||
//calc new max
|
||||
let rel_max = data_iter2
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
(val + offset) - calculated_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let amplitude = rel_max;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = LinearInterpolFooter {
|
||||
relative_max_value: amplitude,
|
||||
offset,
|
||||
first_val,
|
||||
last_val,
|
||||
num_vals: stats.num_vals,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
fn get_slope(first_val: u64, last_val: u64, num_vals: u64) -> f64 {
|
||||
(last_val as f64 - first_val as f64) / (num_vals as u64 - 1) as f64
|
||||
}
|
||||
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::NAME;
|
||||
(ratio, name)
|
||||
}
|
||||
}
|
||||
|
||||
impl CodecId for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_and_validate(data: &[u64]) -> (u64, u64) {
|
||||
let mut out = vec![];
|
||||
LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64, &out), *val);
|
||||
}
|
||||
(reader.footer.relative_max_value, reader.footer.offset)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let (rel_max_value, offset) = create_and_validate(&data);
|
||||
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(rel_max_value, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..500 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|_| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,123 +0,0 @@
|
||||
mod serialize;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use std::io;
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitUnpacker;
|
||||
|
||||
use crate::FastFieldDataAccess;
|
||||
/// Depending on the field type, a different
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct LinearinterpolFastFieldReader<'data> {
|
||||
bytes: &'data [u8],
|
||||
bit_unpacker: BitUnpacker,
|
||||
pub first_value: u64,
|
||||
pub rel_max_value: u64,
|
||||
pub offset: u64,
|
||||
pub slope: f64,
|
||||
}
|
||||
|
||||
impl<'data> LinearinterpolFastFieldReader<'data> {
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
|
||||
let rel_max_value = u64::deserialize(&mut bytes)?;
|
||||
let offset = u64::deserialize(&mut bytes)?;
|
||||
let first_value = u64::deserialize(&mut bytes)?;
|
||||
let last_value = u64::deserialize(&mut bytes)?;
|
||||
let num_vals = u64::deserialize(&mut bytes)?;
|
||||
let slope = (last_value as f64 - first_value as f64) / (num_vals as u64 - 1) as f64;
|
||||
|
||||
let num_bits = compute_num_bits(rel_max_value);
|
||||
let bit_unpacker = BitUnpacker::new(num_bits);
|
||||
Ok(LinearinterpolFastFieldReader {
|
||||
bytes,
|
||||
first_value,
|
||||
rel_max_value,
|
||||
offset,
|
||||
bit_unpacker,
|
||||
slope,
|
||||
})
|
||||
}
|
||||
pub fn get_u64(&self, doc: u64) -> u64 {
|
||||
let calculated_value = self.first_value + (doc as f64 * self.slope) as u64;
|
||||
calculated_value + self.bit_unpacker.get(doc, &self.bytes) - self.offset
|
||||
|
||||
//self.offset + self.min_value + self.bit_unpacker.get(doc, &self.bytes)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> FastFieldDataAccess for &'a [u64] {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldDataAccess for Vec<u64> {
|
||||
fn get(&self, doc: u32) -> u64 {
|
||||
self[doc as usize]
|
||||
}
|
||||
}
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn create_and_validate(data: &[u64]) -> (u64, u64) {
|
||||
let mut out = vec![];
|
||||
serialize::LinearInterpolFastFieldSerializer::create(
|
||||
&mut out,
|
||||
&data,
|
||||
crate::tests::stats_from_vec(&data),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
|
||||
for (doc, val) in data.iter().enumerate() {
|
||||
assert_eq!(reader.get_u64(doc as u64), *val);
|
||||
}
|
||||
(reader.rel_max_value, reader.offset)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
|
||||
let (rel_max_value, offset) = create_and_validate(&data);
|
||||
|
||||
assert_eq!(offset, 0);
|
||||
assert_eq!(rel_max_value, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_with_offset() {
|
||||
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
|
||||
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_test_no_structure() {
|
||||
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for i in 0..50000 {
|
||||
let mut data = (0..1 + rand::random::<u8>() as usize)
|
||||
.map(|num| rand::random::<i64>() as u64 / 2 as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,87 +0,0 @@
|
||||
use crate::FastFieldDataAccess;
|
||||
use crate::FastFieldSerializerEstimate;
|
||||
use crate::FastFieldStats;
|
||||
use common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
// TODO not suitable if max is larger than i64::MAX / 2
|
||||
|
||||
impl LinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub fn create(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
data_iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
//let first_val = stats.min_value;
|
||||
//let last_val = stats.max_value;
|
||||
let first_val = fastfield_accessor.get(0);
|
||||
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
|
||||
let slope = (last_val as f64 - first_val as f64) / (stats.num_vals as u64 - 1) as f64;
|
||||
// offset to ensure all values are positive
|
||||
let offset = data_iter1
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
val as i64 - calculated_value as i64
|
||||
})
|
||||
.min()
|
||||
.unwrap()
|
||||
.abs() as u64;
|
||||
|
||||
//calc new max
|
||||
let rel_max = data_iter2
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
(val + offset) - calculated_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
let amplitude = rel_max;
|
||||
amplitude.serialize(write)?;
|
||||
offset.serialize(write)?;
|
||||
first_val.serialize(write)?;
|
||||
last_val.serialize(write)?;
|
||||
stats.num_vals.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for (pos, val) in data_iter.enumerate() {
|
||||
let calculated_value = first_val + (pos as f64 * slope) as u64;
|
||||
let diff = (val + offset) - calculated_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("LinearInterpol", 2)
|
||||
}
|
||||
}
|
||||
@@ -56,12 +56,6 @@ impl OwnedBytes {
|
||||
self.data
|
||||
}
|
||||
|
||||
/// Returns the underlying slice of data.
|
||||
/// `Deref` and `AsRef` are also available.
|
||||
#[inline]
|
||||
pub fn into_slice(self) -> &'static [u8] {
|
||||
self.data
|
||||
}
|
||||
/// Returns the len of the slice.
|
||||
#[inline]
|
||||
pub fn len(&self) -> usize {
|
||||
|
||||
@@ -107,7 +107,8 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
/// fast field is required.
|
||||
#[derive(Clone)]
|
||||
pub struct BitpackedFastFieldReader<Item: FastValue> {
|
||||
reader: BitpackedReader<'static>,
|
||||
reader: BitpackedReader,
|
||||
bytes: OwnedBytes,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
@@ -118,16 +119,17 @@ impl<Item: FastValue> BitpackedFastFieldReader<Item> {
|
||||
let _id = u8::deserialize(&mut bytes)?;
|
||||
Self::open_from_bytes(bytes)
|
||||
}
|
||||
/// Opens a fast field given a file.
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = BitpackedReader::open_from_bytes(bytes.into_slice())?;
|
||||
let reader = BitpackedReader::open_from_bytes(bytes.as_slice())?;
|
||||
Ok(BitpackedFastFieldReader {
|
||||
reader,
|
||||
bytes,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(doc))
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
|
||||
@@ -1,97 +0,0 @@
|
||||
use super::FastFieldDataAccess;
|
||||
use super::FastFieldSerializer;
|
||||
use super::FastFieldSerializerEstimate;
|
||||
use super::FastFieldStats;
|
||||
use crate::common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
|
||||
bit_packer: BitPacker,
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub(crate) fn open(
|
||||
write: &'a mut W,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
|
||||
assert!(min_value <= max_value);
|
||||
min_value.serialize(write)?;
|
||||
let amplitude = max_value - min_value;
|
||||
amplitude.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let bit_packer = BitPacker::new();
|
||||
Ok(BitpackedFastFieldSerializer {
|
||||
bit_packer,
|
||||
write,
|
||||
min_value,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
/// Creates a new fast field serializer.
|
||||
///
|
||||
/// The serializer in fact encode the values by bitpacking
|
||||
/// `(val - min_value)`.
|
||||
///
|
||||
/// It requires a `min_value` and a `max_value` to compute
|
||||
/// compute the minimum number of bits required to encode
|
||||
/// values.
|
||||
pub(crate) fn create(
|
||||
write: &'a mut W,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
|
||||
|
||||
for val in data_iter {
|
||||
serializer.add_val(val)?;
|
||||
}
|
||||
serializer.close_field()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializer for BitpackedFastFieldSerializer<'a, W> {
|
||||
/// Pushes a new value to the currently open u64 fast field.
|
||||
fn add_val(&mut self, val: u64) -> io::Result<()> {
|
||||
let val_to_write: u64 = val - self.min_value;
|
||||
self.bit_packer
|
||||
.write(val_to_write, self.num_bits, &mut self.write)?;
|
||||
Ok(())
|
||||
}
|
||||
fn close_field(mut self) -> io::Result<()> {
|
||||
self.bit_packer.close(&mut self.write)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("Bitpacked", 1)
|
||||
}
|
||||
}
|
||||
@@ -1,78 +0,0 @@
|
||||
use super::FastFieldDataAccess;
|
||||
use super::FastFieldSerializerEstimate;
|
||||
use super::FastFieldStats;
|
||||
use crate::common::BinarySerializable;
|
||||
use std::io::{self, Write};
|
||||
use tantivy_bitpacker::compute_num_bits;
|
||||
use tantivy_bitpacker::BitPacker;
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference.
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
impl LinearInterpolFastFieldSerializer {
|
||||
/// Creates a new fast field serializer.
|
||||
pub(crate) fn create(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
data_iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
assert!(stats.min_value <= stats.max_value);
|
||||
|
||||
let step = (stats.max_value - stats.min_value) as f64 / (stats.num_vals as u64 - 1) as f64;
|
||||
// offset to ensure all values are positive
|
||||
let offset = data_iter1
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = stats.min_value + (pos as f64 * step) as u64;
|
||||
val as i64 - calculated_value as i64
|
||||
})
|
||||
.min()
|
||||
.unwrap()
|
||||
.abs() as u64;
|
||||
|
||||
//calc new max
|
||||
let rel_max = data_iter2
|
||||
.enumerate()
|
||||
.map(|(pos, val)| {
|
||||
let calculated_value = stats.min_value + (pos as f64 * step) as u64;
|
||||
(val + offset) - calculated_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
stats.min_value.serialize(write)?;
|
||||
let amplitude = rel_max;
|
||||
amplitude.serialize(write)?;
|
||||
offset.serialize(write)?;
|
||||
stats.min_value.serialize(write)?;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let mut bit_packer = BitPacker::new();
|
||||
for val in data_iter {
|
||||
bit_packer.write(val, num_bits, write)?;
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
|
||||
fn estimate(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> (f32, &'static str) {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
|
||||
let name = Self::codec_id().0;
|
||||
(ratio, name)
|
||||
}
|
||||
fn codec_id() -> (&'static str, u8) {
|
||||
("LinearInterpol", 2)
|
||||
}
|
||||
}
|
||||
@@ -1,12 +1,9 @@
|
||||
mod bitpacked;
|
||||
mod linearinterpol;
|
||||
|
||||
use crate::common::BinarySerializable;
|
||||
use crate::common::CompositeWrite;
|
||||
use crate::common::CountingWriter;
|
||||
use crate::directory::WritePtr;
|
||||
use crate::schema::Field;
|
||||
use crate::DocId;
|
||||
use fastfield_codecs::CodecId;
|
||||
//pub use bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
|
||||
pub use fastfield_codecs::FastFieldDataAccess;
|
||||
@@ -57,13 +54,14 @@ impl CompositeFastFieldSerializer {
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, 0);
|
||||
|
||||
let (_ratio, (name, id)) = (
|
||||
let (_ratio, name, id) = (
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&fastfield_accessor, stats.clone()),
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::codec_id(),
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::NAME,
|
||||
BitpackedFastFieldSerializer::<Vec<u8>>::ID,
|
||||
);
|
||||
|
||||
id.serialize(field_write)?;
|
||||
if name == BitpackedFastFieldSerializer::<Vec<u8>>::codec_id().0 {
|
||||
if name == BitpackedFastFieldSerializer::<Vec<u8>>::NAME {
|
||||
BitpackedFastFieldSerializer::create(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
@@ -97,7 +95,7 @@ impl CompositeFastFieldSerializer {
|
||||
) -> io::Result<BitpackedFastFieldSerializer<'_, CountingWriter<WritePtr>>> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
// Prepend codec id to field data for compatibility with DynamicFastFieldReader.
|
||||
let (_name, id) = BitpackedFastFieldSerializer::<Vec<u8>>::codec_id();
|
||||
let id = BitpackedFastFieldSerializer::<Vec<u8>>::ID;
|
||||
id.serialize(field_write)?;
|
||||
BitpackedFastFieldSerializer::open(field_write, min_value, max_value)
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user