move common to common crate, create fastfield_codecs crate

move common to common crate
create fastfield_codecs crate
add bitpacker to fast field codecs
add linear interpolation to fast field codecs
add tests
This commit is contained in:
Pascal Seitz
2021-06-02 21:22:46 +02:00
parent c02c78ea73
commit 3298d6cb71
17 changed files with 460 additions and 62 deletions

View File

@@ -35,6 +35,8 @@ crossbeam = "0.8"
futures = { version = "0.3.15", features = ["thread-pool"] }
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
common = { version="0.1", path="./common" }
fastfield_codecs = { version="0.1", path="./fastfield_codecs" }
stable_deref_trait = "1.2"
rust-stemmers = "1.2"
downcast-rs = "1.2"
@@ -88,7 +90,7 @@ unstable = [] # useful for benches.
wasm-bindgen = ["uuid/wasm-bindgen"]
[workspace]
members = ["query-grammar", "bitpacker"]
members = ["query-grammar", "bitpacker", "common", "fastfield_codecs"]
[badges]
travis-ci = { repository = "tantivy-search/tantivy" }

10
common/Cargo.toml Normal file
View File

@@ -0,0 +1,10 @@
[package]
name = "common"
version = "0.1.0"
authors = ["Pascal Seitz <pascal.seitz@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
byteorder = "1.4.3"

7
common/src/lib.rs Normal file
View File

@@ -0,0 +1,7 @@
pub use byteorder::LittleEndian as Endianness;
mod serialize;
mod vint;
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};

View File

@@ -1,5 +1,5 @@
use crate::common::Endianness;
use crate::common::VInt;
use crate::Endianness;
use crate::VInt;
use byteorder::{ReadBytesExt, WriteBytesExt};
use std::fmt;
use std::io;

View File

@@ -0,0 +1,14 @@
[package]
name = "fastfield_codecs"
version = "0.1.0"
authors = ["Pascal Seitz <pascal.seitz@gmail.com>"]
edition = "2018"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { path = "../common/" }
tantivy-bitpacker = { path = "../bitpacker/" }
[dev-dependencies]
rand = "0.8.3"

View File

@@ -0,0 +1,5 @@
mod reader;
mod serialize;
pub use reader::BitpackedFastFieldReader;
pub use serialize::BitpackedFastFieldSerializer;

View File

@@ -0,0 +1,33 @@
use common::BinarySerializable;
use std::io;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader<'data> {
bytes: &'data [u8],
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
}
impl<'data> BitpackedFastFieldReader<'data> {
/// Opens a fast field given a file.
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
let min_value = u64::deserialize(&mut bytes)?;
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
pub fn get_u64(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes)
}
}

View File

@@ -0,0 +1,93 @@
use crate::FastFieldDataAccess;
use crate::FastFieldSerializerEstimate;
use crate::FastFieldStats;
use common::BinarySerializable;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_bits: u8,
}
impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn open(
write: &'a mut W,
min_value: u64,
max_value: u64,
) -> io::Result<BitpackedFastFieldSerializer<'a, W>> {
assert!(min_value <= max_value);
min_value.serialize(write)?;
let amplitude = max_value - min_value;
amplitude.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let bit_packer = BitPacker::new();
Ok(BitpackedFastFieldSerializer {
bit_packer,
write,
min_value,
num_bits,
})
}
/// Creates a new fast field serializer.
///
/// The serializer in fact encode the values by bitpacking
/// `(val - min_value)`.
///
/// It requires a `min_value` and a `max_value` to compute
/// compute the minimum number of bits required to encode
/// values.
pub fn create(
write: &'a mut W,
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
) -> io::Result<()> {
let mut serializer = Self::open(write, stats.min_value, stats.max_value)?;
for val in data_iter {
serializer.add_val(val)?;
}
serializer.close_field()?;
Ok(())
}
/// Pushes a new value to the currently open u64 fast field.
pub fn add_val(&mut self, val: u64) -> io::Result<()> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)
}
}
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
fn estimate(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::codec_id().0;
(ratio, name)
}
fn codec_id() -> (&'static str, u8) {
("Bitpacked", 1)
}
}

View File

@@ -0,0 +1,47 @@
pub mod bitpacked;
pub mod linearinterpol;
#[cfg(test)]
mod tests {
use super::*;
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);
let max_value = data.iter().cloned().max().unwrap_or(0);
FastFieldStats {
min_value,
max_value,
num_vals: data.len() as u64,
}
}
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess: Clone {
/// Return the value associated to the given document.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: u32) -> u64;
}
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldSerializerEstimate {
/// returns an estimate of the compression ratio.
fn estimate(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str);
/// the unique (name, id) of the compressor. Used to distinguish when de/serializing.
fn codec_id() -> (&'static str, u8);
}
#[derive(Debug, Clone)]
pub struct FastFieldStats {
pub min_value: u64,
pub max_value: u64,
pub num_vals: u64,
}

View File

@@ -0,0 +1,123 @@
mod serialize;
use common::BinarySerializable;
use std::io;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
use crate::FastFieldDataAccess;
/// Depending on the field type, a different
/// fast field is required.
#[derive(Clone)]
pub struct LinearinterpolFastFieldReader<'data> {
bytes: &'data [u8],
bit_unpacker: BitUnpacker,
pub first_value: u64,
pub rel_max_value: u64,
pub offset: u64,
pub slope: f64,
}
impl<'data> LinearinterpolFastFieldReader<'data> {
/// Opens a fast field given a file.
pub fn open_from_bytes(mut bytes: &'data [u8]) -> io::Result<Self> {
let rel_max_value = u64::deserialize(&mut bytes)?;
let offset = u64::deserialize(&mut bytes)?;
let first_value = u64::deserialize(&mut bytes)?;
let last_value = u64::deserialize(&mut bytes)?;
let num_vals = u64::deserialize(&mut bytes)?;
let slope = (last_value as f64 - first_value as f64) / (num_vals as u64 - 1) as f64;
let num_bits = compute_num_bits(rel_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearinterpolFastFieldReader {
bytes,
first_value,
rel_max_value,
offset,
bit_unpacker,
slope,
})
}
pub fn get_u64(&self, doc: u64) -> u64 {
let calculated_value = self.first_value + (doc as f64 * self.slope) as u64;
calculated_value + self.bit_unpacker.get(doc, &self.bytes) - self.offset
//self.offset + self.min_value + self.bit_unpacker.get(doc, &self.bytes)
}
}
impl<'a> FastFieldDataAccess for &'a [u64] {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
impl FastFieldDataAccess for Vec<u64> {
fn get(&self, doc: u32) -> u64 {
self[doc as usize]
}
}
#[cfg(test)]
mod tests {
use super::*;
fn create_and_validate(data: &[u64]) -> (u64, u64) {
let mut out = vec![];
serialize::LinearInterpolFastFieldSerializer::create(
&mut out,
&data,
crate::tests::stats_from_vec(&data),
data.iter().cloned(),
data.iter().cloned(),
data.iter().cloned(),
)
.unwrap();
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
for (doc, val) in data.iter().enumerate() {
assert_eq!(reader.get_u64(doc as u64), *val);
}
(reader.rel_max_value, reader.offset)
}
#[test]
fn linear_interpol_fast_field_test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
let (rel_max_value, offset) = create_and_validate(&data);
assert_eq!(offset, 0);
assert_eq!(rel_max_value, 0);
}
#[test]
fn linear_interpol_fast_field_test_with_offset() {
//let data = vec![5, 50, 95, 96, 97, 98, 99, 100];
let mut data = vec![5, 6, 7, 8, 9, 10, 99, 100];
create_and_validate(&data);
data.reverse();
create_and_validate(&data);
}
#[test]
fn linear_interpol_fast_field_test_no_structure() {
let mut data = vec![5, 50, 3, 13, 1, 1000, 35];
create_and_validate(&data);
data.reverse();
create_and_validate(&data);
}
#[test]
fn linear_interpol_fast_field_rand() {
for i in 0..50000 {
let mut data = (0..1 + rand::random::<u8>() as usize)
.map(|num| rand::random::<i64>() as u64 / 2 as u64)
.collect::<Vec<_>>();
create_and_validate(&data);
data.reverse();
create_and_validate(&data);
}
}
}

View File

@@ -0,0 +1,87 @@
use crate::FastFieldDataAccess;
use crate::FastFieldSerializerEstimate;
use crate::FastFieldStats;
use common::BinarySerializable;
use std::io::{self, Write};
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitPacker;
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference.
pub struct LinearInterpolFastFieldSerializer {}
// TODO not suitable if max is larger than i64::MAX / 2
impl LinearInterpolFastFieldSerializer {
/// Creates a new fast field serializer.
pub fn create(
write: &mut impl Write,
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
data_iter2: impl Iterator<Item = u64>,
) -> io::Result<()> {
assert!(stats.min_value <= stats.max_value);
//let first_val = stats.min_value;
//let last_val = stats.max_value;
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let slope = (last_val as f64 - first_val as f64) / (stats.num_vals as u64 - 1) as f64;
// offset to ensure all values are positive
let offset = data_iter1
.enumerate()
.map(|(pos, val)| {
let calculated_value = first_val + (pos as f64 * slope) as u64;
val as i64 - calculated_value as i64
})
.min()
.unwrap()
.abs() as u64;
//calc new max
let rel_max = data_iter2
.enumerate()
.map(|(pos, val)| {
let calculated_value = first_val + (pos as f64 * slope) as u64;
(val + offset) - calculated_value
})
.max()
.unwrap();
let amplitude = rel_max;
amplitude.serialize(write)?;
offset.serialize(write)?;
first_val.serialize(write)?;
last_val.serialize(write)?;
stats.num_vals.serialize(write)?;
let num_bits = compute_num_bits(amplitude);
let mut bit_packer = BitPacker::new();
for (pos, val) in data_iter.enumerate() {
let calculated_value = first_val + (pos as f64 * slope) as u64;
let diff = (val + offset) - calculated_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.close(write)?;
Ok(())
}
}
impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
fn estimate(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::codec_id().0;
(ratio, name)
}
fn codec_id() -> (&'static str, u8) {
("LinearInterpol", 2)
}
}

View File

@@ -1,18 +1,19 @@
mod bitset;
mod composite_file;
mod counting_writer;
mod serialize;
mod vint;
//mod serialize;
//mod vint;
pub use self::bitset::BitSet;
pub(crate) use self::bitset::TinySet;
pub(crate) use self::composite_file::{CompositeFile, CompositeWrite};
pub use self::counting_writer::CountingWriter;
pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use self::vint::{
//pub use self::serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use byteorder::LittleEndian as Endianness;
pub use common::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt,
};
pub use byteorder::LittleEndian as Endianness;
pub use common::{BinarySerializable, DeserializeFrom, FixedSize};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
@@ -103,8 +104,9 @@ pub fn u64_to_f64(val: u64) -> f64 {
#[cfg(test)]
pub(crate) mod test {
pub use super::serialize::test::fixed_size_test;
//pub use super::serialize::test::fixed_size_test;
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
use common::{BinarySerializable, FixedSize};
use proptest::prelude::*;
use std::f64;
use tantivy_bitpacker::compute_num_bits;
@@ -118,6 +120,12 @@ pub(crate) mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {

View File

@@ -56,6 +56,12 @@ impl OwnedBytes {
self.data
}
/// Returns the underlying slice of data.
/// `Deref` and `AsRef` are also available.
#[inline]
pub fn into_slice(self) -> &'static [u8] {
self.data
}
/// Returns the len of the slice.
#[inline]
pub fn len(&self) -> usize {

View File

@@ -215,6 +215,7 @@ mod tests {
use super::*;
use crate::common::CompositeFile;
use crate::common::HasLen;
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::fastfield::BitpackedFastFieldReader;
use crate::merge_policy::NoMergePolicy;
@@ -223,7 +224,6 @@ mod tests {
use crate::schema::FAST;
use crate::schema::{Document, IntOptions};
use crate::{Index, SegmentId, SegmentReader};
use common::HasLen;
use once_cell::sync::Lazy;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;

View File

@@ -8,6 +8,7 @@ use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
@@ -67,9 +68,9 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
impl<Item: FastValue> DynamicFastFieldReader<Item> {
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let mut bytes = file.read_bytes()?;
let bytes = file.read_bytes()?;
let (mut id_bytes, data_bytes) = bytes.split(1);
let id = u8::deserialize(&mut id_bytes)?;
let _id = u8::deserialize(&mut id_bytes)?;
Ok(DynamicFastFieldReader::Bitpacked(
BitpackedFastFieldReader::open_from_bytes(data_bytes)?,
@@ -106,10 +107,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
/// fast field is required.
#[derive(Clone)]
pub struct BitpackedFastFieldReader<Item: FastValue> {
bytes: OwnedBytes,
bit_unpacker: BitUnpacker,
min_value_u64: u64,
max_value_u64: u64,
reader: BitpackedReader<'static>,
_phantom: PhantomData<Item>,
}
@@ -121,22 +119,15 @@ impl<Item: FastValue> BitpackedFastFieldReader<Item> {
Self::open_from_bytes(bytes)
}
/// Opens a fast field given a file.
pub fn open_from_bytes(mut bytes: OwnedBytes) -> crate::Result<Self> {
let min_value = u64::deserialize(&mut bytes)?;
let amplitude = u64::deserialize(&mut bytes)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = BitpackedReader::open_from_bytes(bytes.into_slice())?;
Ok(BitpackedFastFieldReader {
bytes,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
reader,
_phantom: PhantomData,
})
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.min_value_u64 + self.bit_unpacker.get(doc, &self.bytes))
Item::from_u64(self.reader.get_u64(doc))
}
/// Internally `multivalued` also use SingleValue Fast fields.
@@ -194,7 +185,7 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item {
Item::from_u64(self.min_value_u64)
Item::from_u64(self.reader.min_value_u64)
}
/// Returns the maximum value for this fast field.
@@ -203,7 +194,7 @@ impl<Item: FastValue> FastFieldReader<Item> for BitpackedFastFieldReader<Item> {
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item {
Item::from_u64(self.max_value_u64)
Item::from_u64(self.reader.max_value_u64)
}
}

View File

@@ -7,22 +7,13 @@ use crate::common::CountingWriter;
use crate::directory::WritePtr;
use crate::schema::Field;
use crate::DocId;
pub use bitpacked::BitpackedFastFieldSerializer;
//pub use bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::FastFieldDataAccess;
pub use fastfield_codecs::FastFieldSerializerEstimate;
pub use fastfield_codecs::FastFieldStats;
use std::io::{self, Write};
/// FastFieldReader is the trait to access fast field data.
pub trait FastFieldDataAccess: Clone {
//type IteratorType: Iterator<Item = u64>;
/// Return the value associated to the given document.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance reasons.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
fn get(&self, doc: DocId) -> u64;
}
/// `CompositeFastFieldSerializer` is in charge of serializing
/// fastfields on disk.
///
@@ -129,13 +120,6 @@ impl CompositeFastFieldSerializer {
}
}
#[derive(Debug, Clone)]
pub struct FastFieldStats {
pub min_value: u64,
pub max_value: u64,
pub num_vals: u64,
}
/// The FastFieldSerializer trait is the common interface
/// implemented by every fastfield serializer variant.
///
@@ -148,18 +132,6 @@ pub trait FastFieldSerializer {
fn close_field(self) -> io::Result<()>;
}
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldSerializerEstimate {
/// returns an estimate of the compression ratio.
fn estimate(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str);
/// the unique (name, id) of the compressor. Used to distinguish when de/serializing.
fn codec_id() -> (&'static str, u8);
}
pub struct FastBytesFieldSerializer<'a, W: Write> {
write: &'a mut W,
}