add interface to create and read codecs

add CodecReader as common interface in fastfield codec crate
add LinearInterpolation to DynamicFastFieldReader
calc estimation and choose best codec
cleanup
This commit is contained in:
Pascal Seitz
2021-06-04 10:30:11 +02:00
parent 483fdb79cc
commit be2dd41e69
9 changed files with 229 additions and 89 deletions

View File

@@ -1,4 +1,5 @@
use crate::CodecId;
use crate::CodecReader;
use crate::FastFieldDataAccess;
use crate::FastFieldSerializerEstimate;
use crate::FastFieldStats;
@@ -18,9 +19,9 @@ pub struct BitpackedFastFieldReader {
pub max_value_u64: u64,
}
impl<'data> BitpackedFastFieldReader {
impl<'data> CodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
pub fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
@@ -33,9 +34,15 @@ impl<'data> BitpackedFastFieldReader {
bit_unpacker,
})
}
pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &data)
}
fn min_value(&self) -> u64 {
self.min_value_u64
}
fn max_value(&self) -> u64 {
self.max_value_u64
}
}
pub struct BitpackedFastFieldSerializer<'a, W: 'a + Write> {
bit_packer: BitPacker,
@@ -111,16 +118,12 @@ impl<'a, W: Write> BitpackedFastFieldSerializer<'a, W> {
}
impl<'a, W: 'a + Write> FastFieldSerializerEstimate for BitpackedFastFieldSerializer<'a, W> {
fn estimate(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::NAME;
(ratio, name)
ratio
}
}
impl<'a, W: 'a + Write> CodecId for BitpackedFastFieldSerializer<'_, W> {

View File

@@ -5,6 +5,16 @@ extern crate more_asserts;
pub mod bitpacked;
pub mod linearinterpol;
pub trait CodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess: Clone {
/// Return the value associated to the given document.
@@ -25,10 +35,7 @@ pub trait FastFieldSerializerEstimate {
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str);
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
}
/// `CodecId` is required by each Codec.
@@ -98,11 +105,11 @@ mod tests {
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation.0, 0.1);
assert_le!(linear_interpol_estimation, 0.1);
let bitpacked_estimation =
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation.0, bitpacked_estimation.0);
assert_le!(linear_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
@@ -110,11 +117,11 @@ mod tests {
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation.0, 0.3);
assert_le!(linear_interpol_estimation, 0.3);
let bitpacked_estimation =
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation.0, linear_interpol_estimation.0);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
@@ -125,11 +132,11 @@ mod tests {
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation.0, 0.35);
assert_le!(linear_interpol_estimation, 0.35);
let bitpacked_estimation =
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation.0, 0.32);
assert_le!(bitpacked_estimation.0, linear_interpol_estimation.0);
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
}
}

View File

@@ -1,4 +1,5 @@
use crate::CodecId;
use crate::CodecReader;
use crate::FastFieldDataAccess;
use crate::FastFieldSerializerEstimate;
use crate::FastFieldStats;
@@ -27,6 +28,8 @@ pub struct LinearInterpolFooter {
pub first_val: u64,
pub last_val: u64,
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
}
impl BinarySerializable for LinearInterpolFooter {
@@ -36,6 +39,8 @@ impl BinarySerializable for LinearInterpolFooter {
self.first_val.serialize(write)?;
self.last_val.serialize(write)?;
self.num_vals.serialize(write)?;
self.min_value.serialize(write)?;
self.max_value.serialize(write)?;
Ok(())
}
@@ -46,17 +51,19 @@ impl BinarySerializable for LinearInterpolFooter {
first_val: u64::deserialize(reader)?,
last_val: u64::deserialize(reader)?,
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
})
}
}
impl FixedSize for LinearInterpolFooter {
const SIZE_IN_BYTES: usize = 40;
const SIZE_IN_BYTES: usize = 56;
}
impl LinearinterpolFastFieldReader {
impl CodecReader for LinearinterpolFastFieldReader {
/// Opens a fast field given a file.
pub fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let (_data, mut footer) = bytes.split_at(bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = (footer.last_val as f64 - footer.first_val as f64)
@@ -70,10 +77,17 @@ impl LinearinterpolFastFieldReader {
slope,
})
}
pub fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &data)) - self.footer.offset
}
fn min_value(&self) -> u64 {
self.footer.min_value
}
fn max_value(&self) -> u64 {
self.footer.max_value
}
}
/// Fastfield serializer, which tries to guess values by linear interpolation
@@ -131,6 +145,8 @@ impl LinearInterpolFastFieldSerializer {
first_val,
last_val,
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
};
footer.serialize(write)?;
Ok(())
@@ -147,10 +163,7 @@ impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
/// estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> (f32, &'static str) {
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
let first_val = fastfield_accessor.get(0);
let last_val = fastfield_accessor.get(stats.num_vals as u32 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
@@ -187,8 +200,7 @@ impl FastFieldSerializerEstimate for LinearInterpolFastFieldSerializer {
+ LinearInterpolFooter::SIZE_IN_BYTES as u64;
let num_bits_uncompressed = 64 * stats.num_vals;
let ratio = num_bits as f32 / num_bits_uncompressed as f32;
let name = Self::NAME;
(ratio, name)
ratio
}
}
@@ -223,7 +235,6 @@ mod tests {
let reader = LinearinterpolFastFieldReader::open_from_bytes(&out).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
//assert_eq!(reader.get_u64(doc as u64, &out), *val);
let val = reader.get_u64(doc as u64, &out);
if val != *orig_val {
panic!(

View File

@@ -408,7 +408,8 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(&path).unwrap();
assert_eq!(file.len(), 17710 as usize);
//assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175 as usize); // linear interpol size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();

View File

@@ -1,5 +1,4 @@
use crate::fastfield::serializer::BitpackedFastFieldSerializer;
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::CompositeFastFieldSerializer;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};

View File

@@ -9,11 +9,14 @@ use crate::schema::Schema;
use crate::schema::FAST;
use crate::DocId;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader as BitpackedReader;
use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearinterpolFastFieldReader;
use fastfield_codecs::CodecId;
use fastfield_codecs::CodecReader;
use std::collections::HashMap;
use std::marker::PhantomData;
use std::path::Path;
use tantivy_bitpacker::compute_num_bits;
use tantivy_bitpacker::BitUnpacker;
/// FastFieldReader is the trait to access fast field data.
pub trait FastFieldReader<Item: FastValue>: Clone {
@@ -62,7 +65,8 @@ pub trait FastFieldReader<Item: FastValue>: Clone {
///
pub enum DynamicFastFieldReader<Item: FastValue> {
/// Bitpacked compressed fastfield data.
Bitpacked(BitpackedFastFieldReader<Item>),
Bitpacked(FastFieldReaderCodecWrapper<Item, BitpackedReader>),
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearinterpolFastFieldReader>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -70,11 +74,31 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
let bytes = file.read_bytes()?;
let (mut id_bytes, data_bytes) = bytes.split(1);
let _id = u8::deserialize(&mut id_bytes)?;
let id = u8::deserialize(&mut id_bytes)?;
Ok(DynamicFastFieldReader::Bitpacked(
BitpackedFastFieldReader::open_from_bytes(data_bytes)?,
))
let reader = match id {
BitpackedFastFieldSerializer::<Vec<u8>>::ID => {
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
Item,
BitpackedReader,
>::open_from_bytes(data_bytes)?)
}
LinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
Item,
LinearinterpolFastFieldReader,
>::open_from_bytes(
data_bytes
)?)
}
_ => {
panic!(
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
id
)
}
};
Ok(reader)
}
}
@@ -82,25 +106,124 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
fn get(&self, doc: DocId) -> Item {
match self {
Self::Bitpacked(reader) => reader.get(doc),
Self::LinearInterpol(reader) => reader.get(doc),
}
}
fn get_range(&self, start: DocId, output: &mut [Item]) {
match self {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.min_value(),
Self::LinearInterpol(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
match self {
Self::Bitpacked(reader) => reader.max_value(),
Self::LinearInterpol(reader) => reader.max_value(),
}
}
}
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
///
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
bytes: OwnedBytes,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, C: CodecReader> FastFieldReaderCodecWrapper<Item, C> {
/// Opens a fast field given the bytes.
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
let reader = C::open_from_bytes(bytes.as_slice())?;
Ok(FastFieldReaderCodecWrapper {
reader,
bytes,
_phantom: PhantomData,
})
}
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
}
/// Internally `multivalued` also use SingleValue Fast fields.
/// It works as follows... A first column contains the list of start index
/// for each document, a second column contains the actual values.
///
/// The values associated to a given doc, are then
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
///
/// Which means single value fast field reader can be indexed internally with
/// something different from a `DocId`. For this use case, we want to use `u64`
/// values.
///
/// See `get_range` for an actual documentation about this method.
pub(crate) fn get_range_u64(&self, start: u64, output: &mut [Item]) {
for (i, out) in output.iter_mut().enumerate() {
*out = self.get_u64(start + (i as u64));
}
}
}
impl<Item: FastValue, C: CodecReader + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: DocId, output: &mut [Item]) {
self.get_range_u64(u64::from(start), output);
}
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item {
Item::from_u64(self.reader.min_value())
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value())
}
}
/// Trait for accessing a fastfield.
///
/// Depending on the field type, a different

View File

@@ -6,6 +6,7 @@ use crate::schema::Field;
use fastfield_codecs::CodecId;
//pub use bitpacked::BitpackedFastFieldSerializer;
pub use fastfield_codecs::bitpacked::BitpackedFastFieldSerializer;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
pub use fastfield_codecs::FastFieldDataAccess;
pub use fastfield_codecs::FastFieldSerializerEstimate;
pub use fastfield_codecs::FastFieldStats;
@@ -54,22 +55,52 @@ impl CompositeFastFieldSerializer {
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, 0);
let (_ratio, name, id) = (
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(&fastfield_accessor, stats.clone()),
BitpackedFastFieldSerializer::<Vec<u8>>::NAME,
BitpackedFastFieldSerializer::<Vec<u8>>::ID,
);
let mut estimations = vec![];
{
let (ratio, name, id) = (
BitpackedFastFieldSerializer::<Vec<u8>>::estimate(
&fastfield_accessor,
stats.clone(),
),
BitpackedFastFieldSerializer::<Vec<u8>>::NAME,
BitpackedFastFieldSerializer::<Vec<u8>>::ID,
);
estimations.push((ratio, name, id));
}
{
let (ratio, name, id) = (
LinearInterpolFastFieldSerializer::estimate(&fastfield_accessor, stats.clone()),
LinearInterpolFastFieldSerializer::NAME,
LinearInterpolFastFieldSerializer::ID,
);
estimations.push((ratio, name, id));
}
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let (_ratio, name, id) = estimations[0];
//estimations.sort_by_key(|el| el.0);
id.serialize(field_write)?;
if name == BitpackedFastFieldSerializer::<Vec<u8>>::NAME {
BitpackedFastFieldSerializer::create(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
)?;
} else {
panic!("unknown fastfield serializer {}", name);
match name {
BitpackedFastFieldSerializer::<Vec<u8>>::NAME => {
BitpackedFastFieldSerializer::create(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
)?;
}
LinearInterpolFastFieldSerializer::NAME => {
LinearInterpolFastFieldSerializer::create(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
_ => {
panic!("unknown fastfield serializer {}", name)
}
};
Ok(())

View File

@@ -1,9 +1,7 @@
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::FastFieldDataAccess;
use super::FastFieldReader;
use crate::common;
use crate::fastfield::serializer::FastFieldSerializer;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;

View File

@@ -5,7 +5,6 @@ use crate::fastfield::DeleteBitSet;
use crate::fastfield::DynamicFastFieldReader;
use crate::fastfield::FastFieldDataAccess;
use crate::fastfield::FastFieldReader;
use crate::fastfield::FastFieldSerializer;
use crate::fastfield::FastFieldStats;
use crate::fastfield::MultiValuedFastFieldReader;
use crate::fieldnorm::FieldNormsSerializer;
@@ -346,32 +345,16 @@ impl IndexMerger {
})
.collect::<Vec<_>>();
if let Some(doc_id_mapping) = doc_id_mapping {
//struct SortedDocidAccessor {};
#[derive(Clone)]
struct SortedDocidFieldAccessProvider<'a> {
doc_id_mapping: &'a Vec<(DocId, SegmentReaderWithOrdinal<'a>)>,
fast_field_readers: &'a Vec<DynamicFastFieldReader<u64>>,
}
impl<'a> FastFieldDataAccess for SortedDocidFieldAccessProvider<'a> {
//type IteratorType = ;
//type IteratorType: std::iter::Map<std::slice::Iter<'a, (u32, SegmentReaderWithOrdinal<'_>)>, _>
//type IteratorType =
//std::iter::Map<std::slice::Iter<'a, (u32, SegmentReaderWithOrdinal<'a>)>, u64>;
fn get(&self, doc: DocId) -> u64 {
let (doc_id, reader_with_ordinal) = self.doc_id_mapping[doc as usize];
self.fast_field_readers[reader_with_ordinal.ordinal as usize].get(doc_id)
}
//fn iter(&self) -> Self::IteratorType {
//self.doc_id_mapping
//.iter()
//.map(|(doc_id, reader_with_ordinal)| {
//let fast_field_reader =
//&self.fast_field_readers[reader_with_ordinal.ordinal as usize];
//let val = self.field_reader.get(*doc_id);
//val
//})
//}
}
let stats = FastFieldStats {
min_value,
@@ -395,22 +378,6 @@ impl IndexMerger {
iter,
)?;
//let sorted_doc_ids = doc_id_mapping.iter().map(|(doc_id, reader_with_ordinal)| {
//(
//doc_id,
//&fast_field_readers[reader_with_ordinal.ordinal as usize],
//)
//});
//// add values in order of the new doc_ids
//let mut fast_single_field_serializer =
//fast_field_serializer.new_u64_fast_field(field, min_value, max_value)?;
//for (doc_id, field_reader) in sorted_doc_ids {
//let val = field_reader.get(*doc_id);
//fast_single_field_serializer.add_val(val)?;
//}
//fast_single_field_serializer.close_field()?;
Ok(())
} else {
let u64_readers = self.readers.iter()