merge FastFieldCodecReader wit FastFieldDataAccess (#1485)

* num_vals to FastFieldCodecReader

* split open_from_bytes to own trait

* rename get_u64 to ge_val

* merge traits
This commit is contained in:
PSeitz
2022-08-27 11:58:28 -07:00
committed by GitHub
parent 3a9727aa91
commit 0dd62169c8
10 changed files with 104 additions and 66 deletions

View File

@@ -25,7 +25,10 @@ mod tests {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn bench_get<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
fn bench_get<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
b: &mut Bencher,
data: &[u64],
) {
@@ -35,7 +38,7 @@ mod tests {
b.iter(|| {
let mut sum = 0u64;
for pos in value_iter() {
let val = reader.get_u64(pos as u64);
let val = reader.get_val(pos as u64);
debug_assert_eq!(data[pos as usize], val);
sum = sum.wrapping_add(val);
}

View File

@@ -5,7 +5,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different
@@ -16,27 +16,32 @@ pub struct BitpackedReader {
bit_unpacker: BitUnpacker,
pub min_value_u64: u64,
pub max_value_u64: u64,
pub num_vals: u64,
}
impl FastFieldCodecReader for BitpackedReader {
impl FastFieldCodecDeserializer for BitpackedReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let footer_offset = bytes.len() - 24;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedReader {
data,
bit_unpacker,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
num_vals,
})
}
}
impl FastFieldDataAccess for BitpackedReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
}
#[inline]
@@ -47,11 +52,16 @@ impl FastFieldCodecReader for BitpackedReader {
fn max_value(&self) -> u64 {
self.max_value_u64
}
#[inline]
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub struct BitpackedSerializerLegacy<'a, W: 'a + Write> {
bit_packer: BitPacker,
write: &'a mut W,
min_value: u64,
num_vals: u64,
amplitude: u64,
num_bits: u8,
}
@@ -78,6 +88,7 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
bit_packer,
write,
min_value,
num_vals: 0,
amplitude,
num_bits,
})
@@ -88,12 +99,14 @@ impl<'a, W: Write> BitpackedSerializerLegacy<'a, W> {
let val_to_write: u64 = val - self.min_value;
self.bit_packer
.write(val_to_write, self.num_bits, &mut self.write)?;
self.num_vals += 1;
Ok(())
}
pub fn close_field(mut self) -> io::Result<()> {
self.bit_packer.close(&mut self.write)?;
self.min_value.serialize(&mut self.write)?;
self.amplitude.serialize(&mut self.write)?;
self.num_vals.serialize(&mut self.write)?;
Ok(())
}
}

View File

@@ -19,7 +19,7 @@ use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::linear::{get_calculated_value, get_slope};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
const CHUNK_SIZE: u64 = 512;
@@ -148,7 +148,7 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
&interpolations[get_interpolation_position(doc)]
}
impl FastFieldCodecReader for BlockwiseLinearReader {
impl FastFieldCodecDeserializer for BlockwiseLinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
@@ -157,9 +157,11 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
let footer = BlockwiseLinearFooter::deserialize(&mut footer)?;
Ok(BlockwiseLinearReader { data, footer })
}
}
impl FastFieldDataAccess for BlockwiseLinearReader {
#[inline]
fn get_u64(&self, idx: u64) -> u64 {
fn get_val(&self, idx: u64) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let in_block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value(
@@ -182,6 +184,10 @@ impl FastFieldCodecReader for BlockwiseLinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Same as LinearSerializer, but working on chunks of CHUNK_SIZE elements.

View File

@@ -12,12 +12,21 @@ pub mod bitpacked;
pub mod blockwise_linear;
pub mod linear;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
pub trait FastFieldCodecDeserializer: Sized {
/// Reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>
where Self: FastFieldDataAccess;
}
pub trait FastFieldDataAccess {
fn get_val(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
fn num_vals(&self) -> u64;
/// Returns a iterator over the data
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = u64> + 'a> {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
}
}
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
@@ -85,31 +94,6 @@ pub trait FastFieldCodecSerializer {
) -> io::Result<()>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
pub trait FastFieldDataAccess {
/// Return the value associated to the given position.
///
/// Whenever possible use the Iterator passed to the fastfield creation instead, for performance
/// reasons.
///
/// # Panics
///
/// May panic if `position` is greater than the index.
fn get_val(&self, position: u64) -> u64;
/// Returns a iterator over the data
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_>;
/// min value of the data
fn min_value(&self) -> u64;
/// max value of the data
fn max_value(&self) -> u64;
/// num vals
fn num_vals(&self) -> u64;
}
#[derive(Debug, Clone)]
/// Statistics are used in codec detection and stored in the fast field footer.
pub struct FastFieldStats {
@@ -169,7 +153,10 @@ mod tests {
use crate::blockwise_linear::{BlockwiseLinearReader, BlockwiseLinearSerializer};
use crate::linear::{LinearReader, LinearSerializer};
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
pub fn create_and_validate<
S: FastFieldCodecSerializer,
R: FastFieldCodecDeserializer + FastFieldDataAccess,
>(
data: &[u64],
name: &str,
) -> (f32, f32) {
@@ -183,8 +170,9 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
assert_eq!(reader.num_vals(), data.len() as u64);
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64);
let val = reader.get_val(doc as u64);
if val != *orig_val {
panic!(
"val {val:?} does not match orig_val {orig_val:?}, in data set {name}, data \
@@ -228,7 +216,10 @@ mod tests {
data_and_names
}
fn test_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>() {
fn test_codec<
S: FastFieldCodecSerializer,
R: FastFieldDataAccess + FastFieldCodecDeserializer,
>() {
let codec_name = format!("{:?}", S::CODEC_TYPE);
for (data, dataset_name) in get_codec_test_data_sets() {
let (estimate, actual) = crate::tests::create_and_validate::<S, R>(&data, dataset_name);

View File

@@ -6,7 +6,7 @@ use ownedbytes::OwnedBytes;
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{
FastFieldCodecReader, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
FastFieldCodecDeserializer, FastFieldCodecSerializer, FastFieldCodecType, FastFieldDataAccess,
};
/// Depending on the field type, a different
@@ -59,7 +59,7 @@ impl FixedSize for LinearFooter {
const SIZE_IN_BYTES: usize = 56;
}
impl FastFieldCodecReader for LinearReader {
impl FastFieldCodecDeserializer for LinearReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearFooter::SIZE_IN_BYTES;
@@ -75,8 +75,11 @@ impl FastFieldCodecReader for LinearReader {
slope,
})
}
}
impl FastFieldDataAccess for LinearReader {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
fn get_val(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, &self.data)) - self.footer.offset
}
@@ -89,6 +92,10 @@ impl FastFieldCodecReader for LinearReader {
fn max_value(&self) -> u64 {
self.footer.max_value
}
#[inline]
fn num_vals(&self) -> u64 {
self.footer.num_vals
}
}
/// Fastfield serializer, which tries to guess values by linear interpolation

View File

@@ -3,7 +3,7 @@ use std::num::NonZeroU64;
use common::BinarySerializable;
use fastdivide::DividerU64;
use fastfield_codecs::FastFieldCodecReader;
use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldDataAccess};
use ownedbytes::OwnedBytes;
pub const GCD_DEFAULT: u64 = 1;
@@ -15,27 +15,33 @@ pub const GCD_DEFAULT: u64 = 1;
pub struct GCDFastFieldCodec<CodecReader> {
gcd: u64,
min_value: u64,
num_vals: u64,
reader: CodecReader,
}
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
impl<C: FastFieldDataAccess + FastFieldCodecDeserializer + Clone> FastFieldCodecDeserializer
for GCDFastFieldCodec<C>
{
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
let footer_offset = bytes.len() - 16;
let footer_offset = bytes.len() - 24;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let num_vals = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
gcd,
min_value,
num_vals,
reader,
})
}
}
impl<C: FastFieldDataAccess + Clone> FastFieldDataAccess for GCDFastFieldCodec<C> {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
fn get_val(&self, doc: u64) -> u64 {
let mut data = self.reader.get_val(doc);
data *= self.gcd;
data += self.min_value;
data
@@ -48,11 +54,20 @@ impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec
fn max_value(&self) -> u64 {
self.min_value + self.reader.max_value() * self.gcd
}
fn num_vals(&self) -> u64 {
self.num_vals
}
}
pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> {
pub fn write_gcd_header<W: Write>(
field_write: &mut W,
min_value: u64,
gcd: u64,
num_vals: u64,
) -> io::Result<()> {
gcd.serialize(field_write)?;
min_value.serialize(field_write)?;
num_vals.serialize(field_write)?;
Ok(())
}

View File

@@ -326,7 +326,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 37);
assert_eq!(file.len(), 45);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
@@ -357,7 +357,7 @@ mod tests {
serializer.close()?;
}
let file = directory.open_read(path)?;
assert_eq!(file.len(), 62);
assert_eq!(file.len(), 70);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -393,7 +393,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 43);
{
let fast_fields_composite = CompositeFile::open(&file).unwrap();
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -425,7 +425,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 80043);
assert_eq!(file.len(), 80051);
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(*FIELD).unwrap();
@@ -896,7 +896,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 36);
assert_eq!(file.len(), 44);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -932,7 +932,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 48);
assert_eq!(file.len(), 56);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
@@ -966,7 +966,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 35);
assert_eq!(file.len(), 43);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(field).unwrap();
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;

View File

@@ -6,7 +6,7 @@ use common::BinarySerializable;
use fastfield_codecs::bitpacked::BitpackedReader;
use fastfield_codecs::blockwise_linear::BlockwiseLinearReader;
use fastfield_codecs::linear::LinearReader;
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecType};
use fastfield_codecs::{FastFieldCodecDeserializer, FastFieldCodecType, FastFieldDataAccess};
use super::{FastValue, GCDFastFieldCodec};
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
@@ -199,7 +199,9 @@ pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item, C> {
impl<Item: FastValue, C: FastFieldDataAccess + FastFieldCodecDeserializer>
FastFieldReaderCodecWrapper<Item, C>
{
/// Opens a fast field given a file.
pub fn open(file: FileSlice) -> crate::Result<Self> {
let mut bytes = file.read_bytes()?;
@@ -226,7 +228,7 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
let data = self.reader.get_u64(doc);
let data = self.reader.get_val(doc);
Item::from_u64(data)
}
@@ -249,8 +251,8 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
}
}
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C>
impl<Item: FastValue, C: FastFieldDataAccess + FastFieldCodecDeserializer + Clone>
FastFieldReader<Item> for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document.
///

View File

@@ -189,7 +189,7 @@ impl CompositeFastFieldSerializer {
field_write,
fastfield_accessor,
)?;
write_gcd_header(field_write, base_value, gcd)?;
write_gcd_header(field_write, base_value, gcd, num_vals)?;
Ok(())
}

View File

@@ -2,12 +2,13 @@ use std::collections::HashMap;
use std::io;
use common;
use fastfield_codecs::FastFieldDataAccess;
use fnv::FnvHashMap;
use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType, FastValue};
use super::{FastFieldType, FastValue};
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;