Experimental refactor

This commit is contained in:
Paul Masurel
2022-08-21 11:19:02 +02:00
parent 6602786db8
commit 0ec2ebd791
10 changed files with 533 additions and 229 deletions

View File

@@ -17,22 +17,7 @@ pub struct BitpackedFastFieldReader {
}
impl FastFieldCodecReader for BitpackedFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
self.min_value_u64 + self.bit_unpacker.get(doc, &self.data)
@@ -96,11 +81,30 @@ impl<'a, W: Write> BitpackedFastFieldSerializerLegacy<'a, W> {
}
}
pub struct BitpackedFastFieldSerializer {}
pub struct BitpackedFastFieldSerializer;
impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
const NAME: &'static str = "Bitpacked";
const ID: u8 = 1;
type Reader = BitpackedFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 16;
let (data, mut footer) = bytes.split(footer_offset);
let min_value = u64::deserialize(&mut footer)?;
let amplitude = u64::deserialize(&mut footer)?;
let max_value = min_value + amplitude;
let num_bits = compute_num_bits(amplitude);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(BitpackedFastFieldReader {
data,
min_value_u64: min_value,
max_value_u64: max_value,
bit_unpacker,
})
}
/// Serializes data with the BitpackedFastFieldSerializer.
///
/// The serializer in fact encode the values by bitpacking
@@ -146,7 +150,7 @@ mod tests {
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) {
crate::tests::create_and_validate::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(
crate::tests::create_and_validate::<BitpackedFastFieldSerializer>(
data, name,
);
}

View File

@@ -0,0 +1,148 @@
// Copyright (C) 2022 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
use std::io;
use std::sync::Arc;
use ownedbytes::OwnedBytes;
use crate::FastFieldCodecSerializer;
use crate::bitpacked::BitpackedFastFieldSerializer;
use crate::linearinterpol::LinearInterpolFastFieldSerializer;
use crate::FastFieldCodecReader;
use crate::gcd::GCDFastFieldCodecSerializer;
use crate::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
struct DynamicFastFieldSerializer;
impl FastFieldCodecSerializer for DynamicFastFieldSerializer {
const NAME: &'static str = "dynamic";
type Reader = DynamicFastFieldReader;
fn is_applicable(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> bool {
todo!()
}
fn estimate(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> f32 {
todo!()
}
fn serialize(
write: &mut impl io::Write,
fastfield_accessor: &dyn crate::FastFieldDataAccess,
stats: crate::FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
todo!()
}
fn open_from_bytes(mut bytes: OwnedBytes) -> io::Result<Self::Reader> {
let codec_code = bytes.read_u8();
let codec_type = CodecType::from_code(codec_code).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Unknown codec code `{codec_code}`"),
)
})?;
let fast_field_reader: Arc<dyn FastFieldCodecReader> = match codec_type {
CodecType::Bitpacked => Arc::new(BitpackedFastFieldSerializer::open_from_bytes(bytes)?),
CodecType::LinearInterpol => {
Arc::new(LinearInterpolFastFieldSerializer::open_from_bytes(bytes)?)
}
CodecType::MultiLinearInterpol => {
Arc::new(MultiLinearInterpolFastFieldSerializer::open_from_bytes(bytes)?)
}
CodecType::Gcd => {
let inner_codec_id = bytes.read_u8();
let inner_codec_type = CodecType::from_code(inner_codec_id).ok_or_else(|| {
io::Error::new(
io::ErrorKind::InvalidData,
format!("Unknown codec code `{codec_code}`"),
)
})?;
match inner_codec_type {
CodecType::Bitpacked => {
Arc::new(GCDFastFieldCodecSerializer::<BitpackedFastFieldSerializer>::open_from_bytes(bytes)?)
}
CodecType::LinearInterpol => {
Arc::new(GCDFastFieldCodecSerializer::<LinearInterpolFastFieldSerializer>::open_from_bytes(bytes)?)
}
CodecType::MultiLinearInterpol => {
Arc::new(GCDFastFieldCodecSerializer::<MultiLinearInterpolFastFieldSerializer>::open_from_bytes(bytes)?)
}
CodecType::Gcd => {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
"A GCD codec may not wrap another GCD codec.",
));
}
}
}
};
Ok(DynamicFastFieldReader(fast_field_reader))
}
}
#[derive(Clone)]
/// DynamicFastFieldReader wraps different readers to access
/// the various encoded fastfield data
pub struct DynamicFastFieldReader(Arc<dyn FastFieldCodecReader>);
#[repr(u8)]
#[derive(Debug, Clone, Copy)]
enum CodecType {
Bitpacked = 0,
LinearInterpol = 1,
MultiLinearInterpol = 2,
Gcd = 3,
}
impl CodecType {
pub fn from_code(code: u8) -> Option<Self> {
match code {
0 => Some(CodecType::Bitpacked),
1 => Some(CodecType::LinearInterpol),
2 => Some(CodecType::MultiLinearInterpol),
3 => Some(CodecType::Gcd),
_ => None,
}
}
pub fn to_code(self) -> u8 {
self as u8
}
}
impl FastFieldCodecReader for DynamicFastFieldReader {
fn get_u64(&self, doc: u64) -> u64 {
self.0.get_u64(doc)
}
fn min_value(&self) -> u64 {
self.0.min_value()
}
fn max_value(&self) -> u64 {
self.0.max_value()
}
}

View File

@@ -1,44 +1,71 @@
use std::io::{self, Write};
use std::{io::{self, Write}, marker::PhantomData, num::NonZeroU64};
use common::BinarySerializable;
use fastdivide::DividerU64;
use ownedbytes::OwnedBytes;
use crate::FastFieldCodecReader;
pub const GCD_DEFAULT: u64 = 1;
pub const GCD_CODEC_ID: u8 = 4;
use crate::{FastFieldCodecReader, FastFieldCodecSerializer};
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct GCDFastFieldCodec<CodecReader> {
pub struct GCDFastFieldCodecReader<CodecReader> {
gcd: u64,
min_value: u64,
reader: CodecReader,
}
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
/// Opens a fast field given the bytes.
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self> {
pub struct GCDFastFieldCodecSerializer<WrappedCodecSerializer: FastFieldCodecSerializer> {
_wrapped_type: PhantomData<WrappedCodecSerializer>,
}
impl<WrappedCodecSerializer: FastFieldCodecSerializer> GCDFastFieldCodecSerializer<WrappedCodecSerializer> {}
impl<WrappedCodecSerializer: FastFieldCodecSerializer> FastFieldCodecSerializer for GCDFastFieldCodecSerializer<WrappedCodecSerializer> {
// TODO Fixme. We could like the underlying codec name as well.
const NAME: &'static str = "GCD";
type Reader = GCDFastFieldCodecReader<WrappedCodecSerializer::Reader>;
fn is_applicable(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> bool {
todo!()
}
fn estimate(fastfield_accessor: &impl crate::FastFieldDataAccess, stats: crate::FastFieldStats) -> f32 {
todo!()
}
fn serialize(
write: &mut impl Write,
fastfield_accessor: &dyn crate::FastFieldDataAccess,
stats: crate::FastFieldStats,
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
todo!()
}
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - 16;
let (body, mut footer) = bytes.split(footer_offset);
let gcd = u64::deserialize(&mut footer)?;
let min_value = u64::deserialize(&mut footer)?;
let reader = C::open_from_bytes(body)?;
Ok(GCDFastFieldCodec {
let reader = WrappedCodecSerializer::open_from_bytes(body)?;
Ok(GCDFastFieldCodecReader {
gcd,
min_value,
reader,
})
}
}
impl<C: FastFieldCodecReader> FastFieldCodecReader for GCDFastFieldCodecReader<C> {
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let mut data = self.reader.get_u64(doc);
data *= self.gcd;
data += self.min_value;
data
self.min_value + self.gcd * self.reader.get_u64(doc)
}
fn min_value(&self) -> u64 {
@@ -64,11 +91,13 @@ fn compute_gcd(mut left: u64, mut right: u64) -> u64 {
}
// Find GCD for iterator of numbers
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<u64> {
//
// If all numbers are '0' (or if there are not numbers, return None).
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<NonZeroU64> {
let mut numbers = numbers.filter(|n| *n != 0);
let mut gcd = numbers.next()?;
if gcd == 1 {
return Some(1);
return NonZeroU64::new(gcd);
}
let mut gcd_divider = DividerU64::divide_by(gcd);
@@ -79,151 +108,150 @@ pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<u64> {
}
gcd = compute_gcd(gcd, val);
if gcd == 1 {
return Some(1);
return NonZeroU64::new(1);
}
gcd_divider = DividerU64::divide_by(gcd);
}
Some(gcd)
NonZeroU64::new(gcd)
}
#[cfg(test)]
mod tests {
/*
TODO Move test
// TODO Move test
//
// use std::collections::HashMap;
// use std::path::Path;
//
// use crate::directory::{CompositeFile, RamDirectory, WritePtr};
// use crate::fastfield::serializer::FastFieldCodecEnableCheck;
// use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
// use super::{
// find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName,
// FastFieldReader, FastFieldsWriter, ALL_CODECS,
// };
// use crate::schema::Schema;
// use crate::Directory;
//
// fn get_index(
// docs: &[crate::Document],
// schema: &Schema,
// codec_enable_checker: FastFieldCodecEnableCheck,
// ) -> crate::Result<RamDirectory> {
// let directory: RamDirectory = RamDirectory::create();
// {
// let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
// let mut serializer =
// CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker)
// .unwrap();
// let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
// for doc in docs {
// fast_field_writers.add_document(doc);
// }
// fast_field_writers
// .serialize(&mut serializer, &HashMap::new(), None)
// .unwrap();
// serializer.close().unwrap();
// }
// Ok(directory)
// }
//
// fn test_fastfield_gcd_i64_with_codec(
// codec_name: FastFieldCodecName,
// num_vals: usize,
// ) -> crate::Result<()> {
// let path = Path::new("test");
// let mut docs = vec![];
// for i in 1..=num_vals {
// let val = i as i64 * 1000i64;
// docs.push(doc!(*FIELDI64=>val));
// }
// let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?;
// let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
// let composite_file = CompositeFile::open(&file)?;
// let file = composite_file.open_read(*FIELD).unwrap();
// let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
// assert_eq!(fast_field_reader.get(0), 1000i64);
// assert_eq!(fast_field_reader.get(1), 2000i64);
// assert_eq!(fast_field_reader.get(2), 3000i64);
// assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000);
// assert_eq!(fast_field_reader.min_value(), 1000i64);
// let file = directory.open_read(path).unwrap();
//
// Can't apply gcd
// let path = Path::new("test");
// docs.pop();
// docs.push(doc!(*FIELDI64=>2001i64));
// let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?;
// let file2 = directory.open_read(path).unwrap();
// assert!(file2.len() > file.len());
//
// Ok(())
// }
//
// #[test]
// fn test_fastfield_gcd_i64() -> crate::Result<()> {
// for codec_name in ALL_CODECS {
// test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?;
// }
// Ok(())
// }
//
// fn test_fastfield_gcd_u64_with_codec(
// codec_name: FastFieldCodecName,
// num_vals: usize,
// ) -> crate::Result<()> {
// let path = Path::new("test");
// let mut docs = vec![];
// for i in 1..=num_vals {
// let val = i as u64 * 1000u64;
// docs.push(doc!(*FIELD=>val));
// }
// let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?;
// let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
// let composite_file = CompositeFile::open(&file)?;
// let file = composite_file.open_read(*FIELD).unwrap();
// let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
// assert_eq!(fast_field_reader.get(0), 1000u64);
// assert_eq!(fast_field_reader.get(1), 2000u64);
// assert_eq!(fast_field_reader.get(2), 3000u64);
// assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
// assert_eq!(fast_field_reader.min_value(), 1000u64);
// let file = directory.open_read(path).unwrap();
//
// Can't apply gcd
// let path = Path::new("test");
// docs.pop();
// docs.push(doc!(*FIELDI64=>2001u64));
// let directory = get_index(&docs, &SCHEMA, codec_name.into())?;
// let file2 = directory.open_read(path).unwrap();
// assert!(file2.len() > file.len());
//
// Ok(())
// }
//
// #[test]
// fn test_fastfield_gcd_u64() -> crate::Result<()> {
// for codec_name in ALL_CODECS {
// test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?;
// }
// Ok(())
// }
//
// #[test]
// pub fn test_fastfield2() {
// let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
// assert_eq!(test_fastfield.get(0), 100);
// assert_eq!(test_fastfield.get(1), 200);
// assert_eq!(test_fastfield.get(2), 300);
// }
use std::collections::HashMap;
use std::path::Path;
use std::num::NonZeroU64;
use crate::directory::{CompositeFile, RamDirectory, WritePtr};
use crate::fastfield::serializer::FastFieldCodecEnableCheck;
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
use super::{
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName,
FastFieldReader, FastFieldsWriter, ALL_CODECS,
};
use crate::schema::Schema;
use crate::Directory;
fn get_index(
docs: &[crate::Document],
schema: &Schema,
codec_enable_checker: FastFieldCodecEnableCheck,
) -> crate::Result<RamDirectory> {
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
let mut serializer =
CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker)
.unwrap();
let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
for doc in docs {
fast_field_writers.add_document(doc);
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
Ok(directory)
}
fn test_fastfield_gcd_i64_with_codec(
codec_name: FastFieldCodecName,
num_vals: usize,
) -> crate::Result<()> {
let path = Path::new("test");
let mut docs = vec![];
for i in 1..=num_vals {
let val = i as i64 * 1000i64;
docs.push(doc!(*FIELDI64=>val));
}
let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?;
let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 1000i64);
assert_eq!(fast_field_reader.get(1), 2000i64);
assert_eq!(fast_field_reader.get(2), 3000i64);
assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000);
assert_eq!(fast_field_reader.min_value(), 1000i64);
let file = directory.open_read(path).unwrap();
// Can't apply gcd
let path = Path::new("test");
docs.pop();
docs.push(doc!(*FIELDI64=>2001i64));
let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?;
let file2 = directory.open_read(path).unwrap();
assert!(file2.len() > file.len());
Ok(())
}
#[test]
fn test_fastfield_gcd_i64() -> crate::Result<()> {
for codec_name in ALL_CODECS {
test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?;
}
Ok(())
}
fn test_fastfield_gcd_u64_with_codec(
codec_name: FastFieldCodecName,
num_vals: usize,
) -> crate::Result<()> {
let path = Path::new("test");
let mut docs = vec![];
for i in 1..=num_vals {
let val = i as u64 * 1000u64;
docs.push(doc!(*FIELD=>val));
}
let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?;
let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 118);
let composite_file = CompositeFile::open(&file)?;
let file = composite_file.open_read(*FIELD).unwrap();
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
assert_eq!(fast_field_reader.get(0), 1000u64);
assert_eq!(fast_field_reader.get(1), 2000u64);
assert_eq!(fast_field_reader.get(2), 3000u64);
assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
assert_eq!(fast_field_reader.min_value(), 1000u64);
let file = directory.open_read(path).unwrap();
// Can't apply gcd
let path = Path::new("test");
docs.pop();
docs.push(doc!(*FIELDI64=>2001u64));
let directory = get_index(&docs, &SCHEMA, codec_name.into())?;
let file2 = directory.open_read(path).unwrap();
assert!(file2.len() > file.len());
Ok(())
}
#[test]
fn test_fastfield_gcd_u64() -> crate::Result<()> {
for codec_name in ALL_CODECS {
test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?;
}
Ok(())
}
#[test]
pub fn test_fastfield2() {
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
assert_eq!(test_fastfield.get(0), 100);
assert_eq!(test_fastfield.get(1), 200);
assert_eq!(test_fastfield.get(2), 300);
}
*/
use crate::gcd::compute_gcd;
use crate::gcd::find_gcd;
use crate::gcd::{compute_gcd, find_gcd};
#[test]
fn test_compute_gcd() {
@@ -238,16 +266,15 @@ mod tests {
assert_eq!(compute_gcd(25, 25), 25);
}
#[test]
fn find_gcd_test() {
assert_eq!(find_gcd([0].into_iter()), None);
assert_eq!(find_gcd([0, 10].into_iter()), Some(10));
assert_eq!(find_gcd([10, 0].into_iter()), Some(10));
assert_eq!(find_gcd([0, 10].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([10, 0].into_iter()), NonZeroU64::new(10));
assert_eq!(find_gcd([].into_iter()), None);
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), Some(5));
assert_eq!(find_gcd([15, 16, 10].into_iter()), Some(1));
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), Some(5));
assert_eq!(find_gcd([0, 0].into_iter()), Some(0));
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([15, 16, 10].into_iter()), NonZeroU64::new(1));
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), NonZeroU64::new(5));
assert_eq!(find_gcd([0, 0].into_iter()), None);
}
}

View File

@@ -8,13 +8,13 @@ use std::io::Write;
use ownedbytes::OwnedBytes;
pub mod bitpacked;
pub mod dynamic;
pub mod gcd;
pub mod linearinterpol;
pub mod multilinearinterpol;
pub trait FastFieldCodecReader: Sized {
pub trait FastFieldCodecReader{
/// reads the metadata and returns the CodecReader
fn open_from_bytes(bytes: OwnedBytes) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
@@ -23,10 +23,10 @@ pub trait FastFieldCodecReader: Sized {
/// The FastFieldSerializerEstimate trait is required on all variants
/// of fast field compressions, to decide which one to choose.
pub trait FastFieldCodecSerializer {
/// A codex needs to provide a unique name and id, which is
/// used for debugging and de/serialization.
/// A codex needs to provide a unique name used for debugging and de/serialization.
const NAME: &'static str;
const ID: u8;
type Reader: FastFieldCodecReader;
/// Check if the Codec is able to compress the data
fn is_applicable(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> bool;
@@ -48,6 +48,8 @@ pub trait FastFieldCodecSerializer {
data_iter: impl Iterator<Item = u64>,
data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()>;
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader>;
}
/// FastFieldDataAccess is the trait to access fast field data during serialization and estimation.
@@ -91,7 +93,7 @@ mod tests {
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
};
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
pub fn create_and_validate<S: FastFieldCodecSerializer>(
data: &[u64],
name: &str,
) -> (f32, f32) {
@@ -111,7 +113,7 @@ mod tests {
let actual_compression = out.len() as f32 / (data.len() as f32 * 8.0);
let reader = R::open_from_bytes(OwnedBytes::new(out)).unwrap();
let reader = S::open_from_bytes(OwnedBytes::new(out)).unwrap();
for (doc, orig_val) in data.iter().enumerate() {
let val = reader.get_u64(doc as u64);
if val != *orig_val {
@@ -143,7 +145,7 @@ mod tests {
let codec_name = S::NAME;
for (data, data_set_name) in get_codec_test_data_sets() {
let (estimate, actual) =
crate::tests::create_and_validate::<S, R>(&data, data_set_name);
crate::tests::create_and_validate::<S>(&data, data_set_name);
let result = if estimate == f32::MAX {
"Disabled".to_string()
} else {

View File

@@ -58,21 +58,7 @@ impl FixedSize for LinearInterpolFooter {
}
impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearInterpolFastFieldReader {
data,
bit_unpacker,
footer,
slope,
})
}
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
@@ -110,7 +96,25 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
const NAME: &'static str = "LinearInterpol";
const ID: u8 = 2;
type Reader = LinearInterpolFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_offset = bytes.len() - LinearInterpolFooter::SIZE_IN_BYTES;
let (data, mut footer) = bytes.split(footer_offset);
let footer = LinearInterpolFooter::deserialize(&mut footer)?;
let slope = get_slope(footer.first_val, footer.last_val, footer.num_vals);
let num_bits = compute_num_bits(footer.relative_max_value);
let bit_unpacker = BitUnpacker::new(num_bits);
Ok(LinearInterpolFastFieldReader {
data,
bit_unpacker,
footer,
slope,
})
}
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
@@ -240,7 +244,6 @@ mod tests {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
LinearInterpolFastFieldSerializer,
LinearInterpolFastFieldReader,
>(data, name)
}

View File

@@ -1,7 +1,7 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
// use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
// use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
use prettytable::{Cell, Row, Table};
@@ -12,11 +12,11 @@ fn main() {
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
results.push(res);
let mut results = Vec::new();
// let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
// results.push(res);
// let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
// results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
&data,
);

View File

@@ -146,15 +146,6 @@ fn get_interpolation_function(doc: u64, interpolations: &[Function]) -> &Functio
}
impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
Ok(MultiLinearInterpolFastFieldReader { data, footer })
}
#[inline]
fn get_u64(&self, doc: u64) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
@@ -192,7 +183,18 @@ pub struct MultiLinearInterpolFastFieldSerializer {}
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpol";
const ID: u8 = 3;
type Reader = MultiLinearInterpolFastFieldReader;
/// Opens a fast field given a file.
fn open_from_bytes(bytes: OwnedBytes) -> io::Result<Self::Reader> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let footer_offset = bytes.len() - 4 - footer_len as usize;
let (data, mut footer) = bytes.split(footer_offset);
let footer = MultiLinearInterpolFooter::deserialize(&mut footer)?;
Ok(MultiLinearInterpolFastFieldReader { data, footer })
}
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
@@ -374,7 +376,6 @@ mod tests {
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
MultiLinearInterpolFastFieldSerializer,
MultiLinearInterpolFastFieldReader,
>(data, name)
}

View File

@@ -5,7 +5,7 @@ use std::path::Path;
use fastfield_codecs::bitpacked::{
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
};
use fastfield_codecs::gcd::{GCDFastFieldCodec, GCD_CODEC_ID};
use fastfield_codecs::gcd::{GCDFastFieldCodecReader, GCD_CODEC_ID};
use fastfield_codecs::linearinterpol::{
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
};
@@ -73,14 +73,14 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
/// GCD and Bitpacked compressed fastfield data.
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BitpackedReader>>),
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodecReader<BitpackedReader>>),
/// GCD and Linear interpolated values + bitpacked
LinearInterpolGCD(
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<LinearInterpolFastFieldReader>>,
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodecReader<LinearInterpolFastFieldReader>>,
),
/// GCD and Blockwise linear interpolated values + bitpacked
MultiLinearInterpolGCD(
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<MultiLinearInterpolFastFieldReader>>,
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodecReader<MultiLinearInterpolFastFieldReader>>,
),
}
@@ -118,7 +118,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
BitpackedFastFieldSerializer::ID => {
DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<BitpackedReader>,
GCDFastFieldCodecReader<BitpackedReader>,
>::open_from_bytes(
bytes
)?)
@@ -126,7 +126,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
LinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::LinearInterpolGCD(FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<LinearInterpolFastFieldReader>,
GCDFastFieldCodecReader<LinearInterpolFastFieldReader>,
>::open_from_bytes(
bytes
)?)
@@ -135,7 +135,7 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
DynamicFastFieldReader::MultiLinearInterpolGCD(
FastFieldReaderCodecWrapper::<
Item,
GCDFastFieldCodec<MultiLinearInterpolFastFieldReader>,
GCDFastFieldCodecReader<MultiLinearInterpolFastFieldReader>,
>::open_from_bytes(bytes)?,
)
}

View File

@@ -1,4 +1,5 @@
use std::io::{self, Write};
use std::num::NonZeroU64;
use common::{BinarySerializable, CountingWriter};
pub use fastfield_codecs::bitpacked::{
@@ -141,7 +142,8 @@ impl CompositeFastFieldSerializer {
let field_write = self.composite_write.for_field_with_idx(field, idx);
let gcd = find_gcd(iter_gen().map(|val| val - stats.min_value)).unwrap_or(GCD_DEFAULT);
if gcd <= 1 {
if gcd == 1 {
// No GCD opportunity here.
return Self::create_auto_detect_u64_fast_field_with_idx_gcd(
self.codec_enable_checker.clone(),
field,
@@ -157,7 +159,7 @@ impl CompositeFastFieldSerializer {
struct GCDWrappedFFAccess<T: FastFieldDataAccess> {
fastfield_accessor: T,
min_value: u64,
gcd: u64,
gcd: NonZeroU64,
}
impl<T: FastFieldDataAccess> FastFieldDataAccess for GCDWrappedFFAccess<T> {
fn get_val(&self, position: u64) -> u64 {

117
src/fastfield/wrapper.rs Normal file
View File

@@ -0,0 +1,117 @@
// Copyright (C) 2022 Quickwit, Inc.
//
// Quickwit is offered under the AGPL v3.0 and as commercial software.
// For commercial licensing, contact us at hello@quickwit.io.
//
// AGPL:
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
//
/// Wrapper for accessing a fastfield.
///
/// Holds the data and the codec to the read the data.
#[derive(Clone)]
pub struct FastFieldReaderCodecWrapper<Item: FastValue, CodecReader> {
reader: CodecReader,
_phantom: PhantomData<Item>,
}
impl<Item: FastValue, C: FastFieldCodecReader + Clone> FastFieldReader<Item>
for FastFieldReaderCodecWrapper<Item, C>
{
/// Return the value associated to the given document.
///
/// This accessor should return as fast as possible.
///
/// # Panics
///
/// May panic if `doc` is greater than the segment
// `maxdoc`.
fn get(&self, doc: DocId) -> Item {
self.get_u64(u64::from(doc))
}
/// Fills an output buffer with the fast field values
/// associated with the `DocId` going from
/// `start` to `start + output.len()`.
///
/// Regardless of the type of `Item`, this method works
/// - transmuting the output array
/// - extracting the `Item`s as if they were `u64`
/// - possibly converting the `u64` value to the right type.
///
/// # Panics
///
/// May panic if `start + output.len()` is greater than
/// the segment's `maxdoc`.
fn get_range(&self, start: u64, output: &mut [Item]) {
self.get_range_u64(start, output);
}
/// Returns the minimum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn min_value(&self) -> Item {
Item::from_u64(self.reader.min_value())
}
/// Returns the maximum value for this fast field.
///
/// The max value does not take in account of possible
/// deleted document, and should be considered as an upper bound
/// of the actual maximum value.
fn max_value(&self) -> Item {
Item::from_u64(self.reader.max_value())
}
}
impl<Item: FastValue> From<Vec<Item>> for DynamicFastFieldReader<Item> {
fn from(vals: Vec<Item>) -> DynamicFastFieldReader<Item> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_u64_field("field", FAST);
let schema = schema_builder.build();
let path = Path::new("__dummy__");
let directory: RamDirectory = RamDirectory::create();
{
let write: WritePtr = directory
.open_write(path)
.expect("With a RamDirectory, this should never fail.");
let mut serializer = CompositeFastFieldSerializer::from_write(write)
.expect("With a RamDirectory, this should never fail.");
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
{
let fast_field_writer = fast_field_writers
.get_field_writer_mut(field)
.expect("With a RamDirectory, this should never fail.");
for val in vals {
fast_field_writer.add_val(val.to_u64());
}
}
fast_field_writers
.serialize(&mut serializer, &HashMap::new(), None)
.unwrap();
serializer.close().unwrap();
}
let file = directory.open_read(path).expect("Failed to open the file");
let composite_file = CompositeFile::open(&file).expect("Failed to read the composite file");
let field_file = composite_file
.open_read(field)
.expect("File component not found");
DynamicFastFieldReader::open(field_file).unwrap()
}
}