mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 04:52:55 +00:00
Compare commits
15 Commits
float
...
fastfield-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14d53851a8 | ||
|
|
2d176e66b6 | ||
|
|
838a332db0 | ||
|
|
defbd9139b | ||
|
|
0c87732459 | ||
|
|
4d66a3f0a0 | ||
|
|
977f01a8a3 | ||
|
|
c14bdd26d4 | ||
|
|
3272f80171 | ||
|
|
23d5ab5656 | ||
|
|
245ed5fed1 | ||
|
|
33bed01168 | ||
|
|
17a5f4f0ff | ||
|
|
c969582308 | ||
|
|
18d2ee5bb7 |
1
fastfield_codecs/.gitignore
vendored
Normal file
1
fastfield_codecs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
datasets/
|
||||
@@ -6,8 +6,6 @@ license = "MIT"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
@@ -19,6 +17,6 @@ more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
unstable = [] # useful for benches and experimental codecs.
|
||||
bin = ["prettytable-rs", "rand"]
|
||||
default = ["bin"]
|
||||
|
||||
|
||||
6
fastfield_codecs/Makefile
Normal file
6
fastfield_codecs/Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids nooc_temperatures
|
||||
download:
|
||||
@echo "--- Downloading datasets ---"
|
||||
mkdir -p datasets
|
||||
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done
|
||||
|
||||
@@ -13,6 +13,10 @@ A codec needs to implement 2 traits:
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Download real world datasets for codecs comparison
|
||||
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
|
||||
To run with the unstable codecs, execute `cargo run --features unstable`.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
@@ -23,46 +27,101 @@ cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Autoincrement | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.0051544965 | 0.17251475 | 960 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.118189104 | 0.14172314 | 708 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 474 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing concave | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.005955 | 0.18813984 | 885 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.16113 | 0.15734828 | 704 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 478 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing convex | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.00613 | 0.20376484 | 889 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.157175 | 0.17297328 | 706 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 471 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Almost monotonically increasing | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14549863 | 0.17251475 | 923 | 210 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.14943957 | 0.15734814 | 703 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 462 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Random | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14533783 | 0.14126475 | 924 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.13381402 | 0.15734814 | 695 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.12501445 | 0.125 | 422 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.39826187 | 0.4068908 | 5545 | 1086 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.39214826 | 0.40734857 | 5082 | 1073 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2864 | 567 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.032736875 | 0.094390824 | 4942 | 1067 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.02667125 | 0.079223566 | 3626 | 994 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2493 | 566 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HTTP logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.047942877 | 0.20376582 | 5121 | 1065 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.06637425 | 0.18859856 | 3929 | 1093 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.26562786 | 0.265625 | 2221 | 526 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.41900787 | 0.4225158 | 5239 | 1089 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.41504425 | 0.43859857 | 4158 | 1052 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2603 | 513 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.18364687 | 0.25064084 | 5036 | 990 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.21239226 | 0.21984856 | 4087 | 1072 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2702 | 525 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Temperatures | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | | Codec Disabled | 0 | 0 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 1.0088086 | 1.001098 | 1306 | 237 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 1.000012 | 1 | 950 | 108 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
|
||||
```
|
||||
|
||||
@@ -5,11 +5,8 @@ extern crate test;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::*;
|
||||
|
||||
@@ -70,14 +67,9 @@ mod tests {
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
@@ -85,16 +77,9 @@ mod tests {
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(b, &data);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
|
||||
@@ -128,7 +128,10 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
|
||||
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 128;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FORFastFieldReader {
|
||||
num_vals: u64,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
min: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
self.metadata.min + diff
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.min.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let min = u64::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self { min, num_bits })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FORFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for FORFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for FORFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = FORFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
num_vals: footer.num_vals,
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct FORFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for FORFastFieldSerializer {
|
||||
const NAME: &'static str = "FOR";
|
||||
const ID: u8 = 5;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let mut min = block_values[0];
|
||||
let mut max = block_values[0];
|
||||
for ¤t_value in block_values[1..].iter() {
|
||||
min = min.min(current_value);
|
||||
max = max.max(current_value);
|
||||
}
|
||||
let num_bits = compute_num_bits(max - min);
|
||||
for current_value in block_values.iter() {
|
||||
bit_packer.write(current_value - min, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata { min, num_bits });
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = FORFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
stats.num_vals > BLOCK_SIZE
|
||||
}
|
||||
|
||||
/// Estimate compression ratio by compute the ratio of the first block.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let max_distance = (0..last_elem_in_first_chunk)
|
||||
.into_iter()
|
||||
.map(|pos| {
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
actual_value - stats.min_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and multiply by a magic number 3 to select this codec
|
||||
// when we are almost sure that this is relevant.
|
||||
let relative_max_value = max_distance as f32 * 3.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 9 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<FORFastFieldSerializer, FORFastFieldReader>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
println!("{}", actual_compression);
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(actual_compression > 0.006);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,15 +6,20 @@ use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
#[cfg(feature = "unstable")]
|
||||
pub mod frame_of_reference;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
pub mod piecewise_linear;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
/// Reads the metadata and returns the CodecReader.
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
/// Read u64 value for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field.
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64;
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
@@ -35,7 +40,10 @@ pub trait FastFieldCodecSerializer {
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
@@ -85,9 +93,8 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
|
||||
use crate::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use crate::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
@@ -97,7 +104,7 @@ mod tests {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let estimation = S::estimate_compression_ratio(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -157,13 +164,10 @@ mod tests {
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
fn test_codec_piecewise_linear() {
|
||||
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
@@ -181,45 +185,50 @@ mod tests {
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
fn estimation_test_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
println!("{}", bitpacked_estimation);
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +71,9 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, idx, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(idx, data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
#[deprecated(
|
||||
note = "Linear interpolation works best only on very rare cases and piecewise linear codec \
|
||||
already works great on them."
|
||||
)]
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
@@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
@@ -182,10 +187,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
if stats.num_vals < 3 {
|
||||
return f32::MAX;
|
||||
}
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
@@ -229,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -289,8 +301,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
|
||||
@@ -1,31 +1,52 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common::f64_to_u64;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
|
||||
#[cfg(feature = "unstable")]
|
||||
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
use rand::prelude::StdRng;
|
||||
use rand::Rng;
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
table.add_row(row![
|
||||
"",
|
||||
"Compression ratio",
|
||||
"Compression ratio estimation",
|
||||
"Compression time (micro)",
|
||||
"Reading time (micro)"
|
||||
]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
let res = serialize_with_codec::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
#[cfg(feature = "unstable")]
|
||||
{
|
||||
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
|
||||
results.push(res);
|
||||
}
|
||||
let res = serialize_with_codec::<
|
||||
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
|
||||
BitpackedFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
@@ -33,7 +54,7 @@ fn main() {
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
@@ -49,6 +70,8 @@ fn main() {
|
||||
Cell::new(name).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
Cell::new(&compression_duration.as_micros().to_string()),
|
||||
Cell::new(&read_duration.as_micros().to_string()),
|
||||
]));
|
||||
}
|
||||
}
|
||||
@@ -70,7 +93,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
@@ -83,22 +105,79 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1);
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.map(|num| num + rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|_| rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Random"));
|
||||
|
||||
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
|
||||
data_and_names.push((data.clone(), "HDFS logs timestamps"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "HDFS logs timestamps SORTED"));
|
||||
|
||||
let data = load_dataset("datasets/http_logs_timestamps.txt");
|
||||
data_and_names.push((data, "HTTP logs timestamps SORTED"));
|
||||
|
||||
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
|
||||
data_and_names.push((data.clone(), "Amazon review product ids"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "Amazon review product ids SORTED"));
|
||||
|
||||
let data = load_float_dataset("datasets/nooc_temperatures.txt");
|
||||
data_and_names.push((data, "Temperatures"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
pub fn load_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let l = line.unwrap();
|
||||
data.push(l.parse::<u64>().unwrap());
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load float dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let line_string = line.unwrap();
|
||||
let value = line_string.parse::<f64>().unwrap();
|
||||
data.push(f64_to_u64(value));
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
) -> (bool, f32, f32, &'static str, Duration, Duration) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
return (
|
||||
false,
|
||||
0.0,
|
||||
0.0,
|
||||
S::NAME,
|
||||
Duration::from_secs(0),
|
||||
Duration::from_secs(0),
|
||||
);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let start_time_compression = Instant::now();
|
||||
let estimation = S::estimate_compression_ratio(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -108,9 +187,22 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let elasped_time_compression = start_time_compression.elapsed();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
let start_time_read = Instant::now();
|
||||
for doc in 0..data.len() {
|
||||
reader.get_u64(doc as u64, &out);
|
||||
}
|
||||
let elapsed_time_read = start_time_read.elapsed();
|
||||
(
|
||||
true,
|
||||
estimation,
|
||||
actual_compression,
|
||||
S::NAME,
|
||||
elasped_time_compression,
|
||||
elapsed_time_read,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -155,14 +155,17 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||
let block_idx = idx - interpolation.start_pos;
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
block_idx,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
.get(block_idx, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
@@ -187,8 +190,13 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
#[deprecated(
|
||||
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is \
|
||||
a little bit more optimized."
|
||||
)]
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
@@ -311,10 +319,13 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
@@ -366,6 +377,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(deprecated)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
//! PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict
|
||||
//! values and fast field values. The difference with real fast field values is then stored.
|
||||
//! For every block, the linear function can be expressed as
|
||||
//! `computed_value = slope * block_position + first_value + positive_offset`
|
||||
//! where:
|
||||
//! - `block_position` is the position inside of the block from 0 to 511
|
||||
//! - `first_value` is the first value on the block
|
||||
//! - `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is
|
||||
//! always positive.
|
||||
//!
|
||||
//! 21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits
|
||||
//! per element.
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 512;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PiecewiseLinearFastFieldReader {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
/// Block that stores metadata to predict value with a linear
|
||||
/// function `predicted_value = slope * position + first_value + positive_offset`
|
||||
/// where `positive_offset` is comupted such that predicted values
|
||||
/// are always positive.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
first_value: u64,
|
||||
positive_offset: u64,
|
||||
slope: f32,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
let predicted_value =
|
||||
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
|
||||
(predicted_value + diff) - self.metadata.positive_offset
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.first_value.serialize(write)?;
|
||||
self.positive_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let first_value = u64::deserialize(reader)?;
|
||||
let positive_offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PiecewiseLinearFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for PiecewiseLinearFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = PiecewiseLinearFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas.into_iter() {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
pub struct PiecewiseLinearFastFieldSerializer;
|
||||
|
||||
impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
|
||||
const NAME: &'static str = "PiecewiseLinear";
|
||||
const ID: u8 = 4;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let slope = if block_num_vals == 1 {
|
||||
0f32
|
||||
} else {
|
||||
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64)
|
||||
/ (block_num_vals - 1) as f64) as f32
|
||||
};
|
||||
let first_value = block_values[0];
|
||||
let mut positive_offset = 0;
|
||||
let mut max_delta = 0;
|
||||
for (pos, ¤t_value) in block_values[1..].iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
|
||||
if computed_value > current_value {
|
||||
positive_offset = positive_offset.max(computed_value - current_value);
|
||||
} else {
|
||||
max_delta = max_delta.max(current_value - computed_value);
|
||||
}
|
||||
}
|
||||
let num_bits = compute_num_bits(max_delta + positive_offset);
|
||||
for (pos, current_value) in block_values.iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64, slope);
|
||||
let diff = (current_value + positive_offset) - computed_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
});
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = PiecewiseLinearFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 10 * BLOCK_SIZE {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64)
|
||||
/ (stats.num_vals - 1) as f64) as f32;
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|&pos| {
|
||||
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 21 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -392,8 +392,7 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
|
||||
@@ -6,12 +6,17 @@ use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
||||
|
||||
use super::FastValue;
|
||||
@@ -71,6 +76,8 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
/// Piecewise linear interpolated values + bitpacked
|
||||
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
@@ -86,12 +93,14 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
@@ -100,6 +109,12 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
PiecewiseLinearFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::PiecewiseLinear(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
@@ -118,6 +133,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
Self::PiecewiseLinear(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
@@ -126,6 +142,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
@@ -133,6 +150,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
@@ -140,6 +158,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,9 +195,12 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
|
||||
/// Get u64 for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field. See [`get_range`] for more details.
|
||||
pub(crate) fn get_u64(&self, idx: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(idx, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
|
||||
@@ -4,9 +4,9 @@ use common::{BinarySerializable, CountingWriter};
|
||||
pub use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
|
||||
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
@@ -35,18 +35,31 @@ pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
#[derive(Debug)]
|
||||
pub struct CodecEstimationResult<'a> {
|
||||
pub ratio: f32,
|
||||
pub name: &'a str,
|
||||
pub id: u8,
|
||||
}
|
||||
|
||||
// TODO: use this when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
) -> CodecEstimationResult {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
return CodecEstimationResult {
|
||||
ratio: f32::MAX,
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
};
|
||||
}
|
||||
CodecEstimationResult {
|
||||
ratio: T::estimate_compression_ratio(fastfield_accessor, stats),
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
@@ -59,7 +72,7 @@ impl CompositeFastFieldSerializer {
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
pub fn new_u64_fast_field_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
@@ -67,7 +80,7 @@ impl CompositeFastFieldSerializer {
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(
|
||||
self.new_u64_fast_field_with_idx_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -78,7 +91,7 @@ impl CompositeFastFieldSerializer {
|
||||
}
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
pub fn new_u64_fast_field_with_idx_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
@@ -88,42 +101,29 @@ impl CompositeFastFieldSerializer {
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables
|
||||
// codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
let estimations = vec![
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(stats.clone(), &fastfield_accessor),
|
||||
codec_estimation::<PiecewiseLinearFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
),
|
||||
];
|
||||
let best_codec_result = estimations
|
||||
.iter()
|
||||
.sorted_by(|result_a, result_b| {
|
||||
result_a
|
||||
.ratio
|
||||
.partial_cmp(&result_b.ratio)
|
||||
.expect("Ratio cannot be nan.")
|
||||
})
|
||||
.next()
|
||||
.expect("A codec must be present.");
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
"Choosing fast field codec {} for field_id {:?} among {:?}",
|
||||
best_codec_result.name, field, estimations,
|
||||
);
|
||||
best_codec_result.id.serialize(field_write)?;
|
||||
match best_codec_result.name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
@@ -133,17 +133,8 @@ impl CompositeFastFieldSerializer {
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
PiecewiseLinearFastFieldSerializer::NAME => {
|
||||
PiecewiseLinearFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
@@ -152,7 +143,7 @@ impl CompositeFastFieldSerializer {
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
panic!("unknown fastfield serializer {}", best_codec_result.name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
@@ -216,3 +207,45 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::FastFieldStats;
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::CompositeFastFieldSerializer;
|
||||
use crate::directory::{RamDirectory, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::Directory;
|
||||
|
||||
#[test]
|
||||
fn new_u64_fast_field_with_best_codec() -> crate::Result<()> {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let path = Path::new("test");
|
||||
let write: WritePtr = directory.open_write(path)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let vals = (0..10000u64).into_iter().collect_vec();
|
||||
let stats = FastFieldStats {
|
||||
min_value: 0,
|
||||
max_value: 9999,
|
||||
num_vals: vals.len() as u64,
|
||||
};
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
Field::from_field_id(0),
|
||||
stats,
|
||||
vals.clone(),
|
||||
vals.clone().into_iter(),
|
||||
vals.into_iter(),
|
||||
)?;
|
||||
serializer.close()?;
|
||||
// get the codecs id
|
||||
let mut bytes = directory.open_read(path)?.read_bytes()?;
|
||||
let codec_id = u8::deserialize(&mut bytes)?;
|
||||
// Codec id = 4 is piecewise linear.
|
||||
assert_eq!(codec_id, 4);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -298,7 +298,7 @@ impl IntFastFieldWriter {
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -306,7 +306,7 @@ impl IntFastFieldWriter {
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
|
||||
@@ -384,7 +384,7 @@ impl IndexMerger {
|
||||
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
|
||||
fast_field_reader.get(*doc_id)
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -551,7 +551,7 @@ impl IndexMerger {
|
||||
}
|
||||
offsets.push(offset);
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
&offsets[..],
|
||||
@@ -771,7 +771,7 @@ impl IndexMerger {
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
||||
fast_field_serializer.new_u64_fast_field_with_idx_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
|
||||
Reference in New Issue
Block a user