mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-04 08:12:54 +00:00
Compare commits
15 Commits
missing_te
...
fastfield-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14d53851a8 | ||
|
|
2d176e66b6 | ||
|
|
838a332db0 | ||
|
|
defbd9139b | ||
|
|
0c87732459 | ||
|
|
4d66a3f0a0 | ||
|
|
977f01a8a3 | ||
|
|
c14bdd26d4 | ||
|
|
3272f80171 | ||
|
|
23d5ab5656 | ||
|
|
245ed5fed1 | ||
|
|
33bed01168 | ||
|
|
17a5f4f0ff | ||
|
|
c969582308 | ||
|
|
18d2ee5bb7 |
1
fastfield_codecs/.gitignore
vendored
Normal file
1
fastfield_codecs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
|||||||
|
datasets/
|
||||||
@@ -6,8 +6,6 @@ license = "MIT"
|
|||||||
edition = "2018"
|
edition = "2018"
|
||||||
description = "Fast field codecs used by tantivy"
|
description = "Fast field codecs used by tantivy"
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||||
@@ -19,6 +17,6 @@ more-asserts = "0.2.1"
|
|||||||
rand = "0.8.3"
|
rand = "0.8.3"
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
|
unstable = [] # useful for benches and experimental codecs.
|
||||||
bin = ["prettytable-rs", "rand"]
|
bin = ["prettytable-rs", "rand"]
|
||||||
default = ["bin"]
|
default = ["bin"]
|
||||||
|
|
||||||
|
|||||||
6
fastfield_codecs/Makefile
Normal file
6
fastfield_codecs/Makefile
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids nooc_temperatures
|
||||||
|
download:
|
||||||
|
@echo "--- Downloading datasets ---"
|
||||||
|
mkdir -p datasets
|
||||||
|
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done
|
||||||
|
|
||||||
@@ -13,6 +13,10 @@ A codec needs to implement 2 traits:
|
|||||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||||
|
|
||||||
|
### Download real world datasets for codecs comparison
|
||||||
|
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
|
||||||
|
To run with the unstable codecs, execute `cargo run --features unstable`.
|
||||||
|
|
||||||
### Tests
|
### Tests
|
||||||
|
|
||||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||||
@@ -23,46 +27,101 @@ cargo run --features bin
|
|||||||
```
|
```
|
||||||
|
|
||||||
### TODO
|
### TODO
|
||||||
- Add real world data sets in comparison
|
|
||||||
- Add codec to cover sparse data sets
|
- Add codec to cover sparse data sets
|
||||||
|
|
||||||
|
|
||||||
### Codec Comparison
|
### Codec Comparison
|
||||||
```
|
```
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| | Compression Ratio | Compression Estimation |
|
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Autoincrement | | |
|
| Autoincrement | | | | |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
| PiecewiseLinear | 0.0051544965 | 0.17251475 | 960 | 211 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
| FOR | 0.118189104 | 0.14172314 | 708 | 212 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Bitpacked | 0.28126493 | 0.28125 |
|
| Bitpacked | 0.28126493 | 0.28125 | 474 | 112 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Monotonically increasing concave | | |
|
| Monotonically increasing concave | | | | |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
| PiecewiseLinear | 0.005955 | 0.18813984 | 885 | 211 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
| FOR | 0.16113 | 0.15734828 | 704 | 212 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Bitpacked | 0.31251436 | 0.3125 |
|
| Bitpacked | 0.31251436 | 0.3125 | 478 | 113 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Monotonically increasing convex | | |
|
| Monotonically increasing convex | | | | |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
| PiecewiseLinear | 0.00613 | 0.20376484 | 889 | 211 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
| FOR | 0.157175 | 0.17297328 | 706 | 212 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Bitpacked | 0.31251436 | 0.3125 |
|
| Bitpacked | 0.31251436 | 0.3125 | 471 | 113 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Almost monotonically increasing | | |
|
| Almost monotonically increasing | | | | |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
| PiecewiseLinear | 0.14549863 | 0.17251475 | 923 | 210 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
| FOR | 0.14943957 | 0.15734814 | 703 | 211 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
| Bitpacked | 0.28126493 | 0.28125 |
|
| Bitpacked | 0.28126493 | 0.28125 | 462 | 112 |
|
||||||
+----------------------------------+-------------------+------------------------+
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Random | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.14533783 | 0.14126475 | 924 | 211 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.13381402 | 0.15734814 | 695 | 211 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.12501445 | 0.125 | 422 | 112 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| HDFS logs timestamps | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.39826187 | 0.4068908 | 5545 | 1086 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.39214826 | 0.40734857 | 5082 | 1073 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.39062786 | 0.390625 | 2864 | 567 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| HDFS logs timestamps SORTED | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.032736875 | 0.094390824 | 4942 | 1067 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.02667125 | 0.079223566 | 3626 | 994 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.39062786 | 0.390625 | 2493 | 566 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| HTTP logs timestamps SORTED | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.047942877 | 0.20376582 | 5121 | 1065 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.06637425 | 0.18859856 | 3929 | 1093 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.26562786 | 0.265625 | 2221 | 526 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Amazon review product ids | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.41900787 | 0.4225158 | 5239 | 1089 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.41504425 | 0.43859857 | 4158 | 1052 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.40625286 | 0.40625 | 2603 | 513 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Amazon review product ids SORTED | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | 0.18364687 | 0.25064084 | 5036 | 990 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 0.21239226 | 0.21984856 | 4087 | 1072 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 0.40625286 | 0.40625 | 2702 | 525 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Temperatures | | | | |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| PiecewiseLinear | | Codec Disabled | 0 | 0 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| FOR | 1.0088086 | 1.001098 | 1306 | 237 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
| Bitpacked | 1.000012 | 1 | 950 | 108 |
|
||||||
|
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -5,11 +5,8 @@ extern crate test;
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||||
use fastfield_codecs::linearinterpol::{
|
use fastfield_codecs::piecewise_linear::{
|
||||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||||
};
|
|
||||||
use fastfield_codecs::multilinearinterpol::{
|
|
||||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
|
||||||
};
|
};
|
||||||
use fastfield_codecs::*;
|
use fastfield_codecs::*;
|
||||||
|
|
||||||
@@ -70,14 +67,9 @@ mod tests {
|
|||||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||||
}
|
}
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
|
||||||
let data: Vec<_> = get_data();
|
let data: Vec<_> = get_data();
|
||||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
|
||||||
}
|
}
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||||
@@ -85,16 +77,9 @@ mod tests {
|
|||||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||||
}
|
}
|
||||||
#[bench]
|
#[bench]
|
||||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
|
||||||
let data: Vec<_> = get_data();
|
let data: Vec<_> = get_data();
|
||||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(b, &data);
|
||||||
}
|
|
||||||
#[bench]
|
|
||||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
|
||||||
let data: Vec<_> = get_data();
|
|
||||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
|
||||||
b, &data,
|
|
||||||
);
|
|
||||||
}
|
}
|
||||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||||
|
|||||||
@@ -128,7 +128,10 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
|||||||
) -> bool {
|
) -> bool {
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
fn estimate_compression_ratio(
|
||||||
|
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32 {
|
||||||
let amplitude = stats.max_value - stats.min_value;
|
let amplitude = stats.max_value - stats.min_value;
|
||||||
let num_bits = compute_num_bits(amplitude);
|
let num_bits = compute_num_bits(amplitude);
|
||||||
let num_bits_uncompressed = 64;
|
let num_bits_uncompressed = 64;
|
||||||
|
|||||||
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
@@ -0,0 +1,272 @@
|
|||||||
|
use std::io::{self, Read, Write};
|
||||||
|
|
||||||
|
use common::{BinarySerializable, DeserializeFrom};
|
||||||
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||||
|
|
||||||
|
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||||
|
|
||||||
|
const BLOCK_SIZE: u64 = 128;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct FORFastFieldReader {
|
||||||
|
num_vals: u64,
|
||||||
|
min_value: u64,
|
||||||
|
max_value: u64,
|
||||||
|
block_readers: Vec<BlockReader>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
struct BlockMetadata {
|
||||||
|
min: u64,
|
||||||
|
num_bits: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
struct BlockReader {
|
||||||
|
metadata: BlockMetadata,
|
||||||
|
start_offset: u64,
|
||||||
|
bit_unpacker: BitUnpacker,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BlockReader {
|
||||||
|
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||||
|
metadata,
|
||||||
|
start_offset,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||||
|
let diff = self
|
||||||
|
.bit_unpacker
|
||||||
|
.get(block_pos, &data[self.start_offset as usize..]);
|
||||||
|
self.metadata.min + diff
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinarySerializable for BlockMetadata {
|
||||||
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||||
|
self.min.serialize(write)?;
|
||||||
|
self.num_bits.serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
|
let min = u64::deserialize(reader)?;
|
||||||
|
let num_bits = u8::deserialize(reader)?;
|
||||||
|
Ok(Self { min, num_bits })
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct FORFooter {
|
||||||
|
pub num_vals: u64,
|
||||||
|
pub min_value: u64,
|
||||||
|
pub max_value: u64,
|
||||||
|
block_metadatas: Vec<BlockMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinarySerializable for FORFooter {
|
||||||
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||||
|
let mut out = vec![];
|
||||||
|
self.num_vals.serialize(&mut out)?;
|
||||||
|
self.min_value.serialize(&mut out)?;
|
||||||
|
self.max_value.serialize(&mut out)?;
|
||||||
|
self.block_metadatas.serialize(&mut out)?;
|
||||||
|
write.write_all(&out)?;
|
||||||
|
(out.len() as u32).serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
|
let footer = Self {
|
||||||
|
num_vals: u64::deserialize(reader)?,
|
||||||
|
min_value: u64::deserialize(reader)?,
|
||||||
|
max_value: u64::deserialize(reader)?,
|
||||||
|
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||||
|
};
|
||||||
|
Ok(footer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldCodecReader for FORFastFieldReader {
|
||||||
|
/// Opens a fast field given a file.
|
||||||
|
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||||
|
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||||
|
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||||
|
let footer = FORFooter::deserialize(&mut footer)?;
|
||||||
|
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||||
|
let mut current_data_offset = 0;
|
||||||
|
for block_metadata in footer.block_metadatas {
|
||||||
|
let num_bits = block_metadata.num_bits;
|
||||||
|
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||||
|
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
num_vals: footer.num_vals,
|
||||||
|
min_value: footer.min_value,
|
||||||
|
max_value: footer.max_value,
|
||||||
|
block_readers,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||||
|
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||||
|
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||||
|
let block_reader = &self.block_readers[block_idx];
|
||||||
|
block_reader.get_u64(block_pos, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn min_value(&self) -> u64 {
|
||||||
|
self.min_value
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
fn max_value(&self) -> u64 {
|
||||||
|
self.max_value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||||
|
pub struct FORFastFieldSerializer {}
|
||||||
|
|
||||||
|
impl FastFieldCodecSerializer for FORFastFieldSerializer {
|
||||||
|
const NAME: &'static str = "FOR";
|
||||||
|
const ID: u8 = 5;
|
||||||
|
/// Creates a new fast field serializer.
|
||||||
|
fn serialize(
|
||||||
|
write: &mut impl Write,
|
||||||
|
_: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
data_iter: impl Iterator<Item = u64>,
|
||||||
|
_data_iter1: impl Iterator<Item = u64>,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
let data = data_iter.collect::<Vec<_>>();
|
||||||
|
let mut bit_packer = BitPacker::new();
|
||||||
|
let mut block_metadatas = Vec::new();
|
||||||
|
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||||
|
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||||
|
let block_values = &data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||||
|
let mut min = block_values[0];
|
||||||
|
let mut max = block_values[0];
|
||||||
|
for ¤t_value in block_values[1..].iter() {
|
||||||
|
min = min.min(current_value);
|
||||||
|
max = max.max(current_value);
|
||||||
|
}
|
||||||
|
let num_bits = compute_num_bits(max - min);
|
||||||
|
for current_value in block_values.iter() {
|
||||||
|
bit_packer.write(current_value - min, num_bits, write)?;
|
||||||
|
}
|
||||||
|
bit_packer.flush(write)?;
|
||||||
|
block_metadatas.push(BlockMetadata { min, num_bits });
|
||||||
|
}
|
||||||
|
bit_packer.close(write)?;
|
||||||
|
|
||||||
|
let footer = FORFooter {
|
||||||
|
num_vals: stats.num_vals,
|
||||||
|
min_value: stats.min_value,
|
||||||
|
max_value: stats.max_value,
|
||||||
|
block_metadatas,
|
||||||
|
};
|
||||||
|
footer.serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_applicable(
|
||||||
|
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> bool {
|
||||||
|
stats.num_vals > BLOCK_SIZE
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimate compression ratio by compute the ratio of the first block.
|
||||||
|
fn estimate_compression_ratio(
|
||||||
|
fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32 {
|
||||||
|
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||||
|
let max_distance = (0..last_elem_in_first_chunk)
|
||||||
|
.into_iter()
|
||||||
|
.map(|pos| {
|
||||||
|
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||||
|
actual_value - stats.min_value
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Estimate one block and multiply by a magic number 3 to select this codec
|
||||||
|
// when we are almost sure that this is relevant.
|
||||||
|
let relative_max_value = max_distance as f32 * 3.0;
|
||||||
|
|
||||||
|
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||||
|
// function metadata per block
|
||||||
|
+ 9 * (stats.num_vals / BLOCK_SIZE);
|
||||||
|
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||||
|
num_bits as f32 / num_bits_uncompressed as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::tests::get_codec_test_data_sets;
|
||||||
|
|
||||||
|
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||||
|
crate::tests::create_and_validate::<FORFastFieldSerializer, FORFastFieldReader>(data, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compression() {
|
||||||
|
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||||
|
let (estimate, actual_compression) =
|
||||||
|
create_and_validate(&data, "simple monotonically large");
|
||||||
|
println!("{}", actual_compression);
|
||||||
|
assert!(actual_compression < 0.2);
|
||||||
|
assert!(actual_compression > 0.006);
|
||||||
|
assert!(estimate < 0.20);
|
||||||
|
assert!(estimate > 0.10);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_with_codec_data_sets() {
|
||||||
|
let data_sets = get_codec_test_data_sets();
|
||||||
|
for (mut data, name) in data_sets {
|
||||||
|
create_and_validate(&data, name);
|
||||||
|
data.reverse();
|
||||||
|
create_and_validate(&data, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_simple() {
|
||||||
|
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "simple monotonically");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn border_cases_1() {
|
||||||
|
let data = (0..1024).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "border case");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn border_case_2() {
|
||||||
|
let data = (0..1025).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "border case");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn rand() {
|
||||||
|
for _ in 0..10 {
|
||||||
|
let mut data = (5_000..20_000)
|
||||||
|
.map(|_| rand::random::<u32>() as u64)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||||
|
dbg!(estimate);
|
||||||
|
dbg!(actual_compression);
|
||||||
|
|
||||||
|
data.reverse();
|
||||||
|
create_and_validate(&data, "random");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -6,15 +6,20 @@ use std::io;
|
|||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
pub mod bitpacked;
|
pub mod bitpacked;
|
||||||
|
#[cfg(feature = "unstable")]
|
||||||
|
pub mod frame_of_reference;
|
||||||
pub mod linearinterpol;
|
pub mod linearinterpol;
|
||||||
pub mod multilinearinterpol;
|
pub mod multilinearinterpol;
|
||||||
|
pub mod piecewise_linear;
|
||||||
|
|
||||||
pub trait FastFieldCodecReader: Sized {
|
pub trait FastFieldCodecReader: Sized {
|
||||||
/// reads the metadata and returns the CodecReader
|
/// Reads the metadata and returns the CodecReader.
|
||||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||||
|
|
||||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
/// Read u64 value for indice `idx`.
|
||||||
|
/// `idx` can be either a `DocId` or an index used for
|
||||||
|
/// `multivalued` fast field.
|
||||||
|
fn get_u64(&self, idx: u64, data: &[u8]) -> u64;
|
||||||
fn min_value(&self) -> u64;
|
fn min_value(&self) -> u64;
|
||||||
fn max_value(&self) -> u64;
|
fn max_value(&self) -> u64;
|
||||||
}
|
}
|
||||||
@@ -35,7 +40,10 @@ pub trait FastFieldCodecSerializer {
|
|||||||
///
|
///
|
||||||
/// It could make sense to also return a value representing
|
/// It could make sense to also return a value representing
|
||||||
/// computational complexity.
|
/// computational complexity.
|
||||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
fn estimate_compression_ratio(
|
||||||
|
fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32;
|
||||||
|
|
||||||
/// Serializes the data using the serializer into write.
|
/// Serializes the data using the serializer into write.
|
||||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||||
@@ -85,9 +93,8 @@ impl FastFieldDataAccess for Vec<u64> {
|
|||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||||
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
|
use crate::piecewise_linear::{
|
||||||
use crate::multilinearinterpol::{
|
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||||
@@ -97,7 +104,7 @@ mod tests {
|
|||||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||||
return (f32::MAX, 0.0);
|
return (f32::MAX, 0.0);
|
||||||
}
|
}
|
||||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
let estimation = S::estimate_compression_ratio(&data, crate::tests::stats_from_vec(data));
|
||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
S::serialize(
|
S::serialize(
|
||||||
&mut out,
|
&mut out,
|
||||||
@@ -157,13 +164,10 @@ mod tests {
|
|||||||
fn test_codec_bitpacking() {
|
fn test_codec_bitpacking() {
|
||||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_codec_interpolation() {
|
fn test_codec_piecewise_linear() {
|
||||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
|
||||||
}
|
|
||||||
#[test]
|
|
||||||
fn test_codec_multi_interpolation() {
|
|
||||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -181,45 +185,50 @@ mod tests {
|
|||||||
fn estimation_good_interpolation_case() {
|
fn estimation_good_interpolation_case() {
|
||||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||||
|
|
||||||
let linear_interpol_estimation =
|
let piecewise_interpol_estimation =
|
||||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||||
assert_le!(linear_interpol_estimation, 0.01);
|
&data,
|
||||||
|
stats_from_vec(&data),
|
||||||
let multi_linear_interpol_estimation =
|
);
|
||||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
|
||||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
|
||||||
|
|
||||||
let bitpacked_estimation =
|
let bitpacked_estimation =
|
||||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn estimation_test_bad_interpolation_case() {
|
fn estimation_test_bad_interpolation_case() {
|
||||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||||
|
|
||||||
let linear_interpol_estimation =
|
let piecewise_interpol_estimation =
|
||||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||||
assert_le!(linear_interpol_estimation, 0.32);
|
&data,
|
||||||
|
stats_from_vec(&data),
|
||||||
|
);
|
||||||
|
assert_le!(piecewise_interpol_estimation, 0.32);
|
||||||
|
|
||||||
let bitpacked_estimation =
|
let bitpacked_estimation =
|
||||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
fn estimation_test_interpolation_case_monotonically_increasing() {
|
||||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||||
data.push(1_000_000);
|
data.push(1_000_000);
|
||||||
|
|
||||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||||
let linear_interpol_estimation =
|
let piecewise_interpol_estimation =
|
||||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||||
assert_le!(linear_interpol_estimation, 0.35);
|
&data,
|
||||||
|
stats_from_vec(&data),
|
||||||
|
);
|
||||||
|
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||||
|
|
||||||
let bitpacked_estimation =
|
let bitpacked_estimation =
|
||||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||||
|
println!("{}", bitpacked_estimation);
|
||||||
assert_le!(bitpacked_estimation, 0.32);
|
assert_le!(bitpacked_estimation, 0.32);
|
||||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -71,9 +71,9 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
#[inline]
|
#[inline]
|
||||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
let calculated_value = get_calculated_value(self.footer.first_val, idx, self.slope);
|
||||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
(calculated_value + self.bit_unpacker.get(idx, data)) - self.footer.offset
|
||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
|||||||
|
|
||||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||||
/// and stores the difference bitpacked.
|
/// and stores the difference bitpacked.
|
||||||
|
#[deprecated(
|
||||||
|
note = "Linear interpolation works best only on very rare cases and piecewise linear codec \
|
||||||
|
already works great on them."
|
||||||
|
)]
|
||||||
pub struct LinearInterpolFastFieldSerializer {}
|
pub struct LinearInterpolFastFieldSerializer {}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
|||||||
first_val + (pos as f32 * slope) as u64
|
first_val + (pos as f32 * slope) as u64
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(deprecated)]
|
||||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||||
const NAME: &'static str = "LinearInterpol";
|
const NAME: &'static str = "LinearInterpol";
|
||||||
const ID: u8 = 2;
|
const ID: u8 = 2;
|
||||||
@@ -182,10 +187,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
|||||||
}
|
}
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
/// estimation for linear interpolation is hard because, you don't know
|
/// Estimation for linear interpolation is hard because, you don't know
|
||||||
/// where the local maxima for the deviation of the calculated value are and
|
/// where the local maxima for the deviation of the calculated value are and
|
||||||
/// the offset to shift all values to >=0 is also unknown.
|
/// the offset to shift all values to >=0 is also unknown.
|
||||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
fn estimate_compression_ratio(
|
||||||
|
fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32 {
|
||||||
|
if stats.num_vals < 3 {
|
||||||
|
return f32::MAX;
|
||||||
|
}
|
||||||
let first_val = fastfield_accessor.get_val(0);
|
let first_val = fastfield_accessor.get_val(0);
|
||||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||||
@@ -229,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[allow(deprecated)]
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
@@ -289,8 +301,10 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn linear_interpol_fast_field_rand() {
|
fn linear_interpol_fast_field_rand() {
|
||||||
for _ in 0..5000 {
|
for _ in 0..10 {
|
||||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
let mut data = (5_000..20_000)
|
||||||
|
.map(|_| rand::random::<u32>() as u64)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
create_and_validate(&data, "random");
|
create_and_validate(&data, "random");
|
||||||
|
|
||||||
data.reverse();
|
data.reverse();
|
||||||
|
|||||||
@@ -1,31 +1,52 @@
|
|||||||
#[macro_use]
|
#[macro_use]
|
||||||
extern crate prettytable;
|
extern crate prettytable;
|
||||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
use std::fs::File;
|
||||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
use std::io;
|
||||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
use std::io::BufRead;
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
|
|
||||||
|
use common::f64_to_u64;
|
||||||
|
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
|
||||||
|
#[cfg(feature = "unstable")]
|
||||||
|
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
|
||||||
|
use fastfield_codecs::piecewise_linear::{
|
||||||
|
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||||
|
};
|
||||||
|
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldStats};
|
||||||
use prettytable::{Cell, Row, Table};
|
use prettytable::{Cell, Row, Table};
|
||||||
|
use rand::prelude::StdRng;
|
||||||
|
use rand::Rng;
|
||||||
|
|
||||||
fn main() {
|
fn main() {
|
||||||
let mut table = Table::new();
|
let mut table = Table::new();
|
||||||
|
|
||||||
// Add a row per time
|
// Add a row per time
|
||||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
table.add_row(row![
|
||||||
|
"",
|
||||||
|
"Compression ratio",
|
||||||
|
"Compression ratio estimation",
|
||||||
|
"Compression time (micro)",
|
||||||
|
"Reading time (micro)"
|
||||||
|
]);
|
||||||
|
|
||||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||||
let mut results = vec![];
|
let mut results = vec![];
|
||||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
let res = serialize_with_codec::<
|
||||||
|
PiecewiseLinearFastFieldSerializer,
|
||||||
|
PiecewiseLinearFastFieldReader,
|
||||||
|
>(&data);
|
||||||
results.push(res);
|
results.push(res);
|
||||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
#[cfg(feature = "unstable")]
|
||||||
results.push(res);
|
{
|
||||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
|
||||||
&data,
|
results.push(res);
|
||||||
);
|
}
|
||||||
|
let res = serialize_with_codec::<
|
||||||
|
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
|
||||||
|
BitpackedFastFieldReader,
|
||||||
|
>(&data);
|
||||||
results.push(res);
|
results.push(res);
|
||||||
|
|
||||||
// let best_estimation_codec = results
|
|
||||||
//.iter()
|
|
||||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
|
||||||
//.unwrap();
|
|
||||||
let best_compression_ratio_codec = results
|
let best_compression_ratio_codec = results
|
||||||
.iter()
|
.iter()
|
||||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||||
@@ -33,7 +54,7 @@ fn main() {
|
|||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||||
for (is_applicable, est, comp, name) in results {
|
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
|
||||||
let (est_cell, ratio_cell) = if !is_applicable {
|
let (est_cell, ratio_cell) = if !is_applicable {
|
||||||
("Codec Disabled".to_string(), "".to_string())
|
("Codec Disabled".to_string(), "".to_string())
|
||||||
} else {
|
} else {
|
||||||
@@ -49,6 +70,8 @@ fn main() {
|
|||||||
Cell::new(name).style_spec("bFg"),
|
Cell::new(name).style_spec("bFg"),
|
||||||
Cell::new(&ratio_cell).style_spec(style),
|
Cell::new(&ratio_cell).style_spec(style),
|
||||||
Cell::new(&est_cell).style_spec(""),
|
Cell::new(&est_cell).style_spec(""),
|
||||||
|
Cell::new(&compression_duration.as_micros().to_string()),
|
||||||
|
Cell::new(&read_duration.as_micros().to_string()),
|
||||||
]));
|
]));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -70,7 +93,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
|||||||
current_cumulative
|
current_cumulative
|
||||||
})
|
})
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
|
||||||
data_and_names.push((data, "Monotonically increasing concave"));
|
data_and_names.push((data, "Monotonically increasing concave"));
|
||||||
|
|
||||||
let mut current_cumulative = 0;
|
let mut current_cumulative = 0;
|
||||||
@@ -83,22 +105,79 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
|||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
data_and_names.push((data, "Monotonically increasing convex"));
|
data_and_names.push((data, "Monotonically increasing convex"));
|
||||||
|
|
||||||
|
let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1);
|
||||||
let data = (1000..=200_000_u64)
|
let data = (1000..=200_000_u64)
|
||||||
.map(|num| num + rand::random::<u8>() as u64)
|
.map(|num| num + rng.gen::<u8>() as u64)
|
||||||
.collect::<Vec<_>>();
|
.collect::<Vec<_>>();
|
||||||
data_and_names.push((data, "Almost monotonically increasing"));
|
data_and_names.push((data, "Almost monotonically increasing"));
|
||||||
|
|
||||||
|
let data = (1000..=200_000_u64)
|
||||||
|
.map(|_| rng.gen::<u8>() as u64)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
data_and_names.push((data, "Random"));
|
||||||
|
|
||||||
|
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
|
||||||
|
data_and_names.push((data.clone(), "HDFS logs timestamps"));
|
||||||
|
|
||||||
|
data.sort_unstable();
|
||||||
|
data_and_names.push((data, "HDFS logs timestamps SORTED"));
|
||||||
|
|
||||||
|
let data = load_dataset("datasets/http_logs_timestamps.txt");
|
||||||
|
data_and_names.push((data, "HTTP logs timestamps SORTED"));
|
||||||
|
|
||||||
|
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
|
||||||
|
data_and_names.push((data.clone(), "Amazon review product ids"));
|
||||||
|
|
||||||
|
data.sort_unstable();
|
||||||
|
data_and_names.push((data, "Amazon review product ids SORTED"));
|
||||||
|
|
||||||
|
let data = load_float_dataset("datasets/nooc_temperatures.txt");
|
||||||
|
data_and_names.push((data, "Temperatures"));
|
||||||
|
|
||||||
data_and_names
|
data_and_names
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
pub fn load_dataset(file_path: &str) -> Vec<u64> {
|
||||||
|
println!("Load dataset from `{}`", file_path);
|
||||||
|
let file = File::open(file_path).expect("Error when opening file.");
|
||||||
|
let lines = io::BufReader::new(file).lines();
|
||||||
|
let mut data = Vec::new();
|
||||||
|
for line in lines {
|
||||||
|
let l = line.unwrap();
|
||||||
|
data.push(l.parse::<u64>().unwrap());
|
||||||
|
}
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
|
||||||
|
println!("Load float dataset from `{}`", file_path);
|
||||||
|
let file = File::open(file_path).expect("Error when opening file.");
|
||||||
|
let lines = io::BufReader::new(file).lines();
|
||||||
|
let mut data = Vec::new();
|
||||||
|
for line in lines {
|
||||||
|
let line_string = line.unwrap();
|
||||||
|
let value = line_string.parse::<f64>().unwrap();
|
||||||
|
data.push(f64_to_u64(value));
|
||||||
|
}
|
||||||
|
data
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||||
data: &[u64],
|
data: &[u64],
|
||||||
) -> (bool, f32, f32, &'static str) {
|
) -> (bool, f32, f32, &'static str, Duration, Duration) {
|
||||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||||
if !is_applicable {
|
if !is_applicable {
|
||||||
return (false, 0.0, 0.0, S::NAME);
|
return (
|
||||||
|
false,
|
||||||
|
0.0,
|
||||||
|
0.0,
|
||||||
|
S::NAME,
|
||||||
|
Duration::from_secs(0),
|
||||||
|
Duration::from_secs(0),
|
||||||
|
);
|
||||||
}
|
}
|
||||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
let start_time_compression = Instant::now();
|
||||||
|
let estimation = S::estimate_compression_ratio(&data, stats_from_vec(data));
|
||||||
let mut out = vec![];
|
let mut out = vec![];
|
||||||
S::serialize(
|
S::serialize(
|
||||||
&mut out,
|
&mut out,
|
||||||
@@ -108,9 +187,22 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
|||||||
data.iter().cloned(),
|
data.iter().cloned(),
|
||||||
)
|
)
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
let elasped_time_compression = start_time_compression.elapsed();
|
||||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||||
(true, estimation, actual_compression, S::NAME)
|
let reader = R::open_from_bytes(&out).unwrap();
|
||||||
|
let start_time_read = Instant::now();
|
||||||
|
for doc in 0..data.len() {
|
||||||
|
reader.get_u64(doc as u64, &out);
|
||||||
|
}
|
||||||
|
let elapsed_time_read = start_time_read.elapsed();
|
||||||
|
(
|
||||||
|
true,
|
||||||
|
estimation,
|
||||||
|
actual_compression,
|
||||||
|
S::NAME,
|
||||||
|
elasped_time_compression,
|
||||||
|
elapsed_time_read,
|
||||||
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||||
|
|||||||
@@ -155,14 +155,17 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||||
let doc = doc - interpolation.start_pos;
|
let block_idx = idx - interpolation.start_pos;
|
||||||
let calculated_value =
|
let calculated_value = get_calculated_value(
|
||||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
interpolation.value_start_pos,
|
||||||
|
block_idx,
|
||||||
|
interpolation.slope,
|
||||||
|
);
|
||||||
let diff = interpolation
|
let diff = interpolation
|
||||||
.bit_unpacker
|
.bit_unpacker
|
||||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
.get(block_idx, &data[interpolation.data_start_offset as usize..]);
|
||||||
(calculated_value + diff) - interpolation.positive_val_offset
|
(calculated_value + diff) - interpolation.positive_val_offset
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -187,8 +190,13 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||||
|
#[deprecated(
|
||||||
|
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is \
|
||||||
|
a little bit more optimized."
|
||||||
|
)]
|
||||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||||
|
|
||||||
|
#[allow(deprecated)]
|
||||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||||
const NAME: &'static str = "MultiLinearInterpol";
|
const NAME: &'static str = "MultiLinearInterpol";
|
||||||
const ID: u8 = 3;
|
const ID: u8 = 3;
|
||||||
@@ -311,10 +319,13 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
|||||||
}
|
}
|
||||||
true
|
true
|
||||||
}
|
}
|
||||||
/// estimation for linear interpolation is hard because, you don't know
|
/// Estimation for linear interpolation is hard because, you don't know
|
||||||
/// where the local maxima are for the deviation of the calculated value and
|
/// where the local maxima are for the deviation of the calculated value and
|
||||||
/// the offset is also unknown.
|
/// the offset is also unknown.
|
||||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
fn estimate_compression_ratio(
|
||||||
|
fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32 {
|
||||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||||
let last_val_in_first_block =
|
let last_val_in_first_block =
|
||||||
@@ -366,6 +377,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
#[allow(deprecated)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::tests::get_codec_test_data_sets;
|
use crate::tests::get_codec_test_data_sets;
|
||||||
|
|||||||
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
@@ -0,0 +1,365 @@
|
|||||||
|
//! PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict
|
||||||
|
//! values and fast field values. The difference with real fast field values is then stored.
|
||||||
|
//! For every block, the linear function can be expressed as
|
||||||
|
//! `computed_value = slope * block_position + first_value + positive_offset`
|
||||||
|
//! where:
|
||||||
|
//! - `block_position` is the position inside of the block from 0 to 511
|
||||||
|
//! - `first_value` is the first value on the block
|
||||||
|
//! - `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is
|
||||||
|
//! always positive.
|
||||||
|
//!
|
||||||
|
//! 21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits
|
||||||
|
//! per element.
|
||||||
|
|
||||||
|
use std::io::{self, Read, Write};
|
||||||
|
use std::ops::Sub;
|
||||||
|
|
||||||
|
use common::{BinarySerializable, DeserializeFrom};
|
||||||
|
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||||
|
|
||||||
|
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||||
|
|
||||||
|
const BLOCK_SIZE: u64 = 512;
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub struct PiecewiseLinearFastFieldReader {
|
||||||
|
min_value: u64,
|
||||||
|
max_value: u64,
|
||||||
|
block_readers: Vec<BlockReader>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Block that stores metadata to predict value with a linear
|
||||||
|
/// function `predicted_value = slope * position + first_value + positive_offset`
|
||||||
|
/// where `positive_offset` is comupted such that predicted values
|
||||||
|
/// are always positive.
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
struct BlockMetadata {
|
||||||
|
first_value: u64,
|
||||||
|
positive_offset: u64,
|
||||||
|
slope: f32,
|
||||||
|
num_bits: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Default)]
|
||||||
|
struct BlockReader {
|
||||||
|
metadata: BlockMetadata,
|
||||||
|
start_offset: u64,
|
||||||
|
bit_unpacker: BitUnpacker,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BlockReader {
|
||||||
|
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||||
|
Self {
|
||||||
|
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||||
|
metadata,
|
||||||
|
start_offset,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||||
|
let diff = self
|
||||||
|
.bit_unpacker
|
||||||
|
.get(block_pos, &data[self.start_offset as usize..]);
|
||||||
|
let predicted_value =
|
||||||
|
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
|
||||||
|
(predicted_value + diff) - self.metadata.positive_offset
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinarySerializable for BlockMetadata {
|
||||||
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||||
|
self.first_value.serialize(write)?;
|
||||||
|
self.positive_offset.serialize(write)?;
|
||||||
|
self.slope.serialize(write)?;
|
||||||
|
self.num_bits.serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
|
let first_value = u64::deserialize(reader)?;
|
||||||
|
let positive_offset = u64::deserialize(reader)?;
|
||||||
|
let slope = f32::deserialize(reader)?;
|
||||||
|
let num_bits = u8::deserialize(reader)?;
|
||||||
|
Ok(Self {
|
||||||
|
first_value,
|
||||||
|
positive_offset,
|
||||||
|
slope,
|
||||||
|
num_bits,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct PiecewiseLinearFooter {
|
||||||
|
pub num_vals: u64,
|
||||||
|
pub min_value: u64,
|
||||||
|
pub max_value: u64,
|
||||||
|
block_metadatas: Vec<BlockMetadata>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BinarySerializable for PiecewiseLinearFooter {
|
||||||
|
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||||
|
let mut out = vec![];
|
||||||
|
self.num_vals.serialize(&mut out)?;
|
||||||
|
self.min_value.serialize(&mut out)?;
|
||||||
|
self.max_value.serialize(&mut out)?;
|
||||||
|
self.block_metadatas.serialize(&mut out)?;
|
||||||
|
write.write_all(&out)?;
|
||||||
|
(out.len() as u32).serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||||
|
let footer = Self {
|
||||||
|
num_vals: u64::deserialize(reader)?,
|
||||||
|
min_value: u64::deserialize(reader)?,
|
||||||
|
max_value: u64::deserialize(reader)?,
|
||||||
|
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||||
|
};
|
||||||
|
Ok(footer)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
|
||||||
|
/// Opens a fast field given a file.
|
||||||
|
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||||
|
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||||
|
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||||
|
let footer = PiecewiseLinearFooter::deserialize(&mut footer)?;
|
||||||
|
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||||
|
let mut current_data_offset = 0;
|
||||||
|
for block_metadata in footer.block_metadatas.into_iter() {
|
||||||
|
let num_bits = block_metadata.num_bits;
|
||||||
|
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||||
|
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||||
|
}
|
||||||
|
Ok(Self {
|
||||||
|
min_value: footer.min_value,
|
||||||
|
max_value: footer.max_value,
|
||||||
|
block_readers,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||||
|
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||||
|
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||||
|
let block_reader = &self.block_readers[block_idx];
|
||||||
|
block_reader.get_u64(block_pos, data)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn min_value(&self) -> u64 {
|
||||||
|
self.min_value
|
||||||
|
}
|
||||||
|
#[inline]
|
||||||
|
fn max_value(&self) -> u64 {
|
||||||
|
self.max_value
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||||
|
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||||
|
}
|
||||||
|
|
||||||
|
pub struct PiecewiseLinearFastFieldSerializer;
|
||||||
|
|
||||||
|
impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
|
||||||
|
const NAME: &'static str = "PiecewiseLinear";
|
||||||
|
const ID: u8 = 4;
|
||||||
|
/// Creates a new fast field serializer.
|
||||||
|
fn serialize(
|
||||||
|
write: &mut impl Write,
|
||||||
|
_: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
data_iter: impl Iterator<Item = u64>,
|
||||||
|
_data_iter1: impl Iterator<Item = u64>,
|
||||||
|
) -> io::Result<()> {
|
||||||
|
let mut data = data_iter.collect::<Vec<_>>();
|
||||||
|
let mut bit_packer = BitPacker::new();
|
||||||
|
let mut block_metadatas = Vec::new();
|
||||||
|
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||||
|
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||||
|
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||||
|
let slope = if block_num_vals == 1 {
|
||||||
|
0f32
|
||||||
|
} else {
|
||||||
|
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64)
|
||||||
|
/ (block_num_vals - 1) as f64) as f32
|
||||||
|
};
|
||||||
|
let first_value = block_values[0];
|
||||||
|
let mut positive_offset = 0;
|
||||||
|
let mut max_delta = 0;
|
||||||
|
for (pos, ¤t_value) in block_values[1..].iter().enumerate() {
|
||||||
|
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
|
||||||
|
if computed_value > current_value {
|
||||||
|
positive_offset = positive_offset.max(computed_value - current_value);
|
||||||
|
} else {
|
||||||
|
max_delta = max_delta.max(current_value - computed_value);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let num_bits = compute_num_bits(max_delta + positive_offset);
|
||||||
|
for (pos, current_value) in block_values.iter().enumerate() {
|
||||||
|
let computed_value = predict_value(first_value, pos as u64, slope);
|
||||||
|
let diff = (current_value + positive_offset) - computed_value;
|
||||||
|
bit_packer.write(diff, num_bits, write)?;
|
||||||
|
}
|
||||||
|
bit_packer.flush(write)?;
|
||||||
|
block_metadatas.push(BlockMetadata {
|
||||||
|
first_value,
|
||||||
|
positive_offset,
|
||||||
|
slope,
|
||||||
|
num_bits,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
bit_packer.close(write)?;
|
||||||
|
|
||||||
|
let footer = PiecewiseLinearFooter {
|
||||||
|
num_vals: stats.num_vals,
|
||||||
|
min_value: stats.min_value,
|
||||||
|
max_value: stats.max_value,
|
||||||
|
block_metadatas,
|
||||||
|
};
|
||||||
|
footer.serialize(write)?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_applicable(
|
||||||
|
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> bool {
|
||||||
|
if stats.num_vals < 10 * BLOCK_SIZE {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// On serialization the offset is added to the actual value.
|
||||||
|
// We need to make sure this won't run into overflow calculation issues.
|
||||||
|
// For this we take the maximum theroretical offset and add this to the max value.
|
||||||
|
// If this doesn't overflow the algortihm should be fine
|
||||||
|
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||||
|
if stats
|
||||||
|
.max_value
|
||||||
|
.checked_add(theorethical_maximum_offset)
|
||||||
|
.is_none()
|
||||||
|
{
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Estimation for linear interpolation is hard because, you don't know
|
||||||
|
/// where the local maxima are for the deviation of the calculated value and
|
||||||
|
/// the offset is also unknown.
|
||||||
|
fn estimate_compression_ratio(
|
||||||
|
fastfield_accessor: &impl FastFieldDataAccess,
|
||||||
|
stats: FastFieldStats,
|
||||||
|
) -> f32 {
|
||||||
|
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||||
|
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||||
|
let last_val_in_first_block =
|
||||||
|
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||||
|
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64)
|
||||||
|
/ (stats.num_vals - 1) as f64) as f32;
|
||||||
|
|
||||||
|
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||||
|
let sample_positions = (0..20)
|
||||||
|
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
let max_distance = sample_positions
|
||||||
|
.iter()
|
||||||
|
.map(|&pos| {
|
||||||
|
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
|
||||||
|
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||||
|
distance(calculated_value, actual_value)
|
||||||
|
})
|
||||||
|
.max()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
// Estimate one block and extrapolate the cost to all blocks.
|
||||||
|
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||||
|
// 50% threshold.
|
||||||
|
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||||
|
// below. So the offset would = max_distance
|
||||||
|
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||||
|
|
||||||
|
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||||
|
// function metadata per block
|
||||||
|
+ 21 * (stats.num_vals / BLOCK_SIZE);
|
||||||
|
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||||
|
num_bits as f32 / num_bits_uncompressed as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||||
|
if x < y {
|
||||||
|
y - x
|
||||||
|
} else {
|
||||||
|
x - y
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::tests::get_codec_test_data_sets;
|
||||||
|
|
||||||
|
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||||
|
crate::tests::create_and_validate::<
|
||||||
|
PiecewiseLinearFastFieldSerializer,
|
||||||
|
PiecewiseLinearFastFieldReader,
|
||||||
|
>(data, name)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_compression() {
|
||||||
|
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||||
|
let (estimate, actual_compression) =
|
||||||
|
create_and_validate(&data, "simple monotonically large");
|
||||||
|
assert!(actual_compression < 0.2);
|
||||||
|
assert!(estimate < 0.20);
|
||||||
|
assert!(estimate > 0.15);
|
||||||
|
assert!(actual_compression > 0.001);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_with_codec_data_sets() {
|
||||||
|
let data_sets = get_codec_test_data_sets();
|
||||||
|
for (mut data, name) in data_sets {
|
||||||
|
create_and_validate(&data, name);
|
||||||
|
data.reverse();
|
||||||
|
create_and_validate(&data, name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn test_simple() {
|
||||||
|
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "simple monotonically");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn border_cases_1() {
|
||||||
|
let data = (0..1024).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "border case");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn border_case_2() {
|
||||||
|
let data = (0..1025).collect::<Vec<_>>();
|
||||||
|
create_and_validate(&data, "border case");
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn rand() {
|
||||||
|
for _ in 0..10 {
|
||||||
|
let mut data = (5_000..20_000)
|
||||||
|
.map(|_| rand::random::<u32>() as u64)
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||||
|
dbg!(estimate);
|
||||||
|
dbg!(actual_compression);
|
||||||
|
|
||||||
|
data.reverse();
|
||||||
|
create_and_validate(&data, "random");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -392,8 +392,7 @@ mod tests {
|
|||||||
serializer.close().unwrap();
|
serializer.close().unwrap();
|
||||||
}
|
}
|
||||||
let file = directory.open_read(path).unwrap();
|
let file = directory.open_read(path).unwrap();
|
||||||
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
|
||||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
|
||||||
{
|
{
|
||||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||||
|
|||||||
@@ -6,12 +6,17 @@ use common::BinarySerializable;
|
|||||||
use fastfield_codecs::bitpacked::{
|
use fastfield_codecs::bitpacked::{
|
||||||
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
||||||
};
|
};
|
||||||
|
#[allow(deprecated)]
|
||||||
use fastfield_codecs::linearinterpol::{
|
use fastfield_codecs::linearinterpol::{
|
||||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||||
};
|
};
|
||||||
|
#[allow(deprecated)]
|
||||||
use fastfield_codecs::multilinearinterpol::{
|
use fastfield_codecs::multilinearinterpol::{
|
||||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||||
};
|
};
|
||||||
|
use fastfield_codecs::piecewise_linear::{
|
||||||
|
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||||
|
};
|
||||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
||||||
|
|
||||||
use super::FastValue;
|
use super::FastValue;
|
||||||
@@ -71,6 +76,8 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
|||||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||||
/// Blockwise linear interpolated values + bitpacked
|
/// Blockwise linear interpolated values + bitpacked
|
||||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||||
|
/// Piecewise linear interpolated values + bitpacked
|
||||||
|
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||||
@@ -86,12 +93,14 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
|||||||
BitpackedReader,
|
BitpackedReader,
|
||||||
>::open_from_bytes(bytes)?)
|
>::open_from_bytes(bytes)?)
|
||||||
}
|
}
|
||||||
|
#[allow(deprecated)]
|
||||||
LinearInterpolFastFieldSerializer::ID => {
|
LinearInterpolFastFieldSerializer::ID => {
|
||||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||||
Item,
|
Item,
|
||||||
LinearInterpolFastFieldReader,
|
LinearInterpolFastFieldReader,
|
||||||
>::open_from_bytes(bytes)?)
|
>::open_from_bytes(bytes)?)
|
||||||
}
|
}
|
||||||
|
#[allow(deprecated)]
|
||||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||||
Item,
|
Item,
|
||||||
@@ -100,6 +109,12 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
|||||||
bytes
|
bytes
|
||||||
)?)
|
)?)
|
||||||
}
|
}
|
||||||
|
PiecewiseLinearFastFieldSerializer::ID => {
|
||||||
|
DynamicFastFieldReader::PiecewiseLinear(FastFieldReaderCodecWrapper::<
|
||||||
|
Item,
|
||||||
|
PiecewiseLinearFastFieldReader,
|
||||||
|
>::open_from_bytes(bytes)?)
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
panic!(
|
panic!(
|
||||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||||
@@ -118,6 +133,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
|||||||
Self::Bitpacked(reader) => reader.get(doc),
|
Self::Bitpacked(reader) => reader.get(doc),
|
||||||
Self::LinearInterpol(reader) => reader.get(doc),
|
Self::LinearInterpol(reader) => reader.get(doc),
|
||||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||||
|
Self::PiecewiseLinear(reader) => reader.get(doc),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -126,6 +142,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
|||||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||||
|
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn min_value(&self) -> Item {
|
fn min_value(&self) -> Item {
|
||||||
@@ -133,6 +150,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
|||||||
Self::Bitpacked(reader) => reader.min_value(),
|
Self::Bitpacked(reader) => reader.min_value(),
|
||||||
Self::LinearInterpol(reader) => reader.min_value(),
|
Self::LinearInterpol(reader) => reader.min_value(),
|
||||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||||
|
Self::PiecewiseLinear(reader) => reader.min_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
fn max_value(&self) -> Item {
|
fn max_value(&self) -> Item {
|
||||||
@@ -140,6 +158,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
|||||||
Self::Bitpacked(reader) => reader.max_value(),
|
Self::Bitpacked(reader) => reader.max_value(),
|
||||||
Self::LinearInterpol(reader) => reader.max_value(),
|
Self::LinearInterpol(reader) => reader.max_value(),
|
||||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||||
|
Self::PiecewiseLinear(reader) => reader.max_value(),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -176,9 +195,12 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
|||||||
_phantom: PhantomData,
|
_phantom: PhantomData,
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
#[inline]
|
|
||||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
/// Get u64 for indice `idx`.
|
||||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
/// `idx` can be either a `DocId` or an index used for
|
||||||
|
/// `multivalued` fast field. See [`get_range`] for more details.
|
||||||
|
pub(crate) fn get_u64(&self, idx: u64) -> Item {
|
||||||
|
Item::from_u64(self.reader.get_u64(idx, self.bytes.as_slice()))
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||||
|
|||||||
@@ -4,9 +4,9 @@ use common::{BinarySerializable, CountingWriter};
|
|||||||
pub use fastfield_codecs::bitpacked::{
|
pub use fastfield_codecs::bitpacked::{
|
||||||
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
||||||
};
|
};
|
||||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
|
||||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
|
||||||
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
use crate::directory::{CompositeWrite, WritePtr};
|
use crate::directory::{CompositeWrite, WritePtr};
|
||||||
use crate::schema::Field;
|
use crate::schema::Field;
|
||||||
@@ -35,18 +35,31 @@ pub struct CompositeFastFieldSerializer {
|
|||||||
composite_write: CompositeWrite<WritePtr>,
|
composite_write: CompositeWrite<WritePtr>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
#[derive(Debug)]
|
||||||
|
pub struct CodecEstimationResult<'a> {
|
||||||
|
pub ratio: f32,
|
||||||
|
pub name: &'a str,
|
||||||
|
pub id: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: use this when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||||
// https://github.com/rust-lang/rust/pull/86176
|
// https://github.com/rust-lang/rust/pull/86176
|
||||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||||
stats: FastFieldStats,
|
stats: FastFieldStats,
|
||||||
fastfield_accessor: &A,
|
fastfield_accessor: &A,
|
||||||
estimations: &mut Vec<(f32, &str, u8)>,
|
) -> CodecEstimationResult {
|
||||||
) {
|
|
||||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||||
return;
|
return CodecEstimationResult {
|
||||||
|
ratio: f32::MAX,
|
||||||
|
name: T::NAME,
|
||||||
|
id: T::ID,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
CodecEstimationResult {
|
||||||
|
ratio: T::estimate_compression_ratio(fastfield_accessor, stats),
|
||||||
|
name: T::NAME,
|
||||||
|
id: T::ID,
|
||||||
}
|
}
|
||||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
|
||||||
estimations.push((ratio, name, id));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl CompositeFastFieldSerializer {
|
impl CompositeFastFieldSerializer {
|
||||||
@@ -59,7 +72,7 @@ impl CompositeFastFieldSerializer {
|
|||||||
|
|
||||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||||
/// automatically.
|
/// automatically.
|
||||||
pub fn create_auto_detect_u64_fast_field(
|
pub fn new_u64_fast_field_with_best_codec(
|
||||||
&mut self,
|
&mut self,
|
||||||
field: Field,
|
field: Field,
|
||||||
stats: FastFieldStats,
|
stats: FastFieldStats,
|
||||||
@@ -67,7 +80,7 @@ impl CompositeFastFieldSerializer {
|
|||||||
data_iter_1: impl Iterator<Item = u64>,
|
data_iter_1: impl Iterator<Item = u64>,
|
||||||
data_iter_2: impl Iterator<Item = u64>,
|
data_iter_2: impl Iterator<Item = u64>,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
self.create_auto_detect_u64_fast_field_with_idx(
|
self.new_u64_fast_field_with_idx_with_best_codec(
|
||||||
field,
|
field,
|
||||||
stats,
|
stats,
|
||||||
fastfield_accessor,
|
fastfield_accessor,
|
||||||
@@ -78,7 +91,7 @@ impl CompositeFastFieldSerializer {
|
|||||||
}
|
}
|
||||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||||
/// automatically.
|
/// automatically.
|
||||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
pub fn new_u64_fast_field_with_idx_with_best_codec(
|
||||||
&mut self,
|
&mut self,
|
||||||
field: Field,
|
field: Field,
|
||||||
stats: FastFieldStats,
|
stats: FastFieldStats,
|
||||||
@@ -88,42 +101,29 @@ impl CompositeFastFieldSerializer {
|
|||||||
idx: usize,
|
idx: usize,
|
||||||
) -> io::Result<()> {
|
) -> io::Result<()> {
|
||||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||||
|
let estimations = vec![
|
||||||
let mut estimations = vec![];
|
codec_estimation::<BitpackedFastFieldSerializer, _>(stats.clone(), &fastfield_accessor),
|
||||||
|
codec_estimation::<PiecewiseLinearFastFieldSerializer, _>(
|
||||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
stats.clone(),
|
||||||
stats.clone(),
|
&fastfield_accessor,
|
||||||
&fastfield_accessor,
|
),
|
||||||
&mut estimations,
|
];
|
||||||
);
|
let best_codec_result = estimations
|
||||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
.iter()
|
||||||
stats.clone(),
|
.sorted_by(|result_a, result_b| {
|
||||||
&fastfield_accessor,
|
result_a
|
||||||
&mut estimations,
|
.ratio
|
||||||
);
|
.partial_cmp(&result_b.ratio)
|
||||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
.expect("Ratio cannot be nan.")
|
||||||
stats.clone(),
|
})
|
||||||
&fastfield_accessor,
|
.next()
|
||||||
&mut estimations,
|
.expect("A codec must be present.");
|
||||||
);
|
|
||||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
|
||||||
{
|
|
||||||
warn!(
|
|
||||||
"broken estimation for fast field codec {}",
|
|
||||||
broken_estimation.1
|
|
||||||
);
|
|
||||||
}
|
|
||||||
// removing nan values for codecs with broken calculations, and max values which disables
|
|
||||||
// codecs
|
|
||||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
|
||||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
|
||||||
let (_ratio, name, id) = estimations[0];
|
|
||||||
debug!(
|
debug!(
|
||||||
"choosing fast field codec {} for field_id {:?}",
|
"Choosing fast field codec {} for field_id {:?} among {:?}",
|
||||||
name, field
|
best_codec_result.name, field, estimations,
|
||||||
); // todo print actual field name
|
);
|
||||||
id.serialize(field_write)?;
|
best_codec_result.id.serialize(field_write)?;
|
||||||
match name {
|
match best_codec_result.name {
|
||||||
BitpackedFastFieldSerializer::NAME => {
|
BitpackedFastFieldSerializer::NAME => {
|
||||||
BitpackedFastFieldSerializer::serialize(
|
BitpackedFastFieldSerializer::serialize(
|
||||||
field_write,
|
field_write,
|
||||||
@@ -133,17 +133,8 @@ impl CompositeFastFieldSerializer {
|
|||||||
data_iter_2,
|
data_iter_2,
|
||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
LinearInterpolFastFieldSerializer::NAME => {
|
PiecewiseLinearFastFieldSerializer::NAME => {
|
||||||
LinearInterpolFastFieldSerializer::serialize(
|
PiecewiseLinearFastFieldSerializer::serialize(
|
||||||
field_write,
|
|
||||||
&fastfield_accessor,
|
|
||||||
stats,
|
|
||||||
data_iter_1,
|
|
||||||
data_iter_2,
|
|
||||||
)?;
|
|
||||||
}
|
|
||||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
|
||||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
|
||||||
field_write,
|
field_write,
|
||||||
&fastfield_accessor,
|
&fastfield_accessor,
|
||||||
stats,
|
stats,
|
||||||
@@ -152,7 +143,7 @@ impl CompositeFastFieldSerializer {
|
|||||||
)?;
|
)?;
|
||||||
}
|
}
|
||||||
_ => {
|
_ => {
|
||||||
panic!("unknown fastfield serializer {}", name)
|
panic!("unknown fastfield serializer {}", best_codec_result.name)
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
field_write.flush()?;
|
field_write.flush()?;
|
||||||
@@ -216,3 +207,45 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
|||||||
self.write.flush()
|
self.write.flush()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::path::Path;
|
||||||
|
|
||||||
|
use common::BinarySerializable;
|
||||||
|
use fastfield_codecs::FastFieldStats;
|
||||||
|
use itertools::Itertools;
|
||||||
|
|
||||||
|
use super::CompositeFastFieldSerializer;
|
||||||
|
use crate::directory::{RamDirectory, WritePtr};
|
||||||
|
use crate::schema::Field;
|
||||||
|
use crate::Directory;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn new_u64_fast_field_with_best_codec() -> crate::Result<()> {
|
||||||
|
let directory: RamDirectory = RamDirectory::create();
|
||||||
|
let path = Path::new("test");
|
||||||
|
let write: WritePtr = directory.open_write(path)?;
|
||||||
|
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||||
|
let vals = (0..10000u64).into_iter().collect_vec();
|
||||||
|
let stats = FastFieldStats {
|
||||||
|
min_value: 0,
|
||||||
|
max_value: 9999,
|
||||||
|
num_vals: vals.len() as u64,
|
||||||
|
};
|
||||||
|
serializer.new_u64_fast_field_with_best_codec(
|
||||||
|
Field::from_field_id(0),
|
||||||
|
stats,
|
||||||
|
vals.clone(),
|
||||||
|
vals.clone().into_iter(),
|
||||||
|
vals.into_iter(),
|
||||||
|
)?;
|
||||||
|
serializer.close()?;
|
||||||
|
// get the codecs id
|
||||||
|
let mut bytes = directory.open_read(path)?.read_bytes()?;
|
||||||
|
let codec_id = u8::deserialize(&mut bytes)?;
|
||||||
|
// Codec id = 4 is piecewise linear.
|
||||||
|
assert_eq!(codec_id, 4);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|||||||
@@ -298,7 +298,7 @@ impl IntFastFieldWriter {
|
|||||||
let iter = doc_id_map
|
let iter = doc_id_map
|
||||||
.iter_old_doc_ids()
|
.iter_old_doc_ids()
|
||||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||||
serializer.create_auto_detect_u64_fast_field(
|
serializer.new_u64_fast_field_with_best_codec(
|
||||||
self.field,
|
self.field,
|
||||||
stats,
|
stats,
|
||||||
fastfield_accessor,
|
fastfield_accessor,
|
||||||
@@ -306,7 +306,7 @@ impl IntFastFieldWriter {
|
|||||||
iter,
|
iter,
|
||||||
)?;
|
)?;
|
||||||
} else {
|
} else {
|
||||||
serializer.create_auto_detect_u64_fast_field(
|
serializer.new_u64_fast_field_with_best_codec(
|
||||||
self.field,
|
self.field,
|
||||||
stats,
|
stats,
|
||||||
fastfield_accessor,
|
fastfield_accessor,
|
||||||
|
|||||||
@@ -384,7 +384,7 @@ impl IndexMerger {
|
|||||||
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
|
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
|
||||||
fast_field_reader.get(*doc_id)
|
fast_field_reader.get(*doc_id)
|
||||||
});
|
});
|
||||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||||
field,
|
field,
|
||||||
stats,
|
stats,
|
||||||
fastfield_accessor,
|
fastfield_accessor,
|
||||||
@@ -551,7 +551,7 @@ impl IndexMerger {
|
|||||||
}
|
}
|
||||||
offsets.push(offset);
|
offsets.push(offset);
|
||||||
|
|
||||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||||
field,
|
field,
|
||||||
stats,
|
stats,
|
||||||
&offsets[..],
|
&offsets[..],
|
||||||
@@ -771,7 +771,7 @@ impl IndexMerger {
|
|||||||
ff_reader.get_vals(*doc_id, &mut vals);
|
ff_reader.get_vals(*doc_id, &mut vals);
|
||||||
vals.into_iter()
|
vals.into_iter()
|
||||||
});
|
});
|
||||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
fast_field_serializer.new_u64_fast_field_with_idx_with_best_codec(
|
||||||
field,
|
field,
|
||||||
stats,
|
stats,
|
||||||
fastfield_accessor,
|
fastfield_accessor,
|
||||||
|
|||||||
Reference in New Issue
Block a user