Compare commits

..

15 Commits

Author SHA1 Message Date
François Massot
14d53851a8 Fix clippy, clean comment. 2022-03-28 01:17:07 +02:00
François Massot
2d176e66b6 Format. 2022-03-26 22:29:43 +01:00
François Massot
838a332db0 Fix fmt. 2022-03-26 21:33:08 +01:00
François Massot
defbd9139b Update fastfield codecs readme. 2022-03-26 21:33:08 +01:00
François Massot
0c87732459 Fix makefile. 2022-03-26 21:33:08 +01:00
François Massot
4d66a3f0a0 Put deprecated attributes on deprecated codecs. Clean. 2022-03-26 21:33:06 +01:00
François Massot
977f01a8a3 Deprecate linear and multilienar fast field coded, add piece wise and FOR. Update tests and clean. 2022-03-26 21:27:15 +01:00
François Massot
c14bdd26d4 Clean. 2022-03-26 21:18:13 +01:00
François Massot
3272f80171 Fix clippy. 2022-03-26 21:17:32 +01:00
François Massot
23d5ab5656 Rename new codecs. 2022-03-26 21:17:32 +01:00
François Massot
245ed5fed1 Add float dataset for comparing fast field codec. 2022-03-26 21:17:32 +01:00
François Massot
33bed01168 Clean frame of ref codec. 2022-03-26 21:17:32 +01:00
François Massot
17a5f4f0ff Seed random datasets in fast field codecs comparison. 2022-03-26 21:17:30 +01:00
François Massot
c969582308 Add frame of reference codecs. 2022-03-26 21:16:50 +01:00
François Massot
18d2ee5bb7 Add another multilinear interpolation and real world dataset. 2022-03-26 21:15:50 +01:00
70 changed files with 1858 additions and 3300 deletions

View File

@@ -13,11 +13,12 @@ jobs:
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly --component llvm-tools-preview
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Install cargo-llvm-cov
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
- name: Generate code coverage
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
uses: codecov/codecov-action@v2
with:
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
files: lcov.info

View File

@@ -8,9 +8,6 @@ Unreleased
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
directly instead.
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
- Add support for fastfield on text fields (@PSeitz)
- Add terms aggregation (@PSeitz)
Tantivy 0.17
================================
@@ -22,7 +19,7 @@ Tantivy 0.17
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)

View File

@@ -31,7 +31,7 @@ serde_json = "1.0.64"
num_cpus = "1.13"
fs2={ version = "0.4.3", optional = true }
levenshtein_automata = "0.2"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
uuid = { version = "0.8.2", features = ["v4", "serde"] }
crossbeam = "0.8.1"
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
@@ -70,7 +70,7 @@ proptest = "1.0"
criterion = "0.3.5"
test-log = "0.2.8"
env_logger = "0.9.0"
pprof = {version= "0.8", features=["flamegraph", "criterion"]}
pprof = {version= "0.7", features=["flamegraph", "criterion"]}
futures = "0.3.15"
[dev-dependencies.fail]

View File

@@ -128,13 +128,10 @@ $ gdb run
# Companies Using Tantivy
<p align="left">
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />&nbsp; &nbsp;
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
</p>
<img align="center" src="doc/assets/images/Nuclia.png" alt="Nuclia" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.png" alt="Humanfirst.ai" height="30" width="auto" />&nbsp;
<img align="center" src="doc/assets/images/element.io.svg" alt="Element.io" height="25" width="auto" />
</p>
# FAQ

Binary file not shown.

Before

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 23 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 7.8 KiB

View File

@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::to_value(&agg_res)?;
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
println!("{}", serde_json::to_string_pretty(&res)?);
Ok(())

1
fastfield_codecs/.gitignore vendored Normal file
View File

@@ -0,0 +1 @@
datasets/

View File

@@ -6,8 +6,6 @@ license = "MIT"
edition = "2018"
description = "Fast field codecs used by tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
@@ -19,6 +17,6 @@ more-asserts = "0.2.1"
rand = "0.8.3"
[features]
unstable = [] # useful for benches and experimental codecs.
bin = ["prettytable-rs", "rand"]
default = ["bin"]

View File

@@ -0,0 +1,6 @@
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids nooc_temperatures
download:
@echo "--- Downloading datasets ---"
mkdir -p datasets
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done

View File

@@ -13,6 +13,10 @@ A codec needs to implement 2 traits:
- A reader implementing `FastFieldCodecReader` to read the codec.
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
### Download real world datasets for codecs comparison
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
To run with the unstable codecs, execute `cargo run --features unstable`.
### Tests
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
@@ -23,46 +27,101 @@ cargo run --features bin
```
### TODO
- Add real world data sets in comparison
- Add codec to cover sparse data sets
### Codec Comparison
```
+----------------------------------+-------------------+------------------------+
| | Compression Ratio | Compression Estimation |
+----------------------------------+-------------------+------------------------+
| Autoincrement | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.000039572664 | 0.000004396963 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing concave | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.26562938 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.190665 | 0.1883836 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Monotonically increasing convex | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.25003937 | 0.28125438 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.18676 | 0.2040086 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.31251436 | 0.3125 |
+----------------------------------+-------------------+------------------------+
| Almost monotonically increasing | | |
+----------------------------------+-------------------+------------------------+
| LinearInterpol | 0.14066513 | 0.1562544 |
+----------------------------------+-------------------+------------------------+
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
+----------------------------------+-------------------+------------------------+
| Bitpacked | 0.28126493 | 0.28125 |
+----------------------------------+-------------------+------------------------+
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Autoincrement | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.0051544965 | 0.17251475 | 960 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.118189104 | 0.14172314 | 708 | 212 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.28126493 | 0.28125 | 474 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Monotonically increasing concave | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.005955 | 0.18813984 | 885 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.16113 | 0.15734828 | 704 | 212 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.31251436 | 0.3125 | 478 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Monotonically increasing convex | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.00613 | 0.20376484 | 889 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.157175 | 0.17297328 | 706 | 212 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.31251436 | 0.3125 | 471 | 113 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Almost monotonically increasing | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.14549863 | 0.17251475 | 923 | 210 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.14943957 | 0.15734814 | 703 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.28126493 | 0.28125 | 462 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Random | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.14533783 | 0.14126475 | 924 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.13381402 | 0.15734814 | 695 | 211 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.12501445 | 0.125 | 422 | 112 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HDFS logs timestamps | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.39826187 | 0.4068908 | 5545 | 1086 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.39214826 | 0.40734857 | 5082 | 1073 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.39062786 | 0.390625 | 2864 | 567 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HDFS logs timestamps SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.032736875 | 0.094390824 | 4942 | 1067 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.02667125 | 0.079223566 | 3626 | 994 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.39062786 | 0.390625 | 2493 | 566 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| HTTP logs timestamps SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.047942877 | 0.20376582 | 5121 | 1065 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.06637425 | 0.18859856 | 3929 | 1093 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.26562786 | 0.265625 | 2221 | 526 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Amazon review product ids | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.41900787 | 0.4225158 | 5239 | 1089 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.41504425 | 0.43859857 | 4158 | 1052 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.40625286 | 0.40625 | 2603 | 513 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Amazon review product ids SORTED | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | 0.18364687 | 0.25064084 | 5036 | 990 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 0.21239226 | 0.21984856 | 4087 | 1072 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 0.40625286 | 0.40625 | 2702 | 525 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Temperatures | | | | |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| PiecewiseLinear | | Codec Disabled | 0 | 0 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| FOR | 1.0088086 | 1.001098 | 1306 | 237 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
| Bitpacked | 1.000012 | 1 | 950 | 108 |
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
```

View File

@@ -5,11 +5,8 @@ extern crate test;
#[cfg(test)]
mod tests {
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
use fastfield_codecs::linearinterpol::{
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
};
use fastfield_codecs::multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
use fastfield_codecs::piecewise_linear::{
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
};
use fastfield_codecs::*;
@@ -70,14 +67,9 @@ mod tests {
bench_create::<BitpackedFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
}
#[bench]
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
@@ -85,16 +77,9 @@ mod tests {
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
}
#[bench]
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
let data: Vec<_> = get_data();
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
b, &data,
);
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(b, &data);
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
let min_value = data.iter().cloned().min().unwrap_or(0);

View File

@@ -128,7 +128,10 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
) -> bool {
true
}
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
fn estimate_compression_ratio(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32 {
let amplitude = stats.max_value - stats.min_value;
let num_bits = compute_num_bits(amplitude);
let num_bits_uncompressed = 64;

View File

@@ -0,0 +1,272 @@
use std::io::{self, Read, Write};
use common::{BinarySerializable, DeserializeFrom};
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
const BLOCK_SIZE: u64 = 128;
#[derive(Clone)]
pub struct FORFastFieldReader {
num_vals: u64,
min_value: u64,
max_value: u64,
block_readers: Vec<BlockReader>,
}
#[derive(Clone, Debug, Default)]
struct BlockMetadata {
min: u64,
num_bits: u8,
}
#[derive(Clone, Debug, Default)]
struct BlockReader {
metadata: BlockMetadata,
start_offset: u64,
bit_unpacker: BitUnpacker,
}
impl BlockReader {
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
Self {
bit_unpacker: BitUnpacker::new(metadata.num_bits),
metadata,
start_offset,
}
}
#[inline]
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
let diff = self
.bit_unpacker
.get(block_pos, &data[self.start_offset as usize..]);
self.metadata.min + diff
}
}
impl BinarySerializable for BlockMetadata {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.min.serialize(write)?;
self.num_bits.serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let min = u64::deserialize(reader)?;
let num_bits = u8::deserialize(reader)?;
Ok(Self { min, num_bits })
}
}
#[derive(Clone, Debug)]
pub struct FORFooter {
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
block_metadatas: Vec<BlockMetadata>,
}
impl BinarySerializable for FORFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
let mut out = vec![];
self.num_vals.serialize(&mut out)?;
self.min_value.serialize(&mut out)?;
self.max_value.serialize(&mut out)?;
self.block_metadatas.serialize(&mut out)?;
write.write_all(&out)?;
(out.len() as u32).serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let footer = Self {
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
};
Ok(footer)
}
}
impl FastFieldCodecReader for FORFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer = FORFooter::deserialize(&mut footer)?;
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
let mut current_data_offset = 0;
for block_metadata in footer.block_metadatas {
let num_bits = block_metadata.num_bits;
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
}
Ok(Self {
num_vals: footer.num_vals,
min_value: footer.min_value,
max_value: footer.max_value,
block_readers,
})
}
#[inline]
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
let block_idx = (idx / BLOCK_SIZE) as usize;
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
let block_reader = &self.block_readers[block_idx];
block_reader.get_u64(block_pos, data)
}
#[inline]
fn min_value(&self) -> u64 {
self.min_value
}
#[inline]
fn max_value(&self) -> u64 {
self.max_value
}
}
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
pub struct FORFastFieldSerializer {}
impl FastFieldCodecSerializer for FORFastFieldSerializer {
const NAME: &'static str = "FOR";
const ID: u8 = 5;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
_: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
let data = data_iter.collect::<Vec<_>>();
let mut bit_packer = BitPacker::new();
let mut block_metadatas = Vec::new();
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
let block_values = &data[data_pos as usize..data_pos as usize + block_num_vals];
let mut min = block_values[0];
let mut max = block_values[0];
for &current_value in block_values[1..].iter() {
min = min.min(current_value);
max = max.max(current_value);
}
let num_bits = compute_num_bits(max - min);
for current_value in block_values.iter() {
bit_packer.write(current_value - min, num_bits, write)?;
}
bit_packer.flush(write)?;
block_metadatas.push(BlockMetadata { min, num_bits });
}
bit_packer.close(write)?;
let footer = FORFooter {
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
block_metadatas,
};
footer.serialize(write)?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
stats.num_vals > BLOCK_SIZE
}
/// Estimate compression ratio by compute the ratio of the first block.
fn estimate_compression_ratio(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32 {
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
let max_distance = (0..last_elem_in_first_chunk)
.into_iter()
.map(|pos| {
let actual_value = fastfield_accessor.get_val(pos as u64);
actual_value - stats.min_value
})
.max()
.unwrap();
// Estimate one block and multiply by a magic number 3 to select this codec
// when we are almost sure that this is relevant.
let relative_max_value = max_distance as f32 * 3.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
// function metadata per block
+ 9 * (stats.num_vals / BLOCK_SIZE);
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<FORFastFieldSerializer, FORFastFieldReader>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
println!("{}", actual_compression);
assert!(actual_compression < 0.2);
assert!(actual_compression > 0.006);
assert!(estimate < 0.20);
assert!(estimate > 0.10);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
create_and_validate(&data, "simple monotonically");
}
#[test]
fn border_cases_1() {
let data = (0..1024).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn border_case_2() {
let data = (0..1025).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) = create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse();
create_and_validate(&data, "random");
}
}
}

View File

@@ -6,15 +6,20 @@ use std::io;
use std::io::Write;
pub mod bitpacked;
#[cfg(feature = "unstable")]
pub mod frame_of_reference;
pub mod linearinterpol;
pub mod multilinearinterpol;
pub mod piecewise_linear;
pub trait FastFieldCodecReader: Sized {
/// reads the metadata and returns the CodecReader
/// Reads the metadata and returns the CodecReader.
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
/// Read u64 value for indice `idx`.
/// `idx` can be either a `DocId` or an index used for
/// `multivalued` fast field.
fn get_u64(&self, idx: u64, data: &[u8]) -> u64;
fn min_value(&self) -> u64;
fn max_value(&self) -> u64;
}
@@ -35,7 +40,10 @@ pub trait FastFieldCodecSerializer {
///
/// It could make sense to also return a value representing
/// computational complexity.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
fn estimate_compression_ratio(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32;
/// Serializes the data using the serializer into write.
/// There are multiple iterators, in case the codec needs to read the data multiple times.
@@ -85,9 +93,8 @@ impl FastFieldDataAccess for Vec<u64> {
#[cfg(test)]
mod tests {
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
use crate::multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
use crate::piecewise_linear::{
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
};
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
@@ -97,7 +104,7 @@ mod tests {
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
return (f32::MAX, 0.0);
}
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
let estimation = S::estimate_compression_ratio(&data, crate::tests::stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
@@ -157,13 +164,10 @@ mod tests {
fn test_codec_bitpacking() {
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
}
#[test]
fn test_codec_interpolation() {
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
}
#[test]
fn test_codec_multi_interpolation() {
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
fn test_codec_piecewise_linear() {
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
}
use super::*;
@@ -181,45 +185,50 @@ mod tests {
fn estimation_good_interpolation_case() {
let data = (10..=20000_u64).collect::<Vec<_>>();
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.01);
let multi_linear_interpol_estimation =
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(multi_linear_interpol_estimation, 0.2);
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(piecewise_interpol_estimation, 0.2);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, bitpacked_estimation);
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case() {
let data = vec![200, 10, 10, 10, 10, 1000, 20];
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.32);
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(piecewise_interpol_estimation, 0.32);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, linear_interpol_estimation);
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
}
#[test]
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
fn estimation_test_interpolation_case_monotonically_increasing() {
let mut data = (200..=20000_u64).collect::<Vec<_>>();
data.push(1_000_000);
// in this case the linear interpolation can't in fact not be worse than bitpacking,
// but the estimator adds some threshold, which leads to estimated worse behavior
let linear_interpol_estimation =
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
assert_le!(linear_interpol_estimation, 0.35);
let piecewise_interpol_estimation =
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
&data,
stats_from_vec(&data),
);
assert_le!(piecewise_interpol_estimation, 0.2);
let bitpacked_estimation =
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
println!("{}", bitpacked_estimation);
assert_le!(bitpacked_estimation, 0.32);
assert_le!(bitpacked_estimation, linear_interpol_estimation);
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
}
}

View File

@@ -71,9 +71,9 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
})
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
let calculated_value = get_calculated_value(self.footer.first_val, idx, self.slope);
(calculated_value + self.bit_unpacker.get(idx, data)) - self.footer.offset
}
#[inline]
@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
/// Fastfield serializer, which tries to guess values by linear interpolation
/// and stores the difference bitpacked.
#[deprecated(
note = "Linear interpolation works best only on very rare cases and piecewise linear codec \
already works great on them."
)]
pub struct LinearInterpolFastFieldSerializer {}
#[inline]
@@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
first_val + (pos as f32 * slope) as u64
}
#[allow(deprecated)]
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
const NAME: &'static str = "LinearInterpol";
const ID: u8 = 2;
@@ -182,10 +187,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// Estimation for linear interpolation is hard because, you don't know
/// where the local maxima for the deviation of the calculated value are and
/// the offset to shift all values to >=0 is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
fn estimate_compression_ratio(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32 {
if stats.num_vals < 3 {
return f32::MAX;
}
let first_val = fastfield_accessor.get_val(0);
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
let slope = get_slope(first_val, last_val, stats.num_vals);
@@ -229,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
}
}
#[allow(deprecated)]
#[cfg(test)]
mod tests {
use super::*;
@@ -289,8 +301,10 @@ mod tests {
#[test]
fn linear_interpol_fast_field_rand() {
for _ in 0..5000 {
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
create_and_validate(&data, "random");
data.reverse();

View File

@@ -1,31 +1,52 @@
#[macro_use]
extern crate prettytable;
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
use std::fs::File;
use std::io;
use std::io::BufRead;
use std::time::{Duration, Instant};
use common::f64_to_u64;
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
#[cfg(feature = "unstable")]
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
use fastfield_codecs::piecewise_linear::{
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
};
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldStats};
use prettytable::{Cell, Row, Table};
use rand::prelude::StdRng;
use rand::Rng;
fn main() {
let mut table = Table::new();
// Add a row per time
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
table.add_row(row![
"",
"Compression ratio",
"Compression ratio estimation",
"Compression time (micro)",
"Reading time (micro)"
]);
for (data, data_set_name) in get_codec_test_data_sets() {
let mut results = vec![];
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
let res = serialize_with_codec::<
PiecewiseLinearFastFieldSerializer,
PiecewiseLinearFastFieldReader,
>(&data);
results.push(res);
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
results.push(res);
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
&data,
);
#[cfg(feature = "unstable")]
{
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
results.push(res);
}
let res = serialize_with_codec::<
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
BitpackedFastFieldReader,
>(&data);
results.push(res);
// let best_estimation_codec = results
//.iter()
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
//.unwrap();
let best_compression_ratio_codec = results
.iter()
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
@@ -33,7 +54,7 @@ fn main() {
.unwrap();
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
for (is_applicable, est, comp, name) in results {
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
let (est_cell, ratio_cell) = if !is_applicable {
("Codec Disabled".to_string(), "".to_string())
} else {
@@ -49,6 +70,8 @@ fn main() {
Cell::new(name).style_spec("bFg"),
Cell::new(&ratio_cell).style_spec(style),
Cell::new(&est_cell).style_spec(""),
Cell::new(&compression_duration.as_micros().to_string()),
Cell::new(&read_duration.as_micros().to_string()),
]));
}
}
@@ -70,7 +93,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
current_cumulative
})
.collect::<Vec<_>>();
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing concave"));
let mut current_cumulative = 0;
@@ -83,22 +105,79 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
.collect::<Vec<_>>();
data_and_names.push((data, "Monotonically increasing convex"));
let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1);
let data = (1000..=200_000_u64)
.map(|num| num + rand::random::<u8>() as u64)
.map(|num| num + rng.gen::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Almost monotonically increasing"));
let data = (1000..=200_000_u64)
.map(|_| rng.gen::<u8>() as u64)
.collect::<Vec<_>>();
data_and_names.push((data, "Random"));
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
data_and_names.push((data.clone(), "HDFS logs timestamps"));
data.sort_unstable();
data_and_names.push((data, "HDFS logs timestamps SORTED"));
let data = load_dataset("datasets/http_logs_timestamps.txt");
data_and_names.push((data, "HTTP logs timestamps SORTED"));
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
data_and_names.push((data.clone(), "Amazon review product ids"));
data.sort_unstable();
data_and_names.push((data, "Amazon review product ids SORTED"));
let data = load_float_dataset("datasets/nooc_temperatures.txt");
data_and_names.push((data, "Temperatures"));
data_and_names
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
pub fn load_dataset(file_path: &str) -> Vec<u64> {
println!("Load dataset from `{}`", file_path);
let file = File::open(file_path).expect("Error when opening file.");
let lines = io::BufReader::new(file).lines();
let mut data = Vec::new();
for line in lines {
let l = line.unwrap();
data.push(l.parse::<u64>().unwrap());
}
data
}
pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
println!("Load float dataset from `{}`", file_path);
let file = File::open(file_path).expect("Error when opening file.");
let lines = io::BufReader::new(file).lines();
let mut data = Vec::new();
for line in lines {
let line_string = line.unwrap();
let value = line_string.parse::<f64>().unwrap();
data.push(f64_to_u64(value));
}
data
}
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
data: &[u64],
) -> (bool, f32, f32, &'static str) {
) -> (bool, f32, f32, &'static str, Duration, Duration) {
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
if !is_applicable {
return (false, 0.0, 0.0, S::NAME);
return (
false,
0.0,
0.0,
S::NAME,
Duration::from_secs(0),
Duration::from_secs(0),
);
}
let estimation = S::estimate(&data, stats_from_vec(data));
let start_time_compression = Instant::now();
let estimation = S::estimate_compression_ratio(&data, stats_from_vec(data));
let mut out = vec![];
S::serialize(
&mut out,
@@ -108,9 +187,22 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
data.iter().cloned(),
)
.unwrap();
let elasped_time_compression = start_time_compression.elapsed();
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
(true, estimation, actual_compression, S::NAME)
let reader = R::open_from_bytes(&out).unwrap();
let start_time_read = Instant::now();
for doc in 0..data.len() {
reader.get_u64(doc as u64, &out);
}
let elapsed_time_read = start_time_read.elapsed();
(
true,
estimation,
actual_compression,
S::NAME,
elasped_time_compression,
elapsed_time_read,
)
}
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {

View File

@@ -155,14 +155,17 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
}
#[inline]
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
let doc = doc - interpolation.start_pos;
let calculated_value =
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
let block_idx = idx - interpolation.start_pos;
let calculated_value = get_calculated_value(
interpolation.value_start_pos,
block_idx,
interpolation.slope,
);
let diff = interpolation
.bit_unpacker
.get(doc, &data[interpolation.data_start_offset as usize..]);
.get(block_idx, &data[interpolation.data_start_offset as usize..]);
(calculated_value + diff) - interpolation.positive_val_offset
}
@@ -187,8 +190,13 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
}
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
#[deprecated(
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is \
a little bit more optimized."
)]
pub struct MultiLinearInterpolFastFieldSerializer {}
#[allow(deprecated)]
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
const NAME: &'static str = "MultiLinearInterpol";
const ID: u8 = 3;
@@ -311,10 +319,13 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
}
true
}
/// estimation for linear interpolation is hard because, you don't know
/// Estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
fn estimate_compression_ratio(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32 {
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
let last_val_in_first_block =
@@ -366,6 +377,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
}
#[cfg(test)]
#[allow(deprecated)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;

View File

@@ -0,0 +1,365 @@
//! PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict
//! values and fast field values. The difference with real fast field values is then stored.
//! For every block, the linear function can be expressed as
//! `computed_value = slope * block_position + first_value + positive_offset`
//! where:
//! - `block_position` is the position inside of the block from 0 to 511
//! - `first_value` is the first value on the block
//! - `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is
//! always positive.
//!
//! 21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits
//! per element.
use std::io::{self, Read, Write};
use std::ops::Sub;
use common::{BinarySerializable, DeserializeFrom};
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
const BLOCK_SIZE: u64 = 512;
#[derive(Clone)]
pub struct PiecewiseLinearFastFieldReader {
min_value: u64,
max_value: u64,
block_readers: Vec<BlockReader>,
}
/// Block that stores metadata to predict value with a linear
/// function `predicted_value = slope * position + first_value + positive_offset`
/// where `positive_offset` is comupted such that predicted values
/// are always positive.
#[derive(Clone, Debug, Default)]
struct BlockMetadata {
first_value: u64,
positive_offset: u64,
slope: f32,
num_bits: u8,
}
#[derive(Clone, Debug, Default)]
struct BlockReader {
metadata: BlockMetadata,
start_offset: u64,
bit_unpacker: BitUnpacker,
}
impl BlockReader {
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
Self {
bit_unpacker: BitUnpacker::new(metadata.num_bits),
metadata,
start_offset,
}
}
#[inline]
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
let diff = self
.bit_unpacker
.get(block_pos, &data[self.start_offset as usize..]);
let predicted_value =
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
(predicted_value + diff) - self.metadata.positive_offset
}
}
impl BinarySerializable for BlockMetadata {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
self.first_value.serialize(write)?;
self.positive_offset.serialize(write)?;
self.slope.serialize(write)?;
self.num_bits.serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let first_value = u64::deserialize(reader)?;
let positive_offset = u64::deserialize(reader)?;
let slope = f32::deserialize(reader)?;
let num_bits = u8::deserialize(reader)?;
Ok(Self {
first_value,
positive_offset,
slope,
num_bits,
})
}
}
#[derive(Clone, Debug)]
pub struct PiecewiseLinearFooter {
pub num_vals: u64,
pub min_value: u64,
pub max_value: u64,
block_metadatas: Vec<BlockMetadata>,
}
impl BinarySerializable for PiecewiseLinearFooter {
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
let mut out = vec![];
self.num_vals.serialize(&mut out)?;
self.min_value.serialize(&mut out)?;
self.max_value.serialize(&mut out)?;
self.block_metadatas.serialize(&mut out)?;
write.write_all(&out)?;
(out.len() as u32).serialize(write)?;
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let footer = Self {
num_vals: u64::deserialize(reader)?,
min_value: u64::deserialize(reader)?,
max_value: u64::deserialize(reader)?,
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
};
Ok(footer)
}
}
impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
/// Opens a fast field given a file.
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
let footer = PiecewiseLinearFooter::deserialize(&mut footer)?;
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
let mut current_data_offset = 0;
for block_metadata in footer.block_metadatas.into_iter() {
let num_bits = block_metadata.num_bits;
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
}
Ok(Self {
min_value: footer.min_value,
max_value: footer.max_value,
block_readers,
})
}
#[inline]
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
let block_idx = (idx / BLOCK_SIZE) as usize;
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
let block_reader = &self.block_readers[block_idx];
block_reader.get_u64(block_pos, data)
}
#[inline]
fn min_value(&self) -> u64 {
self.min_value
}
#[inline]
fn max_value(&self) -> u64 {
self.max_value
}
}
#[inline]
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
(first_val as i64 + (pos as f32 * slope) as i64) as u64
}
pub struct PiecewiseLinearFastFieldSerializer;
impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
const NAME: &'static str = "PiecewiseLinear";
const ID: u8 = 4;
/// Creates a new fast field serializer.
fn serialize(
write: &mut impl Write,
_: &impl FastFieldDataAccess,
stats: FastFieldStats,
data_iter: impl Iterator<Item = u64>,
_data_iter1: impl Iterator<Item = u64>,
) -> io::Result<()> {
let mut data = data_iter.collect::<Vec<_>>();
let mut bit_packer = BitPacker::new();
let mut block_metadatas = Vec::new();
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
let slope = if block_num_vals == 1 {
0f32
} else {
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64)
/ (block_num_vals - 1) as f64) as f32
};
let first_value = block_values[0];
let mut positive_offset = 0;
let mut max_delta = 0;
for (pos, &current_value) in block_values[1..].iter().enumerate() {
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
if computed_value > current_value {
positive_offset = positive_offset.max(computed_value - current_value);
} else {
max_delta = max_delta.max(current_value - computed_value);
}
}
let num_bits = compute_num_bits(max_delta + positive_offset);
for (pos, current_value) in block_values.iter().enumerate() {
let computed_value = predict_value(first_value, pos as u64, slope);
let diff = (current_value + positive_offset) - computed_value;
bit_packer.write(diff, num_bits, write)?;
}
bit_packer.flush(write)?;
block_metadatas.push(BlockMetadata {
first_value,
positive_offset,
slope,
num_bits,
});
}
bit_packer.close(write)?;
let footer = PiecewiseLinearFooter {
num_vals: stats.num_vals,
min_value: stats.min_value,
max_value: stats.max_value,
block_metadatas,
};
footer.serialize(write)?;
Ok(())
}
fn is_applicable(
_fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> bool {
if stats.num_vals < 10 * BLOCK_SIZE {
return false;
}
// On serialization the offset is added to the actual value.
// We need to make sure this won't run into overflow calculation issues.
// For this we take the maximum theroretical offset and add this to the max value.
// If this doesn't overflow the algortihm should be fine
let theorethical_maximum_offset = stats.max_value - stats.min_value;
if stats
.max_value
.checked_add(theorethical_maximum_offset)
.is_none()
{
return false;
}
true
}
/// Estimation for linear interpolation is hard because, you don't know
/// where the local maxima are for the deviation of the calculated value and
/// the offset is also unknown.
fn estimate_compression_ratio(
fastfield_accessor: &impl FastFieldDataAccess,
stats: FastFieldStats,
) -> f32 {
let first_val_in_first_block = fastfield_accessor.get_val(0);
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
let last_val_in_first_block =
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64)
/ (stats.num_vals - 1) as f64) as f32;
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
let sample_positions = (0..20)
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
.collect::<Vec<_>>();
let max_distance = sample_positions
.iter()
.map(|&pos| {
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
let actual_value = fastfield_accessor.get_val(pos as u64);
distance(calculated_value, actual_value)
})
.max()
.unwrap();
// Estimate one block and extrapolate the cost to all blocks.
// the theory would be that we don't have the actual max_distance, but we are close within
// 50% threshold.
// It is multiplied by 2 because in a log case scenario the line would be as much above as
// below. So the offset would = max_distance
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
// function metadata per block
+ 21 * (stats.num_vals / BLOCK_SIZE);
let num_bits_uncompressed = 64 * stats.num_vals;
num_bits as f32 / num_bits_uncompressed as f32
}
}
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
if x < y {
y - x
} else {
x - y
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::tests::get_codec_test_data_sets;
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
crate::tests::create_and_validate::<
PiecewiseLinearFastFieldSerializer,
PiecewiseLinearFastFieldReader,
>(data, name)
}
#[test]
fn test_compression() {
let data = (10..=6_000_u64).collect::<Vec<_>>();
let (estimate, actual_compression) =
create_and_validate(&data, "simple monotonically large");
assert!(actual_compression < 0.2);
assert!(estimate < 0.20);
assert!(estimate > 0.15);
assert!(actual_compression > 0.001);
}
#[test]
fn test_with_codec_data_sets() {
let data_sets = get_codec_test_data_sets();
for (mut data, name) in data_sets {
create_and_validate(&data, name);
data.reverse();
create_and_validate(&data, name);
}
}
#[test]
fn test_simple() {
let data = (10..=20_u64).collect::<Vec<_>>();
create_and_validate(&data, "simple monotonically");
}
#[test]
fn border_cases_1() {
let data = (0..1024).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn border_case_2() {
let data = (0..1025).collect::<Vec<_>>();
create_and_validate(&data, "border case");
}
#[test]
fn rand() {
for _ in 0..10 {
let mut data = (5_000..20_000)
.map(|_| rand::random::<u32>() as u64)
.collect::<Vec<_>>();
let (estimate, actual_compression) = create_and_validate(&data, "random");
dbg!(estimate);
dbg!(actual_compression);
data.reverse();
create_and_validate(&data, "random");
}
}
}

View File

@@ -48,8 +48,8 @@ use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
use super::bucket::HistogramAggregation;
pub use super::bucket::RangeAggregation;
use super::bucket::{HistogramAggregation, TermsAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::VecWithNames;
@@ -100,27 +100,12 @@ pub(crate) struct BucketAggregationInternal {
}
impl BucketAggregationInternal {
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
pub(crate) fn as_histogram(&self) -> &HistogramAggregation {
match &self.bucket_agg {
BucketAggregationType::Histogram(histogram) => Some(histogram),
_ => None,
BucketAggregationType::Range(_) => panic!("unexpected aggregation"),
BucketAggregationType::Histogram(histogram) => histogram,
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self.bucket_agg {
BucketAggregationType::Terms(terms) => Some(terms),
_ => None,
}
}
}
/// Extract all fields, where the term directory is used in the tree.
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
let mut term_dict_field_names = Default::default();
for el in aggs.values() {
el.get_term_dict_field_names(&mut term_dict_field_names)
}
term_dict_field_names
}
/// Extract all fast field names used in the tree.
@@ -145,12 +130,6 @@ pub enum Aggregation {
}
impl Aggregation {
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
if let Aggregation::Bucket(bucket) = self {
bucket.get_term_dict_field_names(term_field_names)
}
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
@@ -183,12 +162,6 @@ pub struct BucketAggregation {
}
impl BucketAggregation {
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
term_dict_field_names.insert(terms.field.to_string());
}
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
self.bucket_agg.get_fast_field_names(fast_field_names);
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
@@ -204,15 +177,11 @@ pub enum BucketAggregationType {
/// Put data into buckets of user-defined ranges.
#[serde(rename = "histogram")]
Histogram(HistogramAggregation),
/// Put data into buckets of terms.
#[serde(rename = "terms")]
Terms(TermsAggregation),
}
impl BucketAggregationType {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
BucketAggregationType::Histogram(histogram) => {
fast_field_names.insert(histogram.field.to_string())

View File

@@ -1,16 +1,12 @@
//! This will enhance the request tree with access to the fastfield and metadata.
use std::sync::Arc;
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
use super::bucket::{HistogramAggregation, RangeAggregation};
use super::metric::{AverageAggregation, StatsAggregation};
use super::VecWithNames;
use crate::fastfield::{
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
};
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
use crate::schema::{Cardinality, Type};
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
use crate::{SegmentReader, TantivyError};
#[derive(Clone, Default)]
pub(crate) struct AggregationsWithAccessor {
@@ -31,32 +27,11 @@ impl AggregationsWithAccessor {
}
}
#[derive(Clone)]
pub(crate) enum FastFieldAccessor {
Multi(MultiValuedFastFieldReader<u64>),
Single(DynamicFastFieldReader<u64>),
}
impl FastFieldAccessor {
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
match self {
FastFieldAccessor::Multi(_) => None,
FastFieldAccessor::Single(reader) => Some(reader),
}
}
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
match self {
FastFieldAccessor::Multi(reader) => Some(reader),
FastFieldAccessor::Single(_) => None,
}
}
}
#[derive(Clone)]
pub struct BucketAggregationWithAccessor {
/// In general there can be buckets without fast field access, e.g. buckets that are created
/// based on search terms. So eventually this needs to be Option or moved.
pub(crate) accessor: FastFieldAccessor,
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
pub(crate) accessor: DynamicFastFieldReader<u64>,
pub(crate) field_type: Type,
pub(crate) bucket_agg: BucketAggregationType,
pub(crate) sub_aggregation: AggregationsWithAccessor,
@@ -68,25 +43,14 @@ impl BucketAggregationWithAccessor {
sub_aggregation: &Aggregations,
reader: &SegmentReader,
) -> crate::Result<BucketAggregationWithAccessor> {
let mut inverted_index = None;
let (accessor, field_type) = match &bucket {
BucketAggregationType::Range(RangeAggregation {
field: field_name,
ranges: _,
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
}) => get_ff_reader_and_validate(reader, field_name)?,
BucketAggregationType::Histogram(HistogramAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
BucketAggregationType::Terms(TermsAggregation {
field: field_name, ..
}) => {
let field = reader
.schema()
.get_field(field_name)
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
inverted_index = Some(reader.inverted_index(field)?);
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
}
}) => get_ff_reader_and_validate(reader, field_name)?,
};
let sub_aggregation = sub_aggregation.clone();
Ok(BucketAggregationWithAccessor {
@@ -94,7 +58,6 @@ impl BucketAggregationWithAccessor {
field_type,
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
bucket_agg: bucket.clone(),
inverted_index,
})
}
}
@@ -115,14 +78,10 @@ impl MetricAggregationWithAccessor {
match &metric {
MetricAggregation::Average(AverageAggregation { field: field_name })
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
let (accessor, field_type) =
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
Ok(MetricAggregationWithAccessor {
accessor: accessor
.as_single()
.expect("unexpected fast field cardinality")
.clone(),
accessor,
field_type,
metric: metric.clone(),
})
@@ -159,45 +118,32 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
))
}
/// Get fast field reader with given cardinatility.
fn get_ff_reader_and_validate(
reader: &SegmentReader,
field_name: &str,
cardinality: Cardinality,
) -> crate::Result<(FastFieldAccessor, Type)> {
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
let field = reader
.schema()
.get_field(field_name)
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
let field_type = reader.schema().get_field_entry(field).field_type();
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
if ff_type == FastType::Date {
return Err(TantivyError::InvalidArgument(
"Unsupported field type date in aggregation".to_string(),
));
}
if cardinality != field_cardinality {
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
return Err(TantivyError::InvalidArgument(format!(
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
field_name, cardinality, field_cardinality
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
field_type.value_type()
)));
}
} else {
return Err(TantivyError::InvalidArgument(format!(
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
field_type.value_type()
)));
};
let ff_fields = reader.fast_fields();
match cardinality {
Cardinality::SingleValue => ff_fields
.u64_lenient(field)
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
Cardinality::MultiValues => ff_fields
.u64s_lenient(field)
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
}
ff_fields
.u64_lenient(field)
.map(|field| (field, field_type.value_type()))
}

View File

@@ -7,134 +7,86 @@
use std::cmp::Ordering;
use std::collections::HashMap;
use itertools::Itertools;
use serde::{Deserialize, Serialize};
use super::agg_req::{
Aggregations, AggregationsInternal, BucketAggregationInternal, MetricAggregation,
};
use super::bucket::{intermediate_buckets_to_final_buckets, GetDocCount};
use super::agg_req::{Aggregations, AggregationsInternal, BucketAggregationInternal};
use super::bucket::intermediate_buckets_to_final_buckets;
use super::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
IntermediateMetricResult, IntermediateRangeBucketEntry,
};
use super::metric::{SingleMetricResult, Stats};
use super::{Key, VecWithNames};
use crate::TantivyError;
use super::Key;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
/// The final aggegation result.
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
impl AggregationResults {
pub(crate) fn get_value_from_aggregation(
&self,
name: &str,
agg_property: &str,
) -> crate::Result<Option<f64>> {
if let Some(agg) = self.0.get(name) {
agg.get_value_from_aggregation(name, agg_property)
} else {
// Validation is be done during request parsing, so we can't reach this state.
Err(TantivyError::InternalError(format!(
"Can't find aggregation {:?} in sub_aggregations",
name
)))
}
}
/// Convert and intermediate result and its aggregation request to the final result
pub fn from_intermediate_and_req(
results: IntermediateAggregationResults,
agg: Aggregations,
) -> crate::Result<Self> {
) -> Self {
AggregationResults::from_intermediate_and_req_internal(results, &(agg.into()))
}
/// Convert and intermediate result and its aggregation request to the final result
///
/// Internal function, CollectorAggregations is used instead Aggregations, which is optimized
/// for internal processing, by splitting metric and buckets into seperate groups.
pub(crate) fn from_intermediate_and_req_internal(
intermediate_results: IntermediateAggregationResults,
/// for internal processing
fn from_intermediate_and_req_internal(
results: IntermediateAggregationResults,
req: &AggregationsInternal,
) -> crate::Result<Self> {
) -> Self {
let mut result = HashMap::default();
// Important assumption:
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
// request
let mut results: HashMap<String, AggregationResult> = HashMap::new();
if let Some(buckets) = intermediate_results.buckets {
add_coverted_final_buckets_to_result(&mut results, buckets, &req.buckets)?
if let Some(buckets) = results.buckets {
result.extend(buckets.into_iter().zip(req.buckets.values()).map(
|((key, bucket), req)| {
(
key,
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
bucket, req,
)),
)
},
));
} else {
// When there are no buckets, we create empty buckets, so that the serialized json
// format is constant
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
};
if let Some(metrics) = intermediate_results.metrics {
add_converted_final_metrics_to_result(&mut results, metrics);
} else {
// When there are no metrics, we create empty metric results, so that the serialized
// json format is constant
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
result.extend(req.buckets.iter().map(|(key, req)| {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
(
key.to_string(),
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
empty_bucket,
req,
)),
)
}));
}
Ok(Self(results))
if let Some(metrics) = results.metrics {
result.extend(
metrics
.into_iter()
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
);
} else {
result.extend(req.metrics.iter().map(|(key, req)| {
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
(
key.to_string(),
AggregationResult::MetricResult(empty_bucket.into()),
)
}));
}
Self(result)
}
}
fn add_converted_final_metrics_to_result(
results: &mut HashMap<String, AggregationResult>,
metrics: VecWithNames<IntermediateMetricResult>,
) {
results.extend(
metrics
.into_iter()
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
);
}
fn add_empty_final_metrics_to_result(
results: &mut HashMap<String, AggregationResult>,
req_metrics: &VecWithNames<MetricAggregation>,
) -> crate::Result<()> {
results.extend(req_metrics.iter().map(|(key, req)| {
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
(
key.to_string(),
AggregationResult::MetricResult(empty_bucket.into()),
)
}));
Ok(())
}
fn add_empty_final_buckets_to_result(
results: &mut HashMap<String, AggregationResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> {
let requested_buckets = req_buckets.iter();
for (key, req) in requested_buckets {
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
results.insert(key.to_string(), empty_bucket);
}
Ok(())
}
fn add_coverted_final_buckets_to_result(
results: &mut HashMap<String, AggregationResult>,
buckets: VecWithNames<IntermediateBucketResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
) -> crate::Result<()> {
assert_eq!(buckets.len(), req_buckets.len());
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
for ((key, bucket), req) in buckets_with_request {
let result =
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(bucket, req)?);
results.insert(key, result);
}
Ok(())
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
/// An aggregation is either a bucket or a metric.
@@ -145,23 +97,6 @@ pub enum AggregationResult {
MetricResult(MetricResult),
}
impl AggregationResult {
pub(crate) fn get_value_from_aggregation(
&self,
_name: &str,
agg_property: &str,
) -> crate::Result<Option<f64>> {
match self {
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
"Tried to retrieve value from bucket aggregation. This is not supported and \
should not happen during collection, but should be catched during validation"
.to_string(),
)),
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
}
}
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
/// MetricResult
@@ -172,14 +107,6 @@ pub enum MetricResult {
Stats(Stats),
}
impl MetricResult {
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
match self {
MetricResult::Average(avg) => Ok(avg.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
}
}
}
impl From<IntermediateMetricResult> for MetricResult {
fn from(metric: IntermediateMetricResult) -> Self {
match metric {
@@ -213,65 +140,39 @@ pub enum BucketResult {
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
buckets: Vec<BucketEntry>,
},
/// This is the term result
Terms {
/// The buckets.
///
/// See [TermsAggregation](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
}
impl BucketResult {
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
Ok(BucketResult::from_intermediate_and_req(empty_bucket, req)?)
}
fn from_intermediate_and_req(
bucket_result: IntermediateBucketResult,
req: &BucketAggregationInternal,
) -> crate::Result<Self> {
) -> Self {
match bucket_result {
IntermediateBucketResult::Range(range_res) => {
let mut buckets: Vec<RangeBucketEntry> = range_res
.buckets
IntermediateBucketResult::Range(range_map) => {
let mut buckets: Vec<RangeBucketEntry> = range_map
.into_iter()
.map(|(_, bucket)| {
RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation)
})
.collect::<crate::Result<Vec<_>>>()?;
.collect_vec();
buckets.sort_by(|left, right| {
// TODO use total_cmp next stable rust release
left.from
buckets.sort_by(|a, b| {
a.from
.unwrap_or(f64::MIN)
.partial_cmp(&right.from.unwrap_or(f64::MIN))
.partial_cmp(&b.from.unwrap_or(f64::MIN))
.unwrap_or(Ordering::Equal)
});
Ok(BucketResult::Range { buckets })
BucketResult::Range { buckets }
}
IntermediateBucketResult::Histogram { buckets } => {
let buckets = intermediate_buckets_to_final_buckets(
buckets,
req.as_histogram()
.expect("unexpected aggregation, expected histogram aggregation"),
req.as_histogram(),
&req.sub_aggregation,
)?;
);
Ok(BucketResult::Histogram { buckets })
BucketResult::Histogram { buckets }
}
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
req.as_term()
.expect("unexpected aggregation, expected term aggregation"),
&req.sub_aggregation,
),
}
}
}
@@ -309,7 +210,7 @@ pub struct BucketEntry {
/// Number of documents in the bucket.
pub doc_count: u64,
#[serde(flatten)]
/// Sub-aggregations in this bucket.
/// sub-aggregations in this bucket.
pub sub_aggregation: AggregationResults,
}
@@ -317,25 +218,15 @@ impl BucketEntry {
pub(crate) fn from_intermediate_and_req(
entry: IntermediateHistogramBucketEntry,
req: &AggregationsInternal,
) -> crate::Result<Self> {
Ok(BucketEntry {
) -> Self {
BucketEntry {
key: Key::F64(entry.key),
doc_count: entry.doc_count,
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
req,
)?,
})
}
}
impl GetDocCount for &BucketEntry {
fn doc_count(&self) -> u64 {
self.doc_count
}
}
impl GetDocCount for BucketEntry {
fn doc_count(&self) -> u64 {
self.doc_count
),
}
}
}
@@ -390,16 +281,16 @@ impl RangeBucketEntry {
fn from_intermediate_and_req(
entry: IntermediateRangeBucketEntry,
req: &AggregationsInternal,
) -> crate::Result<Self> {
Ok(RangeBucketEntry {
) -> Self {
RangeBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
req,
)?,
),
to: entry.to,
from: entry.from,
})
}
}
}

View File

@@ -13,7 +13,9 @@ use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::segment_agg_result::{
SegmentAggregationResultsCollector, SegmentHistogramBucketEntry,
};
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type;
use crate::{DocId, TantivyError};
@@ -56,7 +58,7 @@ use crate::{DocId, TantivyError};
/// "prices": {
/// "histogram": {
/// "field": "price",
/// "interval": 10
/// "interval": 10,
/// }
/// }
/// }
@@ -69,17 +71,16 @@ use crate::{DocId, TantivyError};
pub struct HistogramAggregation {
/// The field to aggregate on.
pub field: String,
/// The interval to chunk your data range. Each bucket spans a value range of [0..interval).
/// The interval to chunk your data range. The buckets span ranges of [0..interval).
/// Must be a positive value.
pub interval: f64,
/// Intervals implicitely defines an absolute grid of buckets `[interval * k, interval * (k +
/// 1))`.
///
/// Offset makes it possible to shift this grid into
/// `[offset + interval * k, offset + interval * (k + 1))`. Offset has to be in the range [0,
/// interval).
/// Offset makes it possible to shift this grid into `[offset + interval * k, offset + interval
/// * (k + 1)) Offset has to be in the range [0, interval).
///
/// As an example, if there are two documents with value 9 and 12 and interval 10.0, they would
/// As an example. If there are two documents with value 8 and 12 and interval 10.0, they would
/// fall into the buckets with the key 0 and 10.
/// With offset 5 and interval 10, they would both fall into the bucket with they key 5 and the
/// range [5..15)
@@ -92,22 +93,6 @@ pub struct HistogramAggregation {
///
/// hard_bounds only limits the buckets, to force a range set both extended_bounds and
/// hard_bounds to the same range.
///
/// ## Example
/// ```json
/// {
/// "prices": {
/// "histogram": {
/// "field": "price",
/// "interval": 10,
/// "hard_bounds": {
/// "min": 0,
/// "max": 100
/// }
/// }
/// }
/// }
/// ```
pub hard_bounds: Option<HistogramBounds>,
/// Can be set to extend your bounds. The range of the buckets is by default defined by the
/// data range of the values of the documents. As the name suggests, this can only be used to
@@ -174,27 +159,6 @@ impl HistogramBounds {
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentHistogramBucketEntry {
pub key: f64,
pub doc_count: u64,
}
impl SegmentHistogramBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
sub_aggregation: SegmentAggregationResultsCollector,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateHistogramBucketEntry> {
Ok(IntermediateHistogramBucketEntry {
key: self.key,
doc_count: self.doc_count,
sub_aggregation: sub_aggregation
.into_intermediate_aggregations_result(agg_with_accessor)?,
})
}
}
/// The collector puts values from the fast field into the correct buckets and does a conversion to
/// the correct datatype.
#[derive(Clone, Debug, PartialEq)]
@@ -210,10 +174,7 @@ pub struct SegmentHistogramCollector {
}
impl SegmentHistogramCollector {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
let mut buckets = Vec::with_capacity(
self.buckets
.iter()
@@ -227,20 +188,13 @@ impl SegmentHistogramCollector {
//
// Empty buckets may be added later again in the final result, depending on the request.
if let Some(sub_aggregations) = self.sub_aggregations {
for bucket_res in self
.buckets
.into_iter()
.zip(sub_aggregations.into_iter())
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
.map(|(bucket, sub_aggregation)| {
bucket.into_intermediate_bucket_entry(
sub_aggregation,
&agg_with_accessor.sub_aggregation,
)
})
{
buckets.push(bucket_res?);
}
buckets.extend(
self.buckets
.into_iter()
.zip(sub_aggregations.into_iter())
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
.map(|(bucket, sub_aggregation)| (bucket, sub_aggregation).into()),
)
} else {
buckets.extend(
self.buckets
@@ -250,7 +204,7 @@ impl SegmentHistogramCollector {
);
};
Ok(IntermediateBucketResult::Histogram { buckets })
IntermediateBucketResult::Histogram { buckets }
}
pub(crate) fn from_req_and_validate(
@@ -319,16 +273,12 @@ impl SegmentHistogramCollector {
let get_bucket_num =
|val| (get_bucket_num_f64(val, interval, offset) as i64 - first_bucket_num) as usize;
let accessor = bucket_with_accessor
.accessor
.as_single()
.expect("unexpected fast field cardinatility");
let mut iter = doc.chunks_exact(4);
for docs in iter.by_ref() {
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0]));
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1]));
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2]));
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3]));
let val0 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[0]));
let val1 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[1]));
let val2 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[2]));
let val3 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[3]));
let bucket_pos0 = get_bucket_num(val0);
let bucket_pos1 = get_bucket_num(val1);
@@ -365,7 +315,8 @@ impl SegmentHistogramCollector {
);
}
for doc in iter.remainder() {
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type);
let val =
f64_from_fastfield_u64(bucket_with_accessor.accessor.get(*doc), &self.field_type);
if !bounds.contains(val) {
continue;
}
@@ -442,7 +393,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
) -> crate::Result<Vec<BucketEntry>> {
) -> Vec<BucketEntry> {
// Generate the the full list of buckets without gaps.
//
// The bounds are the min max from the current buckets, optionally extended by
@@ -485,7 +436,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
.map(|intermediate_bucket| {
BucketEntry::from_intermediate_and_req(intermediate_bucket, sub_aggregation)
})
.collect::<crate::Result<Vec<_>>>()
.collect_vec()
}
// Convert to BucketEntry
@@ -493,7 +444,7 @@ pub(crate) fn intermediate_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &AggregationsInternal,
) -> crate::Result<Vec<BucketEntry>> {
) -> Vec<BucketEntry> {
if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no
// gaps, since intermediate result does not contain empty buckets (filtered to
@@ -505,7 +456,7 @@ pub(crate) fn intermediate_buckets_to_final_buckets(
.into_iter()
.filter(|bucket| bucket.doc_count >= histogram_req.min_doc_count())
.map(|bucket| BucketEntry::from_intermediate_and_req(bucket, sub_aggregation))
.collect::<crate::Result<Vec<_>>>()
.collect_vec()
}
}
@@ -679,9 +630,41 @@ mod tests {
};
use crate::aggregation::metric::{AverageAggregation, StatsAggregation};
use crate::aggregation::tests::{
exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_from_values, get_test_index_with_num_docs,
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
};
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::IndexRecordOption;
use crate::{Index, Term};
fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
exec_request_with_query(agg_req, index, None)
}
fn exec_request_with_query(
agg_req: Aggregations,
index: &Index,
query: Option<(&str, &str)>,
) -> crate::Result<Value> {
let collector = AggregationCollector::from_aggs(agg_req);
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res = if let Some((field, term)) = query {
let text_field = reader.searcher().schema().get_field(field).unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, term),
IndexRecordOption::Basic,
);
searcher.search(&term_query, &collector)?
} else {
searcher.search(&AllQuery, &collector)?
};
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
Ok(res)
}
#[test]
fn histogram_test_crooked_values() -> crate::Result<()> {

View File

@@ -9,132 +9,8 @@
mod histogram;
mod range;
mod term_agg;
use std::collections::HashMap;
pub(crate) use histogram::SegmentHistogramCollector;
pub use histogram::*;
pub(crate) use range::SegmentRangeCollector;
pub use range::*;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
pub use term_agg::*;
/// Order for buckets in a bucket aggregation.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
pub enum Order {
/// Asc order
#[serde(rename = "asc")]
Asc,
/// Desc order
#[serde(rename = "desc")]
Desc,
}
impl Default for Order {
fn default() -> Self {
Order::Desc
}
}
#[derive(Clone, Debug, PartialEq)]
/// Order property by which to apply the order
pub enum OrderTarget {
/// The key of the bucket
Key,
/// The doc count of the bucket
Count,
/// Order by value of the sub aggregation metric with identified by given `String`.
///
/// Only single value metrics are supported currently
SubAggregation(String),
}
impl Default for OrderTarget {
fn default() -> Self {
OrderTarget::Count
}
}
impl From<&str> for OrderTarget {
fn from(val: &str) -> Self {
match val {
"_key" => OrderTarget::Key,
"_count" => OrderTarget::Count,
_ => OrderTarget::SubAggregation(val.to_string()),
}
}
}
impl ToString for OrderTarget {
fn to_string(&self) -> String {
match self {
OrderTarget::Key => "_key".to_string(),
OrderTarget::Count => "_count".to_string(),
OrderTarget::SubAggregation(agg) => agg.to_string(),
}
}
}
/// Set the order. target is either "_count", "_key", or the name of
/// a metric sub_aggregation.
///
/// De/Serializes to elasticsearch compatible JSON.
///
/// Examples in JSON format:
/// { "_count": "asc" }
/// { "_key": "asc" }
/// { "average_price": "asc" }
#[derive(Clone, Default, Debug, PartialEq)]
pub struct CustomOrder {
/// The target property by which to sort by
pub target: OrderTarget,
/// The order asc or desc
pub order: Order,
}
impl Serialize for CustomOrder {
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
where S: Serializer {
let map: HashMap<String, Order> =
std::iter::once((self.target.to_string(), self.order)).collect();
map.serialize(serializer)
}
}
impl<'de> Deserialize<'de> for CustomOrder {
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
where D: Deserializer<'de> {
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
if let Some((key, value)) = map.into_iter().next() {
Ok(CustomOrder {
target: key.as_str().into(),
order: value,
})
} else {
Err(de::Error::custom(
"unexpected empty map in order".to_string(),
))
}
})
}
}
#[test]
fn custom_order_serde_test() {
let order = CustomOrder {
target: OrderTarget::Key,
order: Order::Desc,
};
let order_str = serde_json::to_string(&order).unwrap();
assert_eq!(order_str, "{\"_key\":\"desc\"}");
let order_deser = serde_json::from_str(&order_str).unwrap();
assert_eq!(order, order_deser);
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
assert!(order_deser.is_err());
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
assert!(order_deser.is_err());
}

View File

@@ -1,4 +1,3 @@
use std::fmt::Debug;
use std::ops::Range;
use serde::{Deserialize, Serialize};
@@ -6,10 +5,10 @@ use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
use crate::aggregation::segment_agg_result::{
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
};
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
use crate::fastfield::FastFieldReader;
use crate::schema::Type;
@@ -39,12 +38,12 @@ use crate::{DocId, TantivyError};
/// # Request JSON Format
/// ```json
/// {
/// "my_ranges": {
/// "range": {
/// "field": "score",
/// "ranges": [
/// { "to": 3.0 },
/// { "from": 3.0, "to": 7.0 },
/// { "from": 7.0, "to": 20.0 },
/// { "from": 7.0, "to": 20.0 }
/// { "from": 20.0 }
/// ]
/// }
@@ -103,72 +102,22 @@ pub struct SegmentRangeCollector {
field_type: Type,
}
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
/// The from range of the bucket. Equals f64::MIN when None.
pub from: Option<f64>,
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
/// inclusive.
pub to: Option<f64>,
}
impl Debug for SegmentRangeBucketEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeBucketEntry")
.field("key", &self.key)
.field("doc_count", &self.doc_count)
.field("from", &self.from)
.field("to", &self.to)
.finish()
}
}
impl SegmentRangeBucketEntry {
pub(crate) fn into_intermediate_bucket_entry(
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateRangeBucketEntry> {
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry {
key: self.key,
doc_count: self.doc_count,
sub_aggregation,
from: self.from,
to: self.to,
})
}
}
impl SegmentRangeCollector {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
let field_type = self.field_type;
let buckets = self
.buckets
.into_iter()
.map(move |range_bucket| {
Ok((
(
range_to_string(&range_bucket.range, &field_type),
range_bucket
.bucket
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
))
range_bucket.bucket.into(),
)
})
.collect::<crate::Result<_>>()?;
.collect();
Ok(IntermediateBucketResult::Range(
IntermediateRangeBucketResult { buckets },
))
IntermediateBucketResult::Range(buckets)
}
pub(crate) fn from_req_and_validate(
@@ -226,15 +175,11 @@ impl SegmentRangeCollector {
force_flush: bool,
) {
let mut iter = doc.chunks_exact(4);
let accessor = bucket_with_accessor
.accessor
.as_single()
.expect("unexpected fast field cardinatility");
for docs in iter.by_ref() {
let val1 = accessor.get(docs[0]);
let val2 = accessor.get(docs[1]);
let val3 = accessor.get(docs[2]);
let val4 = accessor.get(docs[3]);
let val1 = bucket_with_accessor.accessor.get(docs[0]);
let val2 = bucket_with_accessor.accessor.get(docs[1]);
let val3 = bucket_with_accessor.accessor.get(docs[2]);
let val4 = bucket_with_accessor.accessor.get(docs[3]);
let bucket_pos1 = self.get_bucket_pos(val1);
let bucket_pos2 = self.get_bucket_pos(val2);
let bucket_pos3 = self.get_bucket_pos(val3);
@@ -246,7 +191,7 @@ impl SegmentRangeCollector {
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
}
for doc in iter.remainder() {
let val = accessor.get(*doc);
let val = bucket_with_accessor.accessor.get(*doc);
let bucket_pos = self.get_bucket_pos(val);
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
}
@@ -401,8 +346,7 @@ mod tests {
ranges,
};
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type)
.expect("unexpected error")
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
}
#[test]
@@ -543,7 +487,11 @@ mod tests {
#[test]
fn range_binary_search_test_f64() {
let ranges = vec![(10.0..100.0).into()];
let ranges = vec![
//(f64::MIN..10.0).into(),
(10.0..100.0).into(),
//(100.0..f64::MAX).into(),
];
let collector = get_collector_from_ranges(ranges, Type::F64);
let search = |val: u64| collector.get_bucket_pos(val);

File diff suppressed because it is too large Load Diff

View File

@@ -86,18 +86,17 @@ impl Collector for AggregationCollector {
&self,
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
let res = merge_fruits(segment_fruits)?;
AggregationResults::from_intermediate_and_req(res, self.agg.clone())
merge_fruits(segment_fruits)
.map(|res| AggregationResults::from_intermediate_and_req(res, self.agg.clone()))
}
}
fn merge_fruits(
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
mut segment_fruits: Vec<IntermediateAggregationResults>,
) -> crate::Result<IntermediateAggregationResults> {
if let Some(fruit) = segment_fruits.pop() {
let mut fruit = fruit?;
if let Some(mut fruit) = segment_fruits.pop() {
for next_fruit in segment_fruits {
fruit.merge_fruits(next_fruit?);
fruit.merge_fruits(next_fruit);
}
Ok(fruit)
} else {
@@ -107,7 +106,7 @@ fn merge_fruits(
/// AggregationSegmentCollector does the aggregation collection on a segment.
pub struct AggregationSegmentCollector {
aggs_with_accessor: AggregationsWithAccessor,
aggs: AggregationsWithAccessor,
result: SegmentAggregationResultsCollector,
}
@@ -122,24 +121,22 @@ impl AggregationSegmentCollector {
let result =
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
Ok(AggregationSegmentCollector {
aggs_with_accessor,
aggs: aggs_with_accessor,
result,
})
}
}
impl SegmentCollector for AggregationSegmentCollector {
type Fruit = crate::Result<IntermediateAggregationResults>;
type Fruit = IntermediateAggregationResults;
#[inline]
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
self.result.collect(doc, &self.aggs_with_accessor);
self.result.collect(doc, &self.aggs);
}
fn harvest(mut self) -> Self::Fruit {
self.result
.flush_staged_docs(&self.aggs_with_accessor, true);
self.result
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
self.result.flush_staged_docs(&self.aggs, true);
self.result.into()
}
}

View File

@@ -9,16 +9,12 @@ use itertools::Itertools;
use serde::{Deserialize, Serialize};
use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation};
use super::agg_result::BucketResult;
use super::bucket::{
cut_off_buckets, get_agg_name_and_property, GetDocCount, Order, OrderTarget,
SegmentHistogramBucketEntry, TermsAggregation,
};
use super::metric::{IntermediateAverage, IntermediateStats};
use super::segment_agg_result::SegmentMetricResultCollector;
use super::segment_agg_result::{
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentHistogramBucketEntry,
SegmentMetricResultCollector, SegmentRangeBucketEntry,
};
use super::{Key, SerializedKey, VecWithNames};
use crate::aggregation::agg_result::{AggregationResults, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
/// intermediate results.
@@ -28,6 +24,15 @@ pub struct IntermediateAggregationResults {
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
}
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
fn from(tree: SegmentAggregationResultsCollector) -> Self {
let metrics = tree.metrics.map(VecWithNames::from_other);
let buckets = tree.buckets.map(VecWithNames::from_other);
Self { metrics, buckets }
}
}
impl IntermediateAggregationResults {
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
let metrics = if req.metrics.is_empty() {
@@ -157,21 +162,29 @@ impl IntermediateMetricResult {
pub enum IntermediateBucketResult {
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
/// sub_aggregations.
Range(IntermediateRangeBucketResult),
Range(FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>),
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
Histogram {
/// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>,
},
/// Term aggregation
Terms(IntermediateTermBucketResult),
}
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
fn from(collector: SegmentBucketResultCollector) -> Self {
match collector {
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
SegmentBucketResultCollector::Histogram(histogram) => {
histogram.into_intermediate_bucket_result()
}
}
}
}
impl IntermediateBucketResult {
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
match req {
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
BucketAggregationType::Histogram(_) => {
IntermediateBucketResult::Histogram { buckets: vec![] }
@@ -181,34 +194,24 @@ impl IntermediateBucketResult {
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
match (self, other) {
(
IntermediateBucketResult::Terms(term_res_left),
IntermediateBucketResult::Terms(term_res_right),
IntermediateBucketResult::Range(entries_left),
IntermediateBucketResult::Range(entries_right),
) => {
merge_maps(&mut term_res_left.entries, term_res_right.entries);
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
term_res_left.doc_count_error_upper_bound +=
term_res_right.doc_count_error_upper_bound;
}
(
IntermediateBucketResult::Range(range_res_left),
IntermediateBucketResult::Range(range_res_right),
) => {
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
merge_maps(entries_left, entries_right);
}
(
IntermediateBucketResult::Histogram {
buckets: buckets_left,
buckets: entries_left,
..
},
IntermediateBucketResult::Histogram {
buckets: buckets_right,
buckets: entries_right,
..
},
) => {
let buckets = buckets_left
let mut buckets = entries_left
.drain(..)
.merge_join_by(buckets_right.into_iter(), |left, right| {
.merge_join_by(entries_right.into_iter(), |left, right| {
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
})
.map(|either| match either {
@@ -221,7 +224,7 @@ impl IntermediateBucketResult {
})
.collect();
*buckets_left = buckets;
std::mem::swap(entries_left, &mut buckets);
}
(IntermediateBucketResult::Range(_), _) => {
panic!("try merge on different types")
@@ -229,118 +232,10 @@ impl IntermediateBucketResult {
(IntermediateBucketResult::Histogram { .. }, _) => {
panic!("try merge on different types")
}
(IntermediateBucketResult::Terms { .. }, _) => {
panic!("try merge on different types")
}
}
}
}
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Range aggregation including error counts
pub struct IntermediateRangeBucketResult {
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
}
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts
pub struct IntermediateTermBucketResult {
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64,
}
impl IntermediateTermBucketResult {
pub(crate) fn into_final_result(
self,
req: &TermsAggregation,
sub_aggregation_req: &AggregationsInternal,
) -> crate::Result<BucketResult> {
let req = TermsAggregationInternal::from_req(req);
let mut buckets: Vec<BucketEntry> = self
.entries
.into_iter()
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
.map(|(key, entry)| {
Ok(BucketEntry {
key: Key::Str(key),
doc_count: entry.doc_count,
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
entry.sub_aggregation,
sub_aggregation_req,
)?,
})
})
.collect::<crate::Result<_>>()?;
let order = req.order.order;
match req.order.target {
OrderTarget::Key => {
buckets.sort_by(|left, right| {
if req.order.order == Order::Desc {
left.key.partial_cmp(&right.key)
} else {
right.key.partial_cmp(&left.key)
}
.expect("expected type string, which is always sortable")
});
}
OrderTarget::Count => {
if req.order.order == Order::Desc {
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
} else {
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
}
}
OrderTarget::SubAggregation(name) => {
let (agg_name, agg_property) = get_agg_name_and_property(&name);
let mut buckets_with_val = buckets
.into_iter()
.map(|bucket| {
let val = bucket
.sub_aggregation
.get_value_from_aggregation(agg_name, agg_property)?
.unwrap_or(f64::NAN);
Ok((bucket, val))
})
.collect::<crate::Result<Vec<_>>>()?;
buckets_with_val.sort_by(|(_, val1), (_, val2)| {
// TODO use total_cmp in next rust stable release
match &order {
Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
}
});
buckets = buckets_with_val
.into_iter()
.map(|(bucket, _val)| bucket)
.collect_vec();
}
}
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
// only for terms that didn't make it into the top N.
//
// This can be interesting, as a value of quality of the results, but not good to check the
// actual error count for the returned terms.
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
cut_off_buckets(&mut buckets, req.size as usize);
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
Some(self.doc_count_error_upper_bound)
} else {
None
};
Ok(BucketResult::Terms {
buckets,
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
doc_count_error_upper_bound,
})
}
}
trait MergeFruits {
fn merge_fruits(&mut self, other: Self);
}
@@ -382,6 +277,26 @@ impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
}
}
impl
From<(
SegmentHistogramBucketEntry,
SegmentAggregationResultsCollector,
)> for IntermediateHistogramBucketEntry
{
fn from(
entry: (
SegmentHistogramBucketEntry,
SegmentAggregationResultsCollector,
),
) -> Self {
IntermediateHistogramBucketEntry {
key: entry.0.key,
doc_count: entry.0.doc_count,
sub_aggregation: entry.1.into(),
}
}
}
/// This is the range entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
@@ -390,6 +305,7 @@ pub struct IntermediateRangeBucketEntry {
pub key: Key,
/// The number of documents in the bucket.
pub doc_count: u64,
pub(crate) values: Option<Vec<u64>>,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
/// The from range of the bucket. Equals f64::MIN when None.
@@ -400,20 +316,22 @@ pub struct IntermediateRangeBucketEntry {
pub to: Option<f64>,
}
/// This is the term entry for a bucket, which contains a count, and optionally
/// sub_aggregations.
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateTermBucketEntry {
/// The number of documents in the bucket.
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
}
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
fn from(entry: SegmentRangeBucketEntry) -> Self {
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
sub_aggregation.into()
} else {
Default::default()
};
impl MergeFruits for IntermediateTermBucketEntry {
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
IntermediateRangeBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
values: None,
sub_aggregation,
to: entry.to,
from: entry.from,
}
}
}
@@ -448,6 +366,7 @@ mod tests {
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
doc_count: *doc_count,
values: None,
sub_aggregation: Default::default(),
from: None,
to: None,
@@ -456,7 +375,7 @@ mod tests {
}
map.insert(
"my_agg_level2".to_string(),
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
IntermediateBucketResult::Range(buckets),
);
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
@@ -475,6 +394,7 @@ mod tests {
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
doc_count: *doc_count,
values: None,
from: None,
to: None,
sub_aggregation: get_sub_test_tree(&[(
@@ -486,7 +406,7 @@ mod tests {
}
map.insert(
"my_agg_level1".to_string(),
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
IntermediateBucketResult::Range(buckets),
);
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),

View File

@@ -19,7 +19,7 @@ use crate::DocId;
/// "avg": {
/// "field": "score",
/// }
/// }
/// }
/// ```
pub struct AverageAggregation {
/// The field name to compute the stats on.

View File

@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::aggregation::f64_from_fastfield_u64;
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
use crate::schema::Type;
use crate::{DocId, TantivyError};
use crate::DocId;
/// A multi-value metric aggregation that computes stats of numeric values that are
/// extracted from the aggregated documents.
@@ -53,23 +53,6 @@ pub struct Stats {
pub avg: Option<f64>,
}
impl Stats {
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
match agg_property {
"count" => Ok(Some(self.count as f64)),
"sum" => Ok(Some(self.sum)),
"standard_deviation" => Ok(self.standard_deviation),
"min" => Ok(self.min),
"max" => Ok(self.max),
"avg" => Ok(self.avg),
_ => Err(TantivyError::InvalidArgument(format!(
"unknown property {} on stats metric aggregation",
agg_property
))),
}
}
}
/// IntermediateStats contains the mergeable version for stats.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {

View File

@@ -37,7 +37,6 @@
//! - [Bucket](bucket)
//! - [Histogram](bucket::HistogramAggregation)
//! - [Range](bucket::RangeAggregation)
//! - [Terms](bucket::TermsAggregation)
//! - [Metric](metric)
//! - [Average](metric::AverageAggregation)
//! - [Stats](metric::StatsAggregation)
@@ -148,8 +147,7 @@
//! IntermediateAggregationResults provides the
//! [merge_fruits](intermediate_agg_result::IntermediateAggregationResults::merge_fruits) method to
//! merge multiple results. The merged result can then be converted into
//! [agg_result::AggregationResults] via the
//! [agg_result::AggregationResults::from_intermediate_and_req] method.
//! [agg_result::AggregationResults] via the [Into] trait.
pub mod agg_req;
mod agg_req_with_accessor;
@@ -247,14 +245,6 @@ impl<T: Clone> VecWithNames<T> {
fn is_empty(&self) -> bool {
self.keys.is_empty()
}
fn len(&self) -> usize {
self.keys.len()
}
fn get(&self, name: &str) -> Option<&T> {
self.keys()
.position(|key| key == name)
.map(|pos| &self.values[pos])
}
}
/// The serialized key is used in a HashMap.
@@ -321,16 +311,13 @@ mod tests {
use super::bucket::RangeAggregation;
use super::collector::AggregationCollector;
use super::metric::AverageAggregation;
use crate::aggregation::agg_req::{
get_term_dict_field_names, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::bucket::TermsAggregation;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
@@ -349,80 +336,17 @@ mod tests {
)
}
pub fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
exec_request_with_query(agg_req, index, None)
}
pub fn exec_request_with_query(
agg_req: Aggregations,
index: &Index,
query: Option<(&str, &str)>,
) -> crate::Result<Value> {
let collector = AggregationCollector::from_aggs(agg_req);
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res = if let Some((field, term)) = query {
let text_field = reader.searcher().schema().get_field(field).unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, term),
IndexRecordOption::Basic,
);
searcher.search(&term_query, &collector)?
} else {
searcher.search(&AllQuery, &collector)?
};
// Test serialization/deserialization rountrip
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
Ok(res)
}
pub fn get_test_index_from_values(
merge_segments: bool,
values: &[f64],
) -> crate::Result<Index> {
// Every value gets its own segment
let mut segment_and_values = vec![];
for value in values {
segment_and_values.push(vec![(*value, value.to_string())]);
}
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
}
pub fn get_test_index_from_terms(
merge_segments: bool,
values: &[Vec<&str>],
) -> crate::Result<Index> {
// Every value gets its own segment
let segment_and_values = values
.iter()
.map(|terms| {
terms
.iter()
.enumerate()
.map(|(i, term)| (i as f64, term.to_string()))
.collect()
})
.collect::<Vec<_>>();
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
}
pub fn get_test_index_from_values_and_terms(
merge_segments: bool,
segment_and_values: &[Vec<(f64, String)>],
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast()
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -435,20 +359,15 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
for values in segment_and_values {
for (i, term) in values {
let i = *i;
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
text_field_id => term.to_string(),
string_field_id => term.to_string(),
score_field => i as u64,
score_field_f64 => i as f64,
score_field_i64 => i as i64,
fraction_field => i as f64/100.0,
))?;
}
for &i in values {
// writing the segment
index_writer.add_document(doc!(
text_field => "cool",
score_field => i as u64,
score_field_f64 => i as f64,
score_field_i64 => i as i64,
fraction_field => i as f64/100.0,
))?;
index_writer.commit()?;
}
}
@@ -469,13 +388,15 @@ mod tests {
merge_segments: bool,
use_distributed_collector: bool,
) -> crate::Result<()> {
let mut values_and_terms = (0..80)
.map(|val| vec![(val as f64, "terma".to_string())])
.collect::<Vec<_>>();
values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string();
let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?;
let index = get_test_index_with_num_docs(merge_segments, 80)?;
let reader = index.reader()?;
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
assert_eq!(DOC_BLOCK_SIZE, 64);
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
@@ -520,19 +441,6 @@ mod tests {
}
}
}
},
"term_agg_test":{
"terms": {
"field": "string_id"
},
"aggs": {
"bucketsL2": {
"histogram": {
"field": "score",
"interval": 70.0
}
}
}
}
});
@@ -545,15 +453,14 @@ mod tests {
let searcher = reader.searcher();
AggregationResults::from_intermediate_and_req(
searcher.search(&AllQuery, &collector).unwrap(),
searcher.search(&term_query, &collector).unwrap(),
agg_req,
)
.unwrap()
} else {
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
searcher.search(&term_query, &collector).unwrap()
};
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
@@ -583,46 +490,6 @@ mod tests {
);
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70);
assert_eq!(
res["term_agg_test"],
json!(
{
"buckets": [
{
"bucketsL2": {
"buckets": [
{
"doc_count": 70,
"key": 0.0
},
{
"doc_count": 9,
"key": 70.0
}
]
},
"doc_count": 79,
"key": "terma"
},
{
"bucketsL2": {
"buckets": [
{
"doc_count": 1,
"key": 70.0
}
]
},
"doc_count": 1,
"key": "termb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
)
);
Ok(())
}
@@ -640,10 +507,8 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast()
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
schema_builder.add_text_field("dummy_text", STRING);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -848,21 +713,10 @@ mod tests {
IndexRecordOption::Basic,
);
let sub_agg_req: Aggregations = vec![
("average_in_range".to_string(), get_avg_req("score")),
(
"term_agg".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
),
]
.into_iter()
.collect();
let sub_agg_req: Aggregations =
vec![("average_in_range".to_string(), get_avg_req("score"))]
.into_iter()
.collect();
let agg_req: Aggregations = if use_elastic_json_req {
let elasticsearch_compatible_json_req = r#"
{
@@ -878,8 +732,7 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
"average_in_range": { "avg": { "field": "score" } }
}
},
"rangei64": {
@@ -894,8 +747,7 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": {
@@ -913,8 +765,7 @@ mod tests {
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } },
"term_agg": { "terms": { "field": "text" } }
"average_in_range": { "avg": { "field": "score" } }
}
}
}
@@ -973,9 +824,6 @@ mod tests {
agg_req
};
let field_names = get_term_dict_field_names(&agg_req);
assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
let agg_res: AggregationResults = if use_distributed_collector {
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
@@ -984,7 +832,7 @@ mod tests {
// Test de/serialization roundtrip on intermediate_agg_result
let res: IntermediateAggregationResults =
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
AggregationResults::from_intermediate_and_req(res, agg_req.clone()).unwrap()
AggregationResults::from_intermediate_and_req(res, agg_req.clone())
} else {
let collector = AggregationCollector::from_aggs(agg_req.clone());
@@ -1116,10 +964,10 @@ mod tests {
searcher.search(&AllQuery, &collector).unwrap_err()
};
let agg_res = avg_on_field("dummy_text");
let agg_res = avg_on_field("text");
assert_eq!(
format!("{:?}", agg_res),
r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#
r#"InvalidArgument("Only single value fast fields of type f64, u64, i64 are supported, but got Str ")"#
);
let agg_res = avg_on_field("not_exist_field");
@@ -1131,7 +979,7 @@ mod tests {
let agg_res = avg_on_field("scores_i64");
assert_eq!(
format!("{:?}", agg_res),
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
r#"InvalidArgument("Invalid field type in aggregation I64, only Cardinality::SingleValue supported")"#
);
Ok(())
@@ -1140,12 +988,11 @@ mod tests {
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::SliceRandom;
use rand::{thread_rng, Rng};
use test::{self, Bencher};
use super::*;
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds, TermsAggregation};
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds};
use crate::aggregation::metric::StatsAggregation;
use crate::query::AllQuery;
@@ -1157,10 +1004,6 @@ mod tests {
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let text_field_many_terms =
schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms =
schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype =
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
@@ -1168,10 +1011,6 @@ mod tests {
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let many_terms_data = (0..15_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = thread_rng();
let mut index_writer = index.writer_for_tests()?;
@@ -1180,8 +1019,6 @@ mod tests {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
index_writer.add_document(doc!(
text_field => "cool",
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => val as f64,
score_field_i64 => val as i64,
@@ -1333,64 +1170,6 @@ mod tests {
});
}
#[bench]
fn bench_aggregation_terms_few(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_few_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_terms_many(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req);
let searcher = reader.searcher();
let agg_res: AggregationResults =
searcher.search(&AllQuery, &collector).unwrap().into();
agg_res
});
}
#[bench]
fn bench_aggregation_range_only(b: &mut Bencher) {
let index = get_test_index_bench(false).unwrap();

View File

@@ -9,12 +9,11 @@ use super::agg_req::MetricAggregation;
use super::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
};
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector};
use super::metric::{
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
};
use super::VecWithNames;
use super::{Key, VecWithNames};
use crate::aggregation::agg_req::BucketAggregationType;
use crate::DocId;
@@ -29,17 +28,6 @@ pub(crate) struct SegmentAggregationResultsCollector {
num_staged_docs: usize,
}
impl Default for SegmentAggregationResultsCollector {
fn default() -> Self {
Self {
metrics: Default::default(),
buckets: Default::default(),
staged_docs: [0; DOC_BLOCK_SIZE],
num_staged_docs: Default::default(),
}
}
}
impl Debug for SegmentAggregationResultsCollector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentAggregationResultsCollector")
@@ -52,25 +40,6 @@ impl Debug for SegmentAggregationResultsCollector {
}
impl SegmentAggregationResultsCollector {
pub fn into_intermediate_aggregations_result(
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
let buckets = if let Some(buckets) = self.buckets {
let entries = buckets
.into_iter()
.zip(agg_with_accessor.buckets.values())
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
.collect::<crate::Result<Vec<(String, _)>>>()?;
Some(VecWithNames::from_entries(entries))
} else {
None
};
let metrics = self.metrics.map(VecWithNames::from_other);
Ok(IntermediateAggregationResults { metrics, buckets })
}
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
let buckets = req
.buckets
@@ -128,9 +97,6 @@ impl SegmentAggregationResultsCollector {
agg_with_accessor: &AggregationsWithAccessor,
force_flush: bool,
) {
if self.num_staged_docs == 0 {
return;
}
if let Some(metrics) = &mut self.metrics {
for (collector, agg_with_accessor) in
metrics.values_mut().zip(agg_with_accessor.metrics.values())
@@ -196,40 +162,12 @@ impl SegmentMetricResultCollector {
#[derive(Clone, Debug, PartialEq)]
pub(crate) enum SegmentBucketResultCollector {
Range(SegmentRangeCollector),
Histogram(Box<SegmentHistogramCollector>),
Terms(Box<SegmentTermCollector>),
Histogram(SegmentHistogramCollector),
}
impl SegmentBucketResultCollector {
pub fn into_intermediate_bucket_result(
self,
agg_with_accessor: &BucketAggregationWithAccessor,
) -> crate::Result<IntermediateBucketResult> {
match self {
SegmentBucketResultCollector::Terms(terms) => {
terms.into_intermediate_bucket_result(agg_with_accessor)
}
SegmentBucketResultCollector::Range(range) => {
range.into_intermediate_bucket_result(agg_with_accessor)
}
SegmentBucketResultCollector::Histogram(histogram) => {
histogram.into_intermediate_bucket_result(agg_with_accessor)
}
}
}
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
match &req.bucket_agg {
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
SegmentTermCollector::from_req_and_validate(
terms_req,
&req.sub_aggregation,
req.field_type,
req.accessor
.as_multi()
.expect("unexpected fast field cardinality"),
)?,
))),
BucketAggregationType::Range(range_req) => {
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
range_req,
@@ -237,16 +175,14 @@ impl SegmentBucketResultCollector {
req.field_type,
)?))
}
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(
SegmentHistogramCollector::from_req_and_validate(
histogram,
&req.sub_aggregation,
req.field_type,
req.accessor
.as_single()
.expect("unexpected fast field cardinality"),
&req.accessor,
)?,
))),
)),
}
}
@@ -264,9 +200,34 @@ impl SegmentBucketResultCollector {
SegmentBucketResultCollector::Histogram(histogram) => {
histogram.collect_block(doc, bucket_with_accessor, force_flush)
}
SegmentBucketResultCollector::Terms(terms) => {
terms.collect_block(doc, bucket_with_accessor, force_flush)
}
}
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentHistogramBucketEntry {
pub key: f64,
pub doc_count: u64,
}
#[derive(Clone, PartialEq)]
pub(crate) struct SegmentRangeBucketEntry {
pub key: Key,
pub doc_count: u64,
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
/// The from range of the bucket. Equals f64::MIN when None.
pub from: Option<f64>,
/// The to range of the bucket. Equals f64::MAX when None.
pub to: Option<f64>,
}
impl Debug for SegmentRangeBucketEntry {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentRangeBucketEntry")
.field("key", &self.key)
.field("doc_count", &self.doc_count)
.field("from", &self.from)
.field("to", &self.to)
.finish()
}
}

View File

@@ -273,18 +273,18 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
let all_query = AllQuery;
let week_histogram_collector = HistogramCollector::new(
date_field,
DateTime::from_primitive(
DateTime::new_primitive(
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
),
3600 * 24 * 365, // it is just for a unit test... sorry leap years.

View File

@@ -26,11 +26,11 @@ pub fn test_filter_collector() -> crate::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::new_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::new_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
index_writer.commit()?;
let reader = index.reader()?;
@@ -55,7 +55,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
assert_eq!(filtered_top_docs.len(), 0);
fn date_filter(value: DateTime) -> bool {
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
(value.to_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
.whole_weeks()
> 0
}

View File

@@ -898,7 +898,7 @@ mod tests {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests()?;
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
let pr_birthday = DateTime::new_utc(OffsetDateTime::parse(
"1898-04-09T00:00:00+00:00",
&Rfc3339,
)?);
@@ -906,7 +906,7 @@ mod tests {
name => "Paul Robeson",
birthday => pr_birthday,
))?;
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
let mr_birthday = DateTime::new_utc(OffsetDateTime::parse(
"1947-11-08T00:00:00+00:00",
&Rfc3339,
)?);

View File

@@ -401,7 +401,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();

View File

@@ -35,7 +35,7 @@ const ZERO_ARRAY: [u8; 8] = [0u8; 8];
#[cfg(test)]
fn create_uuid() -> Uuid {
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY)
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
}
#[cfg(not(test))]
@@ -57,7 +57,7 @@ impl SegmentId {
/// Picking the first 8 chars is ok to identify
/// segments in a display message (e.g. a5c4dfcb).
pub fn short_uuid_string(&self) -> String {
(&self.0.as_simple().to_string()[..8]).to_string()
(&self.0.to_simple_ref().to_string()[..8]).to_string()
}
/// Returns a segment uuid string.
@@ -65,7 +65,7 @@ impl SegmentId {
/// It consists in 32 lowercase hexadecimal chars
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
pub fn uuid_string(&self) -> String {
self.0.as_simple().to_string()
self.0.to_simple_ref().to_string()
}
/// Build a `SegmentId` string from the full uuid string.

View File

@@ -169,7 +169,7 @@ impl SegmentReader {
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
let fast_fields_readers =
let fast_field_readers =
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
@@ -196,7 +196,7 @@ impl SegmentReader {
max_doc,
termdict_composite,
postings_composite,
fast_fields_readers,
fast_fields_readers: fast_field_readers,
fieldnorm_readers,
segment_id: segment.id(),
delete_opstamp: segment.meta().delete_opstamp(),

View File

@@ -97,10 +97,6 @@ pub enum TantivyError {
/// Index incompatible with current version of Tantivy.
#[error("{0:?}")]
IncompatibleIndex(Incompatibility),
/// An internal error occurred. This is are internal states that should not be reached.
/// e.g. a datastructure is incorrectly inititalized.
#[error("Internal error: '{0}'")]
InternalError(String),
}
#[cfg(feature = "quickwit")]

View File

@@ -188,14 +188,14 @@ mod bench {
}
#[bench]
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_alive_bitset_access(bench: &mut Bencher) {
fn bench_deletebitset_access(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
bench.iter(|| {
@@ -206,14 +206,14 @@ mod bench {
}
#[bench]
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
}
#[bench]
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
bench.iter(|| {

View File

@@ -167,7 +167,7 @@ impl FastValue for DateTime {
}
fn to_u64(&self) -> u64 {
self.into_unix_timestamp().to_u64()
self.to_unix_timestamp().to_u64()
}
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
@@ -178,7 +178,7 @@ impl FastValue for DateTime {
}
fn as_u64(&self) -> u64 {
self.into_unix_timestamp().as_u64()
self.to_unix_timestamp().as_u64()
}
fn to_type() -> Type {
@@ -196,31 +196,10 @@ fn value_to_u64(value: &Value) -> u64 {
}
}
/// The fast field type
pub enum FastFieldType {
/// Numeric type, e.g. f64.
Numeric,
/// Fast field stores string ids.
String,
/// Fast field stores string ids for facets.
Facet,
}
impl FastFieldType {
fn is_storing_term_ids(&self) -> bool {
matches!(self, FastFieldType::String | FastFieldType::Facet)
}
fn is_facet(&self) -> bool {
matches!(self, FastFieldType::Facet)
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use std::ops::Range;
use std::path::Path;
use common::HasLen;
@@ -232,7 +211,7 @@ mod tests {
use super::*;
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT};
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
use crate::time::OffsetDateTime;
use crate::{Index, SegmentId, SegmentReader};
@@ -254,7 +233,7 @@ mod tests {
#[test]
pub fn test_fastfield_i64_u64() {
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
let datetime = DateTime::new_utc(OffsetDateTime::UNIX_EPOCH);
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
}
@@ -413,8 +392,7 @@ mod tests {
serializer.close().unwrap();
}
let file = directory.open_read(path).unwrap();
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
assert_eq!(file.len(), 10175_usize); // linear interpol size
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
{
let fast_fields_composite = CompositeFile::open(&file)?;
let data = fast_fields_composite.open_read(i64_field).unwrap();
@@ -511,7 +489,7 @@ mod tests {
let mut index_writer = index.writer_for_tests().unwrap();
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer
.add_document(doc!(date_field =>DateTime::from_utc(OffsetDateTime::now_utc())))?;
.add_document(doc!(date_field =>DateTime::new_utc(OffsetDateTime::now_utc())))?;
index_writer.commit()?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
@@ -531,206 +509,7 @@ mod tests {
#[test]
fn test_default_datetime() {
assert_eq!(0, DateTime::make_zero().into_unix_timestamp());
}
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
let mut all = vec![];
for doc in docs {
let mut out = vec![];
ff.get_vals(doc, &mut out);
all.extend(out);
}
all
}
#[test]
fn test_text_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// first segment
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "BBBBB AAAAA", // term_ord 1,2
))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "AAAAA BBBBB", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..5),
vec![1, 0, 0, 0, 1, 2]
);
let mut out = vec![];
text_fast_field.get_vals(3, &mut out);
assert_eq!(out, vec![0, 1]);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 3);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
// default tokenizer applies lower case
assert_eq!(bytes, "aaaaa".as_bytes());
}
{
// second segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "CCCCC AAAAA", // term_ord 1, after merge 2
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let segment_reader = searcher.segment_reader(1);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..3), vec![0, 1, 0]);
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..8),
vec![1, 0, 0, 0, 1, 3 /* next segment */, 0, 2, 0]
);
Ok(())
}
#[test]
fn test_string_fastfield() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", STRING | FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
// first segment
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
text_field => "BBBBB", // term_ord 1
))?;
index_writer.add_document(doc!())?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
let inverted_index = segment_reader.inverted_index(text_field)?;
assert_eq!(inverted_index.terms().num_terms(), 3);
let mut bytes = vec![];
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
assert_eq!(bytes, "AAAAA".as_bytes());
}
{
// second segment
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(
text_field => "AAAAA", // term_ord 0
))?;
index_writer.add_document(doc!(
text_field => "CCCCC", // term_ord 1, after merge 2
))?;
index_writer.add_document(doc!())?;
index_writer.commit()?;
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 2);
let segment_reader = searcher.segment_reader(1);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let text_fast_field = fast_fields.u64s(text_field).unwrap();
assert_eq!(
get_vals_for_docs(&text_fast_field, 0..9),
vec![1, 0, 0, 3 /* next segment */, 0, 2]
);
Ok(())
assert_eq!(0, DateTime::make_zero().to_unix_timestamp());
}
#[test]
@@ -768,23 +547,23 @@ mod tests {
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
let mut dates = vec![];
{
assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64);
assert_eq!(date_fast_field.get(0u32).to_unix_timestamp(), 1i64);
dates_fast_field.get_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_unix_timestamp(), 2i64);
assert_eq!(dates[1].into_unix_timestamp(), 3i64);
assert_eq!(dates[0].to_unix_timestamp(), 2i64);
assert_eq!(dates[1].to_unix_timestamp(), 3i64);
}
{
assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64);
assert_eq!(date_fast_field.get(1u32).to_unix_timestamp(), 4i64);
dates_fast_field.get_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64);
assert_eq!(date_fast_field.get(2u32).to_unix_timestamp(), 0i64);
dates_fast_field.get_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_unix_timestamp(), 5i64);
assert_eq!(dates[1].into_unix_timestamp(), 6i64);
assert_eq!(dates[0].to_unix_timestamp(), 5i64);
assert_eq!(dates[1].to_unix_timestamp(), 6i64);
}
Ok(())
}

View File

@@ -71,24 +71,24 @@ mod tests {
let mut index_writer = index.writer_for_tests()?;
let first_time_stamp = OffsetDateTime::now_utc();
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp),
date_field => DateTime::from_utc(first_time_stamp),
date_field => DateTime::new_utc(first_time_stamp),
date_field => DateTime::new_utc(first_time_stamp),
time_i=>1i64))?;
index_writer.add_document(doc!(time_i => 0i64))?;
// add one second
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(1)),
time_i => 2i64))?;
// add another second
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
index_writer.add_document(doc!(
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::from_utc(two_secs_ahead),
date_field => DateTime::new_utc(two_secs_ahead),
date_field => DateTime::new_utc(two_secs_ahead),
date_field => DateTime::new_utc(two_secs_ahead),
time_i => 3i64))?;
// add three seconds
index_writer.add_document(doc!(
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(3)),
time_i => 4i64))?;
index_writer.commit()?;
@@ -113,7 +113,7 @@ mod tests {
.expect("cannot find value")
.as_date()
.unwrap(),
DateTime::from_utc(first_time_stamp),
DateTime::new_utc(first_time_stamp),
);
assert_eq!(
retrieved_doc
@@ -140,7 +140,7 @@ mod tests {
.expect("cannot find value")
.as_date()
.unwrap(),
DateTime::from_utc(two_secs_ahead)
DateTime::new_utc(two_secs_ahead)
);
assert_eq!(
retrieved_doc
@@ -181,7 +181,7 @@ mod tests {
.expect("cannot find value")
.as_date()
.expect("value not of Date type"),
DateTime::from_utc(first_time_stamp + Duration::seconds(offset_sec)),
DateTime::new_utc(first_time_stamp + Duration::seconds(offset_sec)),
);
assert_eq!(
retrieved_doc

View File

@@ -27,28 +27,22 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
}
}
/// Returns `[start, end)`, such that the values associated
/// to the given document are `start..end`.
/// Returns `(start, stop)`, such that the values associated
/// to the given document are `start..stop`.
#[inline]
fn range(&self, doc: DocId) -> Range<u64> {
let start = self.idx_reader.get(doc);
let end = self.idx_reader.get(doc + 1);
start..end
}
/// Returns the array of values associated to the given `doc`.
#[inline]
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range(range.start, &mut vals[..]);
let stop = self.idx_reader.get(doc + 1);
start..stop
}
/// Returns the array of values associated to the given `doc`.
#[inline]
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
let range = self.range(doc);
self.get_vals_for_range(range, vals);
let len = (range.end - range.start) as usize;
vals.resize(len, Item::make_zero());
self.vals_reader.get_range(range.start, &mut vals[..]);
}
/// Returns the minimum value for this fast field.

View File

@@ -4,7 +4,7 @@ use fnv::FnvHashMap;
use tantivy_bitpacker::minmax;
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
use crate::schema::{Document, Field};
@@ -38,17 +38,17 @@ pub struct MultiValuedFastFieldWriter {
field: Field,
vals: Vec<UnorderedTermId>,
doc_index: Vec<u64>,
fast_field_type: FastFieldType,
is_facet: bool,
}
impl MultiValuedFastFieldWriter {
/// Creates a new `MultiValuedFastFieldWriter`
pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self {
/// Creates a new `IntFastFieldWriter`
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
MultiValuedFastFieldWriter {
field,
vals: Vec::new(),
doc_index: Vec::new(),
fast_field_type,
is_facet,
}
}
@@ -77,13 +77,12 @@ impl MultiValuedFastFieldWriter {
/// all of the matching field values present in the document.
pub fn add_document(&mut self, doc: &Document) {
self.next_doc();
// facets/texts are indexed in the `SegmentWriter` as we encode their unordered id.
if self.fast_field_type.is_storing_term_ids() {
return;
}
for field_value in doc.field_values() {
if field_value.field == self.field {
self.add_val(value_to_u64(field_value.value()));
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
if !self.is_facet {
for field_value in doc.field_values() {
if field_value.field == self.field {
self.add_val(value_to_u64(field_value.value()));
}
}
}
}
@@ -159,15 +158,15 @@ impl MultiValuedFastFieldWriter {
{
// writing the values themselves.
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
if let Some(mapping) = mapping_opt {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
match mapping_opt {
Some(mapping) => {
value_serializer = serializer.new_u64_fast_field_with_idx(
self.field,
0u64,
mapping.len() as u64,
1,
)?;
if self.fast_field_type.is_facet() {
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
for vals in self.get_ordered_values(doc_id_map) {
doc_vals.clear();
@@ -180,27 +179,19 @@ impl MultiValuedFastFieldWriter {
value_serializer.add_val(val)?;
}
}
} else {
}
None => {
let val_min_max = minmax(self.vals.iter().cloned());
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for vals in self.get_ordered_values(doc_id_map) {
let remapped_vals = vals
.iter()
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
for val in remapped_vals {
// sort values in case of remapped doc_ids?
for &val in vals {
value_serializer.add_val(val)?;
}
}
}
} else {
let val_min_max = minmax(self.vals.iter().cloned());
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
value_serializer =
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
for vals in self.get_ordered_values(doc_id_map) {
// sort values in case of remapped doc_ids?
for &val in vals {
value_serializer.add_val(val)?;
}
}
}
value_serializer.close_field()?;
}

View File

@@ -6,12 +6,17 @@ use common::BinarySerializable;
use fastfield_codecs::bitpacked::{
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
};
#[allow(deprecated)]
use fastfield_codecs::linearinterpol::{
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
};
#[allow(deprecated)]
use fastfield_codecs::multilinearinterpol::{
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
};
use fastfield_codecs::piecewise_linear::{
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
};
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
use super::FastValue;
@@ -71,6 +76,8 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
/// Blockwise linear interpolated values + bitpacked
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
/// Piecewise linear interpolated values + bitpacked
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
}
impl<Item: FastValue> DynamicFastFieldReader<Item> {
@@ -86,12 +93,14 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
BitpackedReader,
>::open_from_bytes(bytes)?)
}
#[allow(deprecated)]
LinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
Item,
LinearInterpolFastFieldReader,
>::open_from_bytes(bytes)?)
}
#[allow(deprecated)]
MultiLinearInterpolFastFieldSerializer::ID => {
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
Item,
@@ -100,6 +109,12 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
bytes
)?)
}
PiecewiseLinearFastFieldSerializer::ID => {
DynamicFastFieldReader::PiecewiseLinear(FastFieldReaderCodecWrapper::<
Item,
PiecewiseLinearFastFieldReader,
>::open_from_bytes(bytes)?)
}
_ => {
panic!(
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
@@ -118,6 +133,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::Bitpacked(reader) => reader.get(doc),
Self::LinearInterpol(reader) => reader.get(doc),
Self::MultiLinearInterpol(reader) => reader.get(doc),
Self::PiecewiseLinear(reader) => reader.get(doc),
}
}
#[inline]
@@ -126,6 +142,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::Bitpacked(reader) => reader.get_range(start, output),
Self::LinearInterpol(reader) => reader.get_range(start, output),
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
}
}
fn min_value(&self) -> Item {
@@ -133,6 +150,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::Bitpacked(reader) => reader.min_value(),
Self::LinearInterpol(reader) => reader.min_value(),
Self::MultiLinearInterpol(reader) => reader.min_value(),
Self::PiecewiseLinear(reader) => reader.min_value(),
}
}
fn max_value(&self) -> Item {
@@ -140,6 +158,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
Self::Bitpacked(reader) => reader.max_value(),
Self::LinearInterpol(reader) => reader.max_value(),
Self::MultiLinearInterpol(reader) => reader.max_value(),
Self::PiecewiseLinear(reader) => reader.max_value(),
}
}
}
@@ -176,9 +195,12 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
_phantom: PhantomData,
})
}
#[inline]
pub(crate) fn get_u64(&self, doc: u64) -> Item {
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
/// Get u64 for indice `idx`.
/// `idx` can be either a `DocId` or an index used for
/// `multivalued` fast field. See [`get_range`] for more details.
pub(crate) fn get_u64(&self, idx: u64) -> Item {
Item::from_u64(self.reader.get_u64(idx, self.bytes.as_slice()))
}
/// Internally `multivalued` also use SingleValue Fast fields.

View File

@@ -39,9 +39,6 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType,
.get_fastfield_cardinality()
.map(|cardinality| (FastType::Date, cardinality)),
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
FieldType::Str(options) if options.is_fast() => {
Some((FastType::U64, Cardinality::MultiValues))
}
_ => None,
}
}

View File

@@ -4,9 +4,9 @@ use common::{BinarySerializable, CountingWriter};
pub use fastfield_codecs::bitpacked::{
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
};
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
use itertools::Itertools;
use crate::directory::{CompositeWrite, WritePtr};
use crate::schema::Field;
@@ -35,18 +35,31 @@ pub struct CompositeFastFieldSerializer {
composite_write: CompositeWrite<WritePtr>,
}
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
#[derive(Debug)]
pub struct CodecEstimationResult<'a> {
pub ratio: f32,
pub name: &'a str,
pub id: u8,
}
// TODO: use this when this is merged and stabilized explicit_generic_args_with_impl_trait
// https://github.com/rust-lang/rust/pull/86176
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
stats: FastFieldStats,
fastfield_accessor: &A,
estimations: &mut Vec<(f32, &str, u8)>,
) {
) -> CodecEstimationResult {
if !T::is_applicable(fastfield_accessor, stats.clone()) {
return;
return CodecEstimationResult {
ratio: f32::MAX,
name: T::NAME,
id: T::ID,
};
}
CodecEstimationResult {
ratio: T::estimate_compression_ratio(fastfield_accessor, stats),
name: T::NAME,
id: T::ID,
}
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
estimations.push((ratio, name, id));
}
impl CompositeFastFieldSerializer {
@@ -59,7 +72,7 @@ impl CompositeFastFieldSerializer {
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field(
pub fn new_u64_fast_field_with_best_codec(
&mut self,
field: Field,
stats: FastFieldStats,
@@ -67,7 +80,7 @@ impl CompositeFastFieldSerializer {
data_iter_1: impl Iterator<Item = u64>,
data_iter_2: impl Iterator<Item = u64>,
) -> io::Result<()> {
self.create_auto_detect_u64_fast_field_with_idx(
self.new_u64_fast_field_with_idx_with_best_codec(
field,
stats,
fastfield_accessor,
@@ -78,7 +91,7 @@ impl CompositeFastFieldSerializer {
}
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
/// automatically.
pub fn create_auto_detect_u64_fast_field_with_idx(
pub fn new_u64_fast_field_with_idx_with_best_codec(
&mut self,
field: Field,
stats: FastFieldStats,
@@ -88,42 +101,29 @@ impl CompositeFastFieldSerializer {
idx: usize,
) -> io::Result<()> {
let field_write = self.composite_write.for_field_with_idx(field, idx);
let mut estimations = vec![];
codec_estimation::<BitpackedFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
&mut estimations,
);
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
{
warn!(
"broken estimation for fast field codec {}",
broken_estimation.1
);
}
// removing nan values for codecs with broken calculations, and max values which disables
// codecs
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
let (_ratio, name, id) = estimations[0];
let estimations = vec![
codec_estimation::<BitpackedFastFieldSerializer, _>(stats.clone(), &fastfield_accessor),
codec_estimation::<PiecewiseLinearFastFieldSerializer, _>(
stats.clone(),
&fastfield_accessor,
),
];
let best_codec_result = estimations
.iter()
.sorted_by(|result_a, result_b| {
result_a
.ratio
.partial_cmp(&result_b.ratio)
.expect("Ratio cannot be nan.")
})
.next()
.expect("A codec must be present.");
debug!(
"choosing fast field codec {} for field_id {:?}",
name, field
); // todo print actual field name
id.serialize(field_write)?;
match name {
"Choosing fast field codec {} for field_id {:?} among {:?}",
best_codec_result.name, field, estimations,
);
best_codec_result.id.serialize(field_write)?;
match best_codec_result.name {
BitpackedFastFieldSerializer::NAME => {
BitpackedFastFieldSerializer::serialize(
field_write,
@@ -133,17 +133,8 @@ impl CompositeFastFieldSerializer {
data_iter_2,
)?;
}
LinearInterpolFastFieldSerializer::NAME => {
LinearInterpolFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
data_iter_1,
data_iter_2,
)?;
}
MultiLinearInterpolFastFieldSerializer::NAME => {
MultiLinearInterpolFastFieldSerializer::serialize(
PiecewiseLinearFastFieldSerializer::NAME => {
PiecewiseLinearFastFieldSerializer::serialize(
field_write,
&fastfield_accessor,
stats,
@@ -152,7 +143,7 @@ impl CompositeFastFieldSerializer {
)?;
}
_ => {
panic!("unknown fastfield serializer {}", name)
panic!("unknown fastfield serializer {}", best_codec_result.name)
}
};
field_write.flush()?;
@@ -216,3 +207,45 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
self.write.flush()
}
}
#[cfg(test)]
mod tests {
use std::path::Path;
use common::BinarySerializable;
use fastfield_codecs::FastFieldStats;
use itertools::Itertools;
use super::CompositeFastFieldSerializer;
use crate::directory::{RamDirectory, WritePtr};
use crate::schema::Field;
use crate::Directory;
#[test]
fn new_u64_fast_field_with_best_codec() -> crate::Result<()> {
let directory: RamDirectory = RamDirectory::create();
let path = Path::new("test");
let write: WritePtr = directory.open_write(path)?;
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
let vals = (0..10000u64).into_iter().collect_vec();
let stats = FastFieldStats {
min_value: 0,
max_value: 9999,
num_vals: vals.len() as u64,
};
serializer.new_u64_fast_field_with_best_codec(
Field::from_field_id(0),
stats,
vals.clone(),
vals.clone().into_iter(),
vals.into_iter(),
)?;
serializer.close()?;
// get the codecs id
let mut bytes = directory.open_read(path)?.read_bytes()?;
let codec_id = u8::deserialize(&mut bytes)?;
// Codec id = 4 is piecewise linear.
assert_eq!(codec_id, 4);
Ok(())
}
}

View File

@@ -7,7 +7,7 @@ use tantivy_bitpacker::BlockedBitpacker;
use super::multivalued::MultiValuedFastFieldWriter;
use super::serializer::FastFieldStats;
use super::{FastFieldDataAccess, FastFieldType};
use super::FastFieldDataAccess;
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::UnorderedTermId;
@@ -16,7 +16,6 @@ use crate::termdict::TermOrdinal;
/// The `FastFieldsWriter` groups all of the fast field writers.
pub struct FastFieldsWriter {
term_id_writers: Vec<MultiValuedFastFieldWriter>,
single_value_writers: Vec<IntFastFieldWriter>,
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
bytes_value_writers: Vec<BytesFastFieldWriter>,
@@ -34,7 +33,6 @@ impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
let mut single_value_writers = Vec::new();
let mut term_id_writers = Vec::new();
let mut multi_values_writers = Vec::new();
let mut bytes_value_writers = Vec::new();
@@ -52,22 +50,15 @@ impl FastFieldsWriter {
single_value_writers.push(fast_field_writer);
}
Some(Cardinality::MultiValues) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric);
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
multi_values_writers.push(fast_field_writer);
}
None => {}
}
}
FieldType::Facet(_) => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet);
term_id_writers.push(fast_field_writer);
}
FieldType::Str(_) if field_entry.is_fast() => {
let fast_field_writer =
MultiValuedFastFieldWriter::new(field, FastFieldType::String);
term_id_writers.push(fast_field_writer);
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
multi_values_writers.push(fast_field_writer);
}
FieldType::Bytes(bytes_option) => {
if bytes_option.is_fast() {
@@ -79,7 +70,6 @@ impl FastFieldsWriter {
}
}
FastFieldsWriter {
term_id_writers,
single_value_writers,
multi_values_writers,
bytes_value_writers,
@@ -88,15 +78,10 @@ impl FastFieldsWriter {
/// The memory used (inclusive childs)
pub fn mem_usage(&self) -> usize {
self.term_id_writers
self.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.single_value_writers
.iter()
.map(|w| w.mem_usage())
.sum::<usize>()
+ self
.multi_values_writers
.iter()
@@ -109,14 +94,6 @@ impl FastFieldsWriter {
.sum::<usize>()
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter()
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
// TODO optimize
@@ -133,17 +110,6 @@ impl FastFieldsWriter {
.find(|field_writer| field_writer.field() == field)
}
/// Get the `FastFieldWriter` associated to a field.
pub fn get_term_id_writer_mut(
&mut self,
field: Field,
) -> Option<&mut MultiValuedFastFieldWriter> {
// TODO optimize
self.term_id_writers
.iter_mut()
.find(|field_writer| field_writer.field() == field)
}
/// Returns the fast field multi-value writer for the given field.
///
/// Returns None if the field does not exist, or is not
@@ -171,9 +137,6 @@ impl FastFieldsWriter {
/// Indexes all of the fastfields of a new document.
pub fn add_document(&mut self, doc: &Document) {
for field_writer in &mut self.term_id_writers {
field_writer.add_document(doc);
}
for field_writer in &mut self.single_value_writers {
field_writer.add_document(doc);
}
@@ -193,10 +156,6 @@ impl FastFieldsWriter {
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
for field_writer in &self.term_id_writers {
let field = field_writer.field();
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
}
for field_writer in &self.single_value_writers {
field_writer.serialize(serializer, doc_id_map)?;
}
@@ -285,10 +244,6 @@ impl IntFastFieldWriter {
self.val_count += 1;
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
///
///
/// Extract the value associated to the fast field for
/// this document.
///
@@ -299,17 +254,18 @@ impl IntFastFieldWriter {
/// instead.
/// If the document has more than one value for the given field,
/// only the first one is taken in account.
///
/// Values for string fast fields are skipped.
pub fn add_document(&mut self, doc: &Document) {
fn extract_val(&self, doc: &Document) -> u64 {
match doc.get_first(self.field) {
Some(v) => {
self.add_val(super::value_to_u64(v));
}
None => {
self.add_val(self.val_if_missing);
}
};
Some(v) => super::value_to_u64(v),
None => self.val_if_missing,
}
}
/// Extract the fast field value from the document
/// (or use the default value) and records it.
pub fn add_document(&mut self, doc: &Document) {
let val = self.extract_val(doc);
self.add_val(val);
}
/// get iterator over the data
@@ -328,7 +284,6 @@ impl IntFastFieldWriter {
} else {
(self.val_min, self.val_max)
};
let fastfield_accessor = WriterFastFieldAccessProvider {
doc_id_map,
vals: &self.vals,
@@ -343,7 +298,7 @@ impl IntFastFieldWriter {
let iter = doc_id_map
.iter_old_doc_ids()
.map(|doc_id| self.vals.get(doc_id as usize));
serializer.create_auto_detect_u64_fast_field(
serializer.new_u64_fast_field_with_best_codec(
self.field,
stats,
fastfield_accessor,
@@ -351,7 +306,7 @@ impl IntFastFieldWriter {
iter,
)?;
} else {
serializer.create_auto_detect_u64_fast_field(
serializer.new_u64_fast_field_with_best_codec(
self.field,
stats,
fastfield_accessor,

View File

@@ -116,14 +116,14 @@ pub fn demux(
) -> crate::Result<Vec<Index>> {
let mut indices = vec![];
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
let alive_bitset = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
.into_iter()
.map(Some)
.collect_vec();
let index = merge_filtered_segments(
segments,
target_settings.clone(),
alive_bitset,
delete_bitsets,
output_directory,
)?;
indices.push(index);
@@ -141,7 +141,7 @@ mod tests {
use crate::{DocAddress, Term};
#[test]
fn test_demux_map_to_alive_bitset() {
fn test_demux_map_to_deletebitset() {
let max_value = 2;
let mut demux_mapping = DemuxMapping::default();
// segment ordinal 0 mapping

View File

@@ -57,7 +57,7 @@ struct IndexingPositionsPerPath {
impl IndexingPositionsPerPath {
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path
.entry(murmurhash2(term.value_bytes()))
.entry(murmurhash2(term.as_slice()))
.or_insert_with(Default::default)
}
}
@@ -149,11 +149,10 @@ fn index_json_value<'a>(
json_term_writer.term_buffer,
ctx,
indexing_position,
None,
);
}
TextOrDateTime::DateTime(dt) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt));
json_term_writer.set_fast_value(DateTime::new_utc(dt));
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
},
@@ -208,7 +207,7 @@ impl<'a> JsonTermWriter<'a> {
pub fn wrap(term_buffer: &'a mut Term) -> Self {
term_buffer.clear_with_type(Type::Json);
let mut path_stack = Vec::with_capacity(10);
path_stack.push(5); // magic number?
path_stack.push(5);
Self {
term_buffer,
path_stack,
@@ -250,8 +249,8 @@ impl<'a> JsonTermWriter<'a> {
/// Returns the json path of the term being currently built.
#[cfg(test)]
pub(crate) fn path(&self) -> &[u8] {
let end_of_path = self.path_stack.last().cloned().unwrap_or(6); // TODO remove magic number
&self.term().value_bytes()[..end_of_path - 1]
let end_of_path = self.path_stack.last().cloned().unwrap_or(6);
&self.term().as_slice()[5..end_of_path - 1]
}
pub fn set_fast_value<T: FastValue>(&mut self, val: T) {
@@ -321,7 +320,10 @@ mod tests {
let mut json_writer = JsonTermWriter::wrap(&mut term);
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
assert_eq!(
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}
#[test]
@@ -333,8 +335,8 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
json_writer.term().value_bytes(),
b"color\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
)
}
@@ -347,8 +349,8 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
json_writer.term().value_bytes(),
b"color\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
)
}
@@ -361,8 +363,8 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
json_writer.term().value_bytes(),
b"color\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
)
}
@@ -377,8 +379,8 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
json_writer.term().value_bytes(),
b"attribute\x01color\x00sred"
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
)
}
@@ -392,7 +394,10 @@ mod tests {
json_writer.push_path_segment("hue");
json_writer.pop_path_segment();
json_writer.set_str("red");
assert_eq!(json_writer.term().value_bytes(), b"color\x00sred")
assert_eq!(
json_writer.term().as_slice(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}
#[test]

View File

@@ -170,8 +170,8 @@ impl IndexMerger {
index_settings: IndexSettings,
segments: &[Segment],
) -> crate::Result<IndexMerger> {
let alive_bitset = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
}
// Create merge with a custom delete set.
@@ -180,7 +180,7 @@ impl IndexMerger {
// corresponds to the segment index.
//
// If `None` is provided for custom alive set, the regular alive set will be used.
// If a alive_bitset is provided, the union between the provided and regular
// If a delete_bitsets is provided, the union between the provided and regular
// alive set will be used.
//
// This can be used to merge but also apply an additional filter.
@@ -283,12 +283,12 @@ impl IndexMerger {
for (field, field_entry) in self.schema.fields() {
let field_type = field_entry.field_type();
match field_type {
FieldType::Facet(_) | FieldType::Str(_) if field_type.is_fast() => {
FieldType::Facet(_) => {
let term_ordinal_mapping = term_ord_mappings.remove(&field).expect(
"Logic Error in Tantivy (Please report). Facet field should have required \
a`term_ordinal_mapping`.",
);
self.write_term_id_fast_field(
self.write_hierarchical_facet_field(
field,
&term_ordinal_mapping,
fast_field_serializer,
@@ -312,8 +312,8 @@ impl IndexMerger {
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
}
}
_ => {
// We don't handle json fast field for the moment
FieldType::Str(_) | FieldType::JsonObject(_) => {
// We don't handle json / string fast field for the moment
// They can be implemented using what is done
// for facets in the future
}
@@ -384,7 +384,7 @@ impl IndexMerger {
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
fast_field_reader.get(*doc_id)
});
fast_field_serializer.create_auto_detect_u64_fast_field(
fast_field_serializer.new_u64_fast_field_with_best_codec(
field,
stats,
fastfield_accessor,
@@ -551,7 +551,7 @@ impl IndexMerger {
}
offsets.push(offset);
fast_field_serializer.create_auto_detect_u64_fast_field(
fast_field_serializer.new_u64_fast_field_with_best_codec(
field,
stats,
&offsets[..],
@@ -590,14 +590,14 @@ impl IndexMerger {
)
}
fn write_term_id_fast_field(
fn write_hierarchical_facet_field(
&self,
field: Field,
term_ordinal_mappings: &TermOrdinalMapping,
fast_field_serializer: &mut CompositeFastFieldSerializer,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
debug_time!("write-term-id-fast-field");
debug_time!("write-hierarchical-facet-field");
// Multifastfield consists of 2 fastfields.
// The first serves as an index into the second one and is stricly increasing.
@@ -771,7 +771,7 @@ impl IndexMerger {
ff_reader.get_vals(*doc_id, &mut vals);
vals.into_iter()
});
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
fast_field_serializer.new_u64_fast_field_with_idx_with_best_codec(
field,
stats,
fastfield_accessor,
@@ -848,9 +848,6 @@ impl IndexMerger {
let mut term_ord_mapping_opt = match field_type {
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
FieldType::Str(options) if options.is_fast() => {
Some(TermOrdinalMapping::new(max_term_ords))
}
_ => None,
};
@@ -1177,7 +1174,7 @@ mod tests {
index_writer.add_document(doc!(
text_field => "af b",
score_field => 3u64,
date_field => DateTime::from_utc(curr_time),
date_field => DateTime::new_utc(curr_time),
bytes_score_field => 3u32.to_be_bytes().as_ref()
))?;
index_writer.add_document(doc!(
@@ -1194,7 +1191,7 @@ mod tests {
// writing the segment
index_writer.add_document(doc!(
text_field => "af b",
date_field => DateTime::from_utc(curr_time),
date_field => DateTime::new_utc(curr_time),
score_field => 11u64,
bytes_score_field => 11u32.to_be_bytes().as_ref()
))?;
@@ -1252,7 +1249,7 @@ mod tests {
assert_eq!(
get_doc_ids(vec![Term::from_field_date(
date_field,
DateTime::from_utc(curr_time)
DateTime::new_utc(curr_time)
)])?,
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
);

View File

@@ -6,7 +6,8 @@ use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
use crate::indexer::json_term_writer::index_json_values;
use crate::indexer::segment_serializer::SegmentSerializer;
use crate::postings::{
serialize_postings, IndexingContext, IndexingPosition, PerFieldPostingsWriter, PostingsWriter,
compute_table_size, serialize_postings, IndexingContext, IndexingPosition,
PerFieldPostingsWriter, PostingsWriter,
};
use crate::schema::{FieldEntry, FieldType, FieldValue, Schema, Term, Value};
use crate::store::{StoreReader, StoreWriter};
@@ -15,6 +16,25 @@ use crate::tokenizer::{
};
use crate::{DocId, Document, Opstamp, SegmentComponent};
/// Computes the initial size of the hash table.
///
/// Returns the recommended initial table size as a power of 2.
///
/// Note this is a very dumb way to compute log2, but it is easier to proofread that way.
fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<usize> {
let table_memory_upper_bound = per_thread_memory_budget / 3;
(10..20) // We cap it at 2^19 = 512K capacity.
.map(|power| 1 << power)
.take_while(|capacity| compute_table_size(*capacity) < table_memory_upper_bound)
.last()
.ok_or_else(|| {
crate::TantivyError::InvalidArgument(format!(
"per thread memory budget (={per_thread_memory_budget}) is too small. Raise the \
memory budget or lower the number of threads."
))
})
}
fn remap_doc_opstamps(
opstamps: Vec<Opstamp>,
doc_id_mapping_opt: Option<&DocIdMapping>,
@@ -58,11 +78,12 @@ impl SegmentWriter {
/// - segment: The segment being written
/// - schema
pub fn for_segment(
_memory_budget_in_bytes: usize,
memory_budget_in_bytes: usize,
segment: Segment,
schema: Schema,
) -> crate::Result<SegmentWriter> {
let tokenizer_manager = segment.index().tokenizers().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
let per_field_text_analyzers = schema
@@ -85,7 +106,7 @@ impl SegmentWriter {
.collect();
Ok(SegmentWriter {
max_doc: 0,
ctx: IndexingContext::new(),
ctx: IndexingContext::new(table_size),
per_field_postings_writers,
fieldnorms_writer: FieldNormsWriter::for_schema(&schema),
segment_serializer,
@@ -128,7 +149,6 @@ impl SegmentWriter {
pub fn mem_usage(&self) -> usize {
self.ctx.mem_usage()
+ self.fieldnorms_writer.mem_usage()
+ self.per_field_postings_writers.mem_usage()
+ self.fast_field_writers.mem_usage()
+ self.segment_serializer.mem_usage()
}
@@ -168,7 +188,7 @@ impl SegmentWriter {
});
if let Some(unordered_term_id) = unordered_term_id_opt {
self.fast_field_writers
.get_term_id_writer_mut(field)
.get_multivalue_writer_mut(field)
.expect("writer for facet missing")
.add_val(unordered_term_id);
}
@@ -201,22 +221,18 @@ impl SegmentWriter {
}
let mut indexing_position = IndexingPosition::default();
for mut token_stream in token_streams {
// assert_eq!(term_buffer.as_slice().len(), 5);
assert_eq!(term_buffer.as_slice().len(), 5);
postings_writer.index_text(
doc_id,
&mut *token_stream,
term_buffer,
ctx,
&mut indexing_position,
self.fast_field_writers.get_term_id_writer_mut(field),
);
}
if field_entry.has_fieldnorms() {
self.fieldnorms_writer
.record(doc_id, field, indexing_position.num_tokens);
}
self.fieldnorms_writer
.record(doc_id, field, indexing_position.num_tokens);
}
FieldType::U64(_) => {
for value in values {
@@ -398,6 +414,7 @@ pub fn prepare_doc_for_store(doc: Document, schema: &Schema) -> Document {
#[cfg(test)]
mod tests {
use super::compute_initial_table_size;
use crate::collector::Count;
use crate::indexer::json_term_writer::JsonTermWriter;
use crate::postings::TermInfo;
@@ -408,6 +425,15 @@ mod tests {
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{DateTime, DocAddress, DocSet, Document, Index, Postings, Term, TERMINATED};
#[test]
fn test_hashmap_size() {
assert_eq!(compute_initial_table_size(100_000).unwrap(), 1 << 11);
assert_eq!(compute_initial_table_size(1_000_000).unwrap(), 1 << 14);
assert_eq!(compute_initial_table_size(10_000_000).unwrap(), 1 << 17);
assert_eq!(compute_initial_table_size(1_000_000_000).unwrap(), 1 << 19);
assert_eq!(compute_initial_table_size(4_000_000_000).unwrap(), 1 << 19);
}
#[test]
fn test_prepare_for_store() {
let mut schema_builder = Schema::builder();
@@ -497,7 +523,7 @@ mod tests {
json_term_writer.pop_path_segment();
json_term_writer.pop_path_segment();
json_term_writer.push_path_segment("date");
json_term_writer.set_fast_value(DateTime::from_utc(
json_term_writer.set_fast_value(DateTime::new_utc(
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
));
assert!(term_stream.advance());

View File

@@ -158,7 +158,7 @@ impl DateTime {
///
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub const fn from_utc(dt: OffsetDateTime) -> Self {
pub const fn new_utc(dt: OffsetDateTime) -> Self {
Self::from_unix_timestamp(dt.unix_timestamp())
}
@@ -166,19 +166,19 @@ impl DateTime {
///
/// Implicitly assumes that the given date/time is in UTC!
/// Otherwise the original value must only be reobtained with
/// [`Self::into_primitive()`].
pub const fn from_primitive(dt: PrimitiveDateTime) -> Self {
Self::from_utc(dt.assume_utc())
/// [`to_primitive()`].
pub const fn new_primitive(dt: PrimitiveDateTime) -> Self {
Self::new_utc(dt.assume_utc())
}
/// Convert to UNIX timestamp
pub const fn into_unix_timestamp(self) -> i64 {
pub const fn to_unix_timestamp(self) -> i64 {
let Self { unix_timestamp } = self;
unix_timestamp
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
pub fn to_utc(self) -> OffsetDateTime {
let Self { unix_timestamp } = self;
let utc_datetime =
OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp");
@@ -187,16 +187,16 @@ impl DateTime {
}
/// Convert to `OffsetDateTime` with the given time zone
pub fn into_offset(self, offset: UtcOffset) -> OffsetDateTime {
self.into_utc().to_offset(offset)
pub fn to_offset(self, offset: UtcOffset) -> OffsetDateTime {
self.to_utc().to_offset(offset)
}
/// Convert to `PrimitiveDateTime` without any time zone
///
/// The value should have been constructed with [`Self::from_primitive()`].
/// The value should have been constructed with [`from_primitive()`].
/// Otherwise the time zone is implicitly assumed to be UTC.
pub fn into_primitive(self) -> PrimitiveDateTime {
let utc_datetime = self.into_utc();
pub fn to_primitive(self) -> PrimitiveDateTime {
let utc_datetime = self.to_utc();
// Discard the UTC time zone offset
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time())
@@ -205,7 +205,7 @@ impl DateTime {
impl fmt::Debug for DateTime {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
let utc_rfc3339 = self.to_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
f.write_str(&utc_rfc3339)
}
}

View File

@@ -1,24 +1,27 @@
use crate::postings::stacker::MemoryArena;
use crate::postings::stacker::{MemoryArena, TermHashMap};
/// IndexingContext contains all of the transient memory arenas
/// required for building the inverted index.
pub(crate) struct IndexingContext {
/// The term index is an adhoc hashmap,
/// itself backed by a dedicated memory arena.
pub term_index: TermHashMap,
/// Arena is a memory arena that stores posting lists / term frequencies / positions.
pub arena: MemoryArena,
pub arena_terms: MemoryArena,
}
impl IndexingContext {
/// Create a new IndexingContext given the size of the term hash map.
pub(crate) fn new() -> IndexingContext {
pub(crate) fn new(table_size: usize) -> IndexingContext {
let term_index = TermHashMap::new(table_size);
IndexingContext {
arena: MemoryArena::new(),
arena_terms: MemoryArena::new(),
term_index,
}
}
/// Returns the memory usage for the inverted index memory arenas, in bytes.
pub(crate) fn mem_usage(&self) -> usize {
self.arena.mem_usage() + self.arena_terms.mem_usage()
self.term_index.mem_usage() + self.arena.mem_usage()
}
}

View File

@@ -1,7 +1,5 @@
use std::io;
use super::stacker::TermHashMap;
use crate::fastfield::MultiValuedFastFieldWriter;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
@@ -27,14 +25,6 @@ impl<Rec: Recorder> From<JsonPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
}
impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
fn mem_usage(&self) -> usize {
self.str_posting_writer.mem_usage() + self.non_str_posting_writer.mem_usage()
}
fn term_map(&self) -> &TermHashMap {
self.str_posting_writer.term_map()
}
fn subscribe(
&mut self,
doc: crate::DocId,
@@ -52,7 +42,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
) {
self.str_posting_writer.index_text(
doc_id,
@@ -60,7 +49,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer,
ctx,
indexing_position,
None,
);
}
@@ -83,7 +71,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
doc_id_map,
&mut buffer_lender,
ctx,
&self.str_posting_writer.term_map,
serializer,
)?;
} else {
@@ -93,7 +80,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
doc_id_map,
&mut buffer_lender,
ctx,
&self.str_posting_writer.term_map,
serializer,
)?;
}

View File

@@ -26,6 +26,7 @@ pub(crate) use self::postings_writer::{serialize_postings, IndexingPosition, Pos
pub use self::segment_postings::SegmentPostings;
pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub(crate) use self::skip::{BlockInfo, SkipReader};
pub(crate) use self::stacker::compute_table_size;
pub use self::term_info::TermInfo;
pub(crate) type UnorderedTermId = u64;

View File

@@ -10,10 +10,9 @@ pub(crate) struct PerFieldPostingsWriter {
impl PerFieldPostingsWriter {
pub fn for_schema(schema: &Schema) -> Self {
let num_fields = schema.num_fields();
let per_field_postings_writers = schema
.fields()
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry, num_fields))
.map(|(_, field_entry)| posting_writer_from_field_entry(field_entry))
.collect();
PerFieldPostingsWriter {
per_field_postings_writers,
@@ -27,19 +26,9 @@ impl PerFieldPostingsWriter {
pub(crate) fn get_for_field_mut(&mut self, field: Field) -> &mut dyn PostingsWriter {
self.per_field_postings_writers[field.field_id() as usize].as_mut()
}
pub(crate) fn mem_usage(&self) -> usize {
self.per_field_postings_writers
.iter()
.map(|postings_writer| postings_writer.mem_usage())
.sum()
}
}
fn posting_writer_from_field_entry(
field_entry: &FieldEntry,
_num_fields: usize,
) -> Box<dyn PostingsWriter> {
fn posting_writer_from_field_entry(field_entry: &FieldEntry) -> Box<dyn PostingsWriter> {
match *field_entry.field_type() {
FieldType::Str(ref text_options) => text_options
.get_indexing_options()

View File

@@ -1,11 +1,11 @@
use std::collections::HashMap;
use std::io;
use std::marker::PhantomData;
use std::ops::Range;
use fnv::FnvHashMap;
use super::stacker::{Addr, TermHashMap};
use crate::fastfield::MultiValuedFastFieldWriter;
use super::stacker::Addr;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::recorder::{BufferLender, Recorder};
@@ -20,6 +20,31 @@ use crate::DocId;
const POSITION_GAP: u32 = 1;
fn make_field_partition(
term_offsets: &[(Term<&[u8]>, Addr, UnorderedTermId)],
) -> Vec<(Field, Range<usize>)> {
let term_offsets_it = term_offsets
.iter()
.map(|(term, _, _)| term.field())
.enumerate();
let mut prev_field_opt = None;
let mut fields = vec![];
let mut offsets = vec![];
for (offset, field) in term_offsets_it {
if Some(field) != prev_field_opt {
prev_field_opt = Some(field);
fields.push(field);
offsets.push(offset);
}
}
offsets.push(term_offsets.len());
let mut field_offsets = vec![];
for i in 0..fields.len() {
field_offsets.push((fields[i], offsets[i]..offsets[i + 1]));
}
field_offsets
}
/// Serialize the inverted index.
/// It pushes all term, one field at a time, towards the
/// postings serializer.
@@ -31,23 +56,23 @@ pub(crate) fn serialize_postings(
schema: &Schema,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>> {
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(ctx.term_index.len());
term_offsets.extend(ctx.term_index.iter());
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let mut unordered_term_mappings: HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>> =
HashMap::new();
for (field, _) in schema.fields() {
let postings_writer = per_field_postings_writers.get_for_field(field);
let mut term_offsets: Vec<(Term<&[u8]>, Addr, UnorderedTermId)> =
Vec::with_capacity(postings_writer.term_map().len());
term_offsets.extend(postings_writer.term_map().iter(&ctx.arena_terms));
term_offsets.sort_unstable_by_key(|(k, _, _)| k.clone());
let field_offsets = make_field_partition(&term_offsets);
for (field, byte_offsets) in field_offsets {
let field_entry = schema.get_field_entry(field);
match *field_entry.field_type() {
FieldType::Str(_) | FieldType::Facet(_) => {
// populating the (unordered term ord) -> (ordered term ord) mapping
// for the field.
let unordered_term_ids = term_offsets.iter().map(|&(_, _, bucket)| bucket);
let unordered_term_ids = term_offsets[byte_offsets.clone()]
.iter()
.map(|&(_, _, bucket)| bucket);
let mapping: FnvHashMap<UnorderedTermId, TermOrdinal> = unordered_term_ids
.enumerate()
.map(|(term_ord, unord_term_id)| {
@@ -61,10 +86,16 @@ pub(crate) fn serialize_postings(
FieldType::JsonObject(_) => {}
}
let postings_writer = per_field_postings_writers.get_for_field(field);
let fieldnorm_reader = fieldnorm_readers.get_field(field)?;
let mut field_serializer =
serializer.new_field(field, postings_writer.total_num_tokens(), fieldnorm_reader)?;
postings_writer.serialize(&term_offsets, doc_id_map, &ctx, &mut field_serializer)?;
postings_writer.serialize(
&term_offsets[byte_offsets],
doc_id_map,
&ctx,
&mut field_serializer,
)?;
field_serializer.close()?;
}
Ok(unordered_term_mappings)
@@ -96,10 +127,6 @@ pub(crate) trait PostingsWriter {
ctx: &mut IndexingContext,
) -> UnorderedTermId;
fn mem_usage(&self) -> usize;
fn term_map(&self) -> &TermHashMap;
/// Serializes the postings on disk.
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
@@ -118,9 +145,8 @@ pub(crate) trait PostingsWriter {
term_buffer: &mut Term,
ctx: &mut IndexingContext,
indexing_position: &mut IndexingPosition,
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
) {
let end_of_path_idx = term_buffer.value_bytes().len();
let end_of_path_idx = term_buffer.as_slice().len();
let mut num_tokens = 0;
let mut end_position = 0;
token_stream.process(&mut |token: &Token| {
@@ -138,14 +164,9 @@ pub(crate) trait PostingsWriter {
term_buffer.append_bytes(token.text.as_bytes());
let start_position = indexing_position.end_position + token.position as u32;
end_position = start_position + token.position_length as u32;
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
term_id_fast_field_writer.add_val(unordered_term_id);
}
self.subscribe(doc_id, start_position, term_buffer, ctx);
num_tokens += 1;
});
indexing_position.end_position = end_position + POSITION_GAP;
indexing_position.num_tokens += num_tokens;
term_buffer.truncate(end_of_path_idx);
@@ -160,7 +181,6 @@ pub(crate) trait PostingsWriter {
pub(crate) struct SpecializedPostingsWriter<Rec: Recorder> {
total_num_tokens: u64,
_recorder_type: PhantomData<Rec>,
pub(crate) term_map: TermHashMap,
}
impl<Rec: Recorder> From<SpecializedPostingsWriter<Rec>> for Box<dyn PostingsWriter> {
@@ -179,10 +199,9 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
doc_id_map: Option<&DocIdMapping>,
buffer_lender: &mut BufferLender,
ctx: &IndexingContext,
term_index: &TermHashMap,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let recorder: Rec = term_index.read(addr, &ctx.arena_terms);
let recorder: Rec = ctx.term_index.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term.value_bytes(), term_doc_freq)?;
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
@@ -192,14 +211,6 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
}
impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
fn mem_usage(&self) -> usize {
self.term_map.mem_usage()
}
fn term_map(&self) -> &TermHashMap {
&self.term_map
}
fn subscribe(
&mut self,
doc: DocId,
@@ -207,30 +218,25 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
term: &Term,
ctx: &mut IndexingContext,
) -> UnorderedTermId {
//debug_assert!(term.value_bytes().len() >= 1);
debug_assert!(term.as_slice().len() >= 4);
self.total_num_tokens += 1;
let arena = &mut ctx.arena;
let arena_terms = &mut ctx.arena_terms;
self.term_map.mutate_or_create(
term.value_bytes(),
arena_terms,
|opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(arena);
recorder.new_doc(doc, arena);
}
recorder.record_position(position, arena);
recorder
} else {
let mut recorder = Rec::default();
let (term_index, arena) = (&mut ctx.term_index, &mut ctx.arena);
term_index.mutate_or_create(term.as_slice(), |opt_recorder: Option<Rec>| {
if let Some(mut recorder) = opt_recorder {
let current_doc = recorder.current_doc();
if current_doc != doc {
recorder.close_doc(arena);
recorder.new_doc(doc, arena);
recorder.record_position(position, arena);
recorder
}
},
) as UnorderedTermId
recorder.record_position(position, arena);
recorder
} else {
let mut recorder = Rec::default();
recorder.new_doc(doc, arena);
recorder.record_position(position, arena);
recorder
}
}) as UnorderedTermId
}
fn serialize(
@@ -242,15 +248,7 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (term, addr, _) in term_addrs {
Self::serialize_one_term(
term,
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
&self.term_map,
serializer,
)?;
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
}
Ok(())
}

View File

@@ -46,7 +46,6 @@ impl Addr {
}
/// Returns the `Addr` object for `addr + offset`
#[inline]
pub fn offset(self, offset: u32) -> Addr {
Addr(self.0.wrapping_add(offset))
}
@@ -55,24 +54,20 @@ impl Addr {
Addr((page_id << NUM_BITS_PAGE_ADDR | local_addr) as u32)
}
#[inline]
fn page_id(self) -> usize {
(self.0 as usize) >> NUM_BITS_PAGE_ADDR
}
#[inline]
fn page_local_addr(self) -> usize {
(self.0 as usize) & (PAGE_SIZE - 1)
}
/// Returns true if and only if the `Addr` is null.
#[inline]
pub fn is_null(self) -> bool {
self.0 == u32::max_value()
}
}
#[inline]
pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
assert_eq!(dest.len(), std::mem::size_of::<Item>());
unsafe {
@@ -80,7 +75,6 @@ pub fn store<Item: Copy + 'static>(dest: &mut [u8], val: Item) {
}
}
#[inline]
pub fn load<Item: Copy + 'static>(data: &[u8]) -> Item {
assert_eq!(data.len(), std::mem::size_of::<Item>());
unsafe { ptr::read_unaligned(data.as_ptr() as *const Item) }
@@ -116,7 +110,6 @@ impl MemoryArena {
self.pages.len() * PAGE_SIZE
}
#[inline]
pub fn write_at<Item: Copy + 'static>(&mut self, addr: Addr, val: Item) {
let dest = self.slice_mut(addr, std::mem::size_of::<Item>());
store(dest, val);
@@ -127,7 +120,6 @@ impl MemoryArena {
/// # Panics
///
/// If the address is erroneous
#[inline]
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
load(self.slice(addr, mem::size_of::<Item>()))
}
@@ -136,7 +128,6 @@ impl MemoryArena {
self.pages[addr.page_id()].slice(addr.page_local_addr(), len)
}
#[inline]
pub fn slice_from(&self, addr: Addr) -> &[u8] {
self.pages[addr.page_id()].slice_from(addr.page_local_addr())
}

View File

@@ -4,4 +4,4 @@ mod term_hashmap;
pub(crate) use self::expull::ExpUnrolledLinkedList;
pub(crate) use self::memory_arena::{Addr, MemoryArena};
pub(crate) use self::term_hashmap::TermHashMap;
pub(crate) use self::term_hashmap::{compute_table_size, TermHashMap};

View File

@@ -1,6 +1,6 @@
use std::convert::TryInto;
use std::{iter, mem, slice};
use byteorder::{ByteOrder, NativeEndian};
use murmurhash32::murmurhash2;
use super::{Addr, MemoryArena};
@@ -8,6 +8,13 @@ use crate::postings::stacker::memory_arena::store;
use crate::postings::UnorderedTermId;
use crate::Term;
/// Returns the actual memory size in bytes
/// required to create a table with a given capacity.
/// required to create a table of size
pub(crate) fn compute_table_size(capacity: usize) -> usize {
capacity * mem::size_of::<KeyValue>()
}
/// `KeyValue` is the item stored in the hash table.
/// The key is actually a `BytesRef` object stored in an external memory arena.
/// The `value_addr` also points to an address in the memory arena.
@@ -29,7 +36,6 @@ impl Default for KeyValue {
}
impl KeyValue {
#[inline]
fn is_empty(self) -> bool {
self.key_value_addr.is_null()
}
@@ -45,17 +51,12 @@ impl KeyValue {
/// or copying the key as long as there is no insert.
pub struct TermHashMap {
table: Box<[KeyValue]>,
memory_arena: MemoryArena,
mask: usize,
occupied: Vec<usize>,
len: usize,
}
impl Default for TermHashMap {
fn default() -> Self {
Self::new(1 << 10)
}
}
struct QuadraticProbing {
hash: usize,
i: usize,
@@ -74,21 +75,18 @@ impl QuadraticProbing {
}
}
pub struct Iter<'a, 'm> {
pub struct Iter<'a> {
hashmap: &'a TermHashMap,
memory_arena: &'m MemoryArena,
inner: slice::Iter<'a, usize>,
}
impl<'a, 'm> Iterator for Iter<'a, 'm> {
type Item = (Term<&'m [u8]>, Addr, UnorderedTermId);
impl<'a> Iterator for Iter<'a> {
type Item = (Term<&'a [u8]>, Addr, UnorderedTermId);
fn next(&mut self) -> Option<Self::Item> {
self.inner.next().cloned().map(move |bucket: usize| {
let kv = self.hashmap.table[bucket];
let (key, offset): (&'m [u8], Addr) = self
.hashmap
.get_key_value(kv.key_value_addr, self.memory_arena);
let (key, offset): (&'a [u8], Addr) = self.hashmap.get_key_value(kv.key_value_addr);
(Term::wrap(key), offset, kv.unordered_term_id)
})
}
@@ -108,19 +106,21 @@ impl TermHashMap {
pub(crate) fn new(table_size: usize) -> TermHashMap {
assert!(table_size > 0);
let table_size_power_of_2 = compute_previous_power_of_two(table_size);
let memory_arena = MemoryArena::new();
let table: Vec<KeyValue> = iter::repeat(KeyValue::default())
.take(table_size_power_of_2)
.collect();
TermHashMap {
table: table.into_boxed_slice(),
memory_arena,
mask: table_size_power_of_2 - 1,
occupied: Vec::with_capacity(table_size_power_of_2 / 2),
len: 0,
}
}
pub fn read<Item: Copy + 'static>(&self, addr: Addr, memory_arena: &MemoryArena) -> Item {
memory_arena.read(addr)
pub fn read<Item: Copy + 'static>(&self, addr: Addr) -> Item {
self.memory_arena.read(addr)
}
fn probe(&self, hash: u32) -> QuadraticProbing {
@@ -129,8 +129,6 @@ impl TermHashMap {
pub fn mem_usage(&self) -> usize {
self.table.len() * mem::size_of::<KeyValue>()
+ self.occupied.len()
* std::mem::size_of_val(&self.occupied.get(0).cloned().unwrap_or_default())
}
fn is_saturated(&self) -> bool {
@@ -138,22 +136,16 @@ impl TermHashMap {
}
#[inline]
fn get_key_value<'m>(&self, addr: Addr, memory_arena: &'m MemoryArena) -> (&'m [u8], Addr) {
let data = memory_arena.slice_from(addr);
let (key_bytes_len_enc, data) = data.split_at(2);
let key_bytes_len: u16 = u16::from_ne_bytes(key_bytes_len_enc.try_into().unwrap());
let key_bytes: &[u8] = &data[..key_bytes_len as usize];
fn get_key_value(&self, addr: Addr) -> (&[u8], Addr) {
let data = self.memory_arena.slice_from(addr);
let key_bytes_len = NativeEndian::read_u16(data) as usize;
let key_bytes: &[u8] = &data[2..][..key_bytes_len];
(key_bytes, addr.offset(2u32 + key_bytes_len as u32))
}
#[inline]
fn get_value_addr_if_key_match(
&self,
target_key: &[u8],
addr: Addr,
memory_arena: &mut MemoryArena,
) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr, memory_arena);
fn get_value_addr_if_key_match(&self, target_key: &[u8], addr: Addr) -> Option<Addr> {
let (stored_key, value_addr) = self.get_key_value(addr);
if stored_key == target_key {
Some(value_addr)
} else {
@@ -177,11 +169,10 @@ impl TermHashMap {
self.len
}
pub fn iter<'a, 'm>(&'a self, memory_arena: &'m MemoryArena) -> Iter<'a, 'm> {
pub fn iter(&self) -> Iter<'_> {
Iter {
inner: self.occupied.iter(),
hashmap: self,
memory_arena,
}
}
@@ -218,7 +209,6 @@ impl TermHashMap {
pub fn mutate_or_create<V, TMutator>(
&mut self,
key: &[u8],
memory_arena: &mut MemoryArena,
mut updater: TMutator,
) -> UnorderedTermId
where
@@ -229,33 +219,28 @@ impl TermHashMap {
self.resize();
}
let hash = murmurhash2(key);
let mut probe = self.probe(hash);
loop {
let bucket = probe.next_probe();
let kv: KeyValue = self.table[bucket];
if kv.is_empty() {
// The key does not exists yet.
let val = updater(None);
let num_bytes = std::mem::size_of::<u16>() + key.len() + std::mem::size_of::<V>();
let key_addr = memory_arena.allocate_space(num_bytes);
let key_addr = self.memory_arena.allocate_space(num_bytes);
{
let data = memory_arena.slice_mut(key_addr, num_bytes);
let (key_len, data) = data.split_at_mut(2);
key_len.copy_from_slice(&(key.len() as u16).to_le_bytes());
let stop = key.len();
data[..key.len()].copy_from_slice(key);
let data = self.memory_arena.slice_mut(key_addr, num_bytes);
NativeEndian::write_u16(data, key.len() as u16);
let stop = 2 + key.len();
data[2..stop].copy_from_slice(key);
store(&mut data[stop..], val);
}
return self.set_bucket(hash, key_addr, bucket);
} else if kv.hash == hash {
if let Some(val_addr) =
self.get_value_addr_if_key_match(key, kv.key_value_addr, memory_arena)
{
let v = memory_arena.read(val_addr);
if let Some(val_addr) = self.get_value_addr_if_key_match(key, kv.key_value_addr) {
let v = self.memory_arena.read(val_addr);
let new_v = updater(Some(v));
memory_arena.write_at(val_addr, new_v);
self.memory_arena.write_at(val_addr, new_v);
return kv.unordered_term_id;
}
}
@@ -269,28 +254,26 @@ mod tests {
use std::collections::HashMap;
use super::{compute_previous_power_of_two, TermHashMap};
use crate::postings::stacker::MemoryArena;
#[test]
fn test_hash_map() {
let mut arena = MemoryArena::new();
let mut hash_map: TermHashMap = TermHashMap::new(1 << 18);
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
3u32
});
hash_map.mutate_or_create(b"abcd", &mut arena, |opt_val: Option<u32>| {
hash_map.mutate_or_create(b"abcd", |opt_val: Option<u32>| {
assert_eq!(opt_val, None);
4u32
});
hash_map.mutate_or_create(b"abc", &mut arena, |opt_val: Option<u32>| {
hash_map.mutate_or_create(b"abc", |opt_val: Option<u32>| {
assert_eq!(opt_val, Some(3u32));
5u32
});
let mut vanilla_hash_map = HashMap::new();
let iter_values = hash_map.iter(&arena);
let iter_values = hash_map.iter();
for (key, addr, _) in iter_values {
let val: u32 = arena.read(addr);
let val: u32 = hash_map.memory_arena.read(addr);
vanilla_hash_map.insert(key.to_owned(), val);
}
assert_eq!(vanilla_hash_map.len(), 2);

View File

@@ -247,7 +247,7 @@ impl MoreLikeThis {
let unix_timestamp = value
.as_date()
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
.into_unix_timestamp();
.to_unix_timestamp();
if !self.is_noise_word(unix_timestamp.to_string()) {
let term = Term::from_field_i64(field, unix_timestamp);
*term_frequencies.entry(term).or_insert(0) += 1;

View File

@@ -24,8 +24,8 @@ use crate::{DateTime, Score};
#[derive(Debug, PartialEq, Eq, Error)]
pub enum QueryParserError {
/// Error in the query syntax
#[error("Syntax Error: {0}")]
SyntaxError(String),
#[error("Syntax Error")]
SyntaxError,
/// This query is unsupported.
#[error("Unsupported query: {0}")]
UnsupportedQuery(String),
@@ -273,8 +273,8 @@ impl QueryParser {
/// Parse the user query into an AST.
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
let user_input_ast = tantivy_query_grammar::parse_query(query)
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
let user_input_ast =
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
self.compute_logical_ast(user_input_ast)
}
@@ -334,7 +334,7 @@ impl QueryParser {
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
Ok(Term::from_field_date(field, DateTime::new_utc(dt)))
}
FieldType::Str(ref str_options) => {
let option = str_options.get_indexing_options().ok_or_else(|| {
@@ -408,7 +408,7 @@ impl QueryParser {
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
let dt_term = Term::from_field_date(field, DateTime::new_utc(dt));
Ok(vec![LogicalLiteral::Term(dt_term)])
}
FieldType::Str(ref str_options) => {
@@ -547,7 +547,7 @@ impl QueryParser {
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
.collect();
if triplets.is_empty() {
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
return Err(QueryParserError::FieldDoesNotExist(field_name.to_string()));
}
Ok(triplets)
}
@@ -711,14 +711,14 @@ fn generate_literals_for_json_object(
json_term_writer.set_fast_value(f64_val);
}
NumValue::DateTime(dt_val) => {
json_term_writer.set_fast_value(DateTime::from_utc(dt_val));
json_term_writer.set_fast_value(DateTime::new_utc(dt_val));
}
}
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
}
json_term_writer.close_path_and_set_type(Type::Str);
drop(json_term_writer);
let term_num_bytes = term.value_bytes().len();
let term_num_bytes = term.as_slice().len();
let mut token_stream = text_analyzer.token_stream(phrase);
let mut terms: Vec<(usize, Term)> = Vec::new();
token_stream.process(&mut |token| {
@@ -1220,11 +1220,9 @@ mod test {
#[test]
pub fn test_query_parser_field_does_not_exist() {
let query_parser = make_query_parser();
assert_eq!(
query_parser
.parse_query("boujou:\"18446744073709551615\"")
.unwrap_err(),
QueryParserError::FieldDoesNotExist("boujou".to_string())
assert_matches!(
query_parser.parse_query("boujou:\"18446744073709551615\""),
Err(QueryParserError::FieldDoesNotExist(_))
);
}

View File

@@ -93,7 +93,13 @@ impl FieldEntry {
/// Returns true if the field is a int (signed or unsigned) fast field
pub fn is_fast(&self) -> bool {
self.field_type.is_fast()
match self.field_type {
FieldType::U64(ref options)
| FieldType::I64(ref options)
| FieldType::Date(ref options)
| FieldType::F64(ref options) => options.is_fast(),
_ => false,
}
}
/// Returns true if the field is stored
@@ -138,8 +144,7 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
"stored": false
}
}"#;
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();

View File

@@ -185,20 +185,6 @@ impl FieldType {
}
}
/// returns true if the field is fast.
pub fn is_fast(&self) -> bool {
match *self {
FieldType::Bytes(ref bytes_options) => bytes_options.is_fast(),
FieldType::Str(ref text_options) => text_options.is_fast(),
FieldType::U64(ref int_options)
| FieldType::I64(ref int_options)
| FieldType::F64(ref int_options)
| FieldType::Date(ref int_options) => int_options.get_fastfield_cardinality().is_some(),
FieldType::Facet(_) => true,
FieldType::JsonObject(_) => false,
}
}
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
pub fn has_fieldnorms(&self) -> bool {
match *self {
@@ -268,7 +254,7 @@ impl FieldType {
expected: "rfc3339 format",
json: JsonValue::String(field_text),
})?;
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
Ok(DateTime::new_utc(dt_with_fixed_tz).into())
}
FieldType::Str(_) => Ok(Value::Str(field_text)),
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
@@ -388,7 +374,7 @@ mod tests {
let naive_date = Date::from_calendar_date(1982, Month::September, 17).unwrap();
let naive_time = Time::from_hms(13, 20, 0).unwrap();
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
doc.add_date(date_field, DateTime::from_primitive(date_time));
doc.add_date(date_field, DateTime::new_primitive(date_time));
let doc_json = schema.to_json(&doc);
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
}

View File

@@ -417,7 +417,6 @@ mod tests {
use std::collections::BTreeMap;
use matches::{assert_matches, matches};
use pretty_assertions::assert_eq;
use serde_json;
use crate::schema::field_type::ValueParsingError;
@@ -470,8 +469,7 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
"stored": false
}
},
{
@@ -483,8 +481,7 @@ mod tests {
"fieldnorms": false,
"tokenizer": "raw"
},
"stored": false,
"fast": false
"stored": false
}
},
{
@@ -787,8 +784,7 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
"stored": false
}
},
{
@@ -820,8 +816,7 @@ mod tests {
"fieldnorms": true,
"tokenizer": "raw"
},
"stored": true,
"fast": false
"stored": true
}
},
{
@@ -843,8 +838,7 @@ mod tests {
"fieldnorms": true,
"tokenizer": "default"
},
"stored": false,
"fast": false
"stored": false
}
},
{

View File

@@ -17,7 +17,7 @@ use crate::DateTime;
///
/// - <value> is, if this is not the json term, a binary representation specific to the type.
/// If it is a JSON Term, then it is preprended with the path that leads to this leaf value.
const FAST_VALUE_TERM_LEN: usize = 8;
const FAST_VALUE_TERM_LEN: usize = 4 + 1 + 8;
/// Separates the different segments of
/// the json path.
@@ -33,33 +33,22 @@ pub const JSON_END_OF_PATH: u8 = 0u8;
///
/// It actually wraps a `Vec<u8>`.
#[derive(Clone)]
pub struct Term<B = Vec<u8>> {
data: B,
field: Field,
field_type: Type,
}
pub struct Term<B = Vec<u8>>(B)
where B: AsRef<[u8]>;
impl AsMut<Vec<u8>> for Term {
fn as_mut(&mut self) -> &mut Vec<u8> {
&mut self.data
&mut self.0
}
}
impl Term {
pub(crate) fn new() -> Term {
Self::with_capacity(32)
}
pub(crate) fn with_capacity(cap: usize) -> Term {
Term {
data: Vec::with_capacity(cap),
field: Field::from_field_id(0),
field_type: Type::Str,
}
Term(Vec::with_capacity(100))
}
fn from_fast_value<T: FastValue>(field: Field, val: &T) -> Term {
let mut term = Term::with_capacity(FAST_VALUE_TERM_LEN);
let mut term = Term(vec![0u8; FAST_VALUE_TERM_LEN]);
term.set_field(T::to_type(), field);
term.set_u64(val.to_u64());
term
@@ -97,9 +86,9 @@ impl Term {
}
fn create_bytes_term(typ: Type, field: Field, bytes: &[u8]) -> Term {
let mut term = Term::with_capacity(bytes.len());
let mut term = Term(vec![0u8; 5 + bytes.len()]);
term.set_field(typ, field);
term.data.extend_from_slice(bytes);
term.0.extend_from_slice(bytes);
term
}
@@ -109,9 +98,10 @@ impl Term {
}
pub(crate) fn set_field(&mut self, typ: Type, field: Field) {
self.field = field;
self.field_type = typ;
self.data.clear();
self.0.clear();
self.0
.extend_from_slice(field.field_id().to_be_bytes().as_ref());
self.0.push(typ.to_code());
}
/// Sets a u64 value in the term.
@@ -122,9 +112,11 @@ impl Term {
/// the natural order of the values.
pub fn set_u64(&mut self, val: u64) {
self.set_fast_value(val);
self.set_bytes(val.to_be_bytes().as_ref());
}
fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.0.resize(FAST_VALUE_TERM_LEN, 0u8);
self.set_bytes(val.to_u64().to_be_bytes().as_ref());
}
@@ -145,8 +137,8 @@ impl Term {
/// Sets the value of a `Bytes` field.
pub fn set_bytes(&mut self, bytes: &[u8]) {
self.data.clear();
self.data.extend(bytes);
self.0.resize(5, 0u8);
self.0.extend(bytes);
}
/// Set the texts only, keeping the field untouched.
@@ -156,18 +148,18 @@ impl Term {
/// Removes the value_bytes and set the type code.
pub fn clear_with_type(&mut self, typ: Type) {
self.data.clear();
self.field_type = typ;
self.truncate(5);
self.0[4] = typ.to_code();
}
/// Truncate the term right after the field and the type code.
pub fn truncate(&mut self, len: usize) {
self.data.truncate(len);
self.0.truncate(len);
}
/// Truncate the term right after the field and the type code.
pub fn append_bytes(&mut self, bytes: &[u8]) {
self.data.extend_from_slice(bytes);
self.0.extend_from_slice(bytes);
}
}
@@ -175,7 +167,7 @@ impl<B> Ord for Term<B>
where B: AsRef<[u8]>
{
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
self.value_bytes().cmp(other.value_bytes())
self.as_slice().cmp(other.as_slice())
}
}
@@ -191,7 +183,7 @@ impl<B> PartialEq for Term<B>
where B: AsRef<[u8]>
{
fn eq(&self, other: &Self) -> bool {
self.value_bytes() == other.value_bytes()
self.as_slice() == other.as_slice()
}
}
@@ -201,7 +193,7 @@ impl<B> Hash for Term<B>
where B: AsRef<[u8]>
{
fn hash<H: Hasher>(&self, state: &mut H) {
self.data.as_ref().hash(state)
self.0.as_ref().hash(state)
}
}
@@ -210,15 +202,14 @@ where B: AsRef<[u8]>
{
/// Wraps a object holding bytes
pub fn wrap(data: B) -> Term<B> {
Term {
data,
field: Field::from_field_id(0),
field_type: Type::Str,
}
Term(data)
}
fn typ_code(&self) -> u8 {
self.field_type as u8
*self
.as_slice()
.get(4)
.expect("the byte representation is too short")
}
/// Return the type of the term.
@@ -228,7 +219,55 @@ where B: AsRef<[u8]>
/// Returns the field.
pub fn field(&self) -> Field {
self.field
let mut field_id_bytes = [0u8; 4];
field_id_bytes.copy_from_slice(&self.0.as_ref()[..4]);
Field::from_field_id(u32::from_be_bytes(field_id_bytes))
}
/// Returns the `u64` value stored in a term.
///
/// Returns None if the term is not of the u64 type, or if the term byte representation
/// is invalid.
pub fn as_u64(&self) -> Option<u64> {
self.get_fast_type::<u64>()
}
fn get_fast_type<T: FastValue>(&self) -> Option<T> {
if self.typ() != T::to_type() {
return None;
}
let mut value_bytes = [0u8; 8];
let bytes = self.value_bytes();
if bytes.len() != 8 {
return None;
}
value_bytes.copy_from_slice(self.value_bytes());
let value_u64 = u64::from_be_bytes(value_bytes);
Some(FastValue::from_u64(value_u64))
}
/// Returns the `i64` value stored in a term.
///
/// Returns None if the term is not of the i64 type, or if the term byte representation
/// is invalid.
pub fn as_i64(&self) -> Option<i64> {
self.get_fast_type::<i64>()
}
/// Returns the `f64` value stored in a term.
///
/// Returns None if the term is not of the f64 type, or if the term byte representation
/// is invalid.
pub fn as_f64(&self) -> Option<f64> {
self.get_fast_type::<f64>()
}
/// Returns the `Date` value stored in a term.
///
/// Returns None if the term is not of the Date type, or if the term byte representation
/// is invalid.
pub fn as_date(&self) -> Option<DateTime> {
self.get_fast_type::<DateTime>()
}
/// Returns the text associated with the term.
@@ -236,12 +275,43 @@ where B: AsRef<[u8]>
/// Returns None if the field is not of string type
/// or if the bytes are not valid utf-8.
pub fn as_str(&self) -> Option<&str> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Str {
return None;
}
str::from_utf8(self.value_bytes()).ok()
}
/// Returns the facet associated with the term.
///
/// Returns None if the field is not of facet type
/// or if the bytes are not valid utf-8.
pub fn as_facet(&self) -> Option<Facet> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Facet {
return None;
}
let facet_encode_str = str::from_utf8(self.value_bytes()).ok()?;
Some(Facet::from_encoded_string(facet_encode_str.to_string()))
}
/// Returns the bytes associated with the term.
///
/// Returns None if the field is not of bytes type.
pub fn as_bytes(&self) -> Option<&[u8]> {
if self.as_slice().len() < 5 {
return None;
}
if self.typ() != Type::Bytes {
return None;
}
Some(self.value_bytes())
}
/// Returns the serialized value of the term.
/// (this does not include the field.)
///
@@ -249,7 +319,15 @@ where B: AsRef<[u8]>
/// If the term is a u64, its value is encoded according
/// to `byteorder::LittleEndian`.
pub fn value_bytes(&self) -> &[u8] {
&self.data.as_ref()
&self.0.as_ref()[5..]
}
/// Returns the underlying `&[u8]`.
///
/// Do NOT rely on this byte representation in the index.
/// This value is likely to change in the future.
pub(crate) fn as_slice(&self) -> &[u8] {
self.0.as_ref()
}
}
@@ -356,6 +434,7 @@ mod tests {
let term = Term::from_field_u64(count_field, 983u64);
assert_eq!(term.field(), count_field);
assert_eq!(term.typ(), Type::U64);
assert_eq!(term.value_bytes().len(), super::FAST_VALUE_TERM_LEN);
assert_eq!(term.as_slice().len(), super::FAST_VALUE_TERM_LEN);
assert_eq!(term.as_u64(), Some(983u64))
}
}

View File

@@ -3,7 +3,6 @@ use std::ops::BitOr;
use serde::{Deserialize, Serialize};
use super::flags::FastFlag;
use crate::schema::flags::{SchemaFlagList, StoredFlag};
use crate::schema::IndexRecordOption;
@@ -15,8 +14,6 @@ pub struct TextOptions {
indexing: Option<TextFieldIndexing>,
#[serde(default)]
stored: bool,
#[serde(default)]
fast: bool,
}
impl TextOptions {
@@ -30,25 +27,6 @@ impl TextOptions {
self.stored
}
/// Returns true iff the value is a fast field.
pub fn is_fast(&self) -> bool {
self.fast
}
/// Set the field as a fast field.
///
/// Fast fields are designed for random access.
/// Access time are similar to a random lookup in an array.
/// Text fast fields will have the term ids stored in the fast field.
/// The fast field will be a multivalued fast field.
///
/// The original text can be retrieved via `ord_to_term` from the dictionary.
#[must_use]
pub fn set_fast(mut self) -> TextOptions {
self.fast = true;
self
}
/// Sets the field as stored
#[must_use]
pub fn set_stored(mut self) -> TextOptions {
@@ -67,13 +45,9 @@ impl TextOptions {
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
struct TokenizerName(Cow<'static, str>);
const DEFAULT_TOKENIZER_NAME: &str = "default";
const NO_TOKENIZER_NAME: &str = "raw";
impl Default for TokenizerName {
fn default() -> Self {
TokenizerName::from_static(DEFAULT_TOKENIZER_NAME)
TokenizerName::from_static("default")
}
}
@@ -167,23 +141,21 @@ impl TextFieldIndexing {
/// The field will be untokenized and indexed.
pub const STRING: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: TokenizerName::from_static(NO_TOKENIZER_NAME),
tokenizer: TokenizerName::from_static("raw"),
fieldnorms: true,
record: IndexRecordOption::Basic,
}),
stored: false,
fast: false,
};
/// The field will be tokenized and indexed.
pub const TEXT: TextOptions = TextOptions {
indexing: Some(TextFieldIndexing {
tokenizer: TokenizerName::from_static(DEFAULT_TOKENIZER_NAME),
tokenizer: TokenizerName::from_static("default"),
fieldnorms: true,
record: IndexRecordOption::WithFreqsAndPositions,
}),
stored: false,
fast: false,
};
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
@@ -194,7 +166,6 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
TextOptions {
indexing: self.indexing.or(other.indexing),
stored: self.stored | other.stored,
fast: self.fast | other.fast,
}
}
}
@@ -210,17 +181,6 @@ impl From<StoredFlag> for TextOptions {
TextOptions {
indexing: None,
stored: true,
fast: false,
}
}
}
impl From<FastFlag> for TextOptions {
fn from(_: FastFlag) -> TextOptions {
TextOptions {
indexing: None,
stored: false,
fast: true,
}
}
}

View File

@@ -43,7 +43,7 @@ impl Serialize for Value {
Value::U64(u) => serializer.serialize_u64(u),
Value::I64(u) => serializer.serialize_i64(u),
Value::F64(u) => serializer.serialize_f64(u),
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.to_utc(), serializer),
Value::Facet(ref facet) => facet.serialize(serializer),
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
Value::JsonObject(ref obj) => obj.serialize(serializer),
@@ -409,12 +409,12 @@ mod tests {
#[test]
fn test_serialize_date() {
let value = Value::from(DateTime::from_utc(
let value = Value::from(DateTime::new_utc(
OffsetDateTime::parse("1996-12-20T00:39:57+00:00", &Rfc3339).unwrap(),
));
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57Z""#);
let value = Value::from(DateTime::from_utc(
let value = Value::from(DateTime::new_utc(
OffsetDateTime::parse("1996-12-20T00:39:57-01:00", &Rfc3339).unwrap(),
));
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();