mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 15:22:55 +00:00
Compare commits
15 Commits
ip_fastfie
...
fastfield-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14d53851a8 | ||
|
|
2d176e66b6 | ||
|
|
838a332db0 | ||
|
|
defbd9139b | ||
|
|
0c87732459 | ||
|
|
4d66a3f0a0 | ||
|
|
977f01a8a3 | ||
|
|
c14bdd26d4 | ||
|
|
3272f80171 | ||
|
|
23d5ab5656 | ||
|
|
245ed5fed1 | ||
|
|
33bed01168 | ||
|
|
17a5f4f0ff | ||
|
|
c969582308 | ||
|
|
18d2ee5bb7 |
7
.github/workflows/coverage.yml
vendored
7
.github/workflows/coverage.yml
vendored
@@ -13,11 +13,12 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Install cargo-llvm-cov
|
||||
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
uses: codecov/codecov-action@v2
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
|
||||
15
.github/workflows/long_running.yml
vendored
15
.github/workflows/long_running.yml
vendored
@@ -9,21 +9,16 @@ env:
|
||||
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
|
||||
|
||||
jobs:
|
||||
test:
|
||||
|
||||
functional_test_unsorted:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install stable
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Run indexing_unsorted
|
||||
run: cargo test indexing_unsorted -- --ignored
|
||||
functional_test_sorted:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Run indexing_sorted
|
||||
run: cargo test indexing_sorted -- --ignored
|
||||
|
||||
|
||||
9
.github/workflows/test.yml
vendored
9
.github/workflows/test.yml
vendored
@@ -16,6 +16,8 @@ jobs:
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
@@ -23,18 +25,15 @@ jobs:
|
||||
override: true
|
||||
components: rustfmt
|
||||
|
||||
- name: Install stable
|
||||
- name: Install latest nightly to test also against unstable feature flag
|
||||
uses: actions-rs/toolchain@v1
|
||||
with:
|
||||
toolchain: stable
|
||||
override: true
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Build
|
||||
run: cargo build --verbose --workspace
|
||||
|
||||
- name: Run tests
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
|
||||
- name: Run tests quickwit feature
|
||||
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
|
||||
|
||||
@@ -57,8 +57,8 @@ For a better idea of how indexing works, you may read the [following blog post](
|
||||
|
||||
Deletes happen by deleting a "term". Tantivy does not offer any notion of primary id, so it is up to the user to use a field in their schema as if it was a primary id, and delete the associated term if they want to delete only one specific document.
|
||||
|
||||
On commit, tantivy will find all of the segments with documents matching this existing term and remove from [alive bitset file](src/fastfield/alive_bitset.rs) that represents the bitset of the alive document ids.
|
||||
Like all segment files, this file is immutable. Because it is possible to have more than one alive bitset file at a given instant, the alive bitset filename has the format ``` segment_id . commit_opstamp . del```.
|
||||
On commit, tantivy will find all of the segments with documents matching this existing term and create a [tombstone file](src/fastfield/delete.rs) that represents the bitset of the document that are deleted.
|
||||
Like all segment files, this file is immutable. Because it is possible to have more than one tombstone file at a given instant, the tombstone filename has the format ``` segment_id . commit_opstamp . del```.
|
||||
|
||||
An opstamp is simply an incremental id that identifies any operation applied to the index. For instance, performing a commit or adding a document.
|
||||
|
||||
@@ -249,7 +249,7 @@ For instance, when the phrase query "the art of war" does not match "the war of
|
||||
To make it possible, it is possible to specify in the schema that a field should store positions in addition to being indexed.
|
||||
|
||||
The token positions of all of the terms are then stored in a separate file with the extension `.pos`.
|
||||
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
|
||||
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate throught the docset,
|
||||
we advance the position reader by the number of term frequencies of the current document.
|
||||
|
||||
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
|
||||
|
||||
17
CHANGELOG.md
17
CHANGELOG.md
@@ -1,11 +1,4 @@
|
||||
Tantivy 0.19
|
||||
================================
|
||||
- Updated [Date Field Type](https://github.com/quickwit-oss/tantivy/pull/1396)
|
||||
The `DateTime` type has been updated to hold timestamps with microseconds precision.
|
||||
`DateOptions` and `DatePrecision` have been added to configure Date fields. The precision is used to hint on fast values compression. Otherwise, seconds precision is used everywhere else (i.e terms, indexing).
|
||||
- Remove Searcher pool and make `Searcher` cloneable.
|
||||
|
||||
Tantivy 0.18
|
||||
Unreleased
|
||||
================================
|
||||
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
|
||||
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
|
||||
@@ -15,10 +8,6 @@ Tantivy 0.18
|
||||
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
|
||||
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
|
||||
directly instead.
|
||||
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
|
||||
- Add support for fastfield on text fields (@PSeitz)
|
||||
- Add terms aggregation (@PSeitz)
|
||||
- Add support for zstd compression (@kryesh)
|
||||
|
||||
Tantivy 0.17
|
||||
================================
|
||||
@@ -30,13 +19,13 @@ Tantivy 0.17
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)
|
||||
|
||||
Tantivy 0.16.2
|
||||
================================
|
||||
- Bugfix in FuzzyTermQuery. (transposition_cost_one was not doing anything)
|
||||
- Bugfix in FuzzyTermQuery. (tranposition_cost_one was not doing anything)
|
||||
|
||||
Tantivy 0.16.1
|
||||
========================
|
||||
|
||||
101
Cargo.toml
101
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.18.0"
|
||||
version = "0.17.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -10,75 +10,71 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.3"
|
||||
base64 = "0.13.0"
|
||||
oneshot = "0.1"
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
tantivy-fst = "0.3.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.11", optional = true }
|
||||
crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap2 = {version = "0.5", optional=true}
|
||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
num_cpus = "1.13.1"
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
log = "0.4.14"
|
||||
serde = { version = "1.0.126", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
|
||||
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.3", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2.0"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8.1"
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4.0"
|
||||
census = "0.4"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.30"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
murmurhash32 = "0.2.0"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.7.5"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
measure_time = "0.8.2"
|
||||
pretty_assertions = "1.2.1"
|
||||
serde_cbor = { version = "0.11.2", optional = true }
|
||||
async-trait = "0.1.53"
|
||||
arc-swap = "1.5.0"
|
||||
gcd = "2.1.0"
|
||||
roaring = "0.9.0"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
time = { version = "0.3.7", features = ["serde-well-known"] }
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.7.0"
|
||||
fastdivide = "0.4"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
pretty_assertions = "1.1.0"
|
||||
serde_cbor = {version="0.11", optional=true}
|
||||
async-trait = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.5"
|
||||
rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
proptest = "1.0.0"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.10"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = { version = "0.10.0", features = ["flamegraph", "criterion"] }
|
||||
futures = "0.3.21"
|
||||
pprof = {version= "0.7", features=["flamegraph", "criterion"]}
|
||||
futures = "0.3.15"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5.0"
|
||||
version = "0.5"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
@@ -97,7 +93,6 @@ mmap = ["fs2", "tempfile", "memmap2"]
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
22
README.md
22
README.md
@@ -128,13 +128,10 @@ $ gdb run
|
||||
# Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
<img align="center" src="doc/assets/images/Nuclia.png" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
|
||||
|
||||
# FAQ
|
||||
@@ -152,13 +149,4 @@ You can also find other bindings on [GitHub](https://github.com/search?q=tantivy
|
||||
- and [more](https://github.com/search?q=tantivy)!
|
||||
|
||||
### On average, how much faster is Tantivy compared to Lucene?
|
||||
- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
|
||||
|
||||
### Does tantivy support incremental indexing?
|
||||
- Yes.
|
||||
|
||||
### How can I edit documents?
|
||||
- Data in tantivy is immutable. To edit a document, the document needs to be deleted and reindexed.
|
||||
|
||||
### When will my documents be searchable during indexing?
|
||||
- Documents will be searchable after a `commit` is called on an `IndexWriter`. Existing `IndexReader`s will also need to be reloaded in order to reflect the changes. Finally, changes are only visible to newly acquired `Searcher`.
|
||||
- According to our [search latency benchmark](https://tantivy-search.github.io/bench/), Tantivy is approximately 2x faster than Lucene.
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.2.0"
|
||||
edition = "2021"
|
||||
version = "0.1.1"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = []
|
||||
|
||||
@@ -1,16 +1,16 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
description = "common traits and utility functions used by multiple tantivy subcrates"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.3", path="../ownedbytes" }
|
||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
@@ -11,10 +11,7 @@ mod writer;
|
||||
|
||||
pub use bitset::*;
|
||||
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
|
||||
pub use vint::{
|
||||
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
|
||||
serialize_vint_u32, write_u32_vint, VInt,
|
||||
};
|
||||
pub use vint::{read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt};
|
||||
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
|
||||
|
||||
/// Has length trait
|
||||
@@ -107,6 +104,8 @@ pub fn u64_to_f64(val: u64) -> f64 {
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
|
||||
use std::f64;
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
@@ -136,11 +135,11 @@ pub mod test {
|
||||
|
||||
#[test]
|
||||
fn test_i64_converter() {
|
||||
assert_eq!(i64_to_u64(i64::MIN), u64::MIN);
|
||||
assert_eq!(i64_to_u64(i64::MAX), u64::MAX);
|
||||
assert_eq!(i64_to_u64(i64::min_value()), u64::min_value());
|
||||
assert_eq!(i64_to_u64(i64::max_value()), u64::max_value());
|
||||
test_i64_converter_helper(0i64);
|
||||
test_i64_converter_helper(i64::MIN);
|
||||
test_i64_converter_helper(i64::MAX);
|
||||
test_i64_converter_helper(i64::min_value());
|
||||
test_i64_converter_helper(i64::max_value());
|
||||
for i in -1000i64..1000i64 {
|
||||
test_i64_converter_helper(i);
|
||||
}
|
||||
|
||||
@@ -229,7 +229,7 @@ pub mod test {
|
||||
fixed_size_test::<u32>();
|
||||
assert_eq!(4, serialize_test(3u32));
|
||||
assert_eq!(4, serialize_test(5u32));
|
||||
assert_eq!(4, serialize_test(u32::MAX));
|
||||
assert_eq!(4, serialize_test(u32::max_value()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -247,11 +247,6 @@ pub mod test {
|
||||
fixed_size_test::<u64>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_bool() {
|
||||
fixed_size_test::<bool>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_string() {
|
||||
assert_eq!(serialize_test(String::from("")), 1);
|
||||
@@ -277,6 +272,6 @@ pub mod test {
|
||||
assert_eq!(serialize_test(VInt(1234u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_383u64)), 2);
|
||||
assert_eq!(serialize_test(VInt(16_384u64)), 3);
|
||||
assert_eq!(serialize_test(VInt(u64::MAX)), 10);
|
||||
assert_eq!(serialize_test(VInt(u64::max_value())), 10);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,40 +5,6 @@ use byteorder::{ByteOrder, LittleEndian};
|
||||
|
||||
use super::BinarySerializable;
|
||||
|
||||
/// Variable int serializes a u128 number
|
||||
pub fn serialize_vint_u128(mut val: u128, output: &mut Vec<u8>) {
|
||||
loop {
|
||||
let next_byte: u8 = (val % 128u128) as u8;
|
||||
val /= 128u128;
|
||||
if val == 0 {
|
||||
output.push(next_byte | STOP_BIT);
|
||||
return;
|
||||
} else {
|
||||
output.push(next_byte);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Deserializes a u128 number
|
||||
///
|
||||
/// Returns the number and the slice after the vint
|
||||
pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> {
|
||||
let mut result = 0u128;
|
||||
let mut shift = 0u64;
|
||||
for i in 0..19 {
|
||||
let b = data[i];
|
||||
result |= u128::from(b % 128u8) << shift;
|
||||
if b >= STOP_BIT {
|
||||
return Ok((result, &data[i + 1..]));
|
||||
}
|
||||
shift += 7;
|
||||
}
|
||||
Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"Failed to deserialize u128 vint",
|
||||
))
|
||||
}
|
||||
|
||||
/// Wrapper over a `u64` that serializes as a variable int.
|
||||
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
|
||||
pub struct VInt(pub u64);
|
||||
@@ -210,7 +176,6 @@ impl BinarySerializable for VInt {
|
||||
mod tests {
|
||||
|
||||
use super::{serialize_vint_u32, BinarySerializable, VInt};
|
||||
use crate::vint::{deserialize_vint_u128, serialize_vint_u128};
|
||||
|
||||
fn aux_test_vint(val: u64) {
|
||||
let mut v = [14u8; 10];
|
||||
@@ -234,7 +199,7 @@ mod tests {
|
||||
aux_test_vint(0);
|
||||
aux_test_vint(1);
|
||||
aux_test_vint(5);
|
||||
aux_test_vint(u64::MAX);
|
||||
aux_test_vint(u64::max_value());
|
||||
for i in 1..9 {
|
||||
let power_of_128 = 1u64 << (7 * i);
|
||||
aux_test_vint(power_of_128 - 1u64);
|
||||
@@ -252,21 +217,6 @@ mod tests {
|
||||
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val);
|
||||
}
|
||||
|
||||
fn aux_test_vint_u128(val: u128) {
|
||||
let mut data = vec![];
|
||||
serialize_vint_u128(val, &mut data);
|
||||
let (deser_val, _data) = deserialize_vint_u128(&data).unwrap();
|
||||
assert_eq!(val, deser_val);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vint_u128() {
|
||||
aux_test_vint_u128(0);
|
||||
aux_test_vint_u128(1);
|
||||
aux_test_vint_u128(u128::MAX / 3);
|
||||
aux_test_vint_u128(u128::MAX);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_vint_u32() {
|
||||
aux_test_serialize_vint_u32(0);
|
||||
@@ -278,6 +228,6 @@ mod tests {
|
||||
aux_test_serialize_vint_u32(power_of_128);
|
||||
aux_test_serialize_vint_u32(power_of_128 + 1u32);
|
||||
}
|
||||
aux_test_serialize_vint_u32(u32::MAX);
|
||||
aux_test_serialize_vint_u32(u32::max_value());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -62,7 +62,7 @@ impl<W: TerminatingWrite> TerminatingWrite for CountingWriter<W> {
|
||||
pub struct AntiCallToken(());
|
||||
|
||||
/// Trait used to indicate when no more write need to be done on a writer
|
||||
pub trait TerminatingWrite: Write + Send {
|
||||
pub trait TerminatingWrite: Write {
|
||||
/// Indicate that the writer will no longer be used. Internally call terminate_ref.
|
||||
fn terminate(mut self) -> io::Result<()>
|
||||
where Self: Sized {
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 56 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 23 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 7.8 KiB |
@@ -38,7 +38,7 @@ Note: Tantivy 0.16 does not do this optimization yet.
|
||||
In principle there are many algorithms possible that exploit the monotonically increasing nature. (aggregations maybe?)
|
||||
|
||||
## Usage
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantvy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```
|
||||
let settings = IndexSettings {
|
||||
|
||||
@@ -86,7 +86,7 @@ If one more json field is defined, things get even more complicated.
|
||||
## Default json field
|
||||
|
||||
If the schema contains a text field called "text" and a json field that is set as a default field:
|
||||
`text:hello` could be reasonably interpreted as targeting the text field or as targeting the json field called `json_dynamic` with the json_path "text".
|
||||
`text:hello` could be reasonably interpreted as targetting the text field or as targetting the json field called `json_dynamic` with the json_path "text".
|
||||
|
||||
If there is such an ambiguity, we decide to only search in the "text" field: `text:hello`.
|
||||
|
||||
|
||||
@@ -110,7 +110,6 @@ fn main() -> tantivy::Result<()> {
|
||||
(9f64..14f64).into(),
|
||||
(14f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1.clone(),
|
||||
}),
|
||||
@@ -118,12 +117,12 @@ fn main() -> tantivy::Result<()> {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
// # DateTime field example
|
||||
//
|
||||
// This example shows how the DateTime field can be used
|
||||
|
||||
use tantivy::collector::TopDocs;
|
||||
use tantivy::query::QueryParser;
|
||||
use tantivy::schema::{Cardinality, DateOptions, Schema, Value, INDEXED, STORED, STRING};
|
||||
use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
let mut schema_builder = Schema::builder();
|
||||
let opts = DateOptions::from(INDEXED)
|
||||
.set_stored()
|
||||
.set_fast(Cardinality::SingleValue)
|
||||
.set_precision(tantivy::DatePrecision::Seconds);
|
||||
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
|
||||
let event_type = schema_builder.add_text_field("event", STRING | STORED);
|
||||
let schema = schema_builder.build();
|
||||
|
||||
// # Indexing documents
|
||||
let index = Index::create_in_ram(schema.clone());
|
||||
|
||||
let mut index_writer = index.writer(50_000_000)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T12:53:50.53Z",
|
||||
"event": "pull-request"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
let doc = schema.parse_document(
|
||||
r#"{
|
||||
"occurred_at": "2022-06-22T13:00:00.22Z",
|
||||
"event": "comment"
|
||||
}"#,
|
||||
)?;
|
||||
index_writer.add_document(doc)?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type]);
|
||||
{
|
||||
let query = query_parser.parse_query("event:comment")?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(5))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser
|
||||
.parse_query(r#"occurred_at:[2022-06-22T12:58:00Z TO 2022-06-23T00:00:00Z}"#)?;
|
||||
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
|
||||
assert_eq!(count_docs.len(), 1);
|
||||
for (_score, doc_address) in count_docs {
|
||||
let retrieved_doc = searcher.doc(doc_address)?;
|
||||
assert!(matches!(
|
||||
retrieved_doc.get_first(occurred_at),
|
||||
Some(Value::Date(_))
|
||||
));
|
||||
assert_eq!(
|
||||
schema.to_json(&retrieved_doc),
|
||||
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
@@ -1,8 +1,7 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless by setting it as
|
||||
// default query parser field.
|
||||
// to make tantivy partially schemaless.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
@@ -11,6 +10,10 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// We need two fields:
|
||||
// - a timestamp
|
||||
// - a json object field
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
@@ -40,8 +43,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard",
|
||||
"event_type": "holiday-sale"
|
||||
"description": "das keyboard"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
@@ -51,9 +53,6 @@ fn main() -> tantivy::Result<()> {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type and attributes
|
||||
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
|
||||
// instead of "attributes.target"
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
@@ -71,34 +70,10 @@ fn main() -> tantivy::Result<()> {
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
{
|
||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||
// addressed
|
||||
let query = query_parser.parse_query("click AND 133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
{
|
||||
// Default json fields are ignored if they collide with the schema
|
||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
// # Query via full attribute path
|
||||
{
|
||||
// This only searches in our schema's `event_type` field
|
||||
let query = query_parser.parse_query("event_type:click")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 2);
|
||||
}
|
||||
{
|
||||
// Default json fields can still be accessed by full path
|
||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
let query = query_parser
|
||||
.parse_query("event_type:click AND cart.product_id:133")
|
||||
.unwrap();
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -145,7 +145,11 @@ fn main() -> tantivy::Result<()> {
|
||||
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
|
||||
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
|
||||
)];
|
||||
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
|
||||
let reader: IndexReader = index
|
||||
.reader_builder()
|
||||
.warmers(warmers)
|
||||
.num_searchers(1)
|
||||
.try_into()?;
|
||||
reader.reload()?;
|
||||
|
||||
let query_parser = QueryParser::for_index(&index, vec![text]);
|
||||
|
||||
1
fastfield_codecs/.gitignore
vendored
Normal file
1
fastfield_codecs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
datasets/
|
||||
@@ -1,27 +1,22 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = { version="0.8.3", optional= true}
|
||||
itertools = { version="0.10.3", optional=true}
|
||||
measure_time = { version="0.8.2", optional=true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
[dev-dependencies]
|
||||
more-asserts = "0.3.0"
|
||||
proptest = "1.0.0"
|
||||
more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
bin = ["prettytable-rs", "rand", "itertools", "measure_time"]
|
||||
unstable = [] # useful for benches and experimental codecs.
|
||||
bin = ["prettytable-rs", "rand"]
|
||||
default = ["bin"]
|
||||
|
||||
|
||||
6
fastfield_codecs/Makefile
Normal file
6
fastfield_codecs/Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids nooc_temperatures
|
||||
download:
|
||||
@echo "--- Downloading datasets ---"
|
||||
mkdir -p datasets
|
||||
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done
|
||||
|
||||
@@ -13,6 +13,10 @@ A codec needs to implement 2 traits:
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Download real world datasets for codecs comparison
|
||||
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
|
||||
To run with the unstable codecs, execute `cargo run --features unstable`.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
@@ -23,46 +27,101 @@ cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Autoincrement | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.0051544965 | 0.17251475 | 960 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.118189104 | 0.14172314 | 708 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 474 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing concave | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.005955 | 0.18813984 | 885 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.16113 | 0.15734828 | 704 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 478 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing convex | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.00613 | 0.20376484 | 889 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.157175 | 0.17297328 | 706 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 471 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Almost monotonically increasing | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14549863 | 0.17251475 | 923 | 210 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.14943957 | 0.15734814 | 703 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 462 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Random | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14533783 | 0.14126475 | 924 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.13381402 | 0.15734814 | 695 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.12501445 | 0.125 | 422 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.39826187 | 0.4068908 | 5545 | 1086 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.39214826 | 0.40734857 | 5082 | 1073 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2864 | 567 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.032736875 | 0.094390824 | 4942 | 1067 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.02667125 | 0.079223566 | 3626 | 994 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2493 | 566 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HTTP logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.047942877 | 0.20376582 | 5121 | 1065 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.06637425 | 0.18859856 | 3929 | 1093 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.26562786 | 0.265625 | 2221 | 526 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.41900787 | 0.4225158 | 5239 | 1089 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.41504425 | 0.43859857 | 4158 | 1052 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2603 | 513 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.18364687 | 0.25064084 | 5036 | 990 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.21239226 | 0.21984856 | 4087 | 1072 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2702 | 525 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Temperatures | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | | Codec Disabled | 0 | 0 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 1.0088086 | 1.001098 | 1306 | 237 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 1.000012 | 1 | 950 | 108 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
|
||||
```
|
||||
|
||||
@@ -5,11 +5,8 @@ extern crate test;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::*;
|
||||
|
||||
@@ -70,14 +67,9 @@ mod tests {
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
@@ -85,16 +77,9 @@ mod tests {
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(b, &data);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
|
||||
@@ -14,7 +14,7 @@ pub struct BitpackedFastFieldReader {
|
||||
pub max_value_u64: u64,
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
impl<'data> FastFieldCodecReader for BitpackedFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let (_data, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
@@ -107,7 +107,7 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
/// values.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
@@ -128,7 +128,10 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
|
||||
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 128;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FORFastFieldReader {
|
||||
num_vals: u64,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
min: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
self.metadata.min + diff
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.min.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let min = u64::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self { min, num_bits })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FORFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for FORFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for FORFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = FORFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
num_vals: footer.num_vals,
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct FORFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for FORFastFieldSerializer {
|
||||
const NAME: &'static str = "FOR";
|
||||
const ID: u8 = 5;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let mut min = block_values[0];
|
||||
let mut max = block_values[0];
|
||||
for ¤t_value in block_values[1..].iter() {
|
||||
min = min.min(current_value);
|
||||
max = max.max(current_value);
|
||||
}
|
||||
let num_bits = compute_num_bits(max - min);
|
||||
for current_value in block_values.iter() {
|
||||
bit_packer.write(current_value - min, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata { min, num_bits });
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = FORFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
stats.num_vals > BLOCK_SIZE
|
||||
}
|
||||
|
||||
/// Estimate compression ratio by compute the ratio of the first block.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let max_distance = (0..last_elem_in_first_chunk)
|
||||
.into_iter()
|
||||
.map(|pos| {
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
actual_value - stats.min_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and multiply by a magic number 3 to select this codec
|
||||
// when we are almost sure that this is relevant.
|
||||
let relative_max_value = max_distance as f32 * 3.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 9 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<FORFastFieldSerializer, FORFastFieldReader>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
println!("{}", actual_compression);
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(actual_compression > 0.006);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,729 +0,0 @@
|
||||
/// This codec takes a large number space (u128) and reduces it to a compact number space.
|
||||
///
|
||||
/// It will find spaces in the numer range. For example:
|
||||
///
|
||||
/// 100, 101, 102, 103, 104, 50000, 50001
|
||||
/// could be mapped to
|
||||
/// 100..104 -> 0..4
|
||||
/// 50000..50001 -> 5..6
|
||||
///
|
||||
/// Compact space 0..6 requires much less bits than 100..50001
|
||||
///
|
||||
/// The codec is created to compress ip addresses, but may be employed in other use cases.
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
collections::BinaryHeap,
|
||||
io::{self, Write},
|
||||
net::{IpAddr, Ipv6Addr},
|
||||
ops::RangeInclusive,
|
||||
};
|
||||
|
||||
use common::{deserialize_vint_u128, serialize_vint_u128};
|
||||
use tantivy_bitpacker::{self, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::FastFieldCodecReaderU128;
|
||||
|
||||
pub fn ip_to_u128(ip_addr: IpAddr) -> u128 {
|
||||
let ip_addr_v6: Ipv6Addr = match ip_addr {
|
||||
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
|
||||
IpAddr::V6(v6) => v6,
|
||||
};
|
||||
u128::from_be_bytes(ip_addr_v6.octets())
|
||||
}
|
||||
|
||||
const INTERVAL_COST_IN_BITS: usize = 64;
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
pub struct IntervalEncoding();
|
||||
|
||||
pub struct IntervalCompressor {
|
||||
pub null_value: u128,
|
||||
min_value: u128,
|
||||
max_value: u128,
|
||||
compact_space: CompactSpace,
|
||||
pub num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Debug, Eq, PartialEq)]
|
||||
struct DeltaAndPos {
|
||||
delta: u128,
|
||||
pos: usize,
|
||||
}
|
||||
impl DeltaAndPos {
|
||||
fn new(ip: u128, pos: usize) -> Self {
|
||||
DeltaAndPos { delta: ip, pos }
|
||||
}
|
||||
}
|
||||
|
||||
impl Ord for DeltaAndPos {
|
||||
fn cmp(&self, other: &Self) -> std::cmp::Ordering {
|
||||
self.delta.cmp(&other.delta)
|
||||
}
|
||||
}
|
||||
impl PartialOrd for DeltaAndPos {
|
||||
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
|
||||
self.delta.partial_cmp(&other.delta)
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_delta_and_pos_sort() {
|
||||
let mut deltas: BinaryHeap<DeltaAndPos> = BinaryHeap::new();
|
||||
deltas.push(DeltaAndPos::new(10, 1));
|
||||
deltas.push(DeltaAndPos::new(100, 10));
|
||||
deltas.push(DeltaAndPos::new(1, 10));
|
||||
assert_eq!(deltas.pop().unwrap().delta, 100);
|
||||
assert_eq!(deltas.pop().unwrap().delta, 10);
|
||||
}
|
||||
|
||||
/// Put the deltas for the sorted ip addresses into a binary heap
|
||||
fn get_deltas(ip_addrs_sorted: &[u128]) -> BinaryHeap<DeltaAndPos> {
|
||||
let mut prev_opt = None;
|
||||
let mut deltas: BinaryHeap<DeltaAndPos> = BinaryHeap::new();
|
||||
for (pos, ip_addr) in ip_addrs_sorted.iter().cloned().enumerate() {
|
||||
let delta = if let Some(prev) = prev_opt {
|
||||
ip_addr - prev
|
||||
} else {
|
||||
ip_addr + 1
|
||||
};
|
||||
// skip too small deltas
|
||||
if delta > 2 {
|
||||
deltas.push(DeltaAndPos::new(delta, pos));
|
||||
}
|
||||
prev_opt = Some(ip_addr);
|
||||
}
|
||||
deltas
|
||||
}
|
||||
|
||||
/// Will collect blanks and add them to compact space if it will affect the number of bits used on
|
||||
/// the compact space.
|
||||
fn get_compact_space(ip_addrs_sorted: &[u128], cost_per_interval: usize) -> CompactSpace {
|
||||
let max_val = *ip_addrs_sorted.last().unwrap_or(&0u128) + 1;
|
||||
let mut deltas = get_deltas(ip_addrs_sorted);
|
||||
let mut amplitude_compact_space = max_val;
|
||||
let mut amplitude_bits: u8 = (amplitude_compact_space as f64).log2().ceil() as u8;
|
||||
let mut staged_blanks = vec![];
|
||||
|
||||
let mut compact_space = CompactSpaceBuilder::new();
|
||||
|
||||
// We will stage blanks until they reduce the compact space by 1 bit.
|
||||
// Binary heap to process the gaps by their size
|
||||
while let Some(ip_addr_and_pos) = deltas.pop() {
|
||||
let delta = ip_addr_and_pos.delta;
|
||||
let pos = ip_addr_and_pos.pos;
|
||||
staged_blanks.push((delta, pos));
|
||||
let staged_spaces_sum: u128 = staged_blanks.iter().map(|(delta, _)| delta - 1).sum();
|
||||
// +1 for later added null value
|
||||
let amplitude_new_compact_space = amplitude_compact_space - staged_spaces_sum + 1;
|
||||
let amplitude_new_bits = (amplitude_new_compact_space as f64).log2().ceil() as u8;
|
||||
if amplitude_bits == amplitude_new_bits {
|
||||
continue;
|
||||
}
|
||||
let saved_bits = (amplitude_bits - amplitude_new_bits) as usize * ip_addrs_sorted.len();
|
||||
let cost = staged_blanks.len() * cost_per_interval;
|
||||
if cost >= saved_bits {
|
||||
// Continue here, since although we walk over the deltas by size,
|
||||
// we can potentially save a lot at the last bits, which are smaller deltas
|
||||
//
|
||||
// E.g. if the first range reduces the compact space by 1000 from 2000 to 1000, which
|
||||
// saves 11-10=1 bit and the next range reduces the compact space by 950 to
|
||||
// 50, which saves 10-6=4 bit
|
||||
continue;
|
||||
}
|
||||
|
||||
amplitude_compact_space = amplitude_new_compact_space;
|
||||
amplitude_bits = amplitude_new_bits;
|
||||
for (_, pos) in staged_blanks.drain(..) {
|
||||
let ip_addr = ip_addrs_sorted[pos];
|
||||
if pos == 0 {
|
||||
compact_space.add_hole(0..=ip_addr - 1);
|
||||
} else {
|
||||
compact_space.add_hole(ip_addrs_sorted[pos - 1] + 1..=ip_addr - 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
compact_space.add_hole(max_val..=u128::MAX);
|
||||
|
||||
compact_space.finish()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn compact_space_test() {
|
||||
// small ranges are ignored here
|
||||
let ips = vec![
|
||||
2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
|
||||
];
|
||||
let ranges_and_compact_start = get_compact_space(&ips, 11);
|
||||
let null_value = ranges_and_compact_start.null_value;
|
||||
let amplitude = ranges_and_compact_start.amplitude_compact_space();
|
||||
assert_eq!(null_value, 5);
|
||||
assert_eq!(amplitude, 20);
|
||||
assert_eq!(2, ranges_and_compact_start.to_compact(2).unwrap());
|
||||
|
||||
assert_eq!(ranges_and_compact_start.to_compact(100).unwrap_err(), 0);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct CompactSpaceBuilder {
|
||||
covered_space: Vec<std::ops::RangeInclusive<u128>>,
|
||||
}
|
||||
|
||||
impl CompactSpaceBuilder {
|
||||
fn new() -> Self {
|
||||
Self {
|
||||
covered_space: vec![0..=u128::MAX],
|
||||
}
|
||||
}
|
||||
|
||||
// Will extend the first range and add a null value to it.
|
||||
fn assign_and_return_null(&mut self) -> u128 {
|
||||
self.covered_space[0] = *self.covered_space[0].start()..=*self.covered_space[0].end() + 1;
|
||||
*self.covered_space[0].end()
|
||||
}
|
||||
|
||||
// Assumes that repeated add_hole calls don't overlap.
|
||||
fn add_hole(&mut self, hole: std::ops::RangeInclusive<u128>) {
|
||||
let position = self
|
||||
.covered_space
|
||||
.iter()
|
||||
.position(|range| range.start() <= hole.start() && range.end() >= hole.end());
|
||||
if let Some(position) = position {
|
||||
let old_range = self.covered_space.remove(position);
|
||||
if old_range == hole {
|
||||
return;
|
||||
}
|
||||
let new_range_end = hole.end().saturating_add(1)..=*old_range.end();
|
||||
if old_range.start() == hole.start() {
|
||||
self.covered_space.insert(position, new_range_end);
|
||||
return;
|
||||
}
|
||||
let new_range_start = *old_range.start()..=hole.start().saturating_sub(1);
|
||||
if old_range.end() == hole.end() {
|
||||
self.covered_space.insert(position, new_range_start);
|
||||
return;
|
||||
}
|
||||
self.covered_space.insert(position, new_range_end);
|
||||
self.covered_space.insert(position, new_range_start);
|
||||
}
|
||||
}
|
||||
fn finish(mut self) -> CompactSpace {
|
||||
let null_value = self.assign_and_return_null();
|
||||
|
||||
let mut compact_start: u64 = 0;
|
||||
let mut ranges_and_compact_start = vec![];
|
||||
for cov in self.covered_space {
|
||||
let covered_range_len = cov.end() - cov.start();
|
||||
ranges_and_compact_start.push((cov, compact_start));
|
||||
compact_start += covered_range_len as u64 + 1;
|
||||
}
|
||||
CompactSpace {
|
||||
ranges_and_compact_start,
|
||||
null_value,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Eq, PartialEq)]
|
||||
struct CompactSpace {
|
||||
ranges_and_compact_start: Vec<(std::ops::RangeInclusive<u128>, u64)>,
|
||||
pub null_value: u128,
|
||||
}
|
||||
impl CompactSpace {
|
||||
fn amplitude_compact_space(&self) -> u128 {
|
||||
let last_range = &self.ranges_and_compact_start[self.ranges_and_compact_start.len() - 1];
|
||||
last_range.1 as u128 + (last_range.0.end() - last_range.0.start()) + 1
|
||||
}
|
||||
|
||||
fn get_range_and_compact_start(&self, pos: usize) -> &(std::ops::RangeInclusive<u128>, u64) {
|
||||
&self.ranges_and_compact_start[pos]
|
||||
}
|
||||
fn serialize(&self, output: &mut Vec<u8>) {
|
||||
serialize_vint_u128(self.null_value as u128, output);
|
||||
serialize_vint_u128(self.ranges_and_compact_start.len() as u128, output);
|
||||
let mut prev_ip = 0;
|
||||
for (ip_range, _compact) in &self.ranges_and_compact_start {
|
||||
let delta_ip = ip_range.start() - prev_ip;
|
||||
serialize_vint_u128(delta_ip as u128, output);
|
||||
prev_ip = *ip_range.start();
|
||||
|
||||
let delta_ip = ip_range.end() - prev_ip;
|
||||
serialize_vint_u128(delta_ip as u128, output);
|
||||
prev_ip = *ip_range.end();
|
||||
}
|
||||
}
|
||||
|
||||
fn deserialize(data: &[u8]) -> io::Result<(&[u8], Self)> {
|
||||
let (null_value, data) = deserialize_vint_u128(data)?;
|
||||
let (num_ip_addrs, mut data) = deserialize_vint_u128(data)?;
|
||||
let mut ip_addr = 0u128;
|
||||
let mut compact = 0u64;
|
||||
let mut ranges_and_compact_start: Vec<(std::ops::RangeInclusive<u128>, u64)> = vec![];
|
||||
for _ in 0..num_ip_addrs {
|
||||
let (ip_addr_delta, new_data) = deserialize_vint_u128(data)?;
|
||||
data = new_data;
|
||||
ip_addr += ip_addr_delta;
|
||||
let ip_addr_start = ip_addr;
|
||||
|
||||
let (ip_addr_delta, new_data) = deserialize_vint_u128(data)?;
|
||||
data = new_data;
|
||||
ip_addr += ip_addr_delta;
|
||||
let ip_addr_end = ip_addr;
|
||||
|
||||
let compact_delta = ip_addr_end - ip_addr_start + 1;
|
||||
|
||||
ranges_and_compact_start.push((ip_addr_start..=ip_addr_end, compact));
|
||||
compact += compact_delta as u64;
|
||||
}
|
||||
Ok((
|
||||
data,
|
||||
Self {
|
||||
null_value,
|
||||
ranges_and_compact_start,
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns either Ok(the value in the compact space) or if it is outside the compact space the
|
||||
/// Err(position on the next larger range above the value)
|
||||
fn to_compact(&self, ip: u128) -> Result<u64, usize> {
|
||||
self.ranges_and_compact_start
|
||||
.binary_search_by(|probe| {
|
||||
let ip_range = &probe.0;
|
||||
if *ip_range.start() <= ip && *ip_range.end() >= ip {
|
||||
return Ordering::Equal;
|
||||
} else if ip < *ip_range.start() {
|
||||
return Ordering::Greater;
|
||||
} else if ip > *ip_range.end() {
|
||||
return Ordering::Less;
|
||||
}
|
||||
panic!("not covered all ranges in check");
|
||||
})
|
||||
.map(|pos| {
|
||||
let (range, compact_start) = &self.ranges_and_compact_start[pos];
|
||||
compact_start + (ip - range.start()) as u64
|
||||
})
|
||||
.map_err(|pos| pos - 1)
|
||||
}
|
||||
|
||||
/// Unpacks a ip from compact space to u128 space
|
||||
fn unpack_ip(&self, compact: u64) -> u128 {
|
||||
let pos = self
|
||||
.ranges_and_compact_start
|
||||
.binary_search_by_key(&compact, |probe| probe.1)
|
||||
.map_or_else(|e| e - 1, |v| v);
|
||||
|
||||
let range_and_compact_start = &self.ranges_and_compact_start[pos];
|
||||
let diff = compact - self.ranges_and_compact_start[pos].1;
|
||||
range_and_compact_start.0.start() + diff as u128
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ranges_and_compact_start_test() {
|
||||
let ips = vec![
|
||||
2u128, 4u128, 1000, 1001, 1002, 1003, 1004, 1005, 1008, 1010, 1012, 1260,
|
||||
];
|
||||
let ranges_and_compact_start = get_compact_space(&ips, 11);
|
||||
assert_eq!(ranges_and_compact_start.null_value, 5);
|
||||
|
||||
let mut output = vec![];
|
||||
ranges_and_compact_start.serialize(&mut output);
|
||||
|
||||
assert_eq!(
|
||||
ranges_and_compact_start,
|
||||
CompactSpace::deserialize(&output).unwrap().1
|
||||
);
|
||||
|
||||
for ip in &ips {
|
||||
let compact = ranges_and_compact_start.to_compact(*ip).unwrap();
|
||||
assert_eq!(ranges_and_compact_start.unpack_ip(compact), *ip);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn train(ip_addrs_sorted: &[u128]) -> IntervalCompressor {
|
||||
let ranges_and_compact_start = get_compact_space(ip_addrs_sorted, INTERVAL_COST_IN_BITS);
|
||||
let null_value = ranges_and_compact_start.null_value;
|
||||
let amplitude_compact_space = ranges_and_compact_start.amplitude_compact_space();
|
||||
|
||||
assert!(
|
||||
amplitude_compact_space <= u64::MAX as u128,
|
||||
"case unsupported."
|
||||
);
|
||||
|
||||
let num_bits = tantivy_bitpacker::compute_num_bits(amplitude_compact_space as u64);
|
||||
let min_value = *ip_addrs_sorted.first().unwrap_or(&0);
|
||||
let max_value = *ip_addrs_sorted.last().unwrap_or(&0);
|
||||
let compressor = IntervalCompressor {
|
||||
null_value,
|
||||
min_value,
|
||||
max_value,
|
||||
compact_space: ranges_and_compact_start,
|
||||
num_bits,
|
||||
};
|
||||
|
||||
let max_value = *ip_addrs_sorted.last().unwrap_or(&0u128).max(&null_value);
|
||||
assert_eq!(
|
||||
compressor.to_compact(max_value) + 1,
|
||||
amplitude_compact_space as u64
|
||||
);
|
||||
compressor
|
||||
}
|
||||
|
||||
impl IntervalCompressor {
|
||||
/// Taking the vals as Vec may cost a lot of memory.
|
||||
/// It is used to sort the vals.
|
||||
///
|
||||
/// Less memory alternative: We could just store the index (u32), and use that as sorting.
|
||||
pub fn from_vals(mut vals: Vec<u128>) -> Self {
|
||||
vals.sort();
|
||||
train(&vals)
|
||||
}
|
||||
|
||||
fn to_compact(&self, ip_addr: u128) -> u64 {
|
||||
self.compact_space.to_compact(ip_addr).unwrap()
|
||||
}
|
||||
|
||||
fn write_footer(&self, write: &mut impl Write, num_vals: u128) -> io::Result<()> {
|
||||
let mut footer = vec![];
|
||||
|
||||
// header flags for future optional dictionary encoding
|
||||
let header_flags = 0u64;
|
||||
footer.extend_from_slice(&header_flags.to_le_bytes());
|
||||
|
||||
let null_value = self
|
||||
.compact_space
|
||||
.to_compact(self.null_value)
|
||||
.expect("could not convert null to compact space");
|
||||
serialize_vint_u128(null_value as u128, &mut footer);
|
||||
serialize_vint_u128(self.min_value, &mut footer);
|
||||
serialize_vint_u128(self.max_value, &mut footer);
|
||||
|
||||
self.compact_space.serialize(&mut footer);
|
||||
|
||||
footer.push(self.num_bits);
|
||||
serialize_vint_u128(num_vals as u128, &mut footer);
|
||||
|
||||
write.write_all(&footer)?;
|
||||
let footer_len = footer.len() as u32;
|
||||
write.write_all(&footer_len.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn compress(&self, vals: &[u128]) -> io::Result<Vec<u8>> {
|
||||
let mut output = vec![];
|
||||
self.compress_into(vals.iter().cloned(), &mut output)?;
|
||||
Ok(output)
|
||||
}
|
||||
pub fn compress_into(
|
||||
&self,
|
||||
vals: impl Iterator<Item = u128>,
|
||||
write: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
let mut bitpacker = BitPacker::default();
|
||||
let mut num_vals = 0;
|
||||
for ip_addr in vals {
|
||||
let compact = self.to_compact(ip_addr);
|
||||
bitpacker.write(compact, self.num_bits, write).unwrap();
|
||||
num_vals += 1;
|
||||
}
|
||||
bitpacker.close(write).unwrap();
|
||||
self.write_footer(write, num_vals as u128)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct IntervallDecompressor {
|
||||
compact_space: CompactSpace,
|
||||
bit_unpacker: BitUnpacker,
|
||||
null_compact_space: u64,
|
||||
min_value: u128,
|
||||
max_value: u128,
|
||||
num_vals: usize,
|
||||
}
|
||||
|
||||
impl FastFieldCodecReaderU128 for IntervallDecompressor {
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self> {
|
||||
Self::open(bytes)
|
||||
}
|
||||
|
||||
fn get(&self, doc: u64, data: &[u8]) -> Option<u128> {
|
||||
self.get(doc, data)
|
||||
}
|
||||
|
||||
fn get_between_vals(&self, range: RangeInclusive<u128>, data: &[u8]) -> Vec<usize> {
|
||||
self.get_range(range, data)
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u128 {
|
||||
self.min_value()
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u128 {
|
||||
self.max_value()
|
||||
}
|
||||
|
||||
/// The computed and assigned number for null values
|
||||
fn null_value(&self) -> u128 {
|
||||
self.compact_space.null_value
|
||||
}
|
||||
|
||||
fn iter<'a>(&'a self, data: &'a [u8]) -> Box<dyn Iterator<Item = Option<u128>> + 'a> {
|
||||
Box::new(self.iter(data))
|
||||
}
|
||||
}
|
||||
|
||||
impl IntervallDecompressor {
|
||||
pub fn open(data: &[u8]) -> io::Result<IntervallDecompressor> {
|
||||
let (data, footer_len_bytes) = data.split_at(data.len() - 4);
|
||||
let footer_len = u32::from_le_bytes(footer_len_bytes.try_into().unwrap());
|
||||
|
||||
let data = &data[data.len() - footer_len as usize..];
|
||||
let (_header_flags, data) = data.split_at(8);
|
||||
let (null_compact_space, data) = deserialize_vint_u128(data)?;
|
||||
let (min_value, data) = deserialize_vint_u128(data)?;
|
||||
let (max_value, data) = deserialize_vint_u128(data)?;
|
||||
let (mut data, compact_space) = CompactSpace::deserialize(data).unwrap();
|
||||
|
||||
let num_bits = data[0];
|
||||
data = &data[1..];
|
||||
let (num_vals, _data) = deserialize_vint_u128(data)?;
|
||||
let decompressor = IntervallDecompressor {
|
||||
null_compact_space: null_compact_space as u64,
|
||||
min_value,
|
||||
max_value,
|
||||
compact_space,
|
||||
num_vals: num_vals as usize,
|
||||
bit_unpacker: BitUnpacker::new(num_bits),
|
||||
};
|
||||
|
||||
Ok(decompressor)
|
||||
}
|
||||
|
||||
/// Converting to compact space for the decompressor is more complex, since we may get values
|
||||
/// which are outside the compact space. e.g. if we map
|
||||
/// 1000 => 5
|
||||
/// 2000 => 6
|
||||
///
|
||||
/// and we want a mapping for 1005, there is no equivalent compact space. We instead return an
|
||||
/// error with the index of the next range.
|
||||
fn to_compact(&self, ip_addr: u128) -> Result<u64, usize> {
|
||||
self.compact_space.to_compact(ip_addr)
|
||||
}
|
||||
|
||||
fn compact_to_ip_addr(&self, compact: u64) -> u128 {
|
||||
self.compact_space.unpack_ip(compact)
|
||||
}
|
||||
|
||||
/// Comparing on compact space: 1.2 GElements/s
|
||||
///
|
||||
/// Comparing on original space: .06 GElements/s (not completely optimized)
|
||||
pub fn get_range(&self, range: RangeInclusive<u128>, data: &[u8]) -> Vec<usize> {
|
||||
let from_ip_addr = *range.start();
|
||||
let to_ip_addr = *range.end();
|
||||
assert!(to_ip_addr >= from_ip_addr);
|
||||
let compact_from = self.to_compact(from_ip_addr);
|
||||
let compact_to = self.to_compact(to_ip_addr);
|
||||
// Quick return, if both ranges fall into the same non-mapped space, the range can't cover
|
||||
// any values, so we can early exit
|
||||
match (compact_to, compact_from) {
|
||||
(Err(pos1), Err(pos2)) if pos1 == pos2 => return vec![],
|
||||
_ => {}
|
||||
}
|
||||
|
||||
let compact_from = compact_from.unwrap_or_else(|pos| {
|
||||
let range_and_compact_start = self.compact_space.get_range_and_compact_start(pos);
|
||||
let compact_end = range_and_compact_start.1
|
||||
+ (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64;
|
||||
compact_end + 1
|
||||
});
|
||||
// If there is no compact space, we go to the closest upperbound compact space
|
||||
let compact_to = compact_to.unwrap_or_else(|pos| {
|
||||
let range_and_compact_start = self.compact_space.get_range_and_compact_start(pos);
|
||||
let compact_end = range_and_compact_start.1
|
||||
+ (range_and_compact_start.0.end() - range_and_compact_start.0.start()) as u64;
|
||||
compact_end
|
||||
});
|
||||
|
||||
let range = compact_from..=compact_to;
|
||||
let mut positions = vec![];
|
||||
|
||||
for (pos, compact_ip) in self
|
||||
.iter_compact(data)
|
||||
.enumerate()
|
||||
.filter(|(_pos, val)| *val != self.null_compact_space)
|
||||
{
|
||||
if range.contains(&compact_ip) {
|
||||
positions.push(pos);
|
||||
}
|
||||
}
|
||||
|
||||
positions
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn iter_compact<'a>(&'a self, data: &'a [u8]) -> impl Iterator<Item = u64> + 'a {
|
||||
(0..self.num_vals).map(move |idx| self.bit_unpacker.get(idx as u64, data) as u64)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn iter<'a>(&'a self, data: &'a [u8]) -> impl Iterator<Item = Option<u128>> + 'a {
|
||||
// TODO: Performance. It would be better to iterate on the ranges and check existence via
|
||||
// the bit_unpacker.
|
||||
self.iter_compact(data).map(|compact| {
|
||||
if compact == self.null_compact_space {
|
||||
None
|
||||
} else {
|
||||
Some(self.compact_to_ip_addr(compact))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
pub fn get(&self, idx: u64, data: &[u8]) -> Option<u128> {
|
||||
let compact = self.bit_unpacker.get(idx, data);
|
||||
if compact == self.null_compact_space {
|
||||
None
|
||||
} else {
|
||||
Some(self.compact_to_ip_addr(compact))
|
||||
}
|
||||
}
|
||||
|
||||
pub fn min_value(&self) -> u128 {
|
||||
self.min_value
|
||||
}
|
||||
|
||||
pub fn max_value(&self) -> u128 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
impl IntervalEncoding {
|
||||
pub fn train(&self, mut vals: Vec<u128>) -> IntervalCompressor {
|
||||
vals.sort();
|
||||
train(&vals)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
fn decode_all(data: &[u8]) -> Vec<u128> {
|
||||
let decompressor = IntervallDecompressor::open(data).unwrap();
|
||||
let mut u128_vals = Vec::new();
|
||||
for idx in 0..decompressor.num_vals as usize {
|
||||
let val = decompressor.get(idx as u64, data);
|
||||
if let Some(val) = val {
|
||||
u128_vals.push(val);
|
||||
}
|
||||
}
|
||||
u128_vals
|
||||
}
|
||||
|
||||
fn test_aux_vals(encoder: &IntervalEncoding, u128_vals: &[u128]) -> Vec<u8> {
|
||||
let compressor = encoder.train(u128_vals.to_vec());
|
||||
let data = compressor.compress(u128_vals).unwrap();
|
||||
let decoded_val = decode_all(&data);
|
||||
assert_eq!(&decoded_val, u128_vals);
|
||||
data
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_1() {
|
||||
let vals = &[
|
||||
1u128,
|
||||
100u128,
|
||||
3u128,
|
||||
99999u128,
|
||||
100000u128,
|
||||
100001u128,
|
||||
4_000_211_221u128,
|
||||
4_000_211_222u128,
|
||||
333u128,
|
||||
];
|
||||
let interval_encoding = IntervalEncoding::default();
|
||||
let data = test_aux_vals(&interval_encoding, vals);
|
||||
let decomp = IntervallDecompressor::open(&data).unwrap();
|
||||
let positions = decomp.get_range(0..=1, &data);
|
||||
assert_eq!(positions, vec![0]);
|
||||
let positions = decomp.get_range(0..=2, &data);
|
||||
assert_eq!(positions, vec![0]);
|
||||
let positions = decomp.get_range(0..=3, &data);
|
||||
assert_eq!(positions, vec![0, 2]);
|
||||
assert_eq!(decomp.get_range(99999u128..=99999u128, &data), vec![3]);
|
||||
assert_eq!(decomp.get_range(99998u128..=100000u128, &data), vec![3, 4]);
|
||||
assert_eq!(decomp.get_range(99998u128..=99999u128, &data), vec![3]);
|
||||
assert_eq!(decomp.get_range(99998u128..=99998u128, &data), vec![]);
|
||||
assert_eq!(decomp.get_range(333u128..=333u128, &data), vec![8]);
|
||||
assert_eq!(decomp.get_range(332u128..=333u128, &data), vec![8]);
|
||||
assert_eq!(decomp.get_range(332u128..=334u128, &data), vec![8]);
|
||||
assert_eq!(decomp.get_range(333u128..=334u128, &data), vec![8]);
|
||||
|
||||
assert_eq!(
|
||||
decomp.get_range(4_000_211_221u128..=5_000_000_000u128, &data),
|
||||
vec![6, 7]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_empty() {
|
||||
let vals = &[];
|
||||
let interval_encoding = IntervalEncoding::default();
|
||||
let data = test_aux_vals(&interval_encoding, vals);
|
||||
let _decomp = IntervallDecompressor::open(&data).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_2() {
|
||||
let vals = &[
|
||||
100u128,
|
||||
99999u128,
|
||||
100000u128,
|
||||
100001u128,
|
||||
4_000_211_221u128,
|
||||
4_000_211_222u128,
|
||||
333u128,
|
||||
];
|
||||
let interval_encoding = IntervalEncoding::default();
|
||||
let data = test_aux_vals(&interval_encoding, vals);
|
||||
let decomp = IntervallDecompressor::open(&data).unwrap();
|
||||
let positions = decomp.get_range(0..=5, &data);
|
||||
assert_eq!(positions, vec![]);
|
||||
let positions = decomp.get_range(0..=100, &data);
|
||||
assert_eq!(positions, vec![0]);
|
||||
let positions = decomp.get_range(0..=105, &data);
|
||||
assert_eq!(positions, vec![0]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_null() {
|
||||
let vals = &[2u128];
|
||||
let interval_encoding = IntervalEncoding::default().train(vals.to_vec());
|
||||
let vals = vec![interval_encoding.null_value, 2u128];
|
||||
let data = interval_encoding.compress(&vals).unwrap();
|
||||
let decomp = IntervallDecompressor::open(&data).unwrap();
|
||||
let positions = decomp.get_range(0..=1, &data);
|
||||
assert_eq!(positions, vec![]);
|
||||
let positions = decomp.get_range(2..=2, &data);
|
||||
assert_eq!(positions, vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_first_large_gaps() {
|
||||
let vals = &[1_000_000_000u128; 100];
|
||||
let interval_encoding = IntervalEncoding::default();
|
||||
let _data = test_aux_vals(&interval_encoding, vals);
|
||||
}
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
|
||||
#[test]
|
||||
fn compress_decompress_random(vals in proptest::collection::vec(any::<u128>()
|
||||
, 1..1000)) {
|
||||
let interval_encoding = IntervalEncoding::default();
|
||||
let _data = test_aux_vals(&interval_encoding, &vals);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -4,49 +4,30 @@ extern crate more_asserts;
|
||||
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::RangeInclusive;
|
||||
|
||||
pub mod bitpacked;
|
||||
pub mod ip_codec;
|
||||
#[cfg(feature = "unstable")]
|
||||
pub mod frame_of_reference;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
pub mod piecewise_linear;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
/// Reads the metadata and returns the CodecReader.
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
/// Read u64 value for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field.
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64;
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
|
||||
pub trait FastFieldCodecReaderU128: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
/// Get value for doc
|
||||
fn get(&self, doc: u64, data: &[u8]) -> Option<u128>;
|
||||
|
||||
/// Iterator
|
||||
///
|
||||
/// Replace with opaque type after: https://github.com/rust-lang/rust/issues/63063
|
||||
fn iter<'a>(&'a self, data: &'a [u8]) -> Box<dyn Iterator<Item = Option<u128>> + 'a>;
|
||||
|
||||
/// Get positions (=docs in single value) for provided value range
|
||||
fn get_between_vals(&self, range: RangeInclusive<u128>, data: &[u8]) -> Vec<usize>;
|
||||
|
||||
/// The computed and assigned number value for null values
|
||||
fn null_value(&self) -> u128;
|
||||
|
||||
fn min_value(&self) -> u128;
|
||||
fn max_value(&self) -> u128;
|
||||
}
|
||||
|
||||
/// The FastFieldSerializerEstimate trait is required on all variants
|
||||
/// of fast field compressions, to decide which one to choose.
|
||||
pub trait FastFieldCodecSerializer {
|
||||
/// A codec needs to provide a unique name and id, which is
|
||||
/// A codex needs to provide a unique name and id, which is
|
||||
/// used for debugging and de/serialization.
|
||||
const NAME: &'static str;
|
||||
const ID: u8;
|
||||
@@ -59,14 +40,17 @@ pub trait FastFieldCodecSerializer {
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
/// The iterators should be preferred over using fastfield_accessor for performance reasons.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
@@ -109,9 +93,8 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
|
||||
use crate::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use crate::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
@@ -121,7 +104,7 @@ mod tests {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let estimation = S::estimate_compression_ratio(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -181,13 +164,10 @@ mod tests {
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
fn test_codec_piecewise_linear() {
|
||||
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
@@ -205,45 +185,50 @@ mod tests {
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
fn estimation_test_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
println!("{}", bitpacked_estimation);
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +71,9 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, idx, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(idx, data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
#[deprecated(
|
||||
note = "Linear interpolation works best only on very rare cases and piecewise linear codec \
|
||||
already works great on them."
|
||||
)]
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
@@ -105,13 +109,14 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
data_iter1: impl Iterator<Item = u64>,
|
||||
@@ -182,10 +187,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
if stats.num_vals < 3 {
|
||||
return f32::MAX;
|
||||
}
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
@@ -229,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -289,8 +301,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
|
||||
@@ -1,137 +1,52 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use std::collections::HashSet;
|
||||
use std::env;
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use std::net::{IpAddr, Ipv6Addr};
|
||||
use std::str::FromStr;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use fastfield_codecs::ip_codec::{IntervalEncoding, IntervallDecompressor};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
||||
use itertools::Itertools;
|
||||
use measure_time::print_time;
|
||||
use common::f64_to_u64;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
|
||||
#[cfg(feature = "unstable")]
|
||||
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
|
||||
fn print_set_stats(ip_addrs: &[u128]) {
|
||||
println!("NumIps\t{}", ip_addrs.len());
|
||||
let ip_addr_set: HashSet<u128> = ip_addrs.iter().cloned().collect();
|
||||
println!("NumUniqueIps\t{}", ip_addr_set.len());
|
||||
let ratio_unique = ip_addr_set.len() as f64 / ip_addrs.len() as f64;
|
||||
println!("RatioUniqueOverTotal\t{ratio_unique:.4}");
|
||||
|
||||
// histogram
|
||||
let mut ip_addrs = ip_addrs.to_vec();
|
||||
ip_addrs.sort();
|
||||
let mut cnts: Vec<usize> = ip_addrs
|
||||
.into_iter()
|
||||
.dedup_with_count()
|
||||
.map(|(cnt, _)| cnt)
|
||||
.collect();
|
||||
cnts.sort();
|
||||
|
||||
let top_256_cnt: usize = cnts.iter().rev().take(256).sum();
|
||||
let top_128_cnt: usize = cnts.iter().rev().take(128).sum();
|
||||
let top_64_cnt: usize = cnts.iter().rev().take(64).sum();
|
||||
let top_8_cnt: usize = cnts.iter().rev().take(8).sum();
|
||||
let total: usize = cnts.iter().sum();
|
||||
|
||||
println!("{}", total);
|
||||
println!("{}", top_256_cnt);
|
||||
println!("{}", top_128_cnt);
|
||||
println!("Percentage Top8 {:02}", top_8_cnt as f32 / total as f32);
|
||||
println!("Percentage Top64 {:02}", top_64_cnt as f32 / total as f32);
|
||||
println!("Percentage Top128 {:02}", top_128_cnt as f32 / total as f32);
|
||||
println!("Percentage Top256 {:02}", top_256_cnt as f32 / total as f32);
|
||||
|
||||
let mut cnts: Vec<(usize, usize)> = cnts.into_iter().dedup_with_count().collect();
|
||||
cnts.sort_by(|a, b| {
|
||||
if a.1 == b.1 {
|
||||
a.0.cmp(&b.0)
|
||||
} else {
|
||||
b.1.cmp(&a.1)
|
||||
}
|
||||
});
|
||||
|
||||
println!("\n\n----\nIP Address histogram");
|
||||
println!("IPAddrCount\tFrequency");
|
||||
for (ip_addr_count, times) in cnts {
|
||||
println!("{}\t{}", ip_addr_count, times);
|
||||
}
|
||||
}
|
||||
|
||||
fn ip_dataset() -> Vec<u128> {
|
||||
let mut ip_addr_v4 = 0;
|
||||
|
||||
let stdin = std::io::stdin();
|
||||
let ip_addrs: Vec<u128> = stdin
|
||||
.lock()
|
||||
.lines()
|
||||
.flat_map(|line| {
|
||||
let line = line.unwrap();
|
||||
let line = line.trim();
|
||||
let ip_addr = IpAddr::from_str(line.trim()).ok()?;
|
||||
if ip_addr.is_ipv4() {
|
||||
ip_addr_v4 += 1;
|
||||
}
|
||||
let ip_addr_v6: Ipv6Addr = match ip_addr {
|
||||
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
|
||||
IpAddr::V6(v6) => v6,
|
||||
};
|
||||
Some(ip_addr_v6)
|
||||
})
|
||||
.map(|ip_v6| u128::from_be_bytes(ip_v6.octets()))
|
||||
.collect();
|
||||
|
||||
println!("IpAddrsAny\t{}", ip_addrs.len());
|
||||
println!("IpAddrsV4\t{}", ip_addr_v4);
|
||||
|
||||
ip_addrs
|
||||
}
|
||||
|
||||
fn bench_ip() {
|
||||
let encoding = IntervalEncoding();
|
||||
let dataset = ip_dataset();
|
||||
print_set_stats(&dataset);
|
||||
|
||||
let compressor = encoding.train(dataset.to_vec());
|
||||
let data = compressor.compress(&dataset).unwrap();
|
||||
|
||||
let decompressor = IntervallDecompressor::open(&data).unwrap();
|
||||
|
||||
for i in 11100..11150 {
|
||||
print_time!("get range");
|
||||
let doc_values = decompressor.get_range(dataset[i]..=dataset[i], &data);
|
||||
println!("{:?}", doc_values.len());
|
||||
}
|
||||
}
|
||||
use rand::prelude::StdRng;
|
||||
use rand::Rng;
|
||||
|
||||
fn main() {
|
||||
if env::args().nth(1).unwrap() == "bench" {
|
||||
bench_ip();
|
||||
return;
|
||||
}
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
table.add_row(row![
|
||||
"",
|
||||
"Compression ratio",
|
||||
"Compression ratio estimation",
|
||||
"Compression time (micro)",
|
||||
"Reading time (micro)"
|
||||
]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
let res = serialize_with_codec::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
#[cfg(feature = "unstable")]
|
||||
{
|
||||
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
|
||||
results.push(res);
|
||||
}
|
||||
let res = serialize_with_codec::<
|
||||
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
|
||||
BitpackedFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
@@ -139,7 +54,7 @@ fn main() {
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
@@ -155,6 +70,8 @@ fn main() {
|
||||
Cell::new(name).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
Cell::new(&compression_duration.as_micros().to_string()),
|
||||
Cell::new(&read_duration.as_micros().to_string()),
|
||||
]));
|
||||
}
|
||||
}
|
||||
@@ -176,7 +93,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
@@ -189,22 +105,79 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1);
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.map(|num| num + rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|_| rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Random"));
|
||||
|
||||
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
|
||||
data_and_names.push((data.clone(), "HDFS logs timestamps"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "HDFS logs timestamps SORTED"));
|
||||
|
||||
let data = load_dataset("datasets/http_logs_timestamps.txt");
|
||||
data_and_names.push((data, "HTTP logs timestamps SORTED"));
|
||||
|
||||
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
|
||||
data_and_names.push((data.clone(), "Amazon review product ids"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "Amazon review product ids SORTED"));
|
||||
|
||||
let data = load_float_dataset("datasets/nooc_temperatures.txt");
|
||||
data_and_names.push((data, "Temperatures"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
pub fn load_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let l = line.unwrap();
|
||||
data.push(l.parse::<u64>().unwrap());
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load float dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let line_string = line.unwrap();
|
||||
let value = line_string.parse::<f64>().unwrap();
|
||||
data.push(f64_to_u64(value));
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
) -> (bool, f32, f32, &'static str, Duration, Duration) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
return (
|
||||
false,
|
||||
0.0,
|
||||
0.0,
|
||||
S::NAME,
|
||||
Duration::from_secs(0),
|
||||
Duration::from_secs(0),
|
||||
);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let start_time_compression = Instant::now();
|
||||
let estimation = S::estimate_compression_ratio(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -214,9 +187,22 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let elasped_time_compression = start_time_compression.elapsed();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
let start_time_read = Instant::now();
|
||||
for doc in 0..data.len() {
|
||||
reader.get_u64(doc as u64, &out);
|
||||
}
|
||||
let elapsed_time_read = start_time_read.elapsed();
|
||||
(
|
||||
true,
|
||||
estimation,
|
||||
actual_compression,
|
||||
S::NAME,
|
||||
elasped_time_compression,
|
||||
elapsed_time_read,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -155,14 +155,17 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||
let block_idx = idx - interpolation.start_pos;
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
block_idx,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
.get(block_idx, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
@@ -187,15 +190,20 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
#[deprecated(
|
||||
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is \
|
||||
a little bit more optimized."
|
||||
)]
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
fastfield_accessor: &dyn FastFieldDataAccess,
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
@@ -311,10 +319,13 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
@@ -366,6 +377,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(deprecated)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
//! PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict
|
||||
//! values and fast field values. The difference with real fast field values is then stored.
|
||||
//! For every block, the linear function can be expressed as
|
||||
//! `computed_value = slope * block_position + first_value + positive_offset`
|
||||
//! where:
|
||||
//! - `block_position` is the position inside of the block from 0 to 511
|
||||
//! - `first_value` is the first value on the block
|
||||
//! - `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is
|
||||
//! always positive.
|
||||
//!
|
||||
//! 21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits
|
||||
//! per element.
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 512;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PiecewiseLinearFastFieldReader {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
/// Block that stores metadata to predict value with a linear
|
||||
/// function `predicted_value = slope * position + first_value + positive_offset`
|
||||
/// where `positive_offset` is comupted such that predicted values
|
||||
/// are always positive.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
first_value: u64,
|
||||
positive_offset: u64,
|
||||
slope: f32,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
let predicted_value =
|
||||
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
|
||||
(predicted_value + diff) - self.metadata.positive_offset
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.first_value.serialize(write)?;
|
||||
self.positive_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let first_value = u64::deserialize(reader)?;
|
||||
let positive_offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PiecewiseLinearFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for PiecewiseLinearFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = PiecewiseLinearFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas.into_iter() {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
pub struct PiecewiseLinearFastFieldSerializer;
|
||||
|
||||
impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
|
||||
const NAME: &'static str = "PiecewiseLinear";
|
||||
const ID: u8 = 4;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let slope = if block_num_vals == 1 {
|
||||
0f32
|
||||
} else {
|
||||
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64)
|
||||
/ (block_num_vals - 1) as f64) as f32
|
||||
};
|
||||
let first_value = block_values[0];
|
||||
let mut positive_offset = 0;
|
||||
let mut max_delta = 0;
|
||||
for (pos, ¤t_value) in block_values[1..].iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
|
||||
if computed_value > current_value {
|
||||
positive_offset = positive_offset.max(computed_value - current_value);
|
||||
} else {
|
||||
max_delta = max_delta.max(current_value - computed_value);
|
||||
}
|
||||
}
|
||||
let num_bits = compute_num_bits(max_delta + positive_offset);
|
||||
for (pos, current_value) in block_values.iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64, slope);
|
||||
let diff = (current_value + positive_offset) - computed_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
});
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = PiecewiseLinearFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 10 * BLOCK_SIZE {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64)
|
||||
/ (stats.num_vals - 1) as f64) as f32;
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|&pos| {
|
||||
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 21 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,8 +1,8 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.3.0"
|
||||
edition = "2021"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.18.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -9,9 +9,9 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
combine = {version="4", default-features=false, features=[] }
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
|
||||
@@ -2,11 +2,11 @@ use std::fmt;
|
||||
use std::fmt::Write;
|
||||
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must not be present.
|
||||
/// should be present or must be not present.
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the terms with the Should or the Must
|
||||
/// at least one of the document with the Should or the Must
|
||||
/// Occur constraint must be within the document.
|
||||
Should,
|
||||
/// Document without the term are excluded from the search.
|
||||
|
||||
@@ -16,9 +16,9 @@ use crate::Occur;
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|!|\\|\*|\s)"#;
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
@@ -34,8 +34,7 @@ fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|_| true), /* if the next character is not a special char, the \ will be treated
|
||||
* as the \ character. */
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
@@ -120,36 +119,22 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
|
||||
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
|
||||
negative_number().or(phrase.or(word()))
|
||||
phrase.or(word())
|
||||
}
|
||||
|
||||
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
|
||||
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
|
||||
let term_val_with_field = negative_number().or(term_val());
|
||||
(field_name(), term_val_with_field).map(|(field_name, phrase)| UserInputLiteral {
|
||||
field_name: Some(field_name),
|
||||
phrase,
|
||||
slop,
|
||||
})
|
||||
}
|
||||
|
||||
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
|
||||
let slop =
|
||||
(char('~'), many1(digit())).and_then(|(_, slop): (_, String)| match slop.parse::<u32>() {
|
||||
Ok(d) => Ok(d),
|
||||
_ => Err(StringStreamError::UnexpectedParse),
|
||||
});
|
||||
optional(slop).map(|slop| match slop {
|
||||
Some(d) => d,
|
||||
_ => 0,
|
||||
})
|
||||
}
|
||||
|
||||
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
|
||||
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
|
||||
let term_default_field = term_val().map(|phrase| UserInputLiteral {
|
||||
field_name: None,
|
||||
phrase,
|
||||
slop,
|
||||
});
|
||||
|
||||
attempt(term_query())
|
||||
.or(term_default_field)
|
||||
.map(UserInputLeaf::from)
|
||||
@@ -299,7 +284,7 @@ fn boost<'a>() -> impl Parser<&'a str, Output = f64> {
|
||||
|
||||
fn boosted_leaf<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
|
||||
(leaf(), optional(boost())).map(|(leaf, boost_opt)| match boost_opt {
|
||||
Some(boost) if (boost - 1.0).abs() > f64::EPSILON => {
|
||||
Some(boost) if (boost - 1.0).abs() > std::f64::EPSILON => {
|
||||
UserInputAst::Boost(Box::new(leaf), boost)
|
||||
}
|
||||
_ => leaf,
|
||||
@@ -531,18 +516,14 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name() {
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"にんじん:a"#),
|
||||
Ok(("にんじん".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\field:a"#),
|
||||
Ok((r#"my\field"#.to_string(), "a"))
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
@@ -553,32 +534,14 @@ mod test {
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("myfield.b:hello").unwrap(),
|
||||
("myfield.b".to_string(), "hello")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"myfield\.b:hello"#).unwrap(),
|
||||
(r#"myfield\.b"#.to_string(), "hello")
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a"),
|
||||
Ok(("_my_field".to_string(), "a"))
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("~my~field:a"),
|
||||
Ok(("~my~field".to_string(), "a"))
|
||||
);
|
||||
for special_char in SPECIAL_CHARS.iter() {
|
||||
let query = &format!("\\{special_char}my\\{special_char}field:a");
|
||||
assert_eq!(
|
||||
super::field_name().parse(query),
|
||||
Ok((format!("{special_char}my{special_char}field"), "a"))
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -731,22 +694,4 @@ mod test {
|
||||
);
|
||||
test_is_parse_err("abc + ");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_slop() {
|
||||
assert!(parse_to_ast().parse("\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
|
||||
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
|
||||
|
||||
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
|
||||
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
|
||||
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
|
||||
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
|
||||
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
|
||||
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
|
||||
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,19 +40,14 @@ impl Debug for UserInputLeaf {
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
pub slop: u32,
|
||||
}
|
||||
|
||||
impl fmt::Debug for UserInputLiteral {
|
||||
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
|
||||
if let Some(ref field) = self.field_name {
|
||||
write!(formatter, "\"{}\":", field)?;
|
||||
match self.field_name {
|
||||
Some(ref field_name) => write!(formatter, "\"{}\":\"{}\"", field_name, self.phrase),
|
||||
None => write!(formatter, "\"{}\"", self.phrase),
|
||||
}
|
||||
write!(formatter, "\"{}\"", self.phrase)?;
|
||||
if self.slop > 0 {
|
||||
write!(formatter, "~{}", self.slop)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -12,7 +12,7 @@ Tantivy's aggregations have been designed to mimic the
|
||||
The code is organized in submodules:
|
||||
|
||||
## bucket
|
||||
Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggregations.
|
||||
Contains all bucket aggregations, like range aggregation. These bucket aggregations group documents into buckets and can contain sub-aggegations.
|
||||
|
||||
## metric
|
||||
Contains all metric aggregations, like average aggregation. Metric aggregations do not have sub aggregations.
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
|
||||
//! field: "score".to_string(),
|
||||
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
//! keyed: false,
|
||||
//! }),
|
||||
//! sub_aggregation: Default::default(),
|
||||
//! }),
|
||||
@@ -49,8 +48,8 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::HistogramAggregation;
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::bucket::{HistogramAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
|
||||
@@ -101,33 +100,12 @@ pub(crate) struct BucketAggregationInternal {
|
||||
}
|
||||
|
||||
impl BucketAggregationInternal {
|
||||
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
|
||||
pub(crate) fn as_histogram(&self) -> &HistogramAggregation {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Range(range) => Some(range),
|
||||
_ => None,
|
||||
BucketAggregationType::Range(_) => panic!("unexpected aggregation"),
|
||||
BucketAggregationType::Histogram(histogram) => histogram,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Histogram(histogram) => Some(histogram),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Terms(terms) => Some(terms),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all fields, where the term directory is used in the tree.
|
||||
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut term_dict_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_term_dict_field_names(&mut term_dict_field_names)
|
||||
}
|
||||
term_dict_field_names
|
||||
}
|
||||
|
||||
/// Extract all fast field names used in the tree.
|
||||
@@ -152,12 +130,6 @@ pub enum Aggregation {
|
||||
}
|
||||
|
||||
impl Aggregation {
|
||||
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
|
||||
if let Aggregation::Bucket(bucket) = self {
|
||||
bucket.get_term_dict_field_names(term_field_names)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
|
||||
@@ -190,12 +162,6 @@ pub struct BucketAggregation {
|
||||
}
|
||||
|
||||
impl BucketAggregation {
|
||||
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
|
||||
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
|
||||
term_dict_field_names.insert(terms.field.to_string());
|
||||
}
|
||||
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
|
||||
}
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
self.bucket_agg.get_fast_field_names(fast_field_names);
|
||||
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
|
||||
@@ -211,15 +177,11 @@ pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "histogram")]
|
||||
Histogram(HistogramAggregation),
|
||||
/// Put data into buckets of terms.
|
||||
#[serde(rename = "terms")]
|
||||
Terms(TermsAggregation),
|
||||
}
|
||||
|
||||
impl BucketAggregationType {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
|
||||
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
|
||||
BucketAggregationType::Histogram(histogram) => {
|
||||
fast_field_names.insert(histogram.field.to_string())
|
||||
@@ -271,7 +233,6 @@ mod tests {
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -298,8 +259,7 @@ mod tests {
|
||||
{
|
||||
"from": 20.0
|
||||
}
|
||||
],
|
||||
"keyed": true
|
||||
]
|
||||
}
|
||||
}
|
||||
}"#;
|
||||
@@ -321,7 +281,6 @@ mod tests {
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -347,7 +306,6 @@ mod tests {
|
||||
(7f64..20f64).into(),
|
||||
(20f64..f64::MAX).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: agg_req2,
|
||||
}),
|
||||
|
||||
@@ -1,19 +1,12 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::segment_agg_result::BucketCount;
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
@@ -34,36 +27,14 @@ impl AggregationsWithAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
}
|
||||
}
|
||||
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(reader) => Some(reader),
|
||||
FastFieldAccessor::Single(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: FastFieldAccessor,
|
||||
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
|
||||
pub(crate) accessor: DynamicFastFieldReader<u64>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
pub(crate) bucket_count: BucketCount,
|
||||
}
|
||||
|
||||
impl BucketAggregationWithAccessor {
|
||||
@@ -71,44 +42,22 @@ impl BucketAggregationWithAccessor {
|
||||
bucket: &BucketAggregationType,
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
bucket_count: Rc<AtomicU32>,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let mut inverted_index = None;
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
field: field_name,
|
||||
ranges: _,
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Terms(TermsAggregation {
|
||||
field: field_name, ..
|
||||
}) => {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
inverted_index = Some(reader.inverted_index(field)?);
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
|
||||
}
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
accessor,
|
||||
field_type,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(
|
||||
&sub_aggregation,
|
||||
reader,
|
||||
bucket_count.clone(),
|
||||
max_bucket_count,
|
||||
)?,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
inverted_index,
|
||||
bucket_count: BucketCount {
|
||||
bucket_count,
|
||||
max_bucket_count,
|
||||
},
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -129,14 +78,10 @@ impl MetricAggregationWithAccessor {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) =
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
|
||||
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor: accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
accessor,
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
@@ -148,8 +93,6 @@ impl MetricAggregationWithAccessor {
|
||||
pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
bucket_count: Rc<AtomicU32>,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut metrics = vec![];
|
||||
let mut buckets = vec![];
|
||||
@@ -161,8 +104,6 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
&bucket.bucket_agg,
|
||||
&bucket.sub_aggregation,
|
||||
reader,
|
||||
Rc::clone(&bucket_count),
|
||||
max_bucket_count,
|
||||
)?,
|
||||
)),
|
||||
Aggregation::Metric(metric) => metrics.push((
|
||||
@@ -177,45 +118,32 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
))
|
||||
}
|
||||
|
||||
/// Get fast field reader with given cardinatility.
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
cardinality: Cardinality,
|
||||
) -> crate::Result<(FastFieldAccessor, Type)> {
|
||||
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Unsupported field type date in aggregation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if cardinality != field_cardinality {
|
||||
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
|
||||
field_name, cardinality, field_cardinality
|
||||
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
|
||||
field_type.value_type()
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
field_type.value_type()
|
||||
)));
|
||||
};
|
||||
|
||||
let ff_fields = reader.fast_fields();
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
|
||||
Cardinality::MultiValues => ff_fields
|
||||
.u64s_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
|
||||
}
|
||||
ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (field, field_type.value_type()))
|
||||
}
|
||||
|
||||
@@ -4,37 +4,86 @@
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
//! calculated on the step from intermediate to final aggregation result tree.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::BucketAggregationInternal;
|
||||
use super::bucket::GetDocCount;
|
||||
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
|
||||
use super::agg_req::{Aggregations, AggregationsInternal, BucketAggregationInternal};
|
||||
use super::bucket::intermediate_buckets_to_final_buckets;
|
||||
use super::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
IntermediateMetricResult, IntermediateRangeBucketEntry,
|
||||
};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::Key;
|
||||
use crate::TantivyError;
|
||||
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl AggregationResults {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
if let Some(agg) = self.0.get(name) {
|
||||
agg.get_value_from_aggregation(name, agg_property)
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
pub fn from_intermediate_and_req(
|
||||
results: IntermediateAggregationResults,
|
||||
agg: Aggregations,
|
||||
) -> Self {
|
||||
AggregationResults::from_intermediate_and_req_internal(results, &(agg.into()))
|
||||
}
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
///
|
||||
/// Internal function, CollectorAggregations is used instead Aggregations, which is optimized
|
||||
/// for internal processing
|
||||
fn from_intermediate_and_req_internal(
|
||||
results: IntermediateAggregationResults,
|
||||
req: &AggregationsInternal,
|
||||
) -> Self {
|
||||
let mut result = HashMap::default();
|
||||
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
// request
|
||||
if let Some(buckets) = results.buckets {
|
||||
result.extend(buckets.into_iter().zip(req.buckets.values()).map(
|
||||
|((key, bucket), req)| {
|
||||
(
|
||||
key,
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
|
||||
bucket, req,
|
||||
)),
|
||||
)
|
||||
},
|
||||
));
|
||||
} else {
|
||||
// Validation is be done during request parsing, so we can't reach this state.
|
||||
Err(TantivyError::InternalError(format!(
|
||||
"Can't find aggregation {:?} in sub_aggregations",
|
||||
name
|
||||
)))
|
||||
result.extend(req.buckets.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
|
||||
empty_bucket,
|
||||
req,
|
||||
)),
|
||||
)
|
||||
}));
|
||||
}
|
||||
|
||||
if let Some(metrics) = results.metrics {
|
||||
result.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
} else {
|
||||
result.extend(req.metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
}
|
||||
Self(result)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,24 +97,6 @@ pub enum AggregationResult {
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
|
||||
impl AggregationResult {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
_name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
|
||||
"Tried to retrieve value from bucket aggregation. This is not supported and \
|
||||
should not happen during collection phase, but should be catched during \
|
||||
validation"
|
||||
.to_string(),
|
||||
)),
|
||||
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// MetricResult
|
||||
@@ -76,14 +107,6 @@ pub enum MetricResult {
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
MetricResult::Average(avg) => Ok(avg.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
@@ -105,7 +128,7 @@ pub enum BucketResult {
|
||||
/// sub_aggregations.
|
||||
Range {
|
||||
/// The range buckets sorted by range.
|
||||
buckets: BucketEntries<RangeBucketEntry>,
|
||||
buckets: Vec<RangeBucketEntry>,
|
||||
},
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
@@ -115,38 +138,43 @@ pub enum BucketResult {
|
||||
/// If there are holes depends on the request, if min_doc_count is 0, then there are no
|
||||
/// holes between the first and last bucket.
|
||||
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
|
||||
buckets: BucketEntries<BucketEntry>,
|
||||
},
|
||||
/// This is the term result
|
||||
Terms {
|
||||
/// The buckets.
|
||||
///
|
||||
/// See [TermsAggregation](super::bucket::TermsAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
/// The number of documents that didn’t make it into to TOP N due to shard_size or size
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
}
|
||||
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
empty_bucket.into_final_bucket_result(req)
|
||||
}
|
||||
}
|
||||
fn from_intermediate_and_req(
|
||||
bucket_result: IntermediateBucketResult,
|
||||
req: &BucketAggregationInternal,
|
||||
) -> Self {
|
||||
match bucket_result {
|
||||
IntermediateBucketResult::Range(range_map) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_map
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| {
|
||||
RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation)
|
||||
})
|
||||
.collect_vec();
|
||||
|
||||
/// This is the wrapper of buckets entries, which can be vector or hashmap
|
||||
/// depending on if it's keyed or not.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
pub enum BucketEntries<T> {
|
||||
/// Vector format bucket entries
|
||||
Vec(Vec<T>),
|
||||
/// HashMap format bucket entries
|
||||
HashMap(FnvHashMap<String, T>),
|
||||
buckets.sort_by(|a, b| {
|
||||
a.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.partial_cmp(&b.from.unwrap_or(f64::MIN))
|
||||
.unwrap_or(Ordering::Equal)
|
||||
});
|
||||
BucketResult::Range { buckets }
|
||||
}
|
||||
IntermediateBucketResult::Histogram { buckets } => {
|
||||
let buckets = intermediate_buckets_to_final_buckets(
|
||||
buckets,
|
||||
req.as_histogram(),
|
||||
&req.sub_aggregation,
|
||||
);
|
||||
|
||||
BucketResult::Histogram { buckets }
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the default entry for a bucket, which contains a key, count, and optionally
|
||||
@@ -182,17 +210,23 @@ pub struct BucketEntry {
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// Sub-aggregations in this bucket.
|
||||
/// sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
impl GetDocCount for &BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
impl GetDocCount for BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
|
||||
impl BucketEntry {
|
||||
pub(crate) fn from_intermediate_and_req(
|
||||
entry: IntermediateHistogramBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> Self {
|
||||
BucketEntry {
|
||||
key: Key::F64(entry.key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -242,3 +276,21 @@ pub struct RangeBucketEntry {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl RangeBucketEntry {
|
||||
fn from_intermediate_and_req(
|
||||
entry: IntermediateRangeBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> Self {
|
||||
RangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
),
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,9 @@ use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentHistogramBucketEntry,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
@@ -48,13 +50,15 @@ use crate::{DocId, TantivyError};
|
||||
///
|
||||
/// # Limitations/Compatibility
|
||||
///
|
||||
/// The keyed parameter (elasticsearch) is not yet supported.
|
||||
///
|
||||
/// # JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "prices": {
|
||||
/// "histogram": {
|
||||
/// "field": "price",
|
||||
/// "interval": 10
|
||||
/// "interval": 10,
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
@@ -67,17 +71,16 @@ use crate::{DocId, TantivyError};
|
||||
pub struct HistogramAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// The interval to chunk your data range. Each bucket spans a value range of [0..interval).
|
||||
/// The interval to chunk your data range. The buckets span ranges of [0..interval).
|
||||
/// Must be a positive value.
|
||||
pub interval: f64,
|
||||
/// Intervals implicitely defines an absolute grid of buckets `[interval * k, interval * (k +
|
||||
/// 1))`.
|
||||
///
|
||||
/// Offset makes it possible to shift this grid into
|
||||
/// `[offset + interval * k, offset + interval * (k + 1))`. Offset has to be in the range [0,
|
||||
/// interval).
|
||||
/// Offset makes it possible to shift this grid into `[offset + interval * k, offset + interval
|
||||
/// * (k + 1)) Offset has to be in the range [0, interval).
|
||||
///
|
||||
/// As an example, if there are two documents with value 9 and 12 and interval 10.0, they would
|
||||
/// As an example. If there are two documents with value 8 and 12 and interval 10.0, they would
|
||||
/// fall into the buckets with the key 0 and 10.
|
||||
/// With offset 5 and interval 10, they would both fall into the bucket with they key 5 and the
|
||||
/// range [5..15)
|
||||
@@ -90,22 +93,6 @@ pub struct HistogramAggregation {
|
||||
///
|
||||
/// hard_bounds only limits the buckets, to force a range set both extended_bounds and
|
||||
/// hard_bounds to the same range.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```json
|
||||
/// {
|
||||
/// "prices": {
|
||||
/// "histogram": {
|
||||
/// "field": "price",
|
||||
/// "interval": 10,
|
||||
/// "hard_bounds": {
|
||||
/// "min": 0,
|
||||
/// "max": 100
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub hard_bounds: Option<HistogramBounds>,
|
||||
/// Can be set to extend your bounds. The range of the buckets is by default defined by the
|
||||
/// data range of the values of the documents. As the name suggests, this can only be used to
|
||||
@@ -115,9 +102,6 @@ pub struct HistogramAggregation {
|
||||
/// Cannot be set in conjunction with min_doc_count > 0, since the empty buckets from extended
|
||||
/// bounds would not be returned.
|
||||
pub extended_bounds: Option<HistogramBounds>,
|
||||
/// Whether to return the buckets as a hash map
|
||||
#[serde(default)]
|
||||
pub keyed: bool,
|
||||
}
|
||||
|
||||
impl HistogramAggregation {
|
||||
@@ -175,27 +159,6 @@ impl HistogramBounds {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentHistogramBucketEntry {
|
||||
pub key: f64,
|
||||
pub doc_count: u64,
|
||||
}
|
||||
|
||||
impl SegmentHistogramBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
sub_aggregation: SegmentAggregationResultsCollector,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateHistogramBucketEntry> {
|
||||
Ok(IntermediateHistogramBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: sub_aggregation
|
||||
.into_intermediate_aggregations_result(agg_with_accessor)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
@@ -211,10 +174,7 @@ pub struct SegmentHistogramCollector {
|
||||
}
|
||||
|
||||
impl SegmentHistogramCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let mut buckets = Vec::with_capacity(
|
||||
self.buckets
|
||||
.iter()
|
||||
@@ -228,20 +188,13 @@ impl SegmentHistogramCollector {
|
||||
//
|
||||
// Empty buckets may be added later again in the final result, depending on the request.
|
||||
if let Some(sub_aggregations) = self.sub_aggregations {
|
||||
for bucket_res in self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.zip(sub_aggregations.into_iter())
|
||||
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
|
||||
.map(|(bucket, sub_aggregation)| {
|
||||
bucket.into_intermediate_bucket_entry(
|
||||
sub_aggregation,
|
||||
&agg_with_accessor.sub_aggregation,
|
||||
)
|
||||
})
|
||||
{
|
||||
buckets.push(bucket_res?);
|
||||
}
|
||||
buckets.extend(
|
||||
self.buckets
|
||||
.into_iter()
|
||||
.zip(sub_aggregations.into_iter())
|
||||
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
|
||||
.map(|(bucket, sub_aggregation)| (bucket, sub_aggregation).into()),
|
||||
)
|
||||
} else {
|
||||
buckets.extend(
|
||||
self.buckets
|
||||
@@ -251,12 +204,7 @@ impl SegmentHistogramCollector {
|
||||
);
|
||||
};
|
||||
|
||||
agg_with_accessor
|
||||
.bucket_count
|
||||
.add_count(buckets.len() as u32);
|
||||
agg_with_accessor.bucket_count.validate_bucket_count()?;
|
||||
|
||||
Ok(IntermediateBucketResult::Histogram { buckets })
|
||||
IntermediateBucketResult::Histogram { buckets }
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
@@ -317,7 +265,7 @@ impl SegmentHistogramCollector {
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
let bounds = self.bounds;
|
||||
let interval = self.interval;
|
||||
let offset = self.offset;
|
||||
@@ -325,16 +273,12 @@ impl SegmentHistogramCollector {
|
||||
let get_bucket_num =
|
||||
|val| (get_bucket_num_f64(val, interval, offset) as i64 - first_bucket_num) as usize;
|
||||
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3]));
|
||||
let val0 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[3]));
|
||||
|
||||
let bucket_pos0 = get_bucket_num(val0);
|
||||
let bucket_pos1 = get_bucket_num(val1);
|
||||
@@ -347,31 +291,32 @@ impl SegmentHistogramCollector {
|
||||
bucket_pos0,
|
||||
docs[0],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val1,
|
||||
&bounds,
|
||||
bucket_pos1,
|
||||
docs[1],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val2,
|
||||
&bounds,
|
||||
bucket_pos2,
|
||||
docs[2],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
);
|
||||
self.increment_bucket_if_in_bounds(
|
||||
val3,
|
||||
&bounds,
|
||||
bucket_pos3,
|
||||
docs[3],
|
||||
&bucket_with_accessor.sub_aggregation,
|
||||
)?;
|
||||
);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type);
|
||||
let val =
|
||||
f64_from_fastfield_u64(bucket_with_accessor.accessor.get(*doc), &self.field_type);
|
||||
if !bounds.contains(val) {
|
||||
continue;
|
||||
}
|
||||
@@ -382,17 +327,16 @@ impl SegmentHistogramCollector {
|
||||
self.buckets[bucket_pos].key,
|
||||
get_bucket_val(val, self.interval, self.offset) as f64
|
||||
);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
if force_flush {
|
||||
if let Some(sub_aggregations) = self.sub_aggregations.as_mut() {
|
||||
for sub_aggregation in sub_aggregations {
|
||||
sub_aggregation
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?;
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -403,16 +347,15 @@ impl SegmentHistogramCollector {
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
if bounds.contains(val) {
|
||||
debug_assert_eq!(
|
||||
self.buckets[bucket_pos].key,
|
||||
get_bucket_val(val, self.interval, self.offset) as f64
|
||||
);
|
||||
|
||||
self.increment_bucket(bucket_pos, doc, bucket_with_accessor)?;
|
||||
self.increment_bucket(bucket_pos, doc, bucket_with_accessor);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -421,13 +364,12 @@ impl SegmentHistogramCollector {
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
let bucket = &mut self.buckets[bucket_pos];
|
||||
bucket.doc_count += 1;
|
||||
if let Some(sub_aggregation) = self.sub_aggregations.as_mut() {
|
||||
(&mut sub_aggregation[bucket_pos]).collect(doc, bucket_with_accessor)?;
|
||||
(&mut sub_aggregation[bucket_pos]).collect(doc, bucket_with_accessor);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn f64_from_fastfield_u64(&self, val: u64) -> f64 {
|
||||
@@ -451,7 +393,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
) -> Vec<BucketEntry> {
|
||||
// Generate the the full list of buckets without gaps.
|
||||
//
|
||||
// The bounds are the min max from the current buckets, optionally extended by
|
||||
@@ -491,16 +433,18 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
sub_aggregation: empty_sub_aggregation.clone(),
|
||||
},
|
||||
})
|
||||
.map(|intermediate_bucket| intermediate_bucket.into_final_bucket_entry(sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
.map(|intermediate_bucket| {
|
||||
BucketEntry::from_intermediate_and_req(intermediate_bucket, sub_aggregation)
|
||||
})
|
||||
.collect_vec()
|
||||
}
|
||||
|
||||
// Convert to BucketEntry
|
||||
pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
||||
pub(crate) fn intermediate_buckets_to_final_buckets(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
) -> Vec<BucketEntry> {
|
||||
if histogram_req.min_doc_count() == 0 {
|
||||
// With min_doc_count != 0, we may need to add buckets, so that there are no
|
||||
// gaps, since intermediate result does not contain empty buckets (filtered to
|
||||
@@ -510,9 +454,9 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
||||
} else {
|
||||
buckets
|
||||
.into_iter()
|
||||
.filter(|histogram_bucket| histogram_bucket.doc_count >= histogram_req.min_doc_count())
|
||||
.map(|histogram_bucket| histogram_bucket.into_final_bucket_entry(sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
.filter(|bucket| bucket.doc_count >= histogram_req.min_doc_count())
|
||||
.map(|bucket| BucketEntry::from_intermediate_and_req(bucket, sub_aggregation))
|
||||
.collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -553,7 +497,7 @@ pub(crate) fn generate_buckets_with_opt_minmax(
|
||||
let offset = req.offset.unwrap_or(0.0);
|
||||
let first_bucket_num = get_bucket_num_f64(min, req.interval, offset) as i64;
|
||||
let last_bucket_num = get_bucket_num_f64(max, req.interval, offset) as i64;
|
||||
let mut buckets = Vec::with_capacity((first_bucket_num..=last_bucket_num).count());
|
||||
let mut buckets = vec![];
|
||||
for bucket_pos in first_bucket_num..=last_bucket_num {
|
||||
let bucket_key = bucket_pos as f64 * req.interval + offset;
|
||||
buckets.push(bucket_key);
|
||||
@@ -686,9 +630,41 @@ mod tests {
|
||||
};
|
||||
use crate::aggregation::metric::{AverageAggregation, StatsAggregation};
|
||||
use crate::aggregation::tests::{
|
||||
exec_request, exec_request_with_query, get_test_index_2_segments,
|
||||
get_test_index_from_values, get_test_index_with_num_docs,
|
||||
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
|
||||
};
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{Index, Term};
|
||||
|
||||
fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
|
||||
exec_request_with_query(agg_req, index, None)
|
||||
}
|
||||
fn exec_request_with_query(
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = if let Some((field, term)) = query {
|
||||
let text_field = reader.searcher().schema().get_field(field).unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
searcher.search(&term_query, &collector)?
|
||||
} else {
|
||||
searcher.search(&AllQuery, &collector)?
|
||||
};
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_test_crooked_values() -> crate::Result<()> {
|
||||
@@ -1371,71 +1347,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_invalid_request() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"histogram".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 0.0,
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_res = exec_request(agg_req, &index);
|
||||
|
||||
assert!(agg_res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"histogram".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 50.0,
|
||||
keyed: true,
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request(agg_req, &index)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"histogram": {
|
||||
"buckets": {
|
||||
"0": {
|
||||
"key": 0.0,
|
||||
"doc_count": 50
|
||||
},
|
||||
"50": {
|
||||
"key": 50.0,
|
||||
"doc_count": 50
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,132 +9,8 @@
|
||||
|
||||
mod histogram;
|
||||
mod range;
|
||||
mod term_agg;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub(crate) use histogram::SegmentHistogramCollector;
|
||||
pub use histogram::*;
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
pub use range::*;
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
pub use term_agg::*;
|
||||
|
||||
/// Order for buckets in a bucket aggregation.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Order {
|
||||
/// Asc order
|
||||
#[serde(rename = "asc")]
|
||||
Asc,
|
||||
/// Desc order
|
||||
#[serde(rename = "desc")]
|
||||
Desc,
|
||||
}
|
||||
|
||||
impl Default for Order {
|
||||
fn default() -> Self {
|
||||
Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Order property by which to apply the order
|
||||
pub enum OrderTarget {
|
||||
/// The key of the bucket
|
||||
Key,
|
||||
/// The doc count of the bucket
|
||||
Count,
|
||||
/// Order by value of the sub aggregation metric with identified by given `String`.
|
||||
///
|
||||
/// Only single value metrics are supported currently
|
||||
SubAggregation(String),
|
||||
}
|
||||
|
||||
impl Default for OrderTarget {
|
||||
fn default() -> Self {
|
||||
OrderTarget::Count
|
||||
}
|
||||
}
|
||||
impl From<&str> for OrderTarget {
|
||||
fn from(val: &str) -> Self {
|
||||
match val {
|
||||
"_key" => OrderTarget::Key,
|
||||
"_count" => OrderTarget::Count,
|
||||
_ => OrderTarget::SubAggregation(val.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for OrderTarget {
|
||||
fn to_string(&self) -> String {
|
||||
match self {
|
||||
OrderTarget::Key => "_key".to_string(),
|
||||
OrderTarget::Count => "_count".to_string(),
|
||||
OrderTarget::SubAggregation(agg) => agg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the order. target is either "_count", "_key", or the name of
|
||||
/// a metric sub_aggregation.
|
||||
///
|
||||
/// De/Serializes to elasticsearch compatible JSON.
|
||||
///
|
||||
/// Examples in JSON format:
|
||||
/// { "_count": "asc" }
|
||||
/// { "_key": "asc" }
|
||||
/// { "average_price": "asc" }
|
||||
#[derive(Clone, Default, Debug, PartialEq)]
|
||||
pub struct CustomOrder {
|
||||
/// The target property by which to sort by
|
||||
pub target: OrderTarget,
|
||||
/// The order asc or desc
|
||||
pub order: Order,
|
||||
}
|
||||
|
||||
impl Serialize for CustomOrder {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
let map: HashMap<String, Order> =
|
||||
std::iter::once((self.target.to_string(), self.order)).collect();
|
||||
map.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn custom_order_serde_test() {
|
||||
let order = CustomOrder {
|
||||
target: OrderTarget::Key,
|
||||
order: Order::Desc,
|
||||
};
|
||||
|
||||
let order_str = serde_json::to_string(&order).unwrap();
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
}
|
||||
|
||||
@@ -1,24 +1,22 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::{BucketCount, SegmentAggregationResultsCollector};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key, SerializedKey};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continuous.
|
||||
/// The provided buckets have to be continous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
/// against each bucket range. Note that this aggregation includes the from value and excludes the
|
||||
/// to value for each range.
|
||||
@@ -35,38 +33,34 @@ use crate::{DocId, TantivyError};
|
||||
/// # Limitations/Compatibility
|
||||
/// Overlapping ranges are not yet supported.
|
||||
///
|
||||
/// The keyed parameter (elasticsearch) is not yet supported.
|
||||
///
|
||||
/// # Request JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "my_ranges": {
|
||||
/// "range": {
|
||||
/// "field": "score",
|
||||
/// "ranges": [
|
||||
/// { "to": 3.0 },
|
||||
/// { "from": 3.0, "to": 7.0 },
|
||||
/// { "from": 7.0, "to": 20.0 },
|
||||
/// { "from": 7.0, "to": 20.0 }
|
||||
/// { "from": 20.0 }
|
||||
/// ]
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
#[derive(Clone, Debug, Default, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct RangeAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// Note that this aggregation includes the from value and excludes the to value for each
|
||||
/// range. Extra buckets will be created until the first to, and last from, if necessary.
|
||||
pub ranges: Vec<RangeAggregationRange>,
|
||||
/// Whether to return the buckets as a hash map
|
||||
#[serde(default)]
|
||||
pub keyed: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The range for one range bucket.
|
||||
pub struct RangeAggregationRange {
|
||||
/// Custom key for the range bucket
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
pub key: Option<String>,
|
||||
/// The from range value, which is inclusive in the range.
|
||||
/// None equals to an open ended interval.
|
||||
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||
@@ -89,26 +83,7 @@ impl From<Range<f64>> for RangeAggregationRange {
|
||||
} else {
|
||||
Some(range.end)
|
||||
};
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from,
|
||||
to,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Internally used u64 range for one range bucket.
|
||||
pub(crate) struct InternalRangeAggregationRange {
|
||||
/// Custom key for the range bucket
|
||||
key: Option<String>,
|
||||
/// u64 range value
|
||||
range: Range<u64>,
|
||||
}
|
||||
|
||||
impl From<Range<u64>> for InternalRangeAggregationRange {
|
||||
fn from(range: Range<u64>) -> Self {
|
||||
InternalRangeAggregationRange { key: None, range }
|
||||
RangeAggregationRange { from, to }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,100 +102,44 @@ pub struct SegmentRangeCollector {
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
|
||||
/// inclusive.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
impl SegmentRangeBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateRangeBucketEntry> {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
|
||||
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
Ok(IntermediateRangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation,
|
||||
from: self.from,
|
||||
to: self.to,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
|
||||
let buckets = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
Ok((
|
||||
(
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_bucket
|
||||
.bucket
|
||||
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
|
||||
))
|
||||
range_bucket.bucket.into(),
|
||||
)
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
.collect();
|
||||
|
||||
Ok(IntermediateBucketResult::Range(
|
||||
IntermediateRangeBucketResult { buckets },
|
||||
))
|
||||
IntermediateBucketResult::Range(buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &AggregationsWithAccessor,
|
||||
bucket_count: &BucketCount,
|
||||
field_type: Type,
|
||||
) -> crate::Result<Self> {
|
||||
// The range input on the request is f64.
|
||||
// We need to convert to u64 ranges, because we read the values as u64.
|
||||
// The mapping from the conversion is monotonic so ordering is preserved.
|
||||
let buckets: Vec<_> = extend_validate_ranges(&req.ranges, &field_type)?
|
||||
let buckets = extend_validate_ranges(&req.ranges, &field_type)?
|
||||
.iter()
|
||||
.map(|range| {
|
||||
let key = range
|
||||
.key
|
||||
.clone()
|
||||
.map(|key| Key::Str(key))
|
||||
.unwrap_or(range_to_key(&range.range, &field_type));
|
||||
let to = if range.range.end == u64::MAX {
|
||||
let to = if range.end == u64::MAX {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.range.end, &field_type))
|
||||
Some(f64_from_fastfield_u64(range.end, &field_type))
|
||||
};
|
||||
let from = if range.range.start == u64::MIN {
|
||||
let from = if range.start == u64::MIN {
|
||||
None
|
||||
} else {
|
||||
Some(f64_from_fastfield_u64(range.range.start, &field_type))
|
||||
Some(f64_from_fastfield_u64(range.start, &field_type))
|
||||
};
|
||||
let sub_aggregation = if sub_aggregation.is_empty() {
|
||||
None
|
||||
@@ -230,11 +149,11 @@ impl SegmentRangeCollector {
|
||||
)?)
|
||||
};
|
||||
Ok(SegmentRangeAndBucketEntry {
|
||||
range: range.range.clone(),
|
||||
range: range.clone(),
|
||||
bucket: SegmentRangeBucketEntry {
|
||||
key: range_to_key(range, &field_type),
|
||||
doc_count: 0,
|
||||
sub_aggregation,
|
||||
key,
|
||||
from,
|
||||
to,
|
||||
},
|
||||
@@ -242,9 +161,6 @@ impl SegmentRangeCollector {
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
bucket_count.add_count(buckets.len() as u32);
|
||||
bucket_count.validate_bucket_count()?;
|
||||
|
||||
Ok(SegmentRangeCollector {
|
||||
buckets,
|
||||
field_type,
|
||||
@@ -257,41 +173,36 @@ impl SegmentRangeCollector {
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = accessor.get(docs[0]);
|
||||
let val2 = accessor.get(docs[1]);
|
||||
let val3 = accessor.get(docs[2]);
|
||||
let val4 = accessor.get(docs[3]);
|
||||
let val1 = bucket_with_accessor.accessor.get(docs[0]);
|
||||
let val2 = bucket_with_accessor.accessor.get(docs[1]);
|
||||
let val3 = bucket_with_accessor.accessor.get(docs[2]);
|
||||
let val4 = bucket_with_accessor.accessor.get(docs[3]);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
let bucket_pos4 = self.get_bucket_pos(val4);
|
||||
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos1, docs[0], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos2, docs[1], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos3, docs[2], &bucket_with_accessor.sub_aggregation);
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = accessor.get(*doc);
|
||||
let val = bucket_with_accessor.accessor.get(*doc);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation)?;
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
if force_flush {
|
||||
for bucket in &mut self.buckets {
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush)?;
|
||||
.flush_staged_docs(&bucket_with_accessor.sub_aggregation, force_flush);
|
||||
}
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -300,14 +211,13 @@ impl SegmentRangeCollector {
|
||||
bucket_pos: usize,
|
||||
doc: DocId,
|
||||
bucket_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
let bucket = &mut self.buckets[bucket_pos];
|
||||
|
||||
bucket.bucket.doc_count += 1;
|
||||
if let Some(sub_aggregation) = &mut bucket.bucket.sub_aggregation {
|
||||
sub_aggregation.collect(doc, bucket_with_accessor)?;
|
||||
sub_aggregation.collect(doc, bucket_with_accessor);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -333,10 +243,7 @@ impl SegmentRangeCollector {
|
||||
/// fast field.
|
||||
/// The alternative would be that every value read would be converted to the f64 range, but that is
|
||||
/// more computational expensive when many documents are hit.
|
||||
fn to_u64_range(
|
||||
range: &RangeAggregationRange,
|
||||
field_type: &Type,
|
||||
) -> crate::Result<InternalRangeAggregationRange> {
|
||||
fn to_u64_range(range: &RangeAggregationRange, field_type: &Type) -> crate::Result<Range<u64>> {
|
||||
let start = if let Some(from) = range.from {
|
||||
f64_to_fastfield_u64(from, field_type)
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid field type".to_string()))?
|
||||
@@ -351,43 +258,39 @@ fn to_u64_range(
|
||||
u64::MAX
|
||||
};
|
||||
|
||||
Ok(InternalRangeAggregationRange {
|
||||
key: range.key.clone(),
|
||||
range: start..end,
|
||||
})
|
||||
Ok(start..end)
|
||||
}
|
||||
|
||||
/// Extends the provided buckets to contain the whole value range, by inserting buckets at the
|
||||
/// beginning and end and filling gaps.
|
||||
/// beginning and end.
|
||||
fn extend_validate_ranges(
|
||||
buckets: &[RangeAggregationRange],
|
||||
field_type: &Type,
|
||||
) -> crate::Result<Vec<InternalRangeAggregationRange>> {
|
||||
) -> crate::Result<Vec<Range<u64>>> {
|
||||
let mut converted_buckets = buckets
|
||||
.iter()
|
||||
.map(|range| to_u64_range(range, field_type))
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
converted_buckets.sort_by_key(|bucket| bucket.range.start);
|
||||
if converted_buckets[0].range.start != u64::MIN {
|
||||
converted_buckets.insert(0, (u64::MIN..converted_buckets[0].range.start).into());
|
||||
converted_buckets.sort_by_key(|bucket| bucket.start);
|
||||
if converted_buckets[0].start != u64::MIN {
|
||||
converted_buckets.insert(0, u64::MIN..converted_buckets[0].start);
|
||||
}
|
||||
|
||||
if converted_buckets[converted_buckets.len() - 1].range.end != u64::MAX {
|
||||
converted_buckets
|
||||
.push((converted_buckets[converted_buckets.len() - 1].range.end..u64::MAX).into());
|
||||
if converted_buckets[converted_buckets.len() - 1].end != u64::MAX {
|
||||
converted_buckets.push(converted_buckets[converted_buckets.len() - 1].end..u64::MAX);
|
||||
}
|
||||
|
||||
// fill up holes in the ranges
|
||||
let find_hole = |converted_buckets: &[InternalRangeAggregationRange]| {
|
||||
let find_hole = |converted_buckets: &[Range<u64>]| {
|
||||
for (pos, ranges) in converted_buckets.windows(2).enumerate() {
|
||||
if ranges[0].range.end > ranges[1].range.start {
|
||||
if ranges[0].end > ranges[1].start {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Overlapping ranges not supported range {:?}, range+1 {:?}",
|
||||
ranges[0], ranges[1]
|
||||
)));
|
||||
}
|
||||
if ranges[0].range.end != ranges[1].range.start {
|
||||
if ranges[0].end != ranges[1].start {
|
||||
return Ok(Some(pos));
|
||||
}
|
||||
}
|
||||
@@ -395,9 +298,8 @@ fn extend_validate_ranges(
|
||||
};
|
||||
|
||||
while let Some(hole_pos) = find_hole(&converted_buckets)? {
|
||||
let new_range =
|
||||
converted_buckets[hole_pos].range.end..converted_buckets[hole_pos + 1].range.start;
|
||||
converted_buckets.insert(hole_pos + 1, new_range.into());
|
||||
let new_range = converted_buckets[hole_pos].end..converted_buckets[hole_pos + 1].start;
|
||||
converted_buckets.insert(hole_pos + 1, new_range);
|
||||
}
|
||||
|
||||
Ok(converted_buckets)
|
||||
@@ -405,7 +307,7 @@ fn extend_validate_ranges(
|
||||
|
||||
pub(crate) fn range_to_string(range: &Range<u64>, field_type: &Type) -> String {
|
||||
// is_start is there for malformed requests, e.g. ig the user passes the range u64::MIN..0.0,
|
||||
// it should be rendered as "*-0" and not "*-*"
|
||||
// it should be rendererd as "*-0" and not "*-*"
|
||||
let to_str = |val: u64, is_start: bool| {
|
||||
if (is_start && val == u64::MIN) || (!is_start && val == u64::MAX) {
|
||||
"*".to_string()
|
||||
@@ -424,12 +326,16 @@ pub(crate) fn range_to_key(range: &Range<u64>, field_type: &Type) -> Key {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use serde_json::Value;
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::agg_req::{
|
||||
Aggregation, Aggregations, BucketAggregation, BucketAggregationType,
|
||||
};
|
||||
use crate::aggregation::tests::{exec_request_with_query, get_test_index_with_num_docs};
|
||||
use crate::aggregation::tests::get_test_index_with_num_docs;
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
pub fn get_collector_from_ranges(
|
||||
ranges: Vec<RangeAggregationRange>,
|
||||
@@ -438,16 +344,9 @@ mod tests {
|
||||
let req = RangeAggregation {
|
||||
field: "dummy".to_string(),
|
||||
ranges,
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req_and_validate(
|
||||
&req,
|
||||
&Default::default(),
|
||||
&Default::default(),
|
||||
field_type,
|
||||
)
|
||||
.expect("unexpected error")
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -460,7 +359,6 @@ mod tests {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -468,7 +366,13 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = searcher.search(&AllQuery, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
|
||||
assert_eq!(res["range"]["buckets"][0]["key"], "*-0");
|
||||
assert_eq!(res["range"]["buckets"][0]["doc_count"], 0);
|
||||
@@ -482,131 +386,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": {
|
||||
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
"0-0.1": {"key": "0-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
"0.1-0.2": {"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
|
||||
"0.2-*": {"key": "0.2-*", "doc_count": 80, "from": 0.2},
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_custom_key_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![
|
||||
RangeAggregationRange {
|
||||
key: Some("custom-key-0-to-0.1".to_string()),
|
||||
from: Some(0f64),
|
||||
to: Some(0.1f64),
|
||||
},
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
from: Some(0.1f64),
|
||||
to: Some(0.2f64),
|
||||
},
|
||||
],
|
||||
keyed: false,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": [
|
||||
{"key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
{"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
{"key": "0.1-0.2", "doc_count": 10, "from": 0.1, "to": 0.2},
|
||||
{"key": "0.2-*", "doc_count": 80, "from": 0.2}
|
||||
]
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
|
||||
let index = get_test_index_with_num_docs(false, 100)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"range".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "fraction_f64".to_string(),
|
||||
ranges: vec![RangeAggregationRange {
|
||||
key: Some("custom-key-0-to-0.1".to_string()),
|
||||
from: Some(0f64),
|
||||
to: Some(0.1f64),
|
||||
}],
|
||||
keyed: true,
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
assert_eq!(
|
||||
res,
|
||||
json!({
|
||||
"range": {
|
||||
"buckets": {
|
||||
"*-0": { "key": "*-0", "doc_count": 0, "to": 0.0},
|
||||
"custom-key-0-to-0.1": {"key": "custom-key-0-to-0.1", "doc_count": 10, "from": 0.0, "to": 0.1},
|
||||
"0.1-*": {"key": "0.1-*", "doc_count": 90, "from": 0.1},
|
||||
}
|
||||
}
|
||||
})
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_test_extend_range_hole() {
|
||||
let buckets = vec![(10f64..20f64).into(), (30f64..40f64).into()];
|
||||
@@ -685,7 +464,6 @@ mod tests {
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
@@ -695,13 +473,11 @@ mod tests {
|
||||
|
||||
let ranges = vec![
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: Some(10.0),
|
||||
from: None,
|
||||
},
|
||||
(10.0..100.0).into(),
|
||||
RangeAggregationRange {
|
||||
key: None,
|
||||
to: None,
|
||||
from: Some(100.0),
|
||||
},
|
||||
@@ -711,7 +487,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
let ranges = vec![
|
||||
//(f64::MIN..10.0).into(),
|
||||
(10.0..100.0).into(),
|
||||
//(100.0..f64::MAX).into(),
|
||||
];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,5 +1,3 @@
|
||||
use std::rc::Rc;
|
||||
|
||||
use super::agg_req::Aggregations;
|
||||
use super::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use super::agg_result::AggregationResults;
|
||||
@@ -7,29 +5,19 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
/// The default max bucket count, before the aggregation fails.
|
||||
pub const MAX_BUCKET_COUNT: u32 = 65000;
|
||||
use crate::SegmentReader;
|
||||
|
||||
/// Collector for aggregations.
|
||||
///
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
max_bucket_count: u32,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
///
|
||||
/// Aggregation fails when the total bucket count is higher than max_bucket_count.
|
||||
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
|
||||
Self {
|
||||
agg,
|
||||
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
|
||||
}
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -40,21 +28,15 @@ impl AggregationCollector {
|
||||
/// # Purpose
|
||||
/// AggregationCollector returns `IntermediateAggregationResults` and not the final
|
||||
/// `AggregationResults`, so that results from differenct indices can be merged and then converted
|
||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||
/// into the final `AggregationResults` via the `into()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
max_bucket_count: u32,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
/// Create collector from aggregation request.
|
||||
///
|
||||
/// max_bucket_count will default to `MAX_BUCKET_COUNT` (65000) when unset
|
||||
pub fn from_aggs(agg: Aggregations, max_bucket_count: Option<u32>) -> Self {
|
||||
Self {
|
||||
agg,
|
||||
max_bucket_count: max_bucket_count.unwrap_or(MAX_BUCKET_COUNT),
|
||||
}
|
||||
pub fn from_aggs(agg: Aggregations) -> Self {
|
||||
Self { agg }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,11 +50,7 @@ impl Collector for DistributedAggregationCollector {
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
reader,
|
||||
self.max_bucket_count,
|
||||
)
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -97,11 +75,7 @@ impl Collector for AggregationCollector {
|
||||
_segment_local_id: crate::SegmentOrdinal,
|
||||
reader: &crate::SegmentReader,
|
||||
) -> crate::Result<Self::Child> {
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(
|
||||
&self.agg,
|
||||
reader,
|
||||
self.max_bucket_count,
|
||||
)
|
||||
AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader)
|
||||
}
|
||||
|
||||
fn requires_scoring(&self) -> bool {
|
||||
@@ -112,18 +86,17 @@ impl Collector for AggregationCollector {
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
res.into_final_bucket_result(self.agg.clone())
|
||||
merge_fruits(segment_fruits)
|
||||
.map(|res| AggregationResults::from_intermediate_and_req(res, self.agg.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
|
||||
mut segment_fruits: Vec<IntermediateAggregationResults>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(fruit) = segment_fruits.pop() {
|
||||
let mut fruit = fruit?;
|
||||
if let Some(mut fruit) = segment_fruits.pop() {
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(next_fruit?);
|
||||
fruit.merge_fruits(next_fruit);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
@@ -133,9 +106,8 @@ fn merge_fruits(
|
||||
|
||||
/// AggregationSegmentCollector does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs_with_accessor: AggregationsWithAccessor,
|
||||
aggs: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
error: Option<TantivyError>,
|
||||
}
|
||||
|
||||
impl AggregationSegmentCollector {
|
||||
@@ -144,40 +116,27 @@ impl AggregationSegmentCollector {
|
||||
pub fn from_agg_req_and_reader(
|
||||
agg: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
max_bucket_count: u32,
|
||||
) -> crate::Result<Self> {
|
||||
let aggs_with_accessor =
|
||||
get_aggs_with_accessor_and_validate(agg, reader, Rc::default(), max_bucket_count)?;
|
||||
let aggs_with_accessor = get_aggs_with_accessor_and_validate(agg, reader)?;
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs_with_accessor,
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
error: None,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = crate::Result<IntermediateAggregationResults>;
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
if self.error.is_some() {
|
||||
return;
|
||||
}
|
||||
if let Err(err) = self.result.collect(doc, &self.aggs_with_accessor) {
|
||||
self.error = Some(err);
|
||||
}
|
||||
self.result.collect(doc, &self.aggs);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
if let Some(err) = self.error {
|
||||
return Err(err);
|
||||
}
|
||||
self.result
|
||||
.flush_staged_docs(&self.aggs_with_accessor, true)?;
|
||||
self.result
|
||||
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
|
||||
self.result.flush_staged_docs(&self.aggs, true);
|
||||
self.result.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,75 +3,37 @@
|
||||
//! indices.
|
||||
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use fnv::FnvHashMap;
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{
|
||||
Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType,
|
||||
MetricAggregation,
|
||||
};
|
||||
use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry};
|
||||
use super::bucket::{
|
||||
cut_off_buckets, get_agg_name_and_property, intermediate_histogram_buckets_to_final_buckets,
|
||||
GetDocCount, Order, OrderTarget, SegmentHistogramBucketEntry, TermsAggregation,
|
||||
};
|
||||
use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation};
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::SegmentMetricResultCollector;
|
||||
use super::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentHistogramBucketEntry,
|
||||
SegmentMetricResultCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
|
||||
}
|
||||
|
||||
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
|
||||
fn from(tree: SegmentAggregationResultsCollector) -> Self {
|
||||
let metrics = tree.metrics.map(VecWithNames::from_other);
|
||||
let buckets = tree.buckets.map(VecWithNames::from_other);
|
||||
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
pub fn into_final_bucket_result(self, req: Aggregations) -> crate::Result<AggregationResults> {
|
||||
self.into_final_bucket_result_internal(&(req.into()))
|
||||
}
|
||||
|
||||
/// Convert intermediate result and its aggregation request to the final result.
|
||||
///
|
||||
/// Internal function, AggregationsInternal is used instead Aggregations, which is optimized
|
||||
/// for internal processing, by splitting metric and buckets into seperate groups.
|
||||
pub(crate) fn into_final_bucket_result_internal(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
// request
|
||||
let mut results: HashMap<String, AggregationResult> = HashMap::new();
|
||||
|
||||
if let Some(buckets) = self.buckets {
|
||||
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets)?
|
||||
} else {
|
||||
// When there are no buckets, we create empty buckets, so that the serialized json
|
||||
// format is constant
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
|
||||
};
|
||||
|
||||
if let Some(metrics) = self.metrics {
|
||||
convert_and_add_final_metrics_to_result(&mut results, metrics);
|
||||
} else {
|
||||
// When there are no metrics, we create empty metric results, so that the serialized
|
||||
// json format is constant
|
||||
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
|
||||
}
|
||||
|
||||
Ok(AggregationResults(results))
|
||||
}
|
||||
|
||||
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
|
||||
let metrics = if req.metrics.is_empty() {
|
||||
None
|
||||
@@ -131,58 +93,6 @@ impl IntermediateAggregationResults {
|
||||
}
|
||||
}
|
||||
|
||||
fn convert_and_add_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
metrics: VecWithNames<IntermediateMetricResult>,
|
||||
) {
|
||||
results.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
}
|
||||
|
||||
fn add_empty_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_metrics: &VecWithNames<MetricAggregation>,
|
||||
) -> crate::Result<()> {
|
||||
results.extend(req_metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_empty_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
let requested_buckets = req_buckets.iter();
|
||||
for (key, req) in requested_buckets {
|
||||
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
|
||||
results.insert(key.to_string(), empty_bucket);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn convert_and_add_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
buckets: VecWithNames<IntermediateBucketResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
assert_eq!(buckets.len(), req_buckets.len());
|
||||
|
||||
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
|
||||
for ((key, bucket), req) in buckets_with_request {
|
||||
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req)?);
|
||||
results.insert(key, result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum IntermediateAggregationResult {
|
||||
@@ -252,83 +162,29 @@ impl IntermediateMetricResult {
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(IntermediateRangeBucketResult),
|
||||
Range(FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>),
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
},
|
||||
/// Term aggregation
|
||||
Terms(IntermediateTermBucketResult),
|
||||
}
|
||||
|
||||
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
|
||||
fn from(collector: SegmentBucketResultCollector) -> Self {
|
||||
match collector {
|
||||
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
pub(crate) fn into_final_bucket_result(
|
||||
self,
|
||||
req: &BucketAggregationInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
match self {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_res
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| bucket.into_final_bucket_entry(&req.sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets.sort_by(|left, right| {
|
||||
left.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.total_cmp(&right.from.unwrap_or(f64::MIN))
|
||||
});
|
||||
|
||||
let is_keyed = req
|
||||
.as_range()
|
||||
.expect("unexpected aggregation, expected range aggregation")
|
||||
.keyed;
|
||||
let buckets = if is_keyed {
|
||||
let mut bucket_map =
|
||||
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
|
||||
for bucket in buckets {
|
||||
bucket_map.insert(bucket.key.to_string(), bucket);
|
||||
}
|
||||
BucketEntries::HashMap(bucket_map)
|
||||
} else {
|
||||
BucketEntries::Vec(buckets)
|
||||
};
|
||||
Ok(BucketResult::Range { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Histogram { buckets } => {
|
||||
let buckets = intermediate_histogram_buckets_to_final_buckets(
|
||||
buckets,
|
||||
req.as_histogram()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
&req.sub_aggregation,
|
||||
)?;
|
||||
|
||||
let buckets = if req.as_histogram().unwrap().keyed {
|
||||
let mut bucket_map =
|
||||
FnvHashMap::with_capacity_and_hasher(buckets.len(), Default::default());
|
||||
for bucket in buckets {
|
||||
bucket_map.insert(bucket.key.to_string(), bucket);
|
||||
}
|
||||
BucketEntries::HashMap(bucket_map)
|
||||
} else {
|
||||
BucketEntries::Vec(buckets)
|
||||
};
|
||||
Ok(BucketResult::Histogram { buckets })
|
||||
}
|
||||
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
|
||||
req.as_term()
|
||||
.expect("unexpected aggregation, expected term aggregation"),
|
||||
&req.sub_aggregation,
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
|
||||
match req {
|
||||
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
|
||||
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
|
||||
BucketAggregationType::Histogram(_) => {
|
||||
IntermediateBucketResult::Histogram { buckets: vec![] }
|
||||
@@ -338,34 +194,24 @@ impl IntermediateBucketResult {
|
||||
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Terms(term_res_left),
|
||||
IntermediateBucketResult::Terms(term_res_right),
|
||||
IntermediateBucketResult::Range(entries_left),
|
||||
IntermediateBucketResult::Range(entries_right),
|
||||
) => {
|
||||
merge_maps(&mut term_res_left.entries, term_res_right.entries);
|
||||
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
|
||||
term_res_left.doc_count_error_upper_bound +=
|
||||
term_res_right.doc_count_error_upper_bound;
|
||||
}
|
||||
|
||||
(
|
||||
IntermediateBucketResult::Range(range_res_left),
|
||||
IntermediateBucketResult::Range(range_res_right),
|
||||
) => {
|
||||
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
|
||||
merge_maps(entries_left, entries_right);
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_left,
|
||||
buckets: entries_left,
|
||||
..
|
||||
},
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_right,
|
||||
buckets: entries_right,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
let buckets = buckets_left
|
||||
let mut buckets = entries_left
|
||||
.drain(..)
|
||||
.merge_join_by(buckets_right.into_iter(), |left, right| {
|
||||
.merge_join_by(entries_right.into_iter(), |left, right| {
|
||||
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
|
||||
})
|
||||
.map(|either| match either {
|
||||
@@ -378,7 +224,7 @@ impl IntermediateBucketResult {
|
||||
})
|
||||
.collect();
|
||||
|
||||
*buckets_left = buckets;
|
||||
std::mem::swap(entries_left, &mut buckets);
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
@@ -386,114 +232,10 @@ impl IntermediateBucketResult {
|
||||
(IntermediateBucketResult::Histogram { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Terms { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Range aggregation including error counts
|
||||
pub struct IntermediateRangeBucketResult {
|
||||
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Term aggregation including error counts
|
||||
pub struct IntermediateTermBucketResult {
|
||||
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
|
||||
pub(crate) sum_other_doc_count: u64,
|
||||
pub(crate) doc_count_error_upper_bound: u64,
|
||||
}
|
||||
|
||||
impl IntermediateTermBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
.entries
|
||||
.into_iter()
|
||||
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key: Key::Str(key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: entry
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(sub_aggregation_req)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let order = req.order.order;
|
||||
match req.order.target {
|
||||
OrderTarget::Key => {
|
||||
buckets.sort_by(|left, right| {
|
||||
if req.order.order == Order::Desc {
|
||||
left.key.partial_cmp(&right.key)
|
||||
} else {
|
||||
right.key.partial_cmp(&left.key)
|
||||
}
|
||||
.expect("expected type string, which is always sortable")
|
||||
});
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
if req.order.order == Order::Desc {
|
||||
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
|
||||
} else {
|
||||
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
|
||||
}
|
||||
}
|
||||
OrderTarget::SubAggregation(name) => {
|
||||
let (agg_name, agg_property) = get_agg_name_and_property(&name);
|
||||
let mut buckets_with_val = buckets
|
||||
.into_iter()
|
||||
.map(|bucket| {
|
||||
let val = bucket
|
||||
.sub_aggregation
|
||||
.get_value_from_aggregation(agg_name, agg_property)?
|
||||
.unwrap_or(f64::NAN);
|
||||
Ok((bucket, val))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets_with_val.sort_by(|(_, val1), (_, val2)| match &order {
|
||||
Order::Desc => val2.total_cmp(val1),
|
||||
Order::Asc => val1.total_cmp(val2),
|
||||
});
|
||||
buckets = buckets_with_val
|
||||
.into_iter()
|
||||
.map(|(bucket, _val)| bucket)
|
||||
.collect_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
|
||||
// only for terms that didn't make it into the top N.
|
||||
//
|
||||
// This can be interesting, as a value of quality of the results, but not good to check the
|
||||
// actual error count for the returned terms.
|
||||
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
|
||||
cut_off_buckets(&mut buckets, req.size as usize);
|
||||
|
||||
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
|
||||
Some(self.doc_count_error_upper_bound)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(BucketResult::Terms {
|
||||
buckets,
|
||||
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
|
||||
doc_count_error_upper_bound,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MergeFruits {
|
||||
fn merge_fruits(&mut self, other: Self);
|
||||
}
|
||||
@@ -525,21 +267,6 @@ pub struct IntermediateHistogramBucketEntry {
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
impl IntermediateHistogramBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketEntry> {
|
||||
Ok(BucketEntry {
|
||||
key: Key::F64(self.key),
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
fn from(entry: SegmentHistogramBucketEntry) -> Self {
|
||||
IntermediateHistogramBucketEntry {
|
||||
@@ -550,6 +277,26 @@ impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl
|
||||
From<(
|
||||
SegmentHistogramBucketEntry,
|
||||
SegmentAggregationResultsCollector,
|
||||
)> for IntermediateHistogramBucketEntry
|
||||
{
|
||||
fn from(
|
||||
entry: (
|
||||
SegmentHistogramBucketEntry,
|
||||
SegmentAggregationResultsCollector,
|
||||
),
|
||||
) -> Self {
|
||||
IntermediateHistogramBucketEntry {
|
||||
key: entry.0.key,
|
||||
doc_count: entry.0.doc_count,
|
||||
sub_aggregation: entry.1.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -558,6 +305,7 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
pub(crate) values: Option<Vec<u64>>,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
@@ -568,37 +316,22 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl IntermediateRangeBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
Ok(RangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: self
|
||||
.sub_aggregation
|
||||
.into_final_bucket_result_internal(req)?,
|
||||
to: self.to,
|
||||
from: self.from,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
|
||||
fn from(entry: SegmentRangeBucketEntry) -> Self {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
|
||||
sub_aggregation.into()
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
/// This is the term entry for a bucket, which contains a count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateTermBucketEntry {
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
|
||||
impl MergeFruits for IntermediateTermBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
IntermediateRangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
values: None,
|
||||
sub_aggregation,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -633,6 +366,7 @@ mod tests {
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
@@ -641,7 +375,7 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
IntermediateBucketResult::Range(buckets),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
@@ -660,6 +394,7 @@ mod tests {
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
@@ -671,7 +406,7 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
IntermediateBucketResult::Range(buckets),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::DocId;
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
use crate::DocId;
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
@@ -53,23 +53,6 @@ pub struct Stats {
|
||||
pub avg: Option<f64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match agg_property {
|
||||
"count" => Ok(Some(self.count as f64)),
|
||||
"sum" => Ok(Some(self.sum)),
|
||||
"standard_deviation" => Ok(self.standard_deviation),
|
||||
"min" => Ok(self.min),
|
||||
"max" => Ok(self.max),
|
||||
"avg" => Ok(self.avg),
|
||||
_ => Err(TantivyError::InvalidArgument(format!(
|
||||
"unknown property {} on stats metric aggregation",
|
||||
agg_property
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
@@ -222,7 +205,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
@@ -285,7 +268,6 @@ mod tests {
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: iter::once((
|
||||
"stats".to_string(),
|
||||
@@ -300,7 +282,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
@@ -20,15 +20,14 @@
|
||||
//!
|
||||
//! #### Limitations
|
||||
//!
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and
|
||||
//! fast fields on text fields.
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
|
||||
//!
|
||||
//! # JSON Format
|
||||
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
|
||||
//!
|
||||
//! ```verbatim
|
||||
//! let agg_req: Aggregations = serde_json::from_str(json_request_string).unwrap();
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req);
|
||||
//! let searcher = reader.searcher();
|
||||
//! let agg_res = searcher.search(&term_query, &collector).unwrap_err();
|
||||
//! let json_response_string: String = &serde_json::to_string(&agg_res)?;
|
||||
@@ -38,7 +37,6 @@
|
||||
//! - [Bucket](bucket)
|
||||
//! - [Histogram](bucket::HistogramAggregation)
|
||||
//! - [Range](bucket::RangeAggregation)
|
||||
//! - [Terms](bucket::TermsAggregation)
|
||||
//! - [Metric](metric)
|
||||
//! - [Average](metric::AverageAggregation)
|
||||
//! - [Stats](metric::StatsAggregation)
|
||||
@@ -68,7 +66,7 @@
|
||||
//! .into_iter()
|
||||
//! .collect();
|
||||
//!
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
//! let collector = AggregationCollector::from_aggs(agg_req);
|
||||
//!
|
||||
//! let searcher = reader.searcher();
|
||||
//! let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
|
||||
@@ -132,7 +130,6 @@
|
||||
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
|
||||
//! field: "score".to_string(),
|
||||
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
//! keyed: false,
|
||||
//! }),
|
||||
//! sub_aggregation: sub_agg_req_1.clone(),
|
||||
//! }),
|
||||
@@ -150,8 +147,7 @@
|
||||
//! IntermediateAggregationResults provides the
|
||||
//! [merge_fruits](intermediate_agg_result::IntermediateAggregationResults::merge_fruits) method to
|
||||
//! merge multiple results. The merged result can then be converted into
|
||||
//! [agg_result::AggregationResults] via the
|
||||
//! [agg_result::AggregationResults::from_intermediate_and_req] method.
|
||||
//! [agg_result::AggregationResults] via the [Into] trait.
|
||||
|
||||
pub mod agg_req;
|
||||
mod agg_req_with_accessor;
|
||||
@@ -167,7 +163,6 @@ use std::fmt::Display;
|
||||
|
||||
pub use collector::{
|
||||
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
||||
MAX_BUCKET_COUNT,
|
||||
};
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -250,14 +245,6 @@ impl<T: Clone> VecWithNames<T> {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.keys.is_empty()
|
||||
}
|
||||
fn len(&self) -> usize {
|
||||
self.keys.len()
|
||||
}
|
||||
fn get(&self, name: &str) -> Option<&T> {
|
||||
self.keys()
|
||||
.position(|key| key == name)
|
||||
.map(|pos| &self.values[pos])
|
||||
}
|
||||
}
|
||||
|
||||
/// The serialized key is used in a HashMap.
|
||||
@@ -324,16 +311,13 @@ mod tests {
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::collector::AggregationCollector;
|
||||
use super::metric::AverageAggregation;
|
||||
use crate::aggregation::agg_req::{
|
||||
get_term_dict_field_names, BucketAggregationType, MetricAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::bucket::TermsAggregation;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use crate::{Index, Term};
|
||||
|
||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||
@@ -352,82 +336,17 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
|
||||
exec_request_with_query(agg_req, index, None)
|
||||
}
|
||||
pub fn exec_request_with_query(
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = if let Some((field, term)) = query {
|
||||
let text_field = reader.searcher().schema().get_field(field).unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
searcher.search(&term_query, &collector)?
|
||||
} else {
|
||||
searcher.search(&AllQuery, &collector)?
|
||||
};
|
||||
|
||||
// Test serialization/deserialization roundtrip
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values(
|
||||
merge_segments: bool,
|
||||
values: &[f64],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let mut segment_and_values = vec![];
|
||||
for value in values {
|
||||
segment_and_values.push(vec![(*value, value.to_string())]);
|
||||
}
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_terms(
|
||||
merge_segments: bool,
|
||||
values: &[Vec<&str>],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let segment_and_values = values
|
||||
.iter()
|
||||
.map(|terms| {
|
||||
terms
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, term)| (i as f64, term.to_string()))
|
||||
.collect()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values_and_terms(
|
||||
merge_segments: bool,
|
||||
segment_and_values: &[Vec<(f64, String)>],
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default()
|
||||
.set_index_option(IndexRecordOption::Basic)
|
||||
.set_fieldnorms(false),
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -439,22 +358,16 @@ mod tests {
|
||||
);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
// let mut index_writer = index.writer_for_tests()?;
|
||||
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
|
||||
for values in segment_and_values {
|
||||
for (i, term) in values {
|
||||
let i = *i;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_id => term.to_string(),
|
||||
string_field_id => term.to_string(),
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
}
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for &i in values {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
}
|
||||
@@ -462,11 +375,9 @@ mod tests {
|
||||
let segment_ids = index
|
||||
.searchable_segment_ids()
|
||||
.expect("Searchable segments failed.");
|
||||
if segment_ids.len() > 1 {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
Ok(index)
|
||||
@@ -477,13 +388,15 @@ mod tests {
|
||||
merge_segments: bool,
|
||||
use_distributed_collector: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut values_and_terms = (0..80)
|
||||
.map(|val| vec![(val as f64, "terma".to_string())])
|
||||
.collect::<Vec<_>>();
|
||||
values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string();
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?;
|
||||
let index = get_test_index_with_num_docs(merge_segments, 80)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(DOC_BLOCK_SIZE, 64);
|
||||
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
|
||||
@@ -518,20 +431,7 @@ mod tests {
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0,
|
||||
"offset": 3.0
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"term_agg_test":{
|
||||
"terms": {
|
||||
"field": "string_id"
|
||||
"offset": 3.0,
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
@@ -549,18 +449,18 @@ mod tests {
|
||||
.unwrap();
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), None);
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
|
||||
intermediate_agg_result
|
||||
.into_final_bucket_result(agg_req)
|
||||
.unwrap()
|
||||
AggregationResults::from_intermediate_and_req(
|
||||
searcher.search(&term_query, &collector).unwrap(),
|
||||
agg_req,
|
||||
)
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
@@ -590,46 +490,6 @@ mod tests {
|
||||
);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70);
|
||||
|
||||
assert_eq!(
|
||||
res["term_agg_test"],
|
||||
json!(
|
||||
{
|
||||
"buckets": [
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 70,
|
||||
"key": 0.0
|
||||
},
|
||||
{
|
||||
"doc_count": 9,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 79,
|
||||
"key": "terma"
|
||||
},
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 1,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 1,
|
||||
"key": "termb"
|
||||
}
|
||||
],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -647,10 +507,8 @@ mod tests {
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
schema_builder.add_text_field("dummy_text", STRING);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -766,7 +624,6 @@ mod tests {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score".to_string(),
|
||||
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -777,7 +634,6 @@ mod tests {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -788,7 +644,6 @@ mod tests {
|
||||
bucket_agg: BucketAggregationType::Range(RangeAggregation {
|
||||
field: "score_i64".to_string(),
|
||||
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -797,7 +652,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
@@ -858,21 +713,10 @@ mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req: Aggregations = vec![
|
||||
("average_in_range".to_string(), get_avg_req("score")),
|
||||
(
|
||||
"term_agg".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let sub_agg_req: Aggregations =
|
||||
vec![("average_in_range".to_string(), get_avg_req("score"))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let agg_req: Aggregations = if use_elastic_json_req {
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
{
|
||||
@@ -888,8 +732,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"rangei64": {
|
||||
@@ -904,8 +747,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"average": {
|
||||
@@ -923,8 +765,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -945,7 +786,6 @@ mod tests {
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req.clone(),
|
||||
}),
|
||||
@@ -960,7 +800,6 @@ mod tests {
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req.clone(),
|
||||
}),
|
||||
@@ -975,7 +814,6 @@ mod tests {
|
||||
(7f64..19f64).into(),
|
||||
(19f64..20f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req,
|
||||
}),
|
||||
@@ -986,20 +824,17 @@ mod tests {
|
||||
agg_req
|
||||
};
|
||||
|
||||
let field_names = get_term_dict_field_names(&agg_req);
|
||||
assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone(), None);
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let res = searcher.search(&term_query, &collector).unwrap();
|
||||
// Test de/serialization roundtrip on intermediate_agg_result
|
||||
let res: IntermediateAggregationResults =
|
||||
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
|
||||
res.into_final_bucket_result(agg_req.clone()).unwrap()
|
||||
AggregationResults::from_intermediate_and_req(res, agg_req.clone())
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone(), None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
@@ -1057,7 +892,7 @@ mod tests {
|
||||
);
|
||||
|
||||
// Test empty result set
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&query_with_no_hits, &collector).unwrap();
|
||||
|
||||
@@ -1122,17 +957,17 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
|
||||
searcher.search(&AllQuery, &collector).unwrap_err()
|
||||
};
|
||||
|
||||
let agg_res = avg_on_field("dummy_text");
|
||||
let agg_res = avg_on_field("text");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
r#"InvalidArgument("Only single value fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
);
|
||||
|
||||
let agg_res = avg_on_field("not_exist_field");
|
||||
@@ -1144,7 +979,7 @@ mod tests {
|
||||
let agg_res = avg_on_field("scores_i64");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
|
||||
r#"InvalidArgument("Invalid field type in aggregation I64, only Cardinality::SingleValue supported")"#
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1153,12 +988,11 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds, TermsAggregation};
|
||||
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds};
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
@@ -1170,10 +1004,6 @@ mod tests {
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let text_field_many_terms =
|
||||
schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms =
|
||||
schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -1181,10 +1011,6 @@ mod tests {
|
||||
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
let many_terms_data = (0..15_000)
|
||||
.map(|num| format!("author{}", num))
|
||||
.collect::<Vec<_>>();
|
||||
{
|
||||
let mut rng = thread_rng();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -1193,8 +1019,6 @@ mod tests {
|
||||
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => val as f64,
|
||||
score_field_i64 => val as i64,
|
||||
@@ -1235,7 +1059,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1266,7 +1090,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1297,7 +1121,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1336,7 +1160,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1346,64 +1170,6 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_few(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_few_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_many(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_many_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req, None);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_range_only(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
@@ -1423,7 +1189,6 @@ mod tests {
|
||||
(40000f64..50000f64).into(),
|
||||
(50000f64..60000f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
@@ -1431,7 +1196,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1466,7 +1231,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1505,7 +1270,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1535,7 +1300,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
@@ -1583,7 +1348,6 @@ mod tests {
|
||||
(7000f64..20000f64).into(),
|
||||
(20000f64..60000f64).into(),
|
||||
],
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: sub_agg_req_1.clone(),
|
||||
}),
|
||||
@@ -1592,7 +1356,7 @@ mod tests {
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1, None);
|
||||
let collector = AggregationCollector::from_aggs(agg_req_1);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
|
||||
@@ -4,22 +4,18 @@
|
||||
//! merging.
|
||||
|
||||
use std::fmt::Debug;
|
||||
use std::rc::Rc;
|
||||
use std::sync::atomic::AtomicU32;
|
||||
|
||||
use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
use super::collector::MAX_BUCKET_COUNT;
|
||||
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector};
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::VecWithNames;
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::{DocId, TantivyError};
|
||||
use crate::DocId;
|
||||
|
||||
pub(crate) const DOC_BLOCK_SIZE: usize = 64;
|
||||
pub(crate) type DocBlock = [DocId; DOC_BLOCK_SIZE];
|
||||
@@ -32,17 +28,6 @@ pub(crate) struct SegmentAggregationResultsCollector {
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Default for SegmentAggregationResultsCollector {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics: Default::default(),
|
||||
buckets: Default::default(),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
@@ -55,25 +40,6 @@ impl Debug for SegmentAggregationResultsCollector {
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub fn into_intermediate_aggregations_result(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
let buckets = if let Some(buckets) = self.buckets {
|
||||
let entries = buckets
|
||||
.into_iter()
|
||||
.zip(agg_with_accessor.buckets.values())
|
||||
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
Some(VecWithNames::from_entries(entries))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let metrics = self.metrics.map(VecWithNames::from_other);
|
||||
|
||||
Ok(IntermediateAggregationResults { metrics, buckets })
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
@@ -118,23 +84,19 @@ impl SegmentAggregationResultsCollector {
|
||||
&mut self,
|
||||
doc: crate::DocId,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
self.staged_docs[self.num_staged_docs] = doc;
|
||||
self.num_staged_docs += 1;
|
||||
if self.num_staged_docs == self.staged_docs.len() {
|
||||
self.flush_staged_docs(agg_with_accessor, false)?;
|
||||
self.flush_staged_docs(agg_with_accessor, false);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub(crate) fn flush_staged_docs(
|
||||
&mut self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
if self.num_staged_docs == 0 {
|
||||
return Ok(());
|
||||
}
|
||||
) {
|
||||
if let Some(metrics) = &mut self.metrics {
|
||||
for (collector, agg_with_accessor) in
|
||||
metrics.values_mut().zip(agg_with_accessor.metrics.values())
|
||||
@@ -152,12 +114,11 @@ impl SegmentAggregationResultsCollector {
|
||||
&self.staged_docs[..self.num_staged_docs],
|
||||
agg_with_accessor,
|
||||
force_flush,
|
||||
)?;
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
self.num_staged_docs = 0;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -201,58 +162,27 @@ impl SegmentMetricResultCollector {
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
Histogram(Box<SegmentHistogramCollector>),
|
||||
Terms(Box<SegmentTermCollector>),
|
||||
Histogram(SegmentHistogramCollector),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
|
||||
SegmentTermCollector::from_req_and_validate(
|
||||
terms_req,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_multi()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
BucketAggregationType::Range(range_req) => {
|
||||
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
&req.sub_aggregation,
|
||||
&req.bucket_count,
|
||||
req.field_type,
|
||||
)?))
|
||||
}
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(
|
||||
SegmentHistogramCollector::from_req_and_validate(
|
||||
histogram,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
&req.accessor,
|
||||
)?,
|
||||
))),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -262,52 +192,42 @@ impl SegmentBucketResultCollector {
|
||||
doc: &[DocId],
|
||||
bucket_with_accessor: &BucketAggregationWithAccessor,
|
||||
force_flush: bool,
|
||||
) -> crate::Result<()> {
|
||||
) {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
range.collect_block(doc, bucket_with_accessor, force_flush);
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
}
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.collect_block(doc, bucket_with_accessor, force_flush)?;
|
||||
histogram.collect_block(doc, bucket_with_accessor, force_flush)
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) struct BucketCount {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
pub(crate) bucket_count: Rc<AtomicU32>,
|
||||
pub(crate) max_bucket_count: u32,
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentHistogramBucketEntry {
|
||||
pub key: f64,
|
||||
pub doc_count: u64,
|
||||
}
|
||||
|
||||
impl Default for BucketCount {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
bucket_count: Default::default(),
|
||||
max_bucket_count: MAX_BUCKET_COUNT,
|
||||
}
|
||||
}
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl BucketCount {
|
||||
pub(crate) fn validate_bucket_count(&self) -> crate::Result<()> {
|
||||
if self.get_count() > self.max_bucket_count {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Aborting aggregation because too many buckets were created".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
pub(crate) fn add_count(&self, count: u32) {
|
||||
self.bucket_count
|
||||
.fetch_add(count as u32, std::sync::atomic::Ordering::Relaxed);
|
||||
}
|
||||
pub(crate) fn get_count(&self) -> u32 {
|
||||
self.bucket_count.load(std::sync::atomic::Ordering::Relaxed)
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -271,8 +271,8 @@ impl Collector for FacetCollector {
|
||||
let mut facet_streamer = facet_reader.facet_dict().range().into_stream()?;
|
||||
if facet_streamer.advance() {
|
||||
'outer: loop {
|
||||
// at the beginning of this loop, facet_streamer
|
||||
// is positioned on a term that has not been processed yet.
|
||||
// at the begining of this loop, facet_streamer
|
||||
// is positionned on a term that has not been processed yet.
|
||||
let skip_result = skip(facet_streamer.key(), &mut collapse_facet_it);
|
||||
match skip_result {
|
||||
SkipResult::Found => {
|
||||
|
||||
@@ -72,7 +72,8 @@ impl HistogramComputer {
|
||||
return;
|
||||
}
|
||||
let delta = value - self.min_value;
|
||||
let bucket_id: usize = self.divider.divide(delta) as usize;
|
||||
let delta_u64 = delta.to_u64();
|
||||
let bucket_id: usize = self.divider.divide(delta_u64) as usize;
|
||||
if bucket_id < self.counts.len() {
|
||||
self.counts[bucket_id] += 1;
|
||||
}
|
||||
@@ -272,21 +273,21 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(
|
||||
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let all_query = AllQuery;
|
||||
let week_histogram_collector = HistogramCollector::new(
|
||||
date_field,
|
||||
DateTime::from_primitive(
|
||||
DateTime::new_primitive(
|
||||
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
|
||||
),
|
||||
3_600_000_000 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
10,
|
||||
);
|
||||
let week_histogram = searcher.search(&all_query, &week_histogram_collector)?;
|
||||
|
||||
@@ -92,7 +92,7 @@ mod histogram_collector;
|
||||
pub use histogram_collector::HistogramCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use super::{Collector, SegmentCollector};
|
||||
use crate::collector::Fruit;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
/// MultiFruit keeps Fruits from every nested Collector
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
@@ -80,17 +79,12 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
}
|
||||
}
|
||||
|
||||
/// FruitHandle stores reference to the corresponding collector inside MultiCollector
|
||||
pub struct FruitHandle<TFruit: Fruit> {
|
||||
pos: usize,
|
||||
_phantom: PhantomData<TFruit>,
|
||||
}
|
||||
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Extract a typed fruit off a multifruit.
|
||||
///
|
||||
/// This function involves downcasting and can panic if the multifruit was
|
||||
/// created using faulty code.
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*boxed_fruit
|
||||
|
||||
@@ -26,11 +26,11 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::new_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::new_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -55,7 +55,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
assert_eq!(filtered_top_docs.len(), 0);
|
||||
|
||||
fn date_filter(value: DateTime) -> bool {
|
||||
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
(value.to_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
> 0
|
||||
}
|
||||
@@ -69,8 +69,10 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
|
||||
/// Stores all of the doc ids.
|
||||
/// This collector is only used for tests.
|
||||
/// It is unusable in practise, as it does
|
||||
/// not store the segment ordinals
|
||||
/// It is unusable in pr
|
||||
///
|
||||
/// actise, as it does not store
|
||||
/// the segment ordinals
|
||||
pub struct TestCollector {
|
||||
pub compute_score: bool,
|
||||
}
|
||||
@@ -263,7 +265,7 @@ impl SegmentCollector for BytesFastFieldSegmentCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn make_test_searcher() -> crate::Result<Searcher> {
|
||||
fn make_test_searcher() -> crate::Result<crate::LeasedItem<Searcher>> {
|
||||
let schema = Schema::builder().build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
@@ -137,7 +137,7 @@ where T: PartialOrd + Clone
|
||||
/// sorted by type `T`.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theoretical complexity for collecting the top `K` out of `n` documents
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
pub(crate) struct TopSegmentCollector<T> {
|
||||
limit: usize,
|
||||
|
||||
@@ -79,7 +79,7 @@ where
|
||||
/// sorted by their score.
|
||||
///
|
||||
/// The implementation is based on a `BinaryHeap`.
|
||||
/// The theoretical complexity for collecting the top `K` out of `n` documents
|
||||
/// The theorical complexity for collecting the top `K` out of `n` documents
|
||||
/// is `O(n log K)`.
|
||||
///
|
||||
/// This collector guarantees a stable sorting in case of a tie on the
|
||||
@@ -283,7 +283,7 @@ impl TopDocs {
|
||||
///
|
||||
/// # See also
|
||||
///
|
||||
/// To comfortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// To confortably work with `u64`s, `i64`s, `f64`s, or `date`s, please refer to
|
||||
/// [.order_by_fast_field(...)](#method.order_by_fast_field) method.
|
||||
pub fn order_by_u64_field(
|
||||
self,
|
||||
@@ -898,7 +898,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
let pr_birthday = DateTime::new_utc(OffsetDateTime::parse(
|
||||
"1898-04-09T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
@@ -906,7 +906,7 @@ mod tests {
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday,
|
||||
))?;
|
||||
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
let mr_birthday = DateTime::new_utc(OffsetDateTime::parse(
|
||||
"1947-11-08T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crossbeam::channel;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||
@@ -48,19 +47,16 @@ impl Executor {
|
||||
match self {
|
||||
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let args: Vec<A> = args.collect();
|
||||
let num_fruits = args.len();
|
||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||
let num_fruits = args_with_indices.len();
|
||||
let fruit_receiver = {
|
||||
let (fruit_sender, fruit_receiver) = crossbeam_channel::unbounded();
|
||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||
pool.scope(|scope| {
|
||||
for (idx, arg) in args.into_iter().enumerate() {
|
||||
// We name references for f and fruit_sender_ref because we do not
|
||||
// want these two to be moved into the closure.
|
||||
let f_ref = &f;
|
||||
let fruit_sender_ref = &fruit_sender;
|
||||
scope.spawn(move |_| {
|
||||
let fruit = f_ref(arg);
|
||||
if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
|
||||
for arg_with_idx in args_with_indices {
|
||||
scope.spawn(|_| {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!(
|
||||
"Failed to send search task. It probably means all search \
|
||||
threads have panicked. {:?}",
|
||||
@@ -75,19 +71,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut result_placeholders: Vec<Option<R>> =
|
||||
std::iter::repeat_with(|| None).take(num_fruits).collect();
|
||||
// This is lame, but safe.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
let fruit = fruit_res?;
|
||||
result_placeholders[pos] = Some(fruit);
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
let results: Vec<R> = result_placeholders.into_iter().flatten().collect();
|
||||
if results.len() != num_fruits {
|
||||
return Err(TantivyError::InternalError(
|
||||
"One of the mapped execution failed.".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ fn load_metas(
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
index_settings: IndexSettings,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
}
|
||||
impl Default for IndexBuilder {
|
||||
fn default() -> Self {
|
||||
@@ -87,7 +86,6 @@ impl IndexBuilder {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,12 +103,6 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the tokenizers .
|
||||
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
|
||||
self.tokenizer_manager = tokenizers;
|
||||
self
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
@@ -162,8 +154,7 @@ impl IndexBuilder {
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let mut index = Index::open(dir)?;
|
||||
index.set_tokenizers(self.tokenizer_manager.clone());
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == self.get_expect_schema()? {
|
||||
Ok(index)
|
||||
} else {
|
||||
@@ -185,8 +176,7 @@ impl IndexBuilder {
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings;
|
||||
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
index.set_tokenizers(self.tokenizer_manager);
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
@@ -232,7 +222,7 @@ impl Index {
|
||||
}
|
||||
|
||||
/// Replace the default single thread search executor pool
|
||||
/// by a thread pool with as many threads as there are CPUs on the system.
|
||||
/// by a thread pool with a given number of threads.
|
||||
pub fn set_default_multithread_executor(&mut self) -> crate::Result<()> {
|
||||
let default_num_threads = num_cpus::get();
|
||||
self.set_multithread_executor(default_num_threads)
|
||||
@@ -314,11 +304,6 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Setter for the tokenizer manager.
|
||||
pub fn set_tokenizers(&mut self, tokenizers: TokenizerManager) {
|
||||
self.tokenizers = tokenizers;
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
@@ -329,31 +314,20 @@ impl Index {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let indexing_options_opt = match field_type {
|
||||
FieldType::JsonObject(options) => options.get_text_indexing_options(),
|
||||
FieldType::Str(options) => options.get_indexing_options(),
|
||||
_ => {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
)))
|
||||
}
|
||||
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
||||
_ => None,
|
||||
};
|
||||
let indexing_options = indexing_options_opt.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No indexing options set for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})?;
|
||||
|
||||
tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No Tokenizer found for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => Ok(tokenizer),
|
||||
None => Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a default `IndexReader` for the given index.
|
||||
@@ -366,7 +340,8 @@ impl Index {
|
||||
/// Create a `IndexReader` for the given index.
|
||||
///
|
||||
/// Most project should create at most one reader for a given index.
|
||||
/// This method is typically called only once per `Index` instance.
|
||||
/// This method is typically called only once per `Index` instance,
|
||||
/// over the lifetime of most problem.
|
||||
pub fn reader_builder(&self) -> IndexReaderBuilder {
|
||||
IndexReaderBuilder::new(self.clone())
|
||||
}
|
||||
@@ -582,8 +557,7 @@ impl fmt::Debug for Index {
|
||||
mod tests {
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -599,21 +573,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_tokenizer_manager() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("num_likes", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = IndexBuilder::new()
|
||||
// set empty tokenizer manager
|
||||
.tokenizers(TokenizerManager::new())
|
||||
.schema(schema)
|
||||
.create_in_ram()
|
||||
.unwrap();
|
||||
assert!(index.tokenizers().get("raw").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
@@ -743,7 +702,7 @@ mod tests {
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
@@ -778,7 +737,7 @@ mod tests {
|
||||
reader: &IndexReader,
|
||||
) -> crate::Result<()> {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index
|
||||
.directory_mut()
|
||||
.watch(WatchCallback::new(move || {
|
||||
|
||||
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
|
||||
///
|
||||
/// Contains settings which are applied on the whole
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
@@ -248,29 +248,10 @@ pub struct IndexSettings {
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
#[serde(default = "default_docstore_blocksize")]
|
||||
/// The size of each block that will be compressed and written to disk
|
||||
pub docstore_blocksize: usize,
|
||||
}
|
||||
|
||||
/// Must be a function to be compatible with serde defaults
|
||||
fn default_docstore_blocksize() -> usize {
|
||||
16_384
|
||||
}
|
||||
|
||||
impl Default for IndexSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sort_by_field: None,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_blocksize: default_docstore_blocksize(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
/// Presorting documents can greatly improve performance
|
||||
/// Presorting documents can greatly performance
|
||||
/// in some scenarios, by applying top n
|
||||
/// optimizations.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
@@ -326,7 +307,7 @@ pub struct IndexMeta {
|
||||
pub payload: Option<String>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
#[derive(Deserialize)]
|
||||
struct UntrackedIndexMeta {
|
||||
pub segments: Vec<InnerSegmentMeta>,
|
||||
#[serde(default)]
|
||||
@@ -395,7 +376,6 @@ mod tests {
|
||||
use super::IndexMeta;
|
||||
use crate::core::index_meta::UntrackedIndexMeta;
|
||||
use crate::schema::{Schema, TEXT};
|
||||
use crate::store::ZstdCompressor;
|
||||
use crate::{IndexSettings, IndexSortByField, Order};
|
||||
|
||||
#[test]
|
||||
@@ -421,7 +401,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
@@ -429,60 +409,4 @@ mod tests {
|
||||
assert_eq!(index_metas.schema, deser_meta.schema);
|
||||
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas_zstd_compressor() {
|
||||
let schema = {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("text", TEXT);
|
||||
schema_builder.build()
|
||||
};
|
||||
let index_metas = IndexMeta {
|
||||
index_settings: IndexSettings {
|
||||
sort_by_field: Some(IndexSortByField {
|
||||
field: "text".to_string(),
|
||||
order: Order::Asc,
|
||||
}),
|
||||
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
|
||||
compression_level: Some(4),
|
||||
}),
|
||||
docstore_blocksize: 1_000_000,
|
||||
},
|
||||
segments: Vec::new(),
|
||||
schema,
|
||||
opstamp: 0u64,
|
||||
payload: None,
|
||||
};
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
assert_eq!(index_metas.index_settings, deser_meta.index_settings);
|
||||
assert_eq!(index_metas.schema, deser_meta.schema);
|
||||
assert_eq!(index_metas.opstamp, deser_meta.opstamp);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_serialize_metas_invalid_comp() {
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown variant `zsstd`, expected one of `none`, `lz4`, `brotli`, `snappy`, `zstd`, \
|
||||
`zstd(compression_level=5)` at line 1 column 96"
|
||||
.to_string()
|
||||
);
|
||||
|
||||
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
|
||||
|
||||
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
"unknown zstd option \"bla\" at line 1 column 103".to_string()
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
use std::collections::BTreeMap;
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::collector::Collector;
|
||||
@@ -7,7 +6,7 @@ use crate::core::{Executor, SegmentReader};
|
||||
use crate::query::Query;
|
||||
use crate::schema::{Document, Schema, Term};
|
||||
use crate::space_usage::SearcherSpaceUsage;
|
||||
use crate::store::{CacheStats, StoreReader};
|
||||
use crate::store::StoreReader;
|
||||
use crate::{DocAddress, Index, Opstamp, SegmentId, TrackedObject};
|
||||
|
||||
/// Identifies the searcher generation accessed by a [Searcher].
|
||||
@@ -63,20 +62,43 @@ impl SearcherGeneration {
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
#[derive(Clone)]
|
||||
pub struct Searcher {
|
||||
inner: Arc<SearcherInner>,
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl Searcher {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
) -> io::Result<Searcher> {
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
.map(SegmentReader::get_store_reader)
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
Ok(Searcher {
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
generation,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the `Index` associated to the `Searcher`
|
||||
pub fn index(&self) -> &Index {
|
||||
&self.inner.index
|
||||
&self.index
|
||||
}
|
||||
|
||||
/// [SearcherGeneration] which identifies the version of the snapshot held by this `Searcher`.
|
||||
pub fn generation(&self) -> &SearcherGeneration {
|
||||
self.inner.generation.as_ref()
|
||||
self.generation.as_ref()
|
||||
}
|
||||
|
||||
/// Fetches a document from tantivy's store given a `DocAddress`.
|
||||
@@ -84,39 +106,25 @@ impl Searcher {
|
||||
/// The searcher uses the segment ordinal to route the
|
||||
/// the request to the right `Segment`.
|
||||
pub fn doc(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get(doc_address.doc_id)
|
||||
}
|
||||
|
||||
/// The cache stats for the underlying store reader.
|
||||
///
|
||||
/// Aggregates the sum for each segment store reader.
|
||||
pub fn doc_store_cache_stats(&self) -> CacheStats {
|
||||
let cache_stats: CacheStats = self
|
||||
.inner
|
||||
.store_readers
|
||||
.iter()
|
||||
.map(|reader| reader.cache_stats())
|
||||
.sum();
|
||||
cache_stats
|
||||
}
|
||||
|
||||
/// Fetches a document in an asynchronous manner.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub async fn doc_async(&self, doc_address: DocAddress) -> crate::Result<Document> {
|
||||
let store_reader = &self.inner.store_readers[doc_address.segment_ord as usize];
|
||||
let store_reader = &self.store_readers[doc_address.segment_ord as usize];
|
||||
store_reader.get_async(doc_address.doc_id).await
|
||||
}
|
||||
|
||||
/// Access the schema associated to the index of this searcher.
|
||||
pub fn schema(&self) -> &Schema {
|
||||
&self.inner.schema
|
||||
&self.schema
|
||||
}
|
||||
|
||||
/// Returns the overall number of documents in the index.
|
||||
pub fn num_docs(&self) -> u64 {
|
||||
self.inner
|
||||
.segment_readers
|
||||
self.segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| u64::from(segment_reader.num_docs()))
|
||||
.sum::<u64>()
|
||||
@@ -126,7 +134,7 @@ impl Searcher {
|
||||
/// the given term.
|
||||
pub fn doc_freq(&self, term: &Term) -> crate::Result<u64> {
|
||||
let mut total_doc_freq = 0;
|
||||
for segment_reader in &self.inner.segment_readers {
|
||||
for segment_reader in &self.segment_readers {
|
||||
let inverted_index = segment_reader.inverted_index(term.field())?;
|
||||
let doc_freq = inverted_index.doc_freq(term)?;
|
||||
total_doc_freq += u64::from(doc_freq);
|
||||
@@ -136,12 +144,12 @@ impl Searcher {
|
||||
|
||||
/// Return the list of segment readers
|
||||
pub fn segment_readers(&self) -> &[SegmentReader] {
|
||||
&self.inner.segment_readers
|
||||
&self.segment_readers
|
||||
}
|
||||
|
||||
/// Returns the segment_reader associated with the given segment_ord
|
||||
pub fn segment_reader(&self, segment_ord: u32) -> &SegmentReader {
|
||||
&self.inner.segment_readers[segment_ord as usize]
|
||||
&self.segment_readers[segment_ord as usize]
|
||||
}
|
||||
|
||||
/// Runs a query on the segment readers wrapped by the searcher.
|
||||
@@ -163,7 +171,7 @@ impl Searcher {
|
||||
query: &dyn Query,
|
||||
collector: &C,
|
||||
) -> crate::Result<C::Fruit> {
|
||||
let executor = self.inner.index.search_executor();
|
||||
let executor = self.index.search_executor();
|
||||
self.search_with_executor(query, collector, executor)
|
||||
}
|
||||
|
||||
@@ -200,59 +208,17 @@ impl Searcher {
|
||||
/// Summarize total space usage of this searcher.
|
||||
pub fn space_usage(&self) -> io::Result<SearcherSpaceUsage> {
|
||||
let mut space_usage = SearcherSpaceUsage::new();
|
||||
for segment_reader in self.segment_readers() {
|
||||
for segment_reader in &self.segment_readers {
|
||||
space_usage.add_segment(segment_reader.space_usage()?);
|
||||
}
|
||||
Ok(space_usage)
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Arc<SearcherInner>> for Searcher {
|
||||
fn from(inner: Arc<SearcherInner>) -> Self {
|
||||
Searcher { inner }
|
||||
}
|
||||
}
|
||||
|
||||
/// Holds a list of `SegmentReader`s ready for search.
|
||||
///
|
||||
/// It guarantees that the `Segment` will not be removed before
|
||||
/// the destruction of the `Searcher`.
|
||||
pub(crate) struct SearcherInner {
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
store_readers: Vec<StoreReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
}
|
||||
|
||||
impl SearcherInner {
|
||||
/// Creates a new `Searcher`
|
||||
pub(crate) fn new(
|
||||
schema: Schema,
|
||||
index: Index,
|
||||
segment_readers: Vec<SegmentReader>,
|
||||
generation: TrackedObject<SearcherGeneration>,
|
||||
doc_store_cache_size: usize,
|
||||
) -> io::Result<SearcherInner> {
|
||||
let store_readers: Vec<StoreReader> = segment_readers
|
||||
.iter()
|
||||
.map(|segment_reader| segment_reader.get_store_reader(doc_store_cache_size))
|
||||
.collect::<io::Result<Vec<_>>>()?;
|
||||
|
||||
Ok(SearcherInner {
|
||||
schema,
|
||||
index,
|
||||
segment_readers,
|
||||
store_readers,
|
||||
generation,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl fmt::Debug for Searcher {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let segment_ids = self
|
||||
.segment_readers()
|
||||
.segment_readers
|
||||
.iter()
|
||||
.map(SegmentReader::segment_id)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
@@ -24,8 +24,7 @@ pub enum SegmentComponent {
|
||||
Store,
|
||||
/// Temporary storage of the documents, before streamed to `Store`.
|
||||
TempStore,
|
||||
/// Bitset describing which document of the segment is alive.
|
||||
/// (It was representing deleted docs but changed to represent alive docs from v0.17)
|
||||
/// Bitset describing which document of the segment is deleted.
|
||||
Delete,
|
||||
}
|
||||
|
||||
|
||||
@@ -35,7 +35,7 @@ const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
#[cfg(test)]
|
||||
fn create_uuid() -> Uuid {
|
||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY)
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
@@ -57,7 +57,7 @@ impl SegmentId {
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message (e.g. a5c4dfcb).
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.as_simple().to_string()[..8]).to_string()
|
||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
@@ -65,7 +65,7 @@ impl SegmentId {
|
||||
/// It consists in 32 lowercase hexadecimal chars
|
||||
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.as_simple().to_string()
|
||||
self.0.to_simple_ref().to_string()
|
||||
}
|
||||
|
||||
/// Build a `SegmentId` string from the full uuid string.
|
||||
|
||||
@@ -128,14 +128,13 @@ impl SegmentReader {
|
||||
})
|
||||
}
|
||||
|
||||
#[doc(hidden)]
|
||||
pub fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
||||
pub(crate) fn fieldnorms_readers(&self) -> &FieldNormReaders {
|
||||
&self.fieldnorm_readers
|
||||
}
|
||||
|
||||
/// Accessor to the segment's `StoreReader`.
|
||||
pub fn get_store_reader(&self, cache_size: usize) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone(), cache_size)
|
||||
pub fn get_store_reader(&self) -> io::Result<StoreReader> {
|
||||
StoreReader::open(self.store_file.clone())
|
||||
}
|
||||
|
||||
/// Open a new segment for reading.
|
||||
@@ -170,15 +169,15 @@ impl SegmentReader {
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_fields_readers =
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
|
||||
let original_bitset = if segment.meta().has_deletes() {
|
||||
let alive_doc_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
||||
let alive_doc_data = alive_doc_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(alive_doc_data))
|
||||
let delete_file_slice = segment.open_read(SegmentComponent::Delete)?;
|
||||
let delete_data = delete_file_slice.read_bytes()?;
|
||||
Some(AliveBitSet::open(delete_data))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -197,7 +196,7 @@ impl SegmentReader {
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
@@ -216,7 +215,7 @@ impl SegmentReader {
|
||||
/// term dictionary associated to a specific field,
|
||||
/// and opening the posting list associated to any term.
|
||||
///
|
||||
/// If the field is not marked as index, a warn is logged and an empty `InvertedIndexReader`
|
||||
/// If the field is marked as index, a warn is logged and an empty `InvertedIndexReader`
|
||||
/// is returned.
|
||||
/// Similarly if the field is marked as indexed but no term has been indexed for the given
|
||||
/// index. an empty `InvertedIndexReader` is returned (but no warning is logged).
|
||||
@@ -296,7 +295,8 @@ impl SegmentReader {
|
||||
self.delete_opstamp
|
||||
}
|
||||
|
||||
/// Returns the bitset representing the alive `DocId`s.
|
||||
/// Returns the bitset representing
|
||||
/// the documents that have been deleted.
|
||||
pub fn alive_bitset(&self) -> Option<&AliveBitSet> {
|
||||
self.alive_bitset_opt.as_ref()
|
||||
}
|
||||
@@ -305,7 +305,7 @@ impl SegmentReader {
|
||||
/// as deleted.
|
||||
pub fn is_deleted(&self, doc: DocId) -> bool {
|
||||
self.alive_bitset()
|
||||
.map(|alive_bitset| alive_bitset.is_deleted(doc))
|
||||
.map(|delete_set| delete_set.is_deleted(doc))
|
||||
.unwrap_or(false)
|
||||
}
|
||||
|
||||
@@ -327,7 +327,7 @@ impl SegmentReader {
|
||||
self.positions_composite.space_usage(),
|
||||
self.fast_fields_readers.space_usage(),
|
||||
self.fieldnorm_readers.space_usage(),
|
||||
self.get_store_reader(0)?.space_usage(),
|
||||
self.get_store_reader()?.space_usage(),
|
||||
self.alive_bitset_opt
|
||||
.as_ref()
|
||||
.map(AliveBitSet::space_usage)
|
||||
|
||||
@@ -38,7 +38,7 @@ impl BinarySerializable for FileAddr {
|
||||
/// A `CompositeWrite` is used to write a `CompositeFile`.
|
||||
pub struct CompositeWrite<W = WritePtr> {
|
||||
write: CountingWriter<W>,
|
||||
offsets: Vec<(FileAddr, u64)>,
|
||||
offsets: HashMap<FileAddr, u64>,
|
||||
}
|
||||
|
||||
impl<W: TerminatingWrite + Write> CompositeWrite<W> {
|
||||
@@ -47,7 +47,7 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
|
||||
pub fn wrap(w: W) -> CompositeWrite<W> {
|
||||
CompositeWrite {
|
||||
write: CountingWriter::wrap(w),
|
||||
offsets: Vec::new(),
|
||||
offsets: HashMap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -60,8 +60,8 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
|
||||
pub fn for_field_with_idx(&mut self, field: Field, idx: usize) -> &mut CountingWriter<W> {
|
||||
let offset = self.write.written_bytes();
|
||||
let file_addr = FileAddr::new(field, idx);
|
||||
assert!(!self.offsets.iter().any(|el| el.0 == file_addr));
|
||||
self.offsets.push((file_addr, offset));
|
||||
assert!(!self.offsets.contains_key(&file_addr));
|
||||
self.offsets.insert(file_addr, offset);
|
||||
&mut self.write
|
||||
}
|
||||
|
||||
@@ -73,8 +73,16 @@ impl<W: TerminatingWrite + Write> CompositeWrite<W> {
|
||||
let footer_offset = self.write.written_bytes();
|
||||
VInt(self.offsets.len() as u64).serialize(&mut self.write)?;
|
||||
|
||||
let mut offset_fields: Vec<_> = self
|
||||
.offsets
|
||||
.iter()
|
||||
.map(|(file_addr, offset)| (*offset, *file_addr))
|
||||
.collect();
|
||||
|
||||
offset_fields.sort();
|
||||
|
||||
let mut prev_offset = 0;
|
||||
for (file_addr, offset) in self.offsets {
|
||||
for (offset, file_addr) in offset_fields {
|
||||
VInt((offset - prev_offset) as u64).serialize(&mut self.write)?;
|
||||
file_addr.serialize(&mut self.write)?;
|
||||
prev_offset = offset;
|
||||
@@ -98,14 +106,6 @@ pub struct CompositeFile {
|
||||
offsets_index: HashMap<FileAddr, Range<usize>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for CompositeFile {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("CompositeFile")
|
||||
.field("offsets_index", &self.offsets_index)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
impl CompositeFile {
|
||||
/// Opens a composite file stored in a given
|
||||
/// `FileSlice`.
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use std::io::Write;
|
||||
use std::marker::{Send, Sync};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Duration;
|
||||
use std::{fmt, io, thread};
|
||||
|
||||
@@ -63,12 +62,7 @@ impl Drop for DirectoryLockGuard {
|
||||
|
||||
enum TryAcquireLockError {
|
||||
FileExists,
|
||||
IoError(Arc<io::Error>),
|
||||
}
|
||||
impl From<io::Error> for TryAcquireLockError {
|
||||
fn from(io_error: io::Error) -> Self {
|
||||
Self::IoError(Arc::new(io_error))
|
||||
}
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
fn try_acquire_lock(
|
||||
@@ -79,7 +73,7 @@ fn try_acquire_lock(
|
||||
OpenWriteError::FileAlreadyExists(_) => TryAcquireLockError::FileExists,
|
||||
OpenWriteError::IoError { io_error, .. } => TryAcquireLockError::IoError(io_error),
|
||||
})?;
|
||||
write.flush().map_err(TryAcquireLockError::from)?;
|
||||
write.flush().map_err(TryAcquireLockError::IoError)?;
|
||||
Ok(DirectoryLock::from(Box::new(DirectoryLockGuard {
|
||||
directory: directory.box_clone(),
|
||||
path: filepath.to_owned(),
|
||||
@@ -111,7 +105,7 @@ pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
///
|
||||
/// Users of `Directory` should typically call `Directory::open_read(...)`,
|
||||
/// while `Directory` implementor should implement `get_file_handle()`.
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError>;
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError>;
|
||||
|
||||
/// Once a virtual file is open, its data may not
|
||||
/// change.
|
||||
|
||||
@@ -1,11 +1,10 @@
|
||||
use std::path::PathBuf;
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
use crate::Version;
|
||||
|
||||
/// Error while trying to acquire a directory lock.
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum LockError {
|
||||
/// Failed to acquired a lock as it is already held by another
|
||||
/// client.
|
||||
@@ -17,18 +16,11 @@ pub enum LockError {
|
||||
LockBusy,
|
||||
/// Trying to acquire a lock failed with an `IoError`
|
||||
#[error("Failed to acquire the lock due to an io:Error.")]
|
||||
IoError(Arc<io::Error>),
|
||||
}
|
||||
|
||||
impl LockError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error) -> Self {
|
||||
Self::IoError(Arc::new(io_error))
|
||||
}
|
||||
IoError(io::Error),
|
||||
}
|
||||
|
||||
/// Error that may occur when opening a directory
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenDirectoryError {
|
||||
/// The underlying directory does not exists.
|
||||
#[error("Directory does not exist: '{0}'.")]
|
||||
@@ -38,12 +30,12 @@ pub enum OpenDirectoryError {
|
||||
NotADirectory(PathBuf),
|
||||
/// Failed to create a temp directory.
|
||||
#[error("Failed to create a temporary directory: '{0}'.")]
|
||||
FailedToCreateTempDir(Arc<io::Error>),
|
||||
FailedToCreateTempDir(io::Error),
|
||||
/// IoError
|
||||
#[error("IoError '{io_error:?}' while create directory in: '{directory_path:?}'.")]
|
||||
IoError {
|
||||
/// underlying io Error.
|
||||
io_error: Arc<io::Error>,
|
||||
io_error: io::Error,
|
||||
/// directory we tried to open.
|
||||
directory_path: PathBuf,
|
||||
},
|
||||
@@ -53,14 +45,14 @@ impl OpenDirectoryError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, directory_path: PathBuf) -> Self {
|
||||
Self::IoError {
|
||||
io_error: Arc::new(io_error),
|
||||
io_error,
|
||||
directory_path,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Error that may occur when starting to write in a file
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenWriteError {
|
||||
/// Our directory is WORM, writing an existing file is forbidden.
|
||||
/// Checkout the `Directory` documentation.
|
||||
@@ -71,7 +63,7 @@ pub enum OpenWriteError {
|
||||
#[error("IoError '{io_error:?}' while opening file for write: '{filepath}'.")]
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: Arc<io::Error>,
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for write.
|
||||
filepath: PathBuf,
|
||||
},
|
||||
@@ -80,15 +72,11 @@ pub enum OpenWriteError {
|
||||
impl OpenWriteError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IoError {
|
||||
io_error: Arc::new(io_error),
|
||||
filepath,
|
||||
}
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Type of index incompatibility between the library and the index found on disk
|
||||
/// Used to catch and provide a hint to solve this incompatibility issue
|
||||
#[derive(Clone)]
|
||||
pub enum Incompatibility {
|
||||
/// This library cannot decompress the index found on disk
|
||||
CompressionMismatch {
|
||||
@@ -147,7 +135,7 @@ impl fmt::Debug for Incompatibility {
|
||||
}
|
||||
|
||||
/// Error that may occur when accessing a file read
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum OpenReadError {
|
||||
/// The file does not exists.
|
||||
#[error("Files does not exists: {0:?}")]
|
||||
@@ -158,7 +146,7 @@ pub enum OpenReadError {
|
||||
)]
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: Arc<io::Error>,
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to open for read.
|
||||
filepath: PathBuf,
|
||||
},
|
||||
@@ -170,14 +158,11 @@ pub enum OpenReadError {
|
||||
impl OpenReadError {
|
||||
/// Wraps an io error.
|
||||
pub fn wrap_io_error(io_error: io::Error, filepath: PathBuf) -> Self {
|
||||
Self::IoError {
|
||||
io_error: Arc::new(io_error),
|
||||
filepath,
|
||||
}
|
||||
Self::IoError { io_error, filepath }
|
||||
}
|
||||
}
|
||||
/// Error that may occur when trying to delete a file
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum DeleteError {
|
||||
/// The file does not exists.
|
||||
#[error("File does not exists: '{0}'.")]
|
||||
@@ -187,7 +172,7 @@ pub enum DeleteError {
|
||||
#[error("The following IO error happened while deleting file '{filepath}': '{io_error:?}'.")]
|
||||
IoError {
|
||||
/// The underlying `io::Error`.
|
||||
io_error: Arc<io::Error>,
|
||||
io_error: io::Error,
|
||||
/// File path of the file that tantivy failed to delete.
|
||||
filepath: PathBuf,
|
||||
},
|
||||
|
||||
@@ -54,7 +54,7 @@ impl<B> From<B> for FileSlice
|
||||
where B: StableDeref + Deref<Target = [u8]> + 'static + Send + Sync
|
||||
{
|
||||
fn from(bytes: B) -> FileSlice {
|
||||
FileSlice::new(Arc::new(OwnedBytes::new(bytes)))
|
||||
FileSlice::new(Box::new(OwnedBytes::new(bytes)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -75,7 +75,7 @@ impl fmt::Debug for FileSlice {
|
||||
|
||||
impl FileSlice {
|
||||
/// Wraps a FileHandle.
|
||||
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
|
||||
pub fn new(file_handle: Box<dyn FileHandle>) -> Self {
|
||||
let num_bytes = file_handle.len();
|
||||
FileSlice::new_with_num_bytes(file_handle, num_bytes)
|
||||
}
|
||||
@@ -83,9 +83,9 @@ impl FileSlice {
|
||||
/// Wraps a FileHandle.
|
||||
#[doc(hidden)]
|
||||
#[must_use]
|
||||
pub fn new_with_num_bytes(file_handle: Arc<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||
pub fn new_with_num_bytes(file_handle: Box<dyn FileHandle>, num_bytes: usize) -> Self {
|
||||
FileSlice {
|
||||
data: file_handle,
|
||||
data: Arc::from(file_handle),
|
||||
range: 0..num_bytes,
|
||||
}
|
||||
}
|
||||
@@ -235,7 +235,6 @@ impl FileHandle for OwnedBytes {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
@@ -243,7 +242,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_file_slice() -> io::Result<()> {
|
||||
let file_slice = FileSlice::new(Arc::new(b"abcdef".as_ref()));
|
||||
let file_slice = FileSlice::new(Box::new(b"abcdef".as_ref()));
|
||||
assert_eq!(file_slice.len(), 6);
|
||||
assert_eq!(file_slice.slice_from(2).read_bytes()?.as_slice(), b"cdef");
|
||||
assert_eq!(file_slice.slice_to(2).read_bytes()?.as_slice(), b"ab");
|
||||
@@ -287,7 +286,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_slice_simple_read() -> io::Result<()> {
|
||||
let slice = FileSlice::new(Arc::new(&b"abcdef"[..]));
|
||||
let slice = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(slice.len(), 6);
|
||||
assert_eq!(slice.read_bytes()?.as_ref(), b"abcdef");
|
||||
assert_eq!(slice.slice(1..4).read_bytes()?.as_ref(), b"bcd");
|
||||
@@ -296,7 +295,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_slice_read_slice() -> io::Result<()> {
|
||||
let slice_deref = FileSlice::new(Arc::new(&b"abcdef"[..]));
|
||||
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(slice_deref.read_bytes_slice(1..4)?.as_ref(), b"bcd");
|
||||
Ok(())
|
||||
}
|
||||
@@ -304,7 +303,7 @@ mod tests {
|
||||
#[test]
|
||||
#[should_panic(expected = "end of requested range exceeds the fileslice length (10 > 6)")]
|
||||
fn test_slice_read_slice_invalid_range_exceeds() {
|
||||
let slice_deref = FileSlice::new(Arc::new(&b"abcdef"[..]));
|
||||
let slice_deref = FileSlice::new(Box::new(&b"abcdef"[..]));
|
||||
assert_eq!(
|
||||
slice_deref.read_bytes_slice(0..10).unwrap().as_ref(),
|
||||
b"bcd"
|
||||
|
||||
@@ -110,7 +110,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
@@ -153,7 +153,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
|
||||
@@ -156,7 +156,6 @@ impl<W: TerminatingWrite> TerminatingWrite for FooterProxy<W> {
|
||||
mod tests {
|
||||
|
||||
use std::io;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
@@ -169,7 +168,7 @@ mod tests {
|
||||
let footer = Footer::new(123);
|
||||
footer.append_footer(&mut buf).unwrap();
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
let fileslice = FileSlice::new(Arc::new(owned_bytes));
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let (footer_deser, _body) = Footer::extract_footer(fileslice).unwrap();
|
||||
assert_eq!(footer_deser.crc(), footer.crc());
|
||||
}
|
||||
@@ -182,7 +181,7 @@ mod tests {
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Arc::new(owned_bytes));
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(
|
||||
err.to_string(),
|
||||
@@ -199,7 +198,7 @@ mod tests {
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Arc::new(owned_bytes));
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::UnexpectedEof);
|
||||
assert_eq!(
|
||||
@@ -218,7 +217,7 @@ mod tests {
|
||||
|
||||
let owned_bytes = OwnedBytes::new(buf);
|
||||
|
||||
let fileslice = FileSlice::new(Arc::new(owned_bytes));
|
||||
let fileslice = FileSlice::new(Box::new(owned_bytes));
|
||||
let err = Footer::extract_footer(fileslice).unwrap_err();
|
||||
assert_eq!(err.kind(), io::ErrorKind::InvalidData);
|
||||
assert_eq!(
|
||||
|
||||
@@ -242,13 +242,16 @@ impl ManagedDirectory {
|
||||
/// Verify checksum of a managed file
|
||||
pub fn validate_checksum(&self, path: &Path) -> result::Result<bool, OpenReadError> {
|
||||
let reader = self.directory.open_read(path)?;
|
||||
let (footer, data) = Footer::extract_footer(reader)
|
||||
.map_err(|io_error| OpenReadError::wrap_io_error(io_error, path.to_path_buf()))?;
|
||||
let (footer, data) =
|
||||
Footer::extract_footer(reader).map_err(|io_error| OpenReadError::IoError {
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
let bytes = data
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
io_error: Arc::new(io_error),
|
||||
filepath: path.to_path_buf(),
|
||||
io_error,
|
||||
})?;
|
||||
let mut hasher = Hasher::new();
|
||||
hasher.update(bytes.as_slice());
|
||||
@@ -269,9 +272,9 @@ impl ManagedDirectory {
|
||||
}
|
||||
|
||||
impl Directory for ManagedDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError> {
|
||||
let file_slice = self.open_read(path)?;
|
||||
Ok(Arc::new(file_slice))
|
||||
Ok(Box::new(file_slice))
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<FileSlice, OpenReadError> {
|
||||
|
||||
@@ -174,8 +174,7 @@ impl MmapDirectory {
|
||||
/// This is mostly useful to test the MmapDirectory itself.
|
||||
/// For your unit tests, prefer the RamDirectory.
|
||||
pub fn create_from_tempdir() -> Result<MmapDirectory, OpenDirectoryError> {
|
||||
let tempdir = TempDir::new()
|
||||
.map_err(|io_err| OpenDirectoryError::FailedToCreateTempDir(Arc::new(io_err)))?;
|
||||
let tempdir = TempDir::new().map_err(OpenDirectoryError::FailedToCreateTempDir)?;
|
||||
Ok(MmapDirectory::new(
|
||||
tempdir.path().to_path_buf(),
|
||||
Some(tempdir),
|
||||
@@ -310,7 +309,7 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
|
||||
}
|
||||
|
||||
impl Directory for MmapDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> result::Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> result::Result<Box<dyn FileHandle>, OpenReadError> {
|
||||
debug!("Open Read {:?}", path);
|
||||
let full_path = self.resolve_path(path);
|
||||
|
||||
@@ -331,7 +330,7 @@ impl Directory for MmapDirectory {
|
||||
})
|
||||
.unwrap_or_else(OwnedBytes::empty);
|
||||
|
||||
Ok(Arc::new(owned_bytes))
|
||||
Ok(Box::new(owned_bytes))
|
||||
}
|
||||
|
||||
/// Any entry associated to the path in the mmap will be
|
||||
@@ -343,7 +342,7 @@ impl Directory for MmapDirectory {
|
||||
DeleteError::FileDoesNotExist(path.to_owned())
|
||||
} else {
|
||||
DeleteError::IoError {
|
||||
io_error: Arc::new(e),
|
||||
io_error: e,
|
||||
filepath: path.to_path_buf(),
|
||||
}
|
||||
}
|
||||
@@ -423,9 +422,9 @@ impl Directory for MmapDirectory {
|
||||
.write(true)
|
||||
.create(true) //< if the file does not exist yet, create it.
|
||||
.open(&full_path)
|
||||
.map_err(LockError::wrap_io_error)?;
|
||||
.map_err(LockError::IoError)?;
|
||||
if lock.is_blocking {
|
||||
file.lock_exclusive().map_err(LockError::wrap_io_error)?;
|
||||
file.lock_exclusive().map_err(LockError::IoError)?;
|
||||
} else {
|
||||
file.try_lock_exclusive().map_err(|_| LockError::LockBusy)?
|
||||
}
|
||||
|
||||
@@ -160,9 +160,9 @@ impl RamDirectory {
|
||||
}
|
||||
|
||||
impl Directory for RamDirectory {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
|
||||
fn get_file_handle(&self, path: &Path) -> Result<Box<dyn FileHandle>, OpenReadError> {
|
||||
let file_slice = self.open_read(path)?;
|
||||
Ok(Arc::new(file_slice))
|
||||
Ok(Box::new(file_slice))
|
||||
}
|
||||
|
||||
fn open_read(&self, path: &Path) -> result::Result<FileSlice, OpenReadError> {
|
||||
@@ -172,7 +172,7 @@ impl Directory for RamDirectory {
|
||||
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
|
||||
fail_point!("RamDirectory::delete", |_| {
|
||||
Err(DeleteError::IoError {
|
||||
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
|
||||
io_error: io::Error::from(io::ErrorKind::Other),
|
||||
filepath: path.to_path_buf(),
|
||||
})
|
||||
});
|
||||
@@ -184,7 +184,7 @@ impl Directory for RamDirectory {
|
||||
.fs
|
||||
.read()
|
||||
.map_err(|e| OpenReadError::IoError {
|
||||
io_error: Arc::new(io::Error::new(io::ErrorKind::Other, e.to_string())),
|
||||
io_error: io::Error::new(io::ErrorKind::Other, e.to_string()),
|
||||
filepath: path.to_path_buf(),
|
||||
})?
|
||||
.exists(path))
|
||||
@@ -208,7 +208,7 @@ impl Directory for RamDirectory {
|
||||
self.open_read(path)?
|
||||
.read_bytes()
|
||||
.map_err(|io_error| OpenReadError::IoError {
|
||||
io_error: Arc::new(io_error),
|
||||
io_error,
|
||||
filepath: path.to_path_buf(),
|
||||
})?;
|
||||
Ok(bytes.as_slice().to_owned())
|
||||
|
||||
@@ -181,7 +181,7 @@ fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
|
||||
|
||||
fn test_watch(directory: &dyn Directory) {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(500);
|
||||
|
||||
let handle = directory
|
||||
|
||||
@@ -7,7 +7,7 @@ use crate::DocId;
|
||||
///
|
||||
/// This is not u32::MAX as one would have expected, due to the lack of SSE2 instructions
|
||||
/// to compare [u32; 4].
|
||||
pub const TERMINATED: DocId = i32::MAX as u32;
|
||||
pub const TERMINATED: DocId = std::i32::MAX as u32;
|
||||
|
||||
/// Represents an iterable set of sorted doc ids.
|
||||
pub trait DocSet: Send {
|
||||
@@ -24,6 +24,7 @@ pub trait DocSet: Send {
|
||||
///
|
||||
/// Calling `.advance()` on a terminated DocSet should be supported, and TERMINATED should
|
||||
/// be returned.
|
||||
/// TODO Test existing docsets.
|
||||
fn advance(&mut self) -> DocId;
|
||||
|
||||
/// Advances the DocSet forward until reaching the target, or going to the
|
||||
|
||||
18
src/error.rs
18
src/error.rs
@@ -1,7 +1,7 @@
|
||||
//! Definition of Tantivy's errors and results.
|
||||
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, PoisonError};
|
||||
use std::sync::PoisonError;
|
||||
use std::{fmt, io};
|
||||
|
||||
use thiserror::Error;
|
||||
@@ -15,7 +15,6 @@ use crate::{query, schema};
|
||||
/// Represents a `DataCorruption` error.
|
||||
///
|
||||
/// When facing data corruption, tantivy actually panics or returns this error.
|
||||
#[derive(Clone)]
|
||||
pub struct DataCorruption {
|
||||
filepath: Option<PathBuf>,
|
||||
comment: String,
|
||||
@@ -51,7 +50,7 @@ impl fmt::Debug for DataCorruption {
|
||||
}
|
||||
|
||||
/// The library's error enum
|
||||
#[derive(Debug, Clone, Error)]
|
||||
#[derive(Debug, Error)]
|
||||
pub enum TantivyError {
|
||||
/// Failed to open the directory.
|
||||
#[error("Failed to open the directory: '{0:?}'")]
|
||||
@@ -70,7 +69,7 @@ pub enum TantivyError {
|
||||
LockFailure(LockError, Option<String>),
|
||||
/// IO Error.
|
||||
#[error("An IO error occurred: '{0}'")]
|
||||
IoError(Arc<io::Error>),
|
||||
IoError(#[from] io::Error),
|
||||
/// Data corruption.
|
||||
#[error("Data corrupted: '{0:?}'")]
|
||||
DataCorruption(DataCorruption),
|
||||
@@ -98,10 +97,6 @@ pub enum TantivyError {
|
||||
/// Index incompatible with current version of Tantivy.
|
||||
#[error("{0:?}")]
|
||||
IncompatibleIndex(Incompatibility),
|
||||
/// An internal error occurred. This is are internal states that should not be reached.
|
||||
/// e.g. a datastructure is incorrectly inititalized.
|
||||
#[error("Internal error: '{0}'")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
@@ -126,11 +121,6 @@ impl From<AsyncIoError> for TantivyError {
|
||||
}
|
||||
}
|
||||
|
||||
impl From<io::Error> for TantivyError {
|
||||
fn from(io_err: io::Error) -> TantivyError {
|
||||
TantivyError::IoError(Arc::new(io_err))
|
||||
}
|
||||
}
|
||||
impl From<DataCorruption> for TantivyError {
|
||||
fn from(data_corruption: DataCorruption) -> TantivyError {
|
||||
TantivyError::DataCorruption(data_corruption)
|
||||
@@ -185,7 +175,7 @@ impl From<schema::DocParsingError> for TantivyError {
|
||||
|
||||
impl From<serde_json::Error> for TantivyError {
|
||||
fn from(error: serde_json::Error) -> TantivyError {
|
||||
TantivyError::IoError(Arc::new(error.into()))
|
||||
TantivyError::IoError(error.into())
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -188,14 +188,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_access(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
@@ -206,14 +206,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
|
||||
@@ -6,6 +6,8 @@ pub use self::writer::BytesFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Deref;
|
||||
|
||||
use crate::query::TermQuery;
|
||||
use crate::schema::{BytesOptions, IndexRecordOption, Schema, Value, FAST, INDEXED, STORED};
|
||||
use crate::{DocAddress, DocSet, Index, Searcher, Term};
|
||||
@@ -35,7 +37,9 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn create_index_for_test<T: Into<BytesOptions>>(byte_options: T) -> crate::Result<Searcher> {
|
||||
fn create_index_for_test<T: Into<BytesOptions>>(
|
||||
byte_options: T,
|
||||
) -> crate::Result<impl Deref<Target = Searcher>> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_bytes_field("string_bytes", byte_options.into());
|
||||
let schema = schema_builder.build();
|
||||
@@ -82,7 +86,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight = term_query.specialized_weight(&searcher, true)?;
|
||||
let term_weight = term_query.specialized_weight(&*searcher, true)?;
|
||||
let term_scorer = term_weight.specialized_scorer(searcher.segment_reader(0), 1.0)?;
|
||||
assert_eq!(term_scorer.doc(), 0u32);
|
||||
Ok(())
|
||||
@@ -95,7 +99,7 @@ mod tests {
|
||||
let field = searcher.schema().get_field("string_bytes").unwrap();
|
||||
let term = Term::from_field_bytes(field, b"lucene".as_ref());
|
||||
let term_query = TermQuery::new(term, IndexRecordOption::Basic);
|
||||
let term_weight_err = term_query.specialized_weight(&searcher, false);
|
||||
let term_weight_err = term_query.specialized_weight(&*searcher, false);
|
||||
assert!(matches!(
|
||||
term_weight_err,
|
||||
Err(crate::TantivyError::SchemaError(_))
|
||||
|
||||
@@ -52,11 +52,6 @@ impl BytesFastFieldReader {
|
||||
}
|
||||
|
||||
impl MultiValueLength for BytesFastFieldReader {
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
|
||||
let (start, stop) = self.range(doc_id);
|
||||
start as u64..stop as u64
|
||||
}
|
||||
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_bytes(doc_id) as u64
|
||||
}
|
||||
|
||||
@@ -1,241 +0,0 @@
|
||||
use std::net::{IpAddr, Ipv6Addr};
|
||||
|
||||
use crate::schema::{Cardinality, FieldType, Type};
|
||||
use crate::DateTime;
|
||||
|
||||
pub fn ip_to_u128(ip_addr: IpAddr) -> u128 {
|
||||
let ip_addr_v6: Ipv6Addr = match ip_addr {
|
||||
IpAddr::V4(v4) => v4.to_ipv6_mapped(),
|
||||
IpAddr::V6(v6) => v6,
|
||||
};
|
||||
u128::from_be_bytes(ip_addr_v6.octets())
|
||||
}
|
||||
|
||||
/// Trait for large types that are allowed for fast fields: u128, IpAddr
|
||||
pub trait FastValueU128: Clone + Copy + Send + Sync + PartialOrd + 'static {
|
||||
/// Converts a value from u128
|
||||
///
|
||||
/// Internally all fast field values are encoded as u128.
|
||||
fn from_u128(val: u128) -> Self;
|
||||
|
||||
/// Converts a value to u128.
|
||||
///
|
||||
/// Internally all fast field values are encoded as u128.
|
||||
fn to_u128(&self) -> u128;
|
||||
|
||||
/// Cast value to `u128`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u128(&self) -> u128;
|
||||
|
||||
/// Returns the `schema::Type` for this FastValue.
|
||||
fn to_type() -> Type;
|
||||
|
||||
/// Build a default value. This default value is never used, so the value does not
|
||||
/// really matter.
|
||||
fn make_zero() -> Self {
|
||||
Self::from_u128(0u128)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValueU128 for u128 {
|
||||
fn from_u128(val: u128) -> Self {
|
||||
val
|
||||
}
|
||||
|
||||
fn to_u128(&self) -> u128 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn as_u128(&self) -> u128 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::U128
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValueU128 for IpAddr {
|
||||
fn from_u128(val: u128) -> Self {
|
||||
IpAddr::from(val.to_be_bytes())
|
||||
}
|
||||
|
||||
fn to_u128(&self) -> u128 {
|
||||
ip_to_u128(*self)
|
||||
}
|
||||
|
||||
fn as_u128(&self) -> u128 {
|
||||
ip_to_u128(*self)
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::Ip
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for types that are allowed for fast fields:
|
||||
/// (u64, i64 and f64, bool, DateTime).
|
||||
pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd + 'static {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
/// **Note: To be used for converting encoded Term, Posting values.**
|
||||
fn from_u64(val: u64) -> Self;
|
||||
|
||||
/// Converts a value to u64.
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn to_u64(&self) -> u64;
|
||||
|
||||
/// Returns the fast field cardinality that can be extracted from the given
|
||||
/// `FieldType`.
|
||||
///
|
||||
/// If the type is not a fast field, `None` is returned.
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
|
||||
|
||||
/// Cast value to `u64`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u64(&self) -> u64;
|
||||
|
||||
/// Build a default value. This default value is never used, so the value does not
|
||||
/// really matter.
|
||||
fn make_zero() -> Self {
|
||||
Self::from_u64(0i64.to_u64())
|
||||
}
|
||||
|
||||
/// Returns the `schema::Type` for this FastValue.
|
||||
fn to_type() -> Type;
|
||||
}
|
||||
|
||||
impl FastValue for u64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
val
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::U64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_i64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::i64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self as u64
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::I64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for f64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_f64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::f64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::F64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.to_bits()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::F64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for bool {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
val != 0u64
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
match self {
|
||||
false => 0,
|
||||
true => 1,
|
||||
}
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::Bool(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self as u64
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::Bool
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for DateTime {
|
||||
/// Converts a timestamp microseconds into DateTime.
|
||||
///
|
||||
/// **Note the timestamps is expected to be in microseconds.**
|
||||
fn from_u64(timestamp_micros_u64: u64) -> Self {
|
||||
let timestamp_micros = i64::from_u64(timestamp_micros_u64);
|
||||
Self::from_timestamp_micros(timestamp_micros)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::i64_to_u64(self.into_timestamp_micros())
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::Date(ref options) => options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.into_timestamp_micros().as_u64()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::Date
|
||||
}
|
||||
}
|
||||
@@ -1,224 +0,0 @@
|
||||
use std::io::{self, Write};
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastdivide::DividerU64;
|
||||
use fastfield_codecs::FastFieldCodecReader;
|
||||
use gcd::Gcd;
|
||||
|
||||
pub const GCD_DEFAULT: u64 = 1;
|
||||
pub const GCD_CODEC_ID: u8 = 4;
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
#[derive(Clone)]
|
||||
pub struct GCDFastFieldCodec<CodecReader> {
|
||||
gcd: u64,
|
||||
min_value: u64,
|
||||
reader: CodecReader,
|
||||
}
|
||||
impl<C: FastFieldCodecReader + Clone> FastFieldCodecReader for GCDFastFieldCodec<C> {
|
||||
/// Opens a fast field given the bytes.
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self> {
|
||||
let (header, mut footer) = bytes.split_at(bytes.len() - 16);
|
||||
let gcd = u64::deserialize(&mut footer)?;
|
||||
let min_value = u64::deserialize(&mut footer)?;
|
||||
let reader = C::open_from_bytes(header)?;
|
||||
|
||||
Ok(GCDFastFieldCodec {
|
||||
gcd,
|
||||
min_value,
|
||||
reader,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let mut data = self.reader.get_u64(doc, data);
|
||||
data *= self.gcd;
|
||||
data += self.min_value;
|
||||
data
|
||||
}
|
||||
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value + self.reader.min_value() * self.gcd
|
||||
}
|
||||
|
||||
fn max_value(&self) -> u64 {
|
||||
self.min_value + self.reader.max_value() * self.gcd
|
||||
}
|
||||
}
|
||||
|
||||
pub fn write_gcd_header<W: Write>(field_write: &mut W, min_value: u64, gcd: u64) -> io::Result<()> {
|
||||
gcd.serialize(field_write)?;
|
||||
min_value.serialize(field_write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
// Find GCD for iterator of numbers
|
||||
pub fn find_gcd(numbers: impl Iterator<Item = u64>) -> Option<u64> {
|
||||
let mut numbers = numbers.filter(|n| *n != 0);
|
||||
let mut gcd = numbers.next()?;
|
||||
if gcd == 1 {
|
||||
return Some(1);
|
||||
}
|
||||
|
||||
let mut gcd_divider = DividerU64::divide_by(gcd);
|
||||
for val in numbers {
|
||||
let remainder = val - (gcd_divider.divide(val)) * gcd;
|
||||
if remainder == 0 {
|
||||
continue;
|
||||
}
|
||||
gcd = gcd.gcd(val);
|
||||
if gcd == 1 {
|
||||
return Some(1);
|
||||
}
|
||||
|
||||
gcd_divider = DividerU64::divide_by(gcd);
|
||||
}
|
||||
Some(gcd)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
|
||||
use common::HasLen;
|
||||
|
||||
use crate::directory::{CompositeFile, RamDirectory, WritePtr};
|
||||
use crate::fastfield::serializer::FastFieldCodecEnableCheck;
|
||||
use crate::fastfield::tests::{FIELD, FIELDI64, SCHEMA, SCHEMAI64};
|
||||
use crate::fastfield::{
|
||||
find_gcd, CompositeFastFieldSerializer, DynamicFastFieldReader, FastFieldCodecName,
|
||||
FastFieldReader, FastFieldsWriter, ALL_CODECS,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::Directory;
|
||||
|
||||
fn get_index(
|
||||
docs: &[crate::Document],
|
||||
schema: &Schema,
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
) -> crate::Result<RamDirectory> {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer =
|
||||
CompositeFastFieldSerializer::from_write_with_codec(write, codec_enable_checker)
|
||||
.unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(schema);
|
||||
for doc in docs {
|
||||
fast_field_writers.add_document(doc);
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
Ok(directory)
|
||||
}
|
||||
|
||||
fn test_fastfield_gcd_i64_with_codec(
|
||||
codec_name: FastFieldCodecName,
|
||||
num_vals: usize,
|
||||
) -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let mut docs = vec![];
|
||||
for i in 1..=num_vals {
|
||||
let val = i as i64 * 1000i64;
|
||||
docs.push(doc!(*FIELDI64=>val));
|
||||
}
|
||||
let directory = get_index(&docs, &SCHEMAI64, codec_name.clone().into())?;
|
||||
let file = directory.open_read(path).unwrap();
|
||||
// assert_eq!(file.len(), 118);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<i64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 1000i64);
|
||||
assert_eq!(fast_field_reader.get(1), 2000i64);
|
||||
assert_eq!(fast_field_reader.get(2), 3000i64);
|
||||
assert_eq!(fast_field_reader.max_value(), num_vals as i64 * 1000);
|
||||
assert_eq!(fast_field_reader.min_value(), 1000i64);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
|
||||
// Can't apply gcd
|
||||
let path = Path::new("test");
|
||||
docs.pop();
|
||||
docs.push(doc!(*FIELDI64=>2001i64));
|
||||
let directory = get_index(&docs, &SCHEMAI64, codec_name.into())?;
|
||||
let file2 = directory.open_read(path).unwrap();
|
||||
assert!(file2.len() > file.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fastfield_gcd_i64() -> crate::Result<()> {
|
||||
for codec_name in ALL_CODECS {
|
||||
test_fastfield_gcd_i64_with_codec(codec_name.clone(), 5005)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_fastfield_gcd_u64_with_codec(
|
||||
codec_name: FastFieldCodecName,
|
||||
num_vals: usize,
|
||||
) -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let mut docs = vec![];
|
||||
for i in 1..=num_vals {
|
||||
let val = i as u64 * 1000u64;
|
||||
docs.push(doc!(*FIELD=>val));
|
||||
}
|
||||
let directory = get_index(&docs, &SCHEMA, codec_name.clone().into())?;
|
||||
let file = directory.open_read(path).unwrap();
|
||||
// assert_eq!(file.len(), 118);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), 1000u64);
|
||||
assert_eq!(fast_field_reader.get(1), 2000u64);
|
||||
assert_eq!(fast_field_reader.get(2), 3000u64);
|
||||
assert_eq!(fast_field_reader.max_value(), num_vals as u64 * 1000);
|
||||
assert_eq!(fast_field_reader.min_value(), 1000u64);
|
||||
let file = directory.open_read(path).unwrap();
|
||||
|
||||
// Can't apply gcd
|
||||
let path = Path::new("test");
|
||||
docs.pop();
|
||||
docs.push(doc!(*FIELDI64=>2001u64));
|
||||
let directory = get_index(&docs, &SCHEMA, codec_name.into())?;
|
||||
let file2 = directory.open_read(path).unwrap();
|
||||
assert!(file2.len() > file.len());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_fastfield_gcd_u64() -> crate::Result<()> {
|
||||
for codec_name in ALL_CODECS {
|
||||
test_fastfield_gcd_u64_with_codec(codec_name.clone(), 5005)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield2() {
|
||||
let test_fastfield = DynamicFastFieldReader::<u64>::from(vec![100, 200, 300]);
|
||||
assert_eq!(test_fastfield.get(0), 100);
|
||||
assert_eq!(test_fastfield.get(1), 200);
|
||||
assert_eq!(test_fastfield.get(2), 300);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_gcd_test() {
|
||||
assert_eq!(find_gcd([0].into_iter()), None);
|
||||
assert_eq!(find_gcd([0, 10].into_iter()), Some(10));
|
||||
assert_eq!(find_gcd([10, 0].into_iter()), Some(10));
|
||||
assert_eq!(find_gcd([].into_iter()), None);
|
||||
assert_eq!(find_gcd([15, 30, 5, 10].into_iter()), Some(5));
|
||||
assert_eq!(find_gcd([15, 16, 10].into_iter()), Some(1));
|
||||
assert_eq!(find_gcd([0, 5, 5, 5].into_iter()), Some(5));
|
||||
}
|
||||
}
|
||||
@@ -20,88 +20,179 @@
|
||||
//!
|
||||
//! Read access performance is comparable to that of an array lookup.
|
||||
|
||||
use std::collections::btree_map::Range;
|
||||
|
||||
pub use self::alive_bitset::{intersect_alive_bitsets, write_alive_bitset, AliveBitSet};
|
||||
pub use self::bytes::{BytesFastFieldReader, BytesFastFieldWriter};
|
||||
pub use self::error::{FastFieldNotAvailableError, Result};
|
||||
pub use self::facet_reader::FacetReader;
|
||||
pub use self::fast_value::{FastValue, FastValueU128};
|
||||
pub(crate) use self::gcd::{find_gcd, GCDFastFieldCodec, GCD_CODEC_ID, GCD_DEFAULT};
|
||||
pub use self::multivalued::{
|
||||
MultiValuedFastFieldReader, MultiValuedFastFieldWriter, MultiValuedU128FastFieldReader,
|
||||
};
|
||||
pub use self::reader::{DynamicFastFieldReader, FastFieldReader, FastFieldReaderCodecWrapperU128};
|
||||
pub use self::multivalued::{MultiValuedFastFieldReader, MultiValuedFastFieldWriter};
|
||||
pub use self::reader::{DynamicFastFieldReader, FastFieldReader};
|
||||
pub use self::readers::FastFieldReaders;
|
||||
pub(crate) use self::readers::{type_and_cardinality, FastType};
|
||||
pub use self::serializer::{CompositeFastFieldSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
pub use self::writer::{FastFieldsWriter, IntFastFieldWriter};
|
||||
use crate::schema::Value;
|
||||
use crate::DocId;
|
||||
use crate::schema::{Cardinality, FieldType, Type, Value};
|
||||
use crate::{DateTime, DocId};
|
||||
|
||||
mod alive_bitset;
|
||||
mod bytes;
|
||||
mod error;
|
||||
mod facet_reader;
|
||||
mod fast_value;
|
||||
mod gcd;
|
||||
mod multivalued;
|
||||
mod reader;
|
||||
mod readers;
|
||||
mod serializer;
|
||||
mod writer;
|
||||
|
||||
#[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone)]
|
||||
pub(crate) enum FastFieldCodecName {
|
||||
Bitpacked,
|
||||
LinearInterpol,
|
||||
BlockwiseLinearInterpol,
|
||||
}
|
||||
pub(crate) const ALL_CODECS: &[FastFieldCodecName; 3] = &[
|
||||
FastFieldCodecName::Bitpacked,
|
||||
FastFieldCodecName::LinearInterpol,
|
||||
FastFieldCodecName::BlockwiseLinearInterpol,
|
||||
];
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match value {
|
||||
Value::U64(val) => val.to_u64(),
|
||||
Value::I64(val) => val.to_u64(),
|
||||
Value::F64(val) => val.to_u64(),
|
||||
Value::Bool(val) => val.to_u64(),
|
||||
Value::Date(val) => val.to_u64(),
|
||||
_ => panic!("Expected a u64/i64/f64/bool/date field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
/// Trait for `BytesFastFieldReader` and `MultiValuedFastFieldReader` to return the length of data
|
||||
/// for a doc_id
|
||||
pub trait MultiValueLength {
|
||||
/// returns the positions of values associated to a doc_id
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64>;
|
||||
/// returns the num of values associated to a doc_id
|
||||
fn get_len(&self, doc_id: DocId) -> u64;
|
||||
/// returns the sum of num values for all doc_ids
|
||||
fn get_total_len(&self) -> u64;
|
||||
}
|
||||
|
||||
/// The fast field type
|
||||
pub enum FastFieldType {
|
||||
/// Numeric type, e.g. f64.
|
||||
Numeric,
|
||||
/// Fast field stores string ids.
|
||||
String,
|
||||
/// Fast field stores string ids for facets.
|
||||
Facet,
|
||||
}
|
||||
/// Trait for types that are allowed for fast fields: (u64, i64 and f64).
|
||||
pub trait FastValue: Clone + Copy + Send + Sync + PartialOrd + 'static {
|
||||
/// Converts a value from u64
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn from_u64(val: u64) -> Self;
|
||||
|
||||
impl FastFieldType {
|
||||
fn is_storing_term_ids(&self) -> bool {
|
||||
matches!(self, FastFieldType::String | FastFieldType::Facet)
|
||||
/// Converts a value to u64.
|
||||
///
|
||||
/// Internally all fast field values are encoded as u64.
|
||||
fn to_u64(&self) -> u64;
|
||||
|
||||
/// Returns the fast field cardinality that can be extracted from the given
|
||||
/// `FieldType`.
|
||||
///
|
||||
/// If the type is not a fast field, `None` is returned.
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality>;
|
||||
|
||||
/// Cast value to `u64`.
|
||||
/// The value is just reinterpreted in memory.
|
||||
fn as_u64(&self) -> u64;
|
||||
|
||||
/// Build a default value. This default value is never used, so the value does not
|
||||
/// really matter.
|
||||
fn make_zero() -> Self {
|
||||
Self::from_u64(0i64.to_u64())
|
||||
}
|
||||
|
||||
fn is_facet(&self) -> bool {
|
||||
matches!(self, FastFieldType::Facet)
|
||||
/// Returns the `schema::Type` for this FastValue.
|
||||
fn to_type() -> Type;
|
||||
}
|
||||
|
||||
impl FastValue for u64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
val
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::U64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
FieldType::Facet(_) => Some(Cardinality::MultiValues),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::U64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for i64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_i64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::i64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::I64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
*self as u64
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::I64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for f64 {
|
||||
fn from_u64(val: u64) -> Self {
|
||||
common::u64_to_f64(val)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
common::f64_to_u64(*self)
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::F64(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.to_bits()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::F64
|
||||
}
|
||||
}
|
||||
|
||||
impl FastValue for DateTime {
|
||||
fn from_u64(timestamp_u64: u64) -> Self {
|
||||
let unix_timestamp = i64::from_u64(timestamp_u64);
|
||||
Self::from_unix_timestamp(unix_timestamp)
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
self.to_unix_timestamp().to_u64()
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
match *field_type {
|
||||
FieldType::Date(ref integer_options) => integer_options.get_fastfield_cardinality(),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.to_unix_timestamp().as_u64()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
Type::Date
|
||||
}
|
||||
}
|
||||
|
||||
fn value_to_u64(value: &Value) -> u64 {
|
||||
match value {
|
||||
Value::U64(val) => val.to_u64(),
|
||||
Value::I64(val) => val.to_u64(),
|
||||
Value::F64(val) => val.to_u64(),
|
||||
Value::Date(val) => val.to_u64(),
|
||||
_ => panic!("Expected a u64/i64/f64/date field, got {:?} ", value),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -109,8 +200,6 @@ impl FastFieldType {
|
||||
mod tests {
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::net::IpAddr;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
|
||||
use common::HasLen;
|
||||
@@ -122,11 +211,9 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{
|
||||
self, Cardinality, Document, Field, IpOptions, Schema, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{DateOptions, DatePrecision, DateTime, Index, SegmentId, SegmentReader};
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
|
||||
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -134,14 +221,7 @@ mod tests {
|
||||
schema_builder.build()
|
||||
});
|
||||
|
||||
pub static SCHEMAI64: Lazy<Schema> = Lazy::new(|| {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_i64_field("field", FAST);
|
||||
schema_builder.build()
|
||||
});
|
||||
|
||||
pub static FIELD: Lazy<Field> = Lazy::new(|| SCHEMA.get_field("field").unwrap());
|
||||
pub static FIELDI64: Lazy<Field> = Lazy::new(|| SCHEMAI64.get_field("field").unwrap());
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield() {
|
||||
@@ -152,8 +232,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_datetime_conversion() {
|
||||
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
|
||||
pub fn test_fastfield_i64_u64() {
|
||||
let datetime = DateTime::new_utc(OffsetDateTime::UNIX_EPOCH);
|
||||
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
|
||||
}
|
||||
|
||||
@@ -290,7 +370,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_signed_intfastfield_normal() -> crate::Result<()> {
|
||||
fn test_signed_intfastfield() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -312,8 +392,7 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
@@ -370,15 +449,10 @@ mod tests {
|
||||
permutation
|
||||
}
|
||||
|
||||
// Warning: this generates the same permutation at each call
|
||||
pub fn generate_permutation_gcd() -> Vec<u64> {
|
||||
let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect();
|
||||
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
|
||||
permutation
|
||||
}
|
||||
|
||||
fn test_intfastfield_permutation_with_data(permutation: Vec<u64>) -> crate::Result<()> {
|
||||
#[test]
|
||||
fn test_intfastfield_permutation() -> crate::Result<()> {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation();
|
||||
let n = permutation.len();
|
||||
let directory = RamDirectory::create();
|
||||
{
|
||||
@@ -397,27 +471,15 @@ mod tests {
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data)?;
|
||||
|
||||
for a in 0..n {
|
||||
let mut a = 0u64;
|
||||
for _ in 0..n {
|
||||
assert_eq!(fast_field_reader.get(a as u32), permutation[a as usize]);
|
||||
a = fast_field_reader.get(a as u32);
|
||||
}
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_permutation_gcd() -> crate::Result<()> {
|
||||
let permutation = generate_permutation_gcd();
|
||||
test_intfastfield_permutation_with_data(permutation)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_intfastfield_permutation() -> crate::Result<()> {
|
||||
let permutation = generate_permutation();
|
||||
test_intfastfield_permutation_with_data(permutation)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_merge_missing_date_fast_field() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -427,7 +489,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer
|
||||
.add_document(doc!(date_field =>DateTime::from_utc(OffsetDateTime::now_utc())))?;
|
||||
.add_document(doc!(date_field =>DateTime::new_utc(OffsetDateTime::now_utc())))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
@@ -446,301 +508,18 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_default_date() {
|
||||
assert_eq!(0, DateTime::make_zero().into_timestamp_secs());
|
||||
}
|
||||
|
||||
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
let mut out = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
all
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_ip_fastfield_minimal() -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_field("ip", FAST | INDEXED | STORED);
|
||||
|
||||
let ips_field = schema_builder.add_ip_field(
|
||||
"ips",
|
||||
IpOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
|
||||
let schema = schema_builder.build();
|
||||
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let ip1 = IpAddr::from((1_u128).to_be_bytes());
|
||||
let ip2 = IpAddr::from((2_u128).to_be_bytes());
|
||||
let ip3 = IpAddr::from((3_u128).to_be_bytes());
|
||||
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
ip_field => ip2,
|
||||
ips_field => ip2,
|
||||
ips_field => ip2,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
|
||||
// single value
|
||||
let ip_addr_fast_field = fast_fields.ip_addr(ip_field).unwrap();
|
||||
assert_eq!(ip_addr_fast_field.get_val(0), None);
|
||||
assert_eq!(ip_addr_fast_field.get_val(1), Some(ip2));
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip2..=ip2), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip1..=ip2), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip2..=ip3), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip1..=ip3), vec![1]);
|
||||
assert_eq!(
|
||||
ip_addr_fast_field.get_between_vals(ip1..=ip1),
|
||||
vec![] as Vec<usize>
|
||||
);
|
||||
assert_eq!(
|
||||
ip_addr_fast_field.get_between_vals(ip3..=ip3),
|
||||
vec![] as Vec<usize>
|
||||
);
|
||||
|
||||
// multi value
|
||||
let ip_addr_fast_field = fast_fields.ip_addrs(ips_field).unwrap();
|
||||
assert_eq!(ip_addr_fast_field.get_first_val(0), None);
|
||||
assert_eq!(ip_addr_fast_field.get_first_val(1), Some(ip2));
|
||||
|
||||
let mut out = vec![];
|
||||
ip_addr_fast_field.get_vals(0, &mut out);
|
||||
assert_eq!(out, vec![] as Vec<IpAddr>);
|
||||
let mut out = vec![];
|
||||
ip_addr_fast_field.get_vals(1, &mut out);
|
||||
assert_eq!(out, vec![ip2, ip2]);
|
||||
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip2..=ip2), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip1..=ip2), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip2..=ip3), vec![1]);
|
||||
assert_eq!(ip_addr_fast_field.get_between_vals(ip1..=ip3), vec![1]);
|
||||
assert_eq!(
|
||||
ip_addr_fast_field.get_between_vals(ip1..=ip1),
|
||||
vec![] as Vec<usize>
|
||||
);
|
||||
assert_eq!(
|
||||
ip_addr_fast_field.get_between_vals(ip3..=ip3),
|
||||
vec![] as Vec<usize>
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB AAAAA", // term_ord 1,2
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA BBBBB", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..5),
|
||||
vec![1, 0, 0, 0, 1, 2]
|
||||
);
|
||||
|
||||
let mut out = vec![];
|
||||
text_fast_field.get_vals(3, &mut out);
|
||||
assert_eq!(out, vec![0, 1]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
// default tokenizer applies lower case
|
||||
assert_eq!(bytes, "aaaaa".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC AAAAA", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..3), vec![0, 1, 0]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..8),
|
||||
vec![1, 0, 0, 0, 1, 3 /* next segment */, 0, 2, 0]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB", // term_ord 1
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(bytes, "AAAAA".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..9),
|
||||
vec![1, 0, 0, 3 /* next segment */, 0, 2]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
fn test_default_datetime() {
|
||||
assert_eq!(0, DateTime::make_zero().to_unix_timestamp());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_datefastfield() -> crate::Result<()> {
|
||||
use crate::fastfield::FastValue;
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"date",
|
||||
DateOptions::from(FAST).set_precision(DatePrecision::Microseconds),
|
||||
);
|
||||
let date_field = schema_builder.add_date_field("date", FAST);
|
||||
let multi_date_field = schema_builder.add_date_field(
|
||||
"multi_date",
|
||||
DateOptions::default()
|
||||
.set_precision(DatePrecision::Microseconds)
|
||||
.set_fast(Cardinality::MultiValues),
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
@@ -768,138 +547,26 @@ mod tests {
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get(0u32).into_timestamp_micros(), 1i64);
|
||||
assert_eq!(date_fast_field.get(0u32).to_unix_timestamp(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
|
||||
assert_eq!(dates[0].to_unix_timestamp(), 2i64);
|
||||
assert_eq!(dates[1].to_unix_timestamp(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(1u32).into_timestamp_micros(), 4i64);
|
||||
assert_eq!(date_fast_field.get(1u32).to_unix_timestamp(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(2u32).into_timestamp_micros(), 0i64);
|
||||
assert_eq!(date_fast_field.get(2u32).to_unix_timestamp(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
|
||||
assert_eq!(dates[1].into_timestamp_micros(), 6i64);
|
||||
assert_eq!(dates[0].to_unix_timestamp(), 5i64);
|
||||
assert_eq!(dates[1].to_unix_timestamp(), 6i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool() {
|
||||
let test_fastfield = DynamicFastFieldReader::<bool>::from(vec![true, false, true, false]);
|
||||
assert_eq!(test_fastfield.get(0), true);
|
||||
assert_eq!(test_fastfield.get(1), false);
|
||||
assert_eq!(test_fastfield.get(2), true);
|
||||
assert_eq!(test_fastfield.get(3), false);
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool_small() -> crate::Result<()> {
|
||||
let path = Path::new("test_bool");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_bool_field("field_bool", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let field = schema.get_field("field_bool").unwrap();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(path).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
fast_field_writers.add_document(&doc!(field=>true));
|
||||
fast_field_writers.add_document(&doc!(field=>false));
|
||||
fast_field_writers.add_document(&doc!(field=>true));
|
||||
fast_field_writers.add_document(&doc!(field=>false));
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 36);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), true);
|
||||
assert_eq!(fast_field_reader.get(1), false);
|
||||
assert_eq!(fast_field_reader.get(2), true);
|
||||
assert_eq!(fast_field_reader.get(3), false);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool_large() -> crate::Result<()> {
|
||||
let path = Path::new("test_bool");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_bool_field("field_bool", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let field = schema.get_field("field_bool").unwrap();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(path).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
for _ in 0..50 {
|
||||
fast_field_writers.add_document(&doc!(field=>true));
|
||||
fast_field_writers.add_document(&doc!(field=>false));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 48);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
for i in 0..25 {
|
||||
assert_eq!(fast_field_reader.get(i * 2), true);
|
||||
assert_eq!(fast_field_reader.get(i * 2 + 1), false);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_bool_default_value() -> crate::Result<()> {
|
||||
let path = Path::new("test_bool");
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_bool_field("field_bool", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let field = schema.get_field("field_bool").unwrap();
|
||||
|
||||
{
|
||||
let write: WritePtr = directory.open_write(path).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema);
|
||||
let doc = Document::default();
|
||||
fast_field_writers.add_document(&doc);
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
assert_eq!(file.len(), 35);
|
||||
let composite_file = CompositeFile::open(&file)?;
|
||||
let file = composite_file.open_read(field).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<bool>::open(file)?;
|
||||
assert_eq!(fast_field_reader.get(0), false);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
@@ -912,7 +579,6 @@ mod bench {
|
||||
use super::tests::{generate_permutation, FIELD, SCHEMA};
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::fastfield::tests::generate_permutation_gcd;
|
||||
use crate::fastfield::FastFieldReader;
|
||||
|
||||
#[bench]
|
||||
@@ -999,42 +665,10 @@ mod bench {
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let n = test::black_box(1000u32);
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get(i) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_intfastfield_fflookup_gcd(b: &mut Bencher) {
|
||||
let path = Path::new("test");
|
||||
let permutation = generate_permutation_gcd();
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
{
|
||||
let write: WritePtr = directory.open_write(Path::new("test")).unwrap();
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write).unwrap();
|
||||
let mut fast_field_writers = FastFieldsWriter::from_schema(&SCHEMA);
|
||||
for &x in &permutation {
|
||||
fast_field_writers.add_document(&doc!(*FIELD=>x));
|
||||
}
|
||||
fast_field_writers
|
||||
.serialize(&mut serializer, &HashMap::new(), None)
|
||||
.unwrap();
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(&path).unwrap();
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file).unwrap();
|
||||
let data = fast_fields_composite.open_read(*FIELD).unwrap();
|
||||
let fast_field_reader = DynamicFastFieldReader::<u64>::open(data).unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let mut a = 0u32;
|
||||
for i in 0u32..permutation.len() as u32 {
|
||||
a = fast_field_reader.get(i) as u32;
|
||||
for _ in 0u32..n {
|
||||
a = fast_field_reader.get(a) as u32;
|
||||
}
|
||||
a
|
||||
});
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use self::reader::{MultiValuedFastFieldReader, MultiValuedU128FastFieldReader};
|
||||
pub use self::writer::{MultiValuedFastFieldWriter, U128MultiValueFastFieldWriter};
|
||||
pub use self::reader::MultiValuedFastFieldReader;
|
||||
pub use self::writer::MultiValuedFastFieldWriter;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
@@ -13,7 +13,7 @@ mod tests {
|
||||
use crate::collector::TopDocs;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::QueryParser;
|
||||
use crate::schema::{Cardinality, DateOptions, Facet, FacetOptions, NumericOptions, Schema};
|
||||
use crate::schema::{Cardinality, Facet, FacetOptions, NumericOptions, Schema};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{Duration, OffsetDateTime};
|
||||
use crate::{DateTime, Document, Index, Term};
|
||||
@@ -58,7 +58,7 @@ mod tests {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let date_field = schema_builder.add_date_field(
|
||||
"multi_date_field",
|
||||
DateOptions::default()
|
||||
NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_indexed()
|
||||
.set_fieldnorm()
|
||||
@@ -71,24 +71,24 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let first_time_stamp = OffsetDateTime::now_utc();
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
date_field => DateTime::new_utc(first_time_stamp),
|
||||
date_field => DateTime::new_utc(first_time_stamp),
|
||||
time_i=>1i64))?;
|
||||
index_writer.add_document(doc!(time_i => 0i64))?;
|
||||
// add one second
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
|
||||
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(1)),
|
||||
time_i => 2i64))?;
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
time_i => 3i64))?;
|
||||
// add three seconds
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
|
||||
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(3)),
|
||||
time_i => 4i64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -113,7 +113,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(first_time_stamp),
|
||||
DateTime::new_utc(first_time_stamp),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -140,7 +140,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(two_secs_ahead)
|
||||
DateTime::new_utc(two_secs_ahead)
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -181,7 +181,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.expect("value not of Date type"),
|
||||
DateTime::from_utc(first_time_stamp + Duration::seconds(offset_sec)),
|
||||
DateTime::new_utc(first_time_stamp + Duration::seconds(offset_sec)),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -226,38 +226,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_bool() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let bool_field = schema_builder.add_bool_field(
|
||||
"multifield",
|
||||
NumericOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.add_document(doc!(bool_field=> true, bool_field => false))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(bool_field=> false))?;
|
||||
index_writer
|
||||
.add_document(doc!(bool_field=> true, bool_field => true, bool_field => false))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let searcher = index.reader()?.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let mut vals = Vec::new();
|
||||
let multi_value_reader = segment_reader.fast_fields().bools(bool_field).unwrap();
|
||||
multi_value_reader.get_vals(2, &mut vals);
|
||||
assert_eq!(&vals, &[false]);
|
||||
multi_value_reader.get_vals(0, &mut vals);
|
||||
assert_eq!(&vals, &[true, false]);
|
||||
multi_value_reader.get_vals(1, &mut vals);
|
||||
assert!(vals.is_empty());
|
||||
multi_value_reader.get_vals(3, &mut vals);
|
||||
assert_eq!(&vals, &[true, true, false]);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_multivalued_no_panic(ops: &[IndexingOp]) -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field = schema_builder.add_u64_field(
|
||||
@@ -346,13 +314,6 @@ mod tests {
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_multivalued_proptest_gcd() {
|
||||
use IndexingOp::*;
|
||||
let ops = [AddDoc { id: 9 }, AddDoc { id: 9 }, Merge];
|
||||
|
||||
assert!(test_multivalued_no_panic(&ops[..]).is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_proptest_off_by_one_bug_1151() {
|
||||
|
||||
@@ -1,11 +1,6 @@
|
||||
use std::ops::{Range, RangeInclusive};
|
||||
use std::ops::Range;
|
||||
|
||||
use fastfield_codecs::ip_codec::IntervallDecompressor;
|
||||
|
||||
use crate::fastfield::{
|
||||
DynamicFastFieldReader, FastFieldReader, FastFieldReaderCodecWrapperU128, FastValue,
|
||||
FastValueU128, MultiValueLength,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader, FastValue, MultiValueLength};
|
||||
use crate::DocId;
|
||||
|
||||
/// Reader for a multivalued `u64` fast field.
|
||||
@@ -32,28 +27,22 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated
|
||||
/// to the given document are `start..end`.
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let end = self.idx_reader.get(doc + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
start..stop
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
self.get_vals_for_range(range, vals);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
@@ -89,155 +78,6 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
|
||||
impl<Item: FastValue> MultiValueLength for MultiValuedFastFieldReader<Item> {
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
|
||||
self.range(doc_id)
|
||||
}
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_vals(doc_id) as u64
|
||||
}
|
||||
fn get_total_len(&self) -> u64 {
|
||||
self.total_num_vals() as u64
|
||||
}
|
||||
}
|
||||
|
||||
/// Reader for a multivalued `u128` fast field.
|
||||
///
|
||||
/// The reader is implemented as a `u64` fast field for the index and a `u128` fast field.
|
||||
///
|
||||
/// The `vals_reader` will access the concatenated list of all
|
||||
/// values for all reader.
|
||||
/// The `idx_reader` associated, for each document, the index of its first value.
|
||||
#[derive(Clone)]
|
||||
pub struct MultiValuedU128FastFieldReader<Item: FastValueU128> {
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: FastFieldReaderCodecWrapperU128<Item, IntervallDecompressor>,
|
||||
}
|
||||
|
||||
impl<Item: FastValueU128> MultiValuedU128FastFieldReader<Item> {
|
||||
pub(crate) fn open(
|
||||
idx_reader: DynamicFastFieldReader<u64>,
|
||||
vals_reader: FastFieldReaderCodecWrapperU128<Item, IntervallDecompressor>,
|
||||
) -> MultiValuedU128FastFieldReader<Item> {
|
||||
Self {
|
||||
idx_reader,
|
||||
vals_reader,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated
|
||||
/// to the given document are `start..end`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let end = self.idx_reader.get(doc + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_first_val(&self, doc: DocId) -> Option<Item> {
|
||||
let range = self.range(doc);
|
||||
if range.is_empty() {
|
||||
return None;
|
||||
}
|
||||
self.vals_reader.get_val(range.start)
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
self.get_vals_for_range(range, vals);
|
||||
}
|
||||
|
||||
/// Returns all docids which are in the provided value range
|
||||
pub fn get_between_vals(&self, range: RangeInclusive<Item>) -> Vec<DocId> {
|
||||
let positions = self.vals_reader.get_between_vals(range);
|
||||
|
||||
positions_to_docids(&positions, self)
|
||||
}
|
||||
|
||||
/// Iterates over all elements in the fast field
|
||||
pub fn iter(&self) -> impl Iterator<Item = Option<Item>> + '_ {
|
||||
self.vals_reader.iter()
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
///
|
||||
/// The min value does not take in account of possible
|
||||
/// deleted document, and should be considered as a lower bound
|
||||
/// of the actual mimimum value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
self.vals_reader.min_value()
|
||||
}
|
||||
|
||||
/// Returns the maximum value for this fast field.
|
||||
///
|
||||
/// The max value does not take in account of possible
|
||||
/// deleted document, and should be considered as an upper bound
|
||||
/// of the actual maximum value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
self.vals_reader.max_value()
|
||||
}
|
||||
|
||||
/// Returns the number of values associated with the document `DocId`.
|
||||
#[inline]
|
||||
pub fn num_vals(&self, doc: DocId) -> usize {
|
||||
let range = self.range(doc);
|
||||
(range.end - range.start) as usize
|
||||
}
|
||||
|
||||
/// Returns the overall number of values in this field .
|
||||
#[inline]
|
||||
pub fn total_num_vals(&self) -> u64 {
|
||||
self.idx_reader.max_value()
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of positions of values in a 1:n index to the corresponding list of DocIds.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a binary search to match a docid to its value
|
||||
/// position.
|
||||
fn positions_to_docids<T: MultiValueLength>(positions: &[usize], multival_idx: &T) -> Vec<DocId> {
|
||||
let mut docs = vec![];
|
||||
let mut cur_doc = 0u32;
|
||||
let mut last_doc = None;
|
||||
|
||||
for pos in positions {
|
||||
loop {
|
||||
let range = multival_idx.get_range(cur_doc);
|
||||
if range.contains(&(*pos as u64)) {
|
||||
// avoid duplicates
|
||||
if Some(cur_doc) == last_doc {
|
||||
break;
|
||||
}
|
||||
docs.push(cur_doc);
|
||||
last_doc = Some(cur_doc);
|
||||
break;
|
||||
}
|
||||
cur_doc += 1;
|
||||
}
|
||||
}
|
||||
|
||||
docs
|
||||
}
|
||||
|
||||
impl<Item: FastValueU128> MultiValueLength for MultiValuedU128FastFieldReader<Item> {
|
||||
fn get_range(&self, doc_id: DocId) -> std::ops::Range<u64> {
|
||||
self.range(doc_id)
|
||||
}
|
||||
fn get_len(&self, doc_id: DocId) -> u64 {
|
||||
self.num_vals(doc_id) as u64
|
||||
}
|
||||
@@ -246,7 +86,6 @@ impl<Item: FastValueU128> MultiValueLength for MultiValuedU128FastFieldReader<It
|
||||
self.total_num_vals() as u64
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
|
||||
@@ -1,16 +1,15 @@
|
||||
use std::io;
|
||||
|
||||
use fastfield_codecs::ip_codec::{ip_to_u128, IntervalCompressor};
|
||||
use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType, FastValue};
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field, Value};
|
||||
use crate::schema::{Document, Field};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::{DatePrecision, DocId};
|
||||
use crate::DocId;
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
@@ -37,25 +36,19 @@ use crate::{DatePrecision, DocId};
|
||||
/// term ids when the segment is getting serialized.
|
||||
pub struct MultiValuedFastFieldWriter {
|
||||
field: Field,
|
||||
precision_opt: Option<DatePrecision>,
|
||||
vals: Vec<UnorderedTermId>,
|
||||
doc_index: Vec<u64>,
|
||||
fast_field_type: FastFieldType,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
impl MultiValuedFastFieldWriter {
|
||||
/// Creates a new `MultiValuedFastFieldWriter`
|
||||
pub(crate) fn new(
|
||||
field: Field,
|
||||
fast_field_type: FastFieldType,
|
||||
precision_opt: Option<DatePrecision>,
|
||||
) -> Self {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValuedFastFieldWriter {
|
||||
field,
|
||||
precision_opt,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
fast_field_type,
|
||||
is_facet,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -84,20 +77,12 @@ impl MultiValuedFastFieldWriter {
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets/texts are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if self.fast_field_type.is_storing_term_ids() {
|
||||
return;
|
||||
}
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
let value = field_value.value();
|
||||
let value_u64 = match (self.precision_opt, value) {
|
||||
(Some(precision), Value::Date(date_val)) => {
|
||||
date_val.truncate(precision).to_u64()
|
||||
}
|
||||
_ => value_to_u64(value),
|
||||
};
|
||||
self.add_val(value_u64);
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -121,9 +106,25 @@ impl MultiValuedFastFieldWriter {
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u64]> {
|
||||
get_ordered_values(&self.vals, &self.doc_index, doc_id_map)
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
let max_doc = self.doc_index.len() as DocId;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| self.get_values_for_doc_id(doc_id))
|
||||
}
|
||||
|
||||
/// returns all values for a doc_ids
|
||||
fn get_values_for_doc_id(&self, doc_id: u32) -> &[u64] {
|
||||
let start_pos = self.doc_index[doc_id as usize] as usize;
|
||||
let end_pos = self
|
||||
.doc_index
|
||||
.get(doc_id as usize + 1)
|
||||
.cloned()
|
||||
.unwrap_or(self.vals.len() as u64) as usize; // special case, last doc_id has no offset information
|
||||
&self.vals[start_pos..end_pos]
|
||||
}
|
||||
/// Serializes fast field values by pushing them to the `FastFieldSerializer`.
|
||||
///
|
||||
/// If a mapping is given, the values are remapped *and sorted* before serialization.
|
||||
@@ -157,15 +158,15 @@ impl MultiValuedFastFieldWriter {
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
if let Some(mapping) = mapping_opt {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
|
||||
if self.fast_field_type.is_facet() {
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_vals.clear();
|
||||
@@ -178,159 +179,22 @@ impl MultiValuedFastFieldWriter {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
}
|
||||
None => {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
let remapped_vals = vals
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
for val in remapped_vals {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Writer for multi-valued (as in, more than one value per document)
|
||||
/// int fast field.
|
||||
///
|
||||
/// This `Writer` is only useful for advanced users.
|
||||
/// The normal way to get your multivalued int in your index
|
||||
/// is to
|
||||
/// - declare your field with fast set to `Cardinality::MultiValues`
|
||||
/// in your schema
|
||||
/// - add your document simply by calling `.add_document(...)`.
|
||||
///
|
||||
/// The `MultiValuedFastFieldWriter` can be acquired from the
|
||||
|
||||
pub struct U128MultiValueFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u128>,
|
||||
doc_index: Vec<u64>,
|
||||
}
|
||||
|
||||
impl U128MultiValueFastFieldWriter {
|
||||
/// Creates a new `U128MultiValueFastFieldWriter`
|
||||
pub(crate) fn new(field: Field) -> Self {
|
||||
U128MultiValueFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.capacity() * std::mem::size_of::<UnorderedTermId>()
|
||||
+ self.doc_index.capacity() * std::mem::size_of::<u64>()
|
||||
}
|
||||
|
||||
/// Finalize the current document.
|
||||
pub(crate) fn next_doc(&mut self) {
|
||||
self.doc_index.push(self.vals.len() as u64);
|
||||
}
|
||||
|
||||
/// Pushes a new value to the current document.
|
||||
pub(crate) fn add_val(&mut self, val: u128) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Shift to the next document and adds
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
let value = field_value.value();
|
||||
let ip_addr = value.as_ip().unwrap();
|
||||
let value = ip_to_u128(ip_addr);
|
||||
self.add_val(value);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over values per doc_id in ascending doc_id order.
|
||||
///
|
||||
/// Normally the order is simply iterating self.doc_id_index.
|
||||
/// With doc_id_map it accounts for the new mapping, returning values in the order of the
|
||||
/// new doc_ids.
|
||||
fn get_ordered_values<'a: 'b, 'b>(
|
||||
&'a self,
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [u128]> {
|
||||
get_ordered_values(&self.vals, &self.doc_index, doc_id_map)
|
||||
}
|
||||
|
||||
/// Serializes fast field values.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
{
|
||||
// writing the offset index
|
||||
let mut doc_index_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, 0, self.vals.len() as u64, 0)?;
|
||||
|
||||
let mut offset = 0;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_index_serializer.add_val(offset)?;
|
||||
offset += vals.len() as u64;
|
||||
}
|
||||
doc_index_serializer.add_val(self.vals.len() as u64)?;
|
||||
|
||||
doc_index_serializer.close_field()?;
|
||||
}
|
||||
{
|
||||
let field_write = serializer.get_field_writer(self.field, 1);
|
||||
let compressor = IntervalCompressor::from_vals(self.vals.to_vec());
|
||||
let iter = self.get_ordered_values(doc_id_map).flatten().cloned();
|
||||
compressor.compress_into(iter, field_write)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns an iterator over values per doc_id in ascending doc_id order.
|
||||
///
|
||||
/// Normally the order is simply iterating self.doc_id_index.
|
||||
/// With doc_id_map it accounts for the new mapping, returning values in the order of the
|
||||
/// new doc_ids.
|
||||
fn get_ordered_values<'a: 'b, 'b, T>(
|
||||
vals: &'a [T],
|
||||
doc_index: &'a [u64],
|
||||
doc_id_map: Option<&'b DocIdMapping>,
|
||||
) -> impl Iterator<Item = &'b [T]> {
|
||||
let doc_id_iter: Box<dyn Iterator<Item = u32>> = if let Some(doc_id_map) = doc_id_map {
|
||||
Box::new(doc_id_map.iter_old_doc_ids())
|
||||
} else {
|
||||
let max_doc = doc_index.len() as DocId;
|
||||
Box::new(0..max_doc)
|
||||
};
|
||||
doc_id_iter.map(move |doc_id| get_values_for_doc_id(doc_id, vals, doc_index))
|
||||
}
|
||||
|
||||
/// returns all values for a doc_id
|
||||
fn get_values_for_doc_id<'a, T>(doc_id: u32, vals: &'a [T], doc_index: &'a [u64]) -> &'a [T] {
|
||||
let start_pos = doc_index[doc_id as usize] as usize;
|
||||
let end_pos = doc_index
|
||||
.get(doc_id as usize + 1)
|
||||
.cloned()
|
||||
.unwrap_or(vals.len() as u64) as usize; // special case, last doc_id has no offset information
|
||||
&vals[start_pos..end_pos]
|
||||
}
|
||||
|
||||
@@ -1,20 +1,25 @@
|
||||
use std::collections::HashMap;
|
||||
use std::marker::PhantomData;
|
||||
use std::ops::RangeInclusive;
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecReaderU128, FastFieldCodecSerializer};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
||||
|
||||
use super::{FastValue, FastValueU128, GCDFastFieldCodec, GCD_CODEC_ID};
|
||||
use super::FastValue;
|
||||
use crate::directory::{CompositeFile, Directory, FileSlice, OwnedBytes, RamDirectory, WritePtr};
|
||||
use crate::fastfield::{CompositeFastFieldSerializer, FastFieldsWriter};
|
||||
use crate::schema::{Schema, FAST};
|
||||
@@ -71,38 +76,31 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
|
||||
/// GCD and Bitpacked compressed fastfield data.
|
||||
BitpackedGCD(FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<BitpackedReader>>),
|
||||
/// GCD and Linear interpolated values + bitpacked
|
||||
LinearInterpolGCD(
|
||||
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<LinearInterpolFastFieldReader>>,
|
||||
),
|
||||
/// GCD and Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpolGCD(
|
||||
FastFieldReaderCodecWrapper<Item, GCDFastFieldCodec<MultiLinearInterpolFastFieldReader>>,
|
||||
),
|
||||
/// Piecewise linear interpolated values + bitpacked
|
||||
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open_from_id(
|
||||
mut bytes: OwnedBytes,
|
||||
codec_id: u8,
|
||||
) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let reader = match codec_id {
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let id = bytes.read_u8();
|
||||
|
||||
let reader = match id {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::Bitpacked(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
@@ -111,59 +109,21 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
_ if codec_id == GCD_CODEC_ID => {
|
||||
let codec_id = bytes.read_u8();
|
||||
|
||||
match codec_id {
|
||||
BitpackedFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::BitpackedGCD(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
GCDFastFieldCodec<BitpackedReader>,
|
||||
>::open_from_bytes(
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpolGCD(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
GCDFastFieldCodec<LinearInterpolFastFieldReader>,
|
||||
>::open_from_bytes(
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpolGCD(
|
||||
FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
GCDFastFieldCodec<MultiLinearInterpolFastFieldReader>,
|
||||
>::open_from_bytes(bytes)?,
|
||||
)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield codec id {:?}. Data corrupted or using old tantivy \
|
||||
version.",
|
||||
codec_id
|
||||
)
|
||||
}
|
||||
}
|
||||
PiecewiseLinearFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::PiecewiseLinear(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield codec id {:?}. Data corrupted or using old tantivy version.",
|
||||
codec_id
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
id
|
||||
)
|
||||
}
|
||||
};
|
||||
Ok(reader)
|
||||
}
|
||||
/// Returns correct the reader wrapped in the `DynamicFastFieldReader` enum for the data.
|
||||
pub fn open(file: FileSlice) -> crate::Result<DynamicFastFieldReader<Item>> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let codec_id = bytes.read_u8();
|
||||
|
||||
Self::open_from_id(bytes, codec_id)
|
||||
}
|
||||
}
|
||||
|
||||
impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
@@ -173,9 +133,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
Self::BitpackedGCD(reader) => reader.get(doc),
|
||||
Self::LinearInterpolGCD(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpolGCD(reader) => reader.get(doc),
|
||||
Self::PiecewiseLinear(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
@@ -184,9 +142,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::BitpackedGCD(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpolGCD(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpolGCD(reader) => reader.get_range(start, output),
|
||||
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
@@ -194,9 +150,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
Self::BitpackedGCD(reader) => reader.min_value(),
|
||||
Self::LinearInterpolGCD(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpolGCD(reader) => reader.min_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
@@ -204,85 +158,11 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
Self::BitpackedGCD(reader) => reader.max_value(),
|
||||
Self::LinearInterpolGCD(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpolGCD(reader) => reader.max_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
#[derive(Clone)]
|
||||
pub struct FastFieldReaderCodecWrapperU128<Item: FastValueU128, CodecReader> {
|
||||
reader: CodecReader,
|
||||
bytes: OwnedBytes,
|
||||
_phantom: PhantomData<Item>,
|
||||
}
|
||||
|
||||
impl<Item: FastValueU128, C: FastFieldCodecReaderU128> FastFieldReaderCodecWrapperU128<Item, C> {
|
||||
/// Opens a fast field given the bytes.
|
||||
pub fn open_from_bytes(bytes: OwnedBytes) -> crate::Result<Self> {
|
||||
let reader = C::open_from_bytes(bytes.as_slice())?;
|
||||
Ok(Self {
|
||||
reader,
|
||||
bytes,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
|
||||
/// Returns the item for the docid, if present
|
||||
pub fn get_val(&self, doc: u64) -> Option<Item> {
|
||||
self.reader
|
||||
.get(doc, self.bytes.as_slice())
|
||||
.map(|el| Item::from_u128(el))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
/// It works as follows... A first column contains the list of start index
|
||||
/// for each document, a second column contains the actual values.
|
||||
///
|
||||
/// The values associated to a given doc, are then
|
||||
/// `second_column[first_column.get(doc)..first_column.get(doc+1)]`.
|
||||
///
|
||||
/// Which means single value fast field reader can be indexed internally with
|
||||
/// something different from a `DocId`. For this use case, we want to use `u64`
|
||||
/// values.
|
||||
///
|
||||
/// See `get_range` for an actual documentation about this method.
|
||||
pub(crate) fn get_range(&self, start: u64, output: &mut [Item]) {
|
||||
for (i, out) in output.iter_mut().enumerate() {
|
||||
if let Some(val) = self.get_val(start + (i as u64)) {
|
||||
*out = val
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Iterates over all elements in the fast field
|
||||
pub fn iter(&self) -> impl Iterator<Item = Option<Item>> + '_ {
|
||||
self.reader
|
||||
.iter(self.bytes.as_slice())
|
||||
.map(|el| el.map(Item::from_u128))
|
||||
}
|
||||
|
||||
/// Returns all docids which are in the provided value range
|
||||
pub fn get_between_vals(&self, range: RangeInclusive<Item>) -> Vec<usize> {
|
||||
let range = range.start().to_u128()..=range.end().to_u128();
|
||||
self.reader.get_between_vals(range, self.bytes.as_slice())
|
||||
}
|
||||
|
||||
/// Return min_value.
|
||||
pub fn min_value(&self) -> Item {
|
||||
Item::from_u128(self.reader.min_value())
|
||||
}
|
||||
|
||||
/// Return max_value.
|
||||
pub fn max_value(&self) -> Item {
|
||||
Item::from_u128(self.reader.max_value())
|
||||
}
|
||||
}
|
||||
|
||||
/// Wrapper for accessing a fastfield.
|
||||
///
|
||||
/// Holds the data and the codec to the read the data.
|
||||
@@ -297,10 +177,10 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
/// Opens a fast field given a file.
|
||||
pub fn open(file: FileSlice) -> crate::Result<Self> {
|
||||
let mut bytes = file.read_bytes()?;
|
||||
let codec_id = bytes.read_u8();
|
||||
let id = u8::deserialize(&mut bytes)?;
|
||||
assert_eq!(
|
||||
BitpackedFastFieldSerializer::ID,
|
||||
codec_id,
|
||||
id,
|
||||
"Tried to open fast field as bitpacked encoded (id=1), but got serializer with \
|
||||
different id"
|
||||
);
|
||||
@@ -315,10 +195,12 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
let data = self.reader.get_u64(doc, self.bytes.as_slice());
|
||||
Item::from_u64(data)
|
||||
|
||||
/// Get u64 for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field. See [`get_range`] for more details.
|
||||
pub(crate) fn get_u64(&self, idx: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(idx, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
|
||||
@@ -1,9 +1,4 @@
|
||||
use std::net::IpAddr;
|
||||
|
||||
use fastfield_codecs::ip_codec::IntervallDecompressor;
|
||||
|
||||
use super::multivalued::MultiValuedU128FastFieldReader;
|
||||
use super::reader::{DynamicFastFieldReader, FastFieldReaderCodecWrapperU128};
|
||||
use super::reader::DynamicFastFieldReader;
|
||||
use crate::directory::{CompositeFile, FileSlice};
|
||||
use crate::fastfield::{
|
||||
BytesFastFieldReader, FastFieldNotAvailableError, FastValue, MultiValuedFastFieldReader,
|
||||
@@ -25,9 +20,7 @@ pub struct FastFieldReaders {
|
||||
pub(crate) enum FastType {
|
||||
I64,
|
||||
U64,
|
||||
U128,
|
||||
F64,
|
||||
Bool,
|
||||
Date,
|
||||
}
|
||||
|
||||
@@ -42,19 +35,10 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType,
|
||||
FieldType::F64(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::F64, cardinality)),
|
||||
FieldType::Bool(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Bool, cardinality)),
|
||||
FieldType::Date(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some((FastType::U64, Cardinality::MultiValues))
|
||||
}
|
||||
FieldType::Ip(options) => options
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::U128, cardinality)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
@@ -146,69 +130,6 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `ip` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn ip_addr(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<FastFieldReaderCodecWrapperU128<IpAddr, IntervallDecompressor>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::SingleValue)?;
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
FastFieldReaderCodecWrapperU128::<IpAddr, IntervallDecompressor>::open_from_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Returns the `ip` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn ip_addrs(&self, field: Field) -> crate::Result<MultiValuedU128FastFieldReader<IpAddr>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: DynamicFastFieldReader<u64> = self.typed_fast_field_reader(field)?;
|
||||
|
||||
let fast_field_slice = self.fast_field_data(field, 1)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
|
||||
let vals_reader =
|
||||
FastFieldReaderCodecWrapperU128::<IpAddr, IntervallDecompressor>::open_from_bytes(
|
||||
bytes,
|
||||
)?;
|
||||
Ok(MultiValuedU128FastFieldReader::open(
|
||||
idx_reader,
|
||||
vals_reader,
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the `u128` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 fast field, this method returns an Error.
|
||||
pub fn u128(
|
||||
&self,
|
||||
field: Field,
|
||||
) -> crate::Result<FastFieldReaderCodecWrapperU128<u128, IntervallDecompressor>> {
|
||||
let fast_field_slice = self.fast_field_data(field, 0)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
FastFieldReaderCodecWrapperU128::<u128, IntervallDecompressor>::open_from_bytes(bytes)
|
||||
}
|
||||
|
||||
/// Returns the `u128` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u128 multi-valued fast field, this method returns an Error.
|
||||
pub fn u128s(&self, field: Field) -> crate::Result<MultiValuedU128FastFieldReader<u128>> {
|
||||
self.check_type(field, FastType::U128, Cardinality::MultiValues)?;
|
||||
let idx_reader: DynamicFastFieldReader<u64> = self.typed_fast_field_reader(field)?;
|
||||
|
||||
let fast_field_slice = self.fast_field_data(field, 1)?;
|
||||
let bytes = fast_field_slice.read_bytes()?;
|
||||
|
||||
let vals_reader =
|
||||
FastFieldReaderCodecWrapperU128::<u128, IntervallDecompressor>::open_from_bytes(bytes)?;
|
||||
Ok(MultiValuedU128FastFieldReader::open(
|
||||
idx_reader,
|
||||
vals_reader,
|
||||
))
|
||||
}
|
||||
|
||||
/// Returns the `u64` fast field reader reader associated to `field`, regardless of whether the
|
||||
/// given field is effectively of type `u64` or not.
|
||||
///
|
||||
@@ -242,14 +163,6 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns the `bool` fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bool fast field, this method returns an Error.
|
||||
pub fn bool(&self, field: Field) -> crate::Result<DynamicFastFieldReader<bool>> {
|
||||
self.check_type(field, FastType::Bool, Cardinality::SingleValue)?;
|
||||
self.typed_fast_field_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `u64s` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a u64 multi-valued fast field, this method returns an Error.
|
||||
@@ -282,14 +195,6 @@ impl FastFieldReaders {
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `bools` multi-valued fast field reader reader associated to `field`.
|
||||
///
|
||||
/// If `field` is not a bool multi-valued fast field, this method returns an Error.
|
||||
pub fn bools(&self, field: Field) -> crate::Result<MultiValuedFastFieldReader<bool>> {
|
||||
self.check_type(field, FastType::Bool, Cardinality::MultiValues)?;
|
||||
self.typed_fast_field_multi_reader(field)
|
||||
}
|
||||
|
||||
/// Returns a `time::OffsetDateTime` multi-valued fast field reader reader associated to
|
||||
/// `field`.
|
||||
///
|
||||
|
||||
@@ -4,14 +4,11 @@ use common::{BinarySerializable, CountingWriter};
|
||||
pub use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
|
||||
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::{find_gcd, FastFieldCodecName, ALL_CODECS, GCD_DEFAULT};
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::fastfield::gcd::write_gcd_header;
|
||||
use crate::fastfield::GCD_CODEC_ID;
|
||||
use crate::schema::Field;
|
||||
|
||||
/// `CompositeFastFieldSerializer` is in charge of serializing
|
||||
@@ -36,262 +33,124 @@ use crate::schema::Field;
|
||||
/// * `close()`
|
||||
pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FastFieldCodecEnableCheck {
|
||||
enabled_codecs: Vec<FastFieldCodecName>,
|
||||
}
|
||||
impl FastFieldCodecEnableCheck {
|
||||
fn allow_all() -> Self {
|
||||
FastFieldCodecEnableCheck {
|
||||
enabled_codecs: ALL_CODECS.to_vec(),
|
||||
}
|
||||
}
|
||||
fn is_enabled(&self, codec_name: FastFieldCodecName) -> bool {
|
||||
self.enabled_codecs.contains(&codec_name)
|
||||
}
|
||||
#[derive(Debug)]
|
||||
pub struct CodecEstimationResult<'a> {
|
||||
pub ratio: f32,
|
||||
pub name: &'a str,
|
||||
pub id: u8,
|
||||
}
|
||||
|
||||
impl From<FastFieldCodecName> for FastFieldCodecEnableCheck {
|
||||
fn from(codec_name: FastFieldCodecName) -> Self {
|
||||
FastFieldCodecEnableCheck {
|
||||
enabled_codecs: vec![codec_name],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// TODO: use this when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
) -> CodecEstimationResult {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
return CodecEstimationResult {
|
||||
ratio: f32::MAX,
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
};
|
||||
}
|
||||
CodecEstimationResult {
|
||||
ratio: T::estimate_compression_ratio(fastfield_accessor, stats),
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
/// Constructor
|
||||
pub fn from_write(write: WritePtr) -> io::Result<CompositeFastFieldSerializer> {
|
||||
Self::from_write_with_codec(write, FastFieldCodecEnableCheck::allow_all())
|
||||
}
|
||||
|
||||
/// Constructor
|
||||
pub fn from_write_with_codec(
|
||||
write: WritePtr,
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
) -> io::Result<CompositeFastFieldSerializer> {
|
||||
// just making room for the pointer to header.
|
||||
let composite_write = CompositeWrite::wrap(write);
|
||||
Ok(CompositeFastFieldSerializer {
|
||||
composite_write,
|
||||
codec_enable_checker,
|
||||
})
|
||||
Ok(CompositeFastFieldSerializer { composite_write })
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field<F, I>(
|
||||
pub fn new_u64_fast_field_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
iter_gen: F,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
F: Fn() -> I,
|
||||
I: Iterator<Item = u64>,
|
||||
{
|
||||
self.create_auto_detect_u64_fast_field_with_idx(
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
self.new_u64_fast_field_with_idx_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter_gen,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
0,
|
||||
)
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn write_header<W: Write>(field_write: &mut W, codec_id: u8) -> io::Result<()> {
|
||||
codec_id.serialize(field_write)?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx<F, I>(
|
||||
pub fn new_u64_fast_field_with_idx_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
iter_gen: F,
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
idx: usize,
|
||||
) -> io::Result<()>
|
||||
where
|
||||
F: Fn() -> I,
|
||||
I: Iterator<Item = u64>,
|
||||
{
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
let gcd = find_gcd(iter_gen().map(|val| val - stats.min_value)).unwrap_or(GCD_DEFAULT);
|
||||
|
||||
if gcd == 1 {
|
||||
return Self::create_auto_detect_u64_fast_field_with_idx_gcd(
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter_gen(),
|
||||
iter_gen(),
|
||||
);
|
||||
}
|
||||
|
||||
Self::write_header(field_write, GCD_CODEC_ID)?;
|
||||
struct GCDWrappedFFAccess<T: FastFieldDataAccess> {
|
||||
fastfield_accessor: T,
|
||||
min_value: u64,
|
||||
gcd: u64,
|
||||
}
|
||||
impl<T: FastFieldDataAccess> FastFieldDataAccess for GCDWrappedFFAccess<T> {
|
||||
fn get_val(&self, position: u64) -> u64 {
|
||||
(self.fastfield_accessor.get_val(position) - self.min_value) / self.gcd
|
||||
}
|
||||
}
|
||||
|
||||
let fastfield_accessor = GCDWrappedFFAccess {
|
||||
fastfield_accessor,
|
||||
min_value: stats.min_value,
|
||||
gcd,
|
||||
};
|
||||
|
||||
let min_value = stats.min_value;
|
||||
let stats = FastFieldStats {
|
||||
min_value: 0,
|
||||
max_value: (stats.max_value - stats.min_value) / gcd,
|
||||
num_vals: stats.num_vals,
|
||||
};
|
||||
let iter1 = iter_gen().map(|val| (val - min_value) / gcd);
|
||||
let iter2 = iter_gen().map(|val| (val - min_value) / gcd);
|
||||
Self::create_auto_detect_u64_fast_field_with_idx_gcd(
|
||||
self.codec_enable_checker.clone(),
|
||||
field,
|
||||
field_write,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter1,
|
||||
iter2,
|
||||
)?;
|
||||
write_gcd_header(field_write, min_value, gcd)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx_gcd<W: Write>(
|
||||
codec_enable_checker: FastFieldCodecEnableCheck,
|
||||
field: Field,
|
||||
field_write: &mut CountingWriter<W>,
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: impl FastFieldDataAccess,
|
||||
iter1: impl Iterator<Item = u64>,
|
||||
iter2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut estimations = vec![];
|
||||
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::Bitpacked) {
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
let estimations = vec![
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(stats.clone(), &fastfield_accessor),
|
||||
codec_estimation::<PiecewiseLinearFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::LinearInterpol) {
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
}
|
||||
if codec_enable_checker.is_enabled(FastFieldCodecName::BlockwiseLinearInterpol) {
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
}
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables
|
||||
// codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
),
|
||||
];
|
||||
let best_codec_result = estimations
|
||||
.iter()
|
||||
.sorted_by(|result_a, result_b| {
|
||||
result_a
|
||||
.ratio
|
||||
.partial_cmp(&result_b.ratio)
|
||||
.expect("Ratio cannot be nan.")
|
||||
})
|
||||
.next()
|
||||
.expect("A codec must be present.");
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
|
||||
Self::write_header(field_write, id)?;
|
||||
match name {
|
||||
"Choosing fast field codec {} for field_id {:?} among {:?}",
|
||||
best_codec_result.name, field, estimations,
|
||||
);
|
||||
best_codec_result.id.serialize(field_write)?;
|
||||
match best_codec_result.name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
iter1,
|
||||
iter2,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
PiecewiseLinearFastFieldSerializer::NAME => {
|
||||
PiecewiseLinearFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
iter1,
|
||||
iter2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
iter1,
|
||||
iter2,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
panic!("unknown fastfield serializer {}", best_codec_result.name)
|
||||
}
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn serialize_into(
|
||||
&mut self,
|
||||
field: Field,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
) -> io::Result<BitpackedFastFieldSerializerLegacy<'_, CountingWriter<WritePtr>>> {
|
||||
self.new_u64_fast_field_with_idx(field, min_value, max_value, 0)
|
||||
}
|
||||
|
||||
/// Start serializing a new u64 fast field
|
||||
pub fn new_u64_fast_field(
|
||||
&mut self,
|
||||
@@ -327,11 +186,6 @@ impl CompositeFastFieldSerializer {
|
||||
FastBytesFieldSerializer { write: field_write }
|
||||
}
|
||||
|
||||
/// Gets the underlying writer
|
||||
pub fn get_field_writer(&mut self, field: Field, idx: usize) -> &mut impl Write {
|
||||
self.composite_write.for_field_with_idx(field, idx)
|
||||
}
|
||||
|
||||
/// Closes the serializer
|
||||
///
|
||||
/// After this call the data must be persistently saved on disk.
|
||||
@@ -353,3 +207,45 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::FastFieldStats;
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::CompositeFastFieldSerializer;
|
||||
use crate::directory::{RamDirectory, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::Directory;
|
||||
|
||||
#[test]
|
||||
fn new_u64_fast_field_with_best_codec() -> crate::Result<()> {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let path = Path::new("test");
|
||||
let write: WritePtr = directory.open_write(path)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let vals = (0..10000u64).into_iter().collect_vec();
|
||||
let stats = FastFieldStats {
|
||||
min_value: 0,
|
||||
max_value: 9999,
|
||||
num_vals: vals.len() as u64,
|
||||
};
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
Field::from_field_id(0),
|
||||
stats,
|
||||
vals.clone(),
|
||||
vals.clone().into_iter(),
|
||||
vals.into_iter(),
|
||||
)?;
|
||||
serializer.close()?;
|
||||
// get the codecs id
|
||||
let mut bytes = directory.open_read(path)?.read_bytes()?;
|
||||
let codec_id = u8::deserialize(&mut bytes)?;
|
||||
// Codec id = 4 is piecewise linear.
|
||||
assert_eq!(codec_id, 4);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,27 +2,21 @@ use std::collections::HashMap;
|
||||
use std::io;
|
||||
|
||||
use common;
|
||||
use fastfield_codecs::ip_codec::{ip_to_u128, IntervalCompressor};
|
||||
use fnv::FnvHashMap;
|
||||
use roaring::RoaringBitmap;
|
||||
use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::multivalued::{MultiValuedFastFieldWriter, U128MultiValueFastFieldWriter};
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::{FastFieldDataAccess, FastFieldType, FastValue};
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema, Value};
|
||||
use crate::schema::{Cardinality, Document, Field, FieldEntry, FieldType, Schema};
|
||||
use crate::termdict::TermOrdinal;
|
||||
use crate::DatePrecision;
|
||||
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
term_id_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
u128_value_writers: Vec<U128FastFieldWriter>,
|
||||
u128_multi_value_writers: Vec<U128MultiValueFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
}
|
||||
@@ -38,10 +32,7 @@ fn fast_field_default_value(field_entry: &FieldEntry) -> u64 {
|
||||
impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut u128_value_writers = Vec::new();
|
||||
let mut u128_multi_value_writers = Vec::new();
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut term_id_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
@@ -50,52 +41,24 @@ impl FastFieldsWriter {
|
||||
FieldType::I64(ref int_options)
|
||||
| FieldType::U64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Bool(ref int_options) => {
|
||||
| FieldType::Date(ref int_options) => {
|
||||
match int_options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field, None);
|
||||
let mut fast_field_writer = IntFastFieldWriter::new(field);
|
||||
let default_value = fast_field_default_value(field_entry);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(
|
||||
field,
|
||||
FastFieldType::Numeric,
|
||||
None,
|
||||
);
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::Date(ref options) => match options.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let mut fast_field_writer =
|
||||
IntFastFieldWriter::new(field, Some(options.get_precision()));
|
||||
let default_value = fast_field_default_value(field_entry);
|
||||
fast_field_writer.set_val_if_missing(default_value);
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(
|
||||
field,
|
||||
FastFieldType::Numeric,
|
||||
Some(options.get_precision()),
|
||||
);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
},
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet, None);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Str(_) if field_entry.is_fast() => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::String, None);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes(bytes_option) => {
|
||||
if bytes_option.is_fast() {
|
||||
@@ -103,28 +66,10 @@ impl FastFieldsWriter {
|
||||
bytes_value_writers.push(fast_field_writer);
|
||||
}
|
||||
}
|
||||
FieldType::Ip(opt) => {
|
||||
if opt.is_fast() {
|
||||
match opt.get_fastfield_cardinality() {
|
||||
Some(Cardinality::SingleValue) => {
|
||||
let fast_field_writer = U128FastFieldWriter::new(field);
|
||||
u128_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer = U128MultiValueFastFieldWriter::new(field);
|
||||
u128_multi_value_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
u128_value_writers,
|
||||
u128_multi_value_writers,
|
||||
term_id_writers,
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
@@ -133,15 +78,10 @@ impl FastFieldsWriter {
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_id_writers
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
@@ -152,24 +92,6 @@ impl FastFieldsWriter {
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.u128_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.u128_multi_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
@@ -188,17 +110,6 @@ impl FastFieldsWriter {
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
@@ -223,11 +134,9 @@ impl FastFieldsWriter {
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.term_id_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
@@ -237,12 +146,6 @@ impl FastFieldsWriter {
|
||||
for field_writer in &mut self.bytes_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.u128_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.u128_multi_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
}
|
||||
|
||||
/// Serializes all of the `FastFieldWriter`s by pushing them in
|
||||
@@ -253,10 +156,6 @@ impl FastFieldsWriter {
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.term_id_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
@@ -268,129 +167,6 @@ impl FastFieldsWriter {
|
||||
for field_writer in &self.bytes_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.u128_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.u128_multi_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
/// Fast field writer for u128 values.
|
||||
/// The fast field writer just keeps the values in memory.
|
||||
///
|
||||
/// Only when the segment writer can be closed and
|
||||
/// persisted on disc, the fast field writer is
|
||||
/// sent to a `FastFieldSerializer` via the `.serialize(...)`
|
||||
/// method.
|
||||
///
|
||||
/// We cannot serialize earlier as the values are
|
||||
/// compressed to a compact number space and the number of
|
||||
/// bits required for bitpacking can only been known once
|
||||
/// we have seen all of the values.
|
||||
pub struct U128FastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<u128>,
|
||||
val_count: u32,
|
||||
|
||||
null_values: RoaringBitmap,
|
||||
}
|
||||
|
||||
impl U128FastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field) -> Self {
|
||||
Self {
|
||||
field,
|
||||
vals: vec![],
|
||||
val_count: 0,
|
||||
null_values: RoaringBitmap::new(),
|
||||
}
|
||||
}
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.vals.len() * 16
|
||||
}
|
||||
|
||||
/// Records a new value.
|
||||
///
|
||||
/// The n-th value being recorded is implicitely
|
||||
/// associated to the document with the `DocId` n.
|
||||
/// (Well, `n-1` actually because of 0-indexing)
|
||||
pub fn add_val(&mut self, val: u128) {
|
||||
self.vals.push(val);
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
let ip_addr = v.as_ip().unwrap();
|
||||
let value = ip_to_u128(ip_addr);
|
||||
self.add_val(value);
|
||||
}
|
||||
None => {
|
||||
self.null_values.insert(self.val_count as u32);
|
||||
}
|
||||
};
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
/// Push the fast fields value to the `FastFieldWriter`.
|
||||
pub fn serialize(
|
||||
&self,
|
||||
serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
let mut field_write = serializer.get_field_writer(self.field, 0);
|
||||
let compressor = IntervalCompressor::from_vals(self.vals.to_vec());
|
||||
|
||||
let mut val_idx = 0;
|
||||
let mut get_val = |idx| {
|
||||
if self.null_values.contains(idx as u32) {
|
||||
compressor.null_value
|
||||
} else {
|
||||
let val = self.vals[val_idx];
|
||||
val_idx += 1;
|
||||
val
|
||||
}
|
||||
};
|
||||
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
// To get the actual value, we could materialize the vec with u128 including nulls, but
|
||||
// that could cost a lot of memory. Instead we just compute the index for of
|
||||
// the values
|
||||
let mut idx_to_val_idx = vec![];
|
||||
idx_to_val_idx.resize(self.val_count as usize, 0);
|
||||
|
||||
let mut val_idx = 0;
|
||||
for idx in 0..self.val_count {
|
||||
if !self.null_values.contains(idx as u32) {
|
||||
idx_to_val_idx[idx as usize] = val_idx as u32;
|
||||
val_idx += 1;
|
||||
}
|
||||
}
|
||||
|
||||
let iter = doc_id_map.iter_old_doc_ids().map(|idx| {
|
||||
if self.null_values.contains(idx as u32) {
|
||||
compressor.null_value
|
||||
} else {
|
||||
self.vals[idx_to_val_idx[idx as usize] as usize]
|
||||
}
|
||||
});
|
||||
compressor.compress_into(iter, &mut field_write)?;
|
||||
} else {
|
||||
let iter = (0..self.val_count).map(&mut get_val);
|
||||
compressor.compress_into(iter, &mut field_write)?;
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
@@ -412,7 +188,6 @@ impl U128FastFieldWriter {
|
||||
/// using `common::i64_to_u64` and `common::f64_to_u64`.
|
||||
pub struct IntFastFieldWriter {
|
||||
field: Field,
|
||||
precision_opt: Option<DatePrecision>,
|
||||
vals: BlockedBitpacker,
|
||||
val_count: usize,
|
||||
val_if_missing: u64,
|
||||
@@ -422,14 +197,13 @@ pub struct IntFastFieldWriter {
|
||||
|
||||
impl IntFastFieldWriter {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub fn new(field: Field, precision_opt: Option<DatePrecision>) -> IntFastFieldWriter {
|
||||
pub fn new(field: Field) -> IntFastFieldWriter {
|
||||
IntFastFieldWriter {
|
||||
field,
|
||||
precision_opt,
|
||||
vals: BlockedBitpacker::new(),
|
||||
val_count: 0,
|
||||
val_if_missing: 0u64,
|
||||
val_min: u64::MAX,
|
||||
val_min: u64::max_value(),
|
||||
val_max: 0,
|
||||
}
|
||||
}
|
||||
@@ -439,7 +213,7 @@ impl IntFastFieldWriter {
|
||||
self.vals.mem_usage()
|
||||
}
|
||||
|
||||
/// Returns the field that this writer is targeting.
|
||||
/// Returns the field that this writer is targetting.
|
||||
pub fn field(&self) -> Field {
|
||||
self.field
|
||||
}
|
||||
@@ -470,10 +244,6 @@ impl IntFastFieldWriter {
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
///
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
@@ -484,23 +254,18 @@ impl IntFastFieldWriter {
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
///
|
||||
/// Values on text fast fields are skipped.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
let value = match (self.precision_opt, v) {
|
||||
(Some(precision), Value::Date(date_val)) => {
|
||||
date_val.truncate(precision).to_u64()
|
||||
}
|
||||
_ => super::value_to_u64(v),
|
||||
};
|
||||
self.add_val(value);
|
||||
}
|
||||
None => {
|
||||
self.add_val(self.val_if_missing);
|
||||
}
|
||||
};
|
||||
Some(v) => super::value_to_u64(v),
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
let val = self.extract_val(doc);
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// get iterator over the data
|
||||
@@ -519,7 +284,6 @@ impl IntFastFieldWriter {
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
@@ -531,25 +295,23 @@ impl IntFastFieldWriter {
|
||||
};
|
||||
|
||||
if let Some(doc_id_map) = doc_id_map {
|
||||
let iter_gen = || {
|
||||
doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize))
|
||||
};
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter_gen,
|
||||
iter.clone(),
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
let iter_gen = || self.vals.iter();
|
||||
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
iter_gen,
|
||||
self.vals.iter(),
|
||||
self.vals.iter(),
|
||||
)?;
|
||||
};
|
||||
Ok(())
|
||||
|
||||
@@ -294,7 +294,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_u32_max() {
|
||||
assert_eq!(fieldnorm_to_id(u32::MAX), u8::MAX);
|
||||
assert_eq!(fieldnorm_to_id(u32::max_value()), u8::max_value());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -2,12 +2,12 @@
|
||||
//! a given Field of a given document.
|
||||
//!
|
||||
//! This metric is important to compute the score of a
|
||||
//! document: a document having a query word in one of its short fields
|
||||
//! document : a document having a query word in one its short fields
|
||||
//! (e.g. title) is likely to be more relevant than in one of its longer field
|
||||
//! (e.g. body).
|
||||
//!
|
||||
//! It encodes `fieldnorm` on one byte with some precision loss,
|
||||
//! using the exact same scheme as Lucene. Each value is placed on a log-scale
|
||||
//! using the exact same scheme as Lucene. Each value is place on a log-scale
|
||||
//! that takes values from `0` to `255`.
|
||||
//!
|
||||
//! A value on this scale is identified by a `fieldnorm_id`.
|
||||
@@ -112,7 +112,7 @@ mod tests {
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&searcher, true)?;
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
Term::from_field_text(text, "hello"),
|
||||
IndexRecordOption::WithFreqs,
|
||||
);
|
||||
let weight = query.weight(&searcher, true)?;
|
||||
let weight = query.weight(&*searcher, true)?;
|
||||
let mut scorer = weight.scorer(searcher.segment_reader(0), 1.0f32)?;
|
||||
assert_eq!(scorer.doc(), 0);
|
||||
assert!((scorer.score() - 0.22920431).abs() < 0.001f32);
|
||||
|
||||
@@ -40,17 +40,25 @@ impl FieldNormReaders {
|
||||
pub fn space_usage(&self) -> PerFieldSpaceUsage {
|
||||
self.data.space_usage()
|
||||
}
|
||||
|
||||
/// Returns a handle to inner file
|
||||
pub fn get_inner_file(&self) -> Arc<CompositeFile> {
|
||||
self.data.clone()
|
||||
}
|
||||
}
|
||||
|
||||
/// Reads the fieldnorm associated to a document.
|
||||
///
|
||||
/// The [fieldnorm](FieldNormReader::fieldnorm) represents the length associated to
|
||||
/// The fieldnorm represents the length associated to
|
||||
/// a given Field of a given document.
|
||||
///
|
||||
/// This metric is important to compute the score of a
|
||||
/// document : a document having a query word in one its short fields
|
||||
/// (e.g. title) is likely to be more relevant than in one of its longer field
|
||||
/// (e.g. body).
|
||||
///
|
||||
/// tantivy encodes `fieldnorm` on one byte with some precision loss,
|
||||
/// using the same scheme as Lucene. Each value is place on a log-scale
|
||||
/// that takes values from `0` to `255`.
|
||||
///
|
||||
/// A value on this scale is identified by a `fieldnorm_id`.
|
||||
/// Apart from compression, this scale also makes it possible to
|
||||
/// precompute computationally expensive functions of the fieldnorm
|
||||
/// in a very short array.
|
||||
#[derive(Clone)]
|
||||
pub struct FieldNormReader(ReaderImplEnum);
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
assert!(searcher.segment_readers().len() < 20);
|
||||
assert_eq!(searcher.num_docs() as usize, vals.len());
|
||||
for segment_reader in searcher.segment_readers() {
|
||||
let store_reader = segment_reader.get_store_reader(1)?;
|
||||
let store_reader = segment_reader.get_store_reader()?;
|
||||
for doc_id in 0..segment_reader.max_doc() {
|
||||
let _doc = store_reader.get(doc_id)?;
|
||||
}
|
||||
|
||||
@@ -116,14 +116,14 @@ pub fn demux(
|
||||
) -> crate::Result<Vec<Index>> {
|
||||
let mut indices = vec![];
|
||||
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
|
||||
let alive_bitset = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
.into_iter()
|
||||
.map(Some)
|
||||
.collect_vec();
|
||||
let index = merge_filtered_segments(
|
||||
segments,
|
||||
target_settings.clone(),
|
||||
alive_bitset,
|
||||
delete_bitsets,
|
||||
output_directory,
|
||||
)?;
|
||||
indices.push(index);
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
use crate::{DocAddress, Term};
|
||||
|
||||
#[test]
|
||||
fn test_demux_map_to_alive_bitset() {
|
||||
fn test_demux_map_to_deletebitset() {
|
||||
let max_value = 2;
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
// segment ordinal 0 mapping
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use smallvec::smallvec;
|
||||
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
@@ -29,7 +30,7 @@ pub const MARGIN_IN_BYTES: usize = 1_000_000;
|
||||
|
||||
// We impose the memory per thread to be at least 3 MB.
|
||||
pub const MEMORY_ARENA_NUM_BYTES_MIN: usize = ((MARGIN_IN_BYTES as u32) * 3u32) as usize;
|
||||
pub const MEMORY_ARENA_NUM_BYTES_MAX: usize = u32::MAX as usize - MARGIN_IN_BYTES;
|
||||
pub const MEMORY_ARENA_NUM_BYTES_MAX: usize = u32::max_value() as usize - MARGIN_IN_BYTES;
|
||||
|
||||
// We impose the number of index writter thread to be at most this.
|
||||
pub const MAX_NUM_THREAD: usize = 8;
|
||||
@@ -158,9 +159,9 @@ pub(crate) fn advance_deletes(
|
||||
if num_deleted_docs > num_deleted_docs_before {
|
||||
// There are new deletes. We need to write a new delete file.
|
||||
segment = segment.with_delete_meta(num_deleted_docs as u32, target_opstamp);
|
||||
let mut alive_doc_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_alive_bitset(&alive_bitset, &mut alive_doc_file)?;
|
||||
alive_doc_file.terminate()?;
|
||||
let mut delete_file = segment.open_write(SegmentComponent::Delete)?;
|
||||
write_alive_bitset(&alive_bitset, &mut delete_file)?;
|
||||
delete_file.terminate()?;
|
||||
}
|
||||
|
||||
segment_entry.set_meta(segment.meta().clone());
|
||||
@@ -288,7 +289,7 @@ impl IndexWriter {
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
@@ -325,7 +326,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
fn drop_sender(&mut self) {
|
||||
let (sender, _receiver) = crossbeam_channel::bounded(1);
|
||||
let (sender, _receiver) = channel::bounded(1);
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
@@ -531,7 +532,7 @@ impl IndexWriter {
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) {
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
self.operation_sender = document_sender;
|
||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
||||
}
|
||||
@@ -776,7 +777,6 @@ impl Drop for IndexWriter {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::net::IpAddr;
|
||||
|
||||
use proptest::prelude::*;
|
||||
use proptest::prop_oneof;
|
||||
@@ -790,10 +790,9 @@ mod tests {
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::query::{QueryParser, TermQuery};
|
||||
use crate::schema::{
|
||||
self, Cardinality, Facet, FacetOptions, IndexRecordOption, IpOptions, NumericOptions,
|
||||
self, Cardinality, Facet, FacetOptions, IndexRecordOption, NumericOptions,
|
||||
TextFieldIndexing, TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
||||
use crate::{DocAddress, Index, IndexSettings, IndexSortByField, Order, ReloadPolicy, Term};
|
||||
|
||||
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do \
|
||||
@@ -1385,14 +1384,8 @@ mod tests {
|
||||
force_end_merge: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut schema_builder = schema::Schema::builder();
|
||||
let ip_field = schema_builder.add_ip_field("ip", FAST | INDEXED | STORED);
|
||||
let ips_field = schema_builder.add_ip_field(
|
||||
"ips",
|
||||
IpOptions::default().set_fast(Cardinality::MultiValues),
|
||||
);
|
||||
let id_field = schema_builder.add_u64_field("id", FAST | INDEXED | STORED);
|
||||
let bytes_field = schema_builder.add_bytes_field("bytes", FAST | INDEXED | STORED);
|
||||
let bool_field = schema_builder.add_bool_field("bool", FAST | INDEXED | STORED);
|
||||
let text_field = schema_builder.add_text_field(
|
||||
"text_field",
|
||||
TextOptions::default()
|
||||
@@ -1411,12 +1404,6 @@ mod tests {
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_stored(),
|
||||
);
|
||||
let multi_bools = schema_builder.add_bool_field(
|
||||
"multi_bools",
|
||||
NumericOptions::default()
|
||||
.set_fast(Cardinality::MultiValues)
|
||||
.set_stored(),
|
||||
);
|
||||
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
|
||||
let schema = schema_builder.build();
|
||||
let settings = if sort_index {
|
||||
@@ -1445,37 +1432,14 @@ mod tests {
|
||||
match op {
|
||||
IndexingOp::AddDoc { id } => {
|
||||
let facet = Facet::from(&("/cola/".to_string() + &id.to_string()));
|
||||
let ip_from_id = IpAddr::from((id as u128).to_be_bytes());
|
||||
|
||||
if id % 3 == 0 {
|
||||
// every 3rd doc has no ip field
|
||||
index_writer.add_document(doc!(id_field=>id,
|
||||
bytes_field => id.to_le_bytes().as_slice(),
|
||||
multi_numbers=> id,
|
||||
multi_numbers => id,
|
||||
bool_field => (id % 2u64) != 0,
|
||||
multi_bools => (id % 2u64) != 0,
|
||||
multi_bools => (id % 2u64) == 0,
|
||||
text_field => id.to_string(),
|
||||
facet_field => facet,
|
||||
large_text_field=> LOREM
|
||||
))?;
|
||||
} else {
|
||||
index_writer.add_document(doc!(id_field=>id,
|
||||
bytes_field => id.to_le_bytes().as_slice(),
|
||||
ip_field => ip_from_id,
|
||||
ips_field => ip_from_id,
|
||||
ips_field => ip_from_id,
|
||||
multi_numbers=> id,
|
||||
multi_numbers => id,
|
||||
bool_field => (id % 2u64) != 0,
|
||||
multi_bools => (id % 2u64) != 0,
|
||||
multi_bools => (id % 2u64) == 0,
|
||||
text_field => id.to_string(),
|
||||
facet_field => facet,
|
||||
large_text_field=> LOREM
|
||||
))?;
|
||||
}
|
||||
index_writer.add_document(doc!(id_field=>id,
|
||||
bytes_field => id.to_le_bytes().as_slice(),
|
||||
multi_numbers=> id,
|
||||
multi_numbers => id,
|
||||
text_field => id.to_string(),
|
||||
facet_field => facet,
|
||||
large_text_field=> LOREM
|
||||
))?;
|
||||
}
|
||||
IndexingOp::DeleteDoc { id } => {
|
||||
index_writer.delete_term(Term::from_field_u64(id_field, id));
|
||||
@@ -1534,104 +1498,47 @@ mod tests {
|
||||
})
|
||||
.collect();
|
||||
|
||||
let (expected_ids_and_num_occurrences, deleted_ids) = expected_ids(ops);
|
||||
let num_docs_expected = expected_ids_and_num_occurrences
|
||||
let (expected_ids_and_num_occurences, deleted_ids) = expected_ids(ops);
|
||||
let num_docs_expected = expected_ids_and_num_occurences
|
||||
.iter()
|
||||
.map(|(_, id_occurrences)| *id_occurrences as usize)
|
||||
.map(|(_, id_occurences)| *id_occurences as usize)
|
||||
.sum::<usize>();
|
||||
assert_eq!(searcher.num_docs() as usize, num_docs_expected);
|
||||
assert_eq!(old_searcher.num_docs() as usize, num_docs_expected);
|
||||
assert_eq!(
|
||||
ids_old_searcher,
|
||||
expected_ids_and_num_occurrences
|
||||
expected_ids_and_num_occurences
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>()
|
||||
);
|
||||
assert_eq!(
|
||||
ids,
|
||||
expected_ids_and_num_occurrences
|
||||
expected_ids_and_num_occurences
|
||||
.keys()
|
||||
.cloned()
|
||||
.collect::<HashSet<_>>()
|
||||
);
|
||||
|
||||
// Check ip addr
|
||||
let ips: HashSet<Option<IpAddr>> = searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader.fast_fields().ip_addr(ip_field).unwrap();
|
||||
segment_reader
|
||||
.doc_ids_alive()
|
||||
.map(move |doc| ff_reader.get_val(doc as u64))
|
||||
})
|
||||
.collect();
|
||||
|
||||
let expected_ips = expected_ids_and_num_occurrences
|
||||
.keys()
|
||||
.map(|id| {
|
||||
if id % 3 == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(IpAddr::from((*id as u128).to_be_bytes()))
|
||||
}
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
assert_eq!(ips, expected_ips);
|
||||
|
||||
let expected_ips = expected_ids_and_num_occurrences
|
||||
.keys()
|
||||
.filter_map(|id| {
|
||||
if id % 3 == 0 {
|
||||
None
|
||||
} else {
|
||||
Some(IpAddr::from((*id as u128).to_be_bytes()))
|
||||
}
|
||||
})
|
||||
.collect::<HashSet<_>>();
|
||||
let ips: HashSet<IpAddr> = searcher
|
||||
.segment_readers()
|
||||
.iter()
|
||||
.flat_map(|segment_reader| {
|
||||
let ff_reader = segment_reader.fast_fields().ip_addrs(ips_field).unwrap();
|
||||
segment_reader.doc_ids_alive().flat_map(move |doc| {
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
vals
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
assert_eq!(ips, expected_ips);
|
||||
|
||||
// multivalue fast field tests
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let ff_reader = segment_reader.fast_fields().u64s(multi_numbers).unwrap();
|
||||
let bool_ff_reader = segment_reader.fast_fields().bools(multi_bools).unwrap();
|
||||
for doc in segment_reader.doc_ids_alive() {
|
||||
let mut vals = vec![];
|
||||
ff_reader.get_vals(doc, &mut vals);
|
||||
assert_eq!(vals.len(), 2);
|
||||
assert_eq!(vals[0], vals[1]);
|
||||
|
||||
let mut bool_vals = vec![];
|
||||
bool_ff_reader.get_vals(doc, &mut bool_vals);
|
||||
assert_eq!(bool_vals.len(), 2);
|
||||
assert_ne!(bool_vals[0], bool_vals[1]);
|
||||
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&vals[0]));
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&vals[0]));
|
||||
}
|
||||
}
|
||||
|
||||
// doc store tests
|
||||
for segment_reader in searcher.segment_readers().iter() {
|
||||
let store_reader = segment_reader
|
||||
.get_store_reader(DOCSTORE_CACHE_CAPACITY)
|
||||
.unwrap();
|
||||
let store_reader = segment_reader.get_store_reader().unwrap();
|
||||
// test store iterator
|
||||
for doc in store_reader.iter(segment_reader.alive_bitset()) {
|
||||
let id = doc.unwrap().get_first(id_field).unwrap().as_u64().unwrap();
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&id));
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
}
|
||||
// test store random access
|
||||
for doc_id in segment_reader.doc_ids_alive() {
|
||||
@@ -1642,7 +1549,7 @@ mod tests {
|
||||
.unwrap()
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
assert!(expected_ids_and_num_occurrences.contains_key(&id));
|
||||
assert!(expected_ids_and_num_occurences.contains_key(&id));
|
||||
let id2 = store_reader
|
||||
.get(doc_id)
|
||||
.unwrap()
|
||||
@@ -1651,18 +1558,6 @@ mod tests {
|
||||
.as_u64()
|
||||
.unwrap();
|
||||
assert_eq!(id, id2);
|
||||
let bool = store_reader
|
||||
.get(doc_id)
|
||||
.unwrap()
|
||||
.get_first(bool_field)
|
||||
.unwrap()
|
||||
.as_bool()
|
||||
.unwrap();
|
||||
let doc = store_reader.get(doc_id).unwrap();
|
||||
let mut bool2 = doc.get_all(multi_bools);
|
||||
assert_eq!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
assert_ne!(bool, bool2.next().unwrap().as_bool().unwrap());
|
||||
assert_eq!(None, bool2.next())
|
||||
}
|
||||
}
|
||||
// test search
|
||||
@@ -1678,7 +1573,7 @@ mod tests {
|
||||
top_docs.iter().map(|el| el.1).collect::<Vec<_>>()
|
||||
};
|
||||
|
||||
for (existing_id, count) in expected_ids_and_num_occurrences {
|
||||
for (existing_id, count) in expected_ids_and_num_occurences {
|
||||
assert_eq!(do_search(&existing_id.to_string()).len() as u64, count);
|
||||
}
|
||||
for existing_id in deleted_ids {
|
||||
@@ -1705,31 +1600,6 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_minimal() {
|
||||
assert!(test_operation_strategy(
|
||||
&[
|
||||
IndexingOp::AddDoc { id: 23 },
|
||||
IndexingOp::AddDoc { id: 13 },
|
||||
IndexingOp::DeleteDoc { id: 13 }
|
||||
],
|
||||
true,
|
||||
false
|
||||
)
|
||||
.is_ok());
|
||||
|
||||
assert!(test_operation_strategy(
|
||||
&[
|
||||
IndexingOp::AddDoc { id: 23 },
|
||||
IndexingOp::AddDoc { id: 13 },
|
||||
IndexingOp::DeleteDoc { id: 13 }
|
||||
],
|
||||
false,
|
||||
false
|
||||
)
|
||||
.is_ok());
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(20))]
|
||||
#[test]
|
||||
|
||||
@@ -92,7 +92,7 @@ impl Drop for IndexWriterBomb {
|
||||
mod tests {
|
||||
use std::mem;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::IndexWriterStatus;
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user