mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-01 15:02:55 +00:00
Compare commits
15 Commits
0.18
...
fastfield-
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
14d53851a8 | ||
|
|
2d176e66b6 | ||
|
|
838a332db0 | ||
|
|
defbd9139b | ||
|
|
0c87732459 | ||
|
|
4d66a3f0a0 | ||
|
|
977f01a8a3 | ||
|
|
c14bdd26d4 | ||
|
|
3272f80171 | ||
|
|
23d5ab5656 | ||
|
|
245ed5fed1 | ||
|
|
33bed01168 | ||
|
|
17a5f4f0ff | ||
|
|
c969582308 | ||
|
|
18d2ee5bb7 |
7
.github/workflows/coverage.yml
vendored
7
.github/workflows/coverage.yml
vendored
@@ -13,11 +13,12 @@ jobs:
|
||||
- uses: actions/checkout@v3
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly --component llvm-tools-preview
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Install cargo-llvm-cov
|
||||
run: curl -LsSf https://github.com/taiki-e/cargo-llvm-cov/releases/latest/download/cargo-llvm-cov-x86_64-unknown-linux-gnu.tar.gz | tar xzf - -C ~/.cargo/bin
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
run: cargo llvm-cov --all-features --workspace --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
uses: codecov/codecov-action@v2
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
files: lcov.info
|
||||
|
||||
2
.github/workflows/test.yml
vendored
2
.github/workflows/test.yml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
components: rustfmt, clippy
|
||||
|
||||
- name: Run tests
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints --verbose --workspace
|
||||
run: cargo +stable test --features mmap,brotli-compression,lz4-compression,snappy-compression,failpoints --verbose --workspace
|
||||
|
||||
- name: Run tests quickwit feature
|
||||
run: cargo +stable test --features mmap,quickwit,failpoints --verbose --workspace
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Tantivy 0.18
|
||||
Unreleased
|
||||
================================
|
||||
- For date values `chrono` has been replaced with `time` (@uklotzde) #1304 :
|
||||
- The `time` crate is re-exported as `tantivy::time` instead of `tantivy::chrono`.
|
||||
@@ -8,10 +8,6 @@ Tantivy 0.18
|
||||
- Converting a `time::OffsetDateTime` to `Value::Date` implicitly converts the value into UTC.
|
||||
If this is not desired do the time zone conversion yourself and use `time::PrimitiveDateTime`
|
||||
directly instead.
|
||||
- Add [histogram](https://github.com/quickwit-oss/tantivy/pull/1306) aggregation (@PSeitz)
|
||||
- Add support for fastfield on text fields (@PSeitz)
|
||||
- Add terms aggregation (@PSeitz)
|
||||
- Add support for zstd compression (@kryesh)
|
||||
|
||||
Tantivy 0.17
|
||||
================================
|
||||
@@ -23,7 +19,7 @@ Tantivy 0.17
|
||||
- Schema now offers not indexing fieldnorms (@lpouget) [#922](https://github.com/quickwit-oss/tantivy/issues/922)
|
||||
- Reduce the number of fsync calls [#1225](https://github.com/quickwit-oss/tantivy/issues/1225)
|
||||
- Fix opening bytes index with dynamic codec (@PSeitz) [#1278](https://github.com/quickwit-oss/tantivy/issues/1278)
|
||||
- Added an aggregation collector for range, average and stats compatible with Elasticsearch. (@PSeitz)
|
||||
- Added an aggregation collector compatible with Elasticsearch (@PSeitz)
|
||||
- Added a JSON schema type @fulmicoton [#1251](https://github.com/quickwit-oss/tantivy/issues/1251)
|
||||
- Added support for slop in phrase queries @halvorboe [#1068](https://github.com/quickwit-oss/tantivy/issues/1068)
|
||||
|
||||
|
||||
98
Cargo.toml
98
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.18.0"
|
||||
version = "0.17.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -10,72 +10,71 @@ homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
edition = "2018"
|
||||
|
||||
[dependencies]
|
||||
oneshot = "0.1.3"
|
||||
base64 = "0.13.0"
|
||||
oneshot = "0.1"
|
||||
base64 = "0.13"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
once_cell = "1.10.0"
|
||||
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
|
||||
tantivy-fst = "0.3.0"
|
||||
memmap2 = { version = "0.5.3", optional = true }
|
||||
lz4_flex = { version = "0.9.2", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3.4", optional = true }
|
||||
zstd = { version = "0.11", optional = true }
|
||||
crc32fast = "1.2.1"
|
||||
once_cell = "1.7.2"
|
||||
regex ={ version = "1.5.4", default-features = false, features = ["std"] }
|
||||
tantivy-fst = "0.3"
|
||||
memmap2 = {version = "0.5", optional=true}
|
||||
lz4_flex = { version = "0.9", default-features = false, features = ["checked-decode"], optional = true }
|
||||
brotli = { version = "3.3", optional = true }
|
||||
snap = { version = "1.0.5", optional = true }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
num_cpus = "1.13.1"
|
||||
tempfile = { version = "3.2", optional = true }
|
||||
log = "0.4.14"
|
||||
serde = { version = "1.0.126", features = ["derive"] }
|
||||
serde_json = "1.0.64"
|
||||
num_cpus = "1.13"
|
||||
fs2={ version = "0.4.3", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
tantivy-query-grammar = { version="0.18.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.2", path="./bitpacker" }
|
||||
common = { version = "0.3", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.2", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.3", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2.0"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
levenshtein_automata = "0.2"
|
||||
uuid = { version = "0.8.2", features = ["v4", "serde"] }
|
||||
crossbeam = "0.8.1"
|
||||
tantivy-query-grammar = { version="0.15.0", path="./query-grammar" }
|
||||
tantivy-bitpacker = { version="0.1", path="./bitpacker" }
|
||||
common = { version = "0.2", path = "./common/", package = "tantivy-common" }
|
||||
fastfield_codecs = { version="0.1", path="./fastfield_codecs", default-features = false }
|
||||
ownedbytes = { version="0.2", path="./ownedbytes" }
|
||||
stable_deref_trait = "1.2"
|
||||
rust-stemmers = "1.2"
|
||||
downcast-rs = "1.2"
|
||||
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
|
||||
census = "0.4.0"
|
||||
census = "0.4"
|
||||
fnv = "1.0.7"
|
||||
thiserror = "1.0.30"
|
||||
thiserror = "1.0.24"
|
||||
htmlescape = "0.3.1"
|
||||
fail = "0.5.0"
|
||||
murmurhash32 = "0.2.0"
|
||||
time = { version = "0.3.9", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.7.5"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.10.3"
|
||||
measure_time = "0.8.2"
|
||||
pretty_assertions = "1.2.1"
|
||||
serde_cbor = { version = "0.11.2", optional = true }
|
||||
async-trait = "0.1.53"
|
||||
fail = "0.5"
|
||||
murmurhash32 = "0.2"
|
||||
time = { version = "0.3.7", features = ["serde-well-known"] }
|
||||
smallvec = "1.6.1"
|
||||
rayon = "1.5"
|
||||
lru = "0.7.0"
|
||||
fastdivide = "0.4"
|
||||
itertools = "0.10.0"
|
||||
measure_time = "0.8.0"
|
||||
pretty_assertions = "1.1.0"
|
||||
serde_cbor = {version="0.11", optional=true}
|
||||
async-trait = "0.1"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
rand = "0.8.5"
|
||||
rand = "0.8.3"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
proptest = "1.0.0"
|
||||
matches = "0.1.8"
|
||||
proptest = "1.0"
|
||||
criterion = "0.3.5"
|
||||
test-log = "0.2.10"
|
||||
test-log = "0.2.8"
|
||||
env_logger = "0.9.0"
|
||||
pprof = { version = "0.9.0", features = ["flamegraph", "criterion"] }
|
||||
futures = "0.3.21"
|
||||
pprof = {version= "0.7", features=["flamegraph", "criterion"]}
|
||||
futures = "0.3.15"
|
||||
|
||||
[dev-dependencies.fail]
|
||||
version = "0.5.0"
|
||||
version = "0.5"
|
||||
features = ["failpoints"]
|
||||
|
||||
[profile.release]
|
||||
@@ -94,7 +93,6 @@ mmap = ["fs2", "tempfile", "memmap2"]
|
||||
brotli-compression = ["brotli"]
|
||||
lz4-compression = ["lz4_flex"]
|
||||
snappy-compression = ["snap"]
|
||||
zstd-compression = ["zstd"]
|
||||
|
||||
failpoints = ["fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
11
README.md
11
README.md
@@ -128,13 +128,10 @@ $ gdb run
|
||||
# Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/nuclia-dark-theme.png#gh-dark-mode-only" alt="Nuclia" height="35" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.ai-dark-theme.png#gh-dark-mode-only" alt="Humanfirst.ai" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
<img align="center" src="doc/assets/images/Nuclia.png" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg" alt="Element.io" height="25" width="auto" />
|
||||
</p>
|
||||
|
||||
|
||||
# FAQ
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.2.0"
|
||||
version = "0.1.1"
|
||||
edition = "2018"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
@@ -10,7 +10,7 @@ description = "common traits and utility functions used by multiple tantivy subc
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version="0.3", path="../ownedbytes" }
|
||||
ownedbytes = { version="0.2", path="../ownedbytes" }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.0.0"
|
||||
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 56 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 23 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 7.8 KiB |
@@ -122,7 +122,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
|
||||
|
||||
let res: Value = serde_json::to_value(&agg_res)?;
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
println!("{}", serde_json::to_string_pretty(&res)?);
|
||||
|
||||
Ok(())
|
||||
|
||||
@@ -1,8 +1,7 @@
|
||||
// # Json field example
|
||||
//
|
||||
// This example shows how the json field can be used
|
||||
// to make tantivy partially schemaless by setting it as
|
||||
// default query parser field.
|
||||
// to make tantivy partially schemaless.
|
||||
|
||||
use tantivy::collector::{Count, TopDocs};
|
||||
use tantivy::query::QueryParser;
|
||||
@@ -11,6 +10,10 @@ use tantivy::Index;
|
||||
|
||||
fn main() -> tantivy::Result<()> {
|
||||
// # Defining the schema
|
||||
//
|
||||
// We need two fields:
|
||||
// - a timestamp
|
||||
// - a json object field
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_date_field("timestamp", FAST | STORED);
|
||||
let event_type = schema_builder.add_text_field("event_type", STRING | STORED);
|
||||
@@ -40,8 +43,7 @@ fn main() -> tantivy::Result<()> {
|
||||
"attributes": {
|
||||
"target": "submit-button",
|
||||
"cart": {"product_id": 133},
|
||||
"description": "das keyboard",
|
||||
"event_type": "holiday-sale"
|
||||
"description": "das keyboard"
|
||||
}
|
||||
}"#,
|
||||
)?;
|
||||
@@ -51,9 +53,6 @@ fn main() -> tantivy::Result<()> {
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
|
||||
// # Default fields: event_type and attributes
|
||||
// By setting attributes as a default field it allows omitting attributes itself, e.g. "target",
|
||||
// instead of "attributes.target"
|
||||
let query_parser = QueryParser::for_index(&index, vec![event_type, attributes]);
|
||||
{
|
||||
let query = query_parser.parse_query("target:submit-button")?;
|
||||
@@ -71,34 +70,10 @@ fn main() -> tantivy::Result<()> {
|
||||
assert_eq!(count_docs, 1);
|
||||
}
|
||||
{
|
||||
let query = query_parser.parse_query("click AND cart.product_id:133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
{
|
||||
// The sub-fields in the json field marked as default field still need to be explicitly
|
||||
// addressed
|
||||
let query = query_parser.parse_query("click AND 133")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
{
|
||||
// Default json fields are ignored if they collide with the schema
|
||||
let query = query_parser.parse_query("event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 0);
|
||||
}
|
||||
// # Query via full attribute path
|
||||
{
|
||||
// This only searches in our schema's `event_type` field
|
||||
let query = query_parser.parse_query("event_type:click")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
assert_eq!(hits.len(), 2);
|
||||
}
|
||||
{
|
||||
// Default json fields can still be accessed by full path
|
||||
let query = query_parser.parse_query("attributes.event_type:holiday-sale")?;
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2))?;
|
||||
let query = query_parser
|
||||
.parse_query("event_type:click AND cart.product_id:133")
|
||||
.unwrap();
|
||||
let hits = searcher.search(&*query, &TopDocs::with_limit(2)).unwrap();
|
||||
assert_eq!(hits.len(), 1);
|
||||
}
|
||||
Ok(())
|
||||
|
||||
1
fastfield_codecs/.gitignore
vendored
Normal file
1
fastfield_codecs/.gitignore
vendored
Normal file
@@ -0,0 +1 @@
|
||||
datasets/
|
||||
@@ -1,16 +1,14 @@
|
||||
[package]
|
||||
name = "fastfield_codecs"
|
||||
version = "0.2.0"
|
||||
version = "0.1.0"
|
||||
authors = ["Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2018"
|
||||
description = "Fast field codecs used by tantivy"
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
common = { version = "0.3", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.2", path = "../bitpacker/" }
|
||||
common = { version = "0.2", path = "../common/", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version="0.1.1", path = "../bitpacker/" }
|
||||
prettytable-rs = {version="0.8.0", optional= true}
|
||||
rand = {version="0.8.3", optional= true}
|
||||
|
||||
@@ -19,6 +17,6 @@ more-asserts = "0.2.1"
|
||||
rand = "0.8.3"
|
||||
|
||||
[features]
|
||||
unstable = [] # useful for benches and experimental codecs.
|
||||
bin = ["prettytable-rs", "rand"]
|
||||
default = ["bin"]
|
||||
|
||||
|
||||
6
fastfield_codecs/Makefile
Normal file
6
fastfield_codecs/Makefile
Normal file
@@ -0,0 +1,6 @@
|
||||
DATASETS ?= hdfs_logs_timestamps http_logs_timestamps amazon_reviews_product_ids nooc_temperatures
|
||||
download:
|
||||
@echo "--- Downloading datasets ---"
|
||||
mkdir -p datasets
|
||||
@for dataset in $(DATASETS); do curl -o - https://quickwit-datasets-public.s3.amazonaws.com/benchmarks/fastfields/$$dataset.txt.gz | gunzip > datasets/$$dataset.txt; done
|
||||
|
||||
@@ -13,6 +13,10 @@ A codec needs to implement 2 traits:
|
||||
- A reader implementing `FastFieldCodecReader` to read the codec.
|
||||
- A serializer implementing `FastFieldCodecSerializer` for compression estimation and codec name + id.
|
||||
|
||||
### Download real world datasets for codecs comparison
|
||||
Before comparing codecs, you need to execute `make download` to download real world datasets hosted on AWS S3.
|
||||
To run with the unstable codecs, execute `cargo run --features unstable`.
|
||||
|
||||
### Tests
|
||||
|
||||
Once the traits are implemented test and benchmark integration is pretty easy (see `test_with_codec_data_sets` and `bench.rs`).
|
||||
@@ -23,46 +27,101 @@ cargo run --features bin
|
||||
```
|
||||
|
||||
### TODO
|
||||
- Add real world data sets in comparison
|
||||
- Add codec to cover sparse data sets
|
||||
|
||||
|
||||
### Codec Comparison
|
||||
```
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| | Compression Ratio | Compression Estimation |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Autoincrement | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.000039572664 | 0.000004396963 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.1477348 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing concave | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.26562938 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.190665 | 0.1883836 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Monotonically increasing convex | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.25003937 | 0.28125438 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.18676 | 0.2040086 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Almost monotonically increasing | | |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| LinearInterpol | 0.14066513 | 0.1562544 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| MultiLinearInterpol | 0.16335973 | 0.17275847 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 |
|
||||
+----------------------------------+-------------------+------------------------+
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| | Compression ratio | Compression ratio estimation | Compression time (micro) | Reading time (micro) |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Autoincrement | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.0051544965 | 0.17251475 | 960 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.118189104 | 0.14172314 | 708 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 474 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing concave | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.005955 | 0.18813984 | 885 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.16113 | 0.15734828 | 704 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 478 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Monotonically increasing convex | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.00613 | 0.20376484 | 889 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.157175 | 0.17297328 | 706 | 212 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.31251436 | 0.3125 | 471 | 113 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Almost monotonically increasing | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14549863 | 0.17251475 | 923 | 210 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.14943957 | 0.15734814 | 703 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.28126493 | 0.28125 | 462 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Random | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.14533783 | 0.14126475 | 924 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.13381402 | 0.15734814 | 695 | 211 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.12501445 | 0.125 | 422 | 112 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.39826187 | 0.4068908 | 5545 | 1086 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.39214826 | 0.40734857 | 5082 | 1073 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2864 | 567 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HDFS logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.032736875 | 0.094390824 | 4942 | 1067 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.02667125 | 0.079223566 | 3626 | 994 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.39062786 | 0.390625 | 2493 | 566 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| HTTP logs timestamps SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.047942877 | 0.20376582 | 5121 | 1065 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.06637425 | 0.18859856 | 3929 | 1093 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.26562786 | 0.265625 | 2221 | 526 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.41900787 | 0.4225158 | 5239 | 1089 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.41504425 | 0.43859857 | 4158 | 1052 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2603 | 513 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Amazon review product ids SORTED | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | 0.18364687 | 0.25064084 | 5036 | 990 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 0.21239226 | 0.21984856 | 4087 | 1072 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 0.40625286 | 0.40625 | 2702 | 525 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Temperatures | | | | |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| PiecewiseLinear | | Codec Disabled | 0 | 0 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| FOR | 1.0088086 | 1.001098 | 1306 | 237 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
| Bitpacked | 1.000012 | 1 | 950 | 108 |
|
||||
+----------------------------------+-------------------+------------------------------+--------------------------+----------------------+
|
||||
|
||||
```
|
||||
|
||||
@@ -5,11 +5,8 @@ extern crate test;
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use fastfield_codecs::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::*;
|
||||
|
||||
@@ -70,14 +67,9 @@ mod tests {
|
||||
bench_create::<BitpackedFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_create(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<LinearInterpolFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_create(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_create::<MultiLinearInterpolFastFieldSerializer>(b, &data);
|
||||
bench_create::<PiecewiseLinearFastFieldSerializer>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_bitpack_get(b: &mut Bencher) {
|
||||
@@ -85,16 +77,9 @@ mod tests {
|
||||
bench_get::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_linearinterpol_get(b: &mut Bencher) {
|
||||
fn bench_fastfield_piecewise_linear_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>(b, &data);
|
||||
}
|
||||
#[bench]
|
||||
fn bench_fastfield_multilinearinterpol_get(b: &mut Bencher) {
|
||||
let data: Vec<_> = get_data();
|
||||
bench_get::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>(
|
||||
b, &data,
|
||||
);
|
||||
bench_get::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>(b, &data);
|
||||
}
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
let min_value = data.iter().cloned().min().unwrap_or(0);
|
||||
|
||||
@@ -128,7 +128,10 @@ impl FastFieldCodecSerializer for BitpackedFastFieldSerializer {
|
||||
) -> bool {
|
||||
true
|
||||
}
|
||||
fn estimate(_fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let amplitude = stats.max_value - stats.min_value;
|
||||
let num_bits = compute_num_bits(amplitude);
|
||||
let num_bits_uncompressed = 64;
|
||||
|
||||
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
272
fastfield_codecs/src/frame_of_reference.rs
Normal file
@@ -0,0 +1,272 @@
|
||||
use std::io::{self, Read, Write};
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 128;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct FORFastFieldReader {
|
||||
num_vals: u64,
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
min: u64,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
self.metadata.min + diff
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.min.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let min = u64::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self { min, num_bits })
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct FORFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for FORFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for FORFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = FORFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
num_vals: footer.num_vals,
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
pub struct FORFastFieldSerializer {}
|
||||
|
||||
impl FastFieldCodecSerializer for FORFastFieldSerializer {
|
||||
const NAME: &'static str = "FOR";
|
||||
const ID: u8 = 5;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let mut min = block_values[0];
|
||||
let mut max = block_values[0];
|
||||
for ¤t_value in block_values[1..].iter() {
|
||||
min = min.min(current_value);
|
||||
max = max.max(current_value);
|
||||
}
|
||||
let num_bits = compute_num_bits(max - min);
|
||||
for current_value in block_values.iter() {
|
||||
bit_packer.write(current_value - min, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata { min, num_bits });
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = FORFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
stats.num_vals > BLOCK_SIZE
|
||||
}
|
||||
|
||||
/// Estimate compression ratio by compute the ratio of the first block.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let max_distance = (0..last_elem_in_first_chunk)
|
||||
.into_iter()
|
||||
.map(|pos| {
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
actual_value - stats.min_value
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and multiply by a magic number 3 to select this codec
|
||||
// when we are almost sure that this is relevant.
|
||||
let relative_max_value = max_distance as f32 * 3.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 9 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<FORFastFieldSerializer, FORFastFieldReader>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
println!("{}", actual_compression);
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(actual_compression > 0.006);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.10);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -6,15 +6,20 @@ use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
pub mod bitpacked;
|
||||
#[cfg(feature = "unstable")]
|
||||
pub mod frame_of_reference;
|
||||
pub mod linearinterpol;
|
||||
pub mod multilinearinterpol;
|
||||
pub mod piecewise_linear;
|
||||
|
||||
pub trait FastFieldCodecReader: Sized {
|
||||
/// reads the metadata and returns the CodecReader
|
||||
/// Reads the metadata and returns the CodecReader.
|
||||
fn open_from_bytes(bytes: &[u8]) -> std::io::Result<Self>;
|
||||
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64;
|
||||
|
||||
/// Read u64 value for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field.
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64;
|
||||
fn min_value(&self) -> u64;
|
||||
fn max_value(&self) -> u64;
|
||||
}
|
||||
@@ -35,7 +40,10 @@ pub trait FastFieldCodecSerializer {
|
||||
///
|
||||
/// It could make sense to also return a value representing
|
||||
/// computational complexity.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32;
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32;
|
||||
|
||||
/// Serializes the data using the serializer into write.
|
||||
/// There are multiple iterators, in case the codec needs to read the data multiple times.
|
||||
@@ -85,9 +93,8 @@ impl FastFieldDataAccess for Vec<u64> {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::bitpacked::{BitpackedFastFieldReader, BitpackedFastFieldSerializer};
|
||||
use crate::linearinterpol::{LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer};
|
||||
use crate::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
use crate::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
|
||||
pub fn create_and_validate<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
@@ -97,7 +104,7 @@ mod tests {
|
||||
if !S::is_applicable(&data, crate::tests::stats_from_vec(data)) {
|
||||
return (f32::MAX, 0.0);
|
||||
}
|
||||
let estimation = S::estimate(&data, crate::tests::stats_from_vec(data));
|
||||
let estimation = S::estimate_compression_ratio(&data, crate::tests::stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -157,13 +164,10 @@ mod tests {
|
||||
fn test_codec_bitpacking() {
|
||||
test_codec::<BitpackedFastFieldSerializer, BitpackedFastFieldReader>();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_codec_interpolation() {
|
||||
test_codec::<LinearInterpolFastFieldSerializer, LinearInterpolFastFieldReader>();
|
||||
}
|
||||
#[test]
|
||||
fn test_codec_multi_interpolation() {
|
||||
test_codec::<MultiLinearInterpolFastFieldSerializer, MultiLinearInterpolFastFieldReader>();
|
||||
fn test_codec_piecewise_linear() {
|
||||
test_codec::<PiecewiseLinearFastFieldSerializer, PiecewiseLinearFastFieldReader>();
|
||||
}
|
||||
|
||||
use super::*;
|
||||
@@ -181,45 +185,50 @@ mod tests {
|
||||
fn estimation_good_interpolation_case() {
|
||||
let data = (10..=20000_u64).collect::<Vec<_>>();
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.01);
|
||||
|
||||
let multi_linear_interpol_estimation =
|
||||
MultiLinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(multi_linear_interpol_estimation, 0.2);
|
||||
assert_le!(linear_interpol_estimation, multi_linear_interpol_estimation);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, bitpacked_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case() {
|
||||
let data = vec![200, 10, 10, 10, 10, 1000, 20];
|
||||
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.32);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.32);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
assert_le!(bitpacked_estimation, piecewise_interpol_estimation);
|
||||
}
|
||||
#[test]
|
||||
fn estimation_test_bad_interpolation_case_monotonically_increasing() {
|
||||
fn estimation_test_interpolation_case_monotonically_increasing() {
|
||||
let mut data = (200..=20000_u64).collect::<Vec<_>>();
|
||||
data.push(1_000_000);
|
||||
|
||||
// in this case the linear interpolation can't in fact not be worse than bitpacking,
|
||||
// but the estimator adds some threshold, which leads to estimated worse behavior
|
||||
let linear_interpol_estimation =
|
||||
LinearInterpolFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
assert_le!(linear_interpol_estimation, 0.35);
|
||||
let piecewise_interpol_estimation =
|
||||
PiecewiseLinearFastFieldSerializer::estimate_compression_ratio(
|
||||
&data,
|
||||
stats_from_vec(&data),
|
||||
);
|
||||
assert_le!(piecewise_interpol_estimation, 0.2);
|
||||
|
||||
let bitpacked_estimation =
|
||||
BitpackedFastFieldSerializer::estimate(&data, stats_from_vec(&data));
|
||||
BitpackedFastFieldSerializer::estimate_compression_ratio(&data, stats_from_vec(&data));
|
||||
println!("{}", bitpacked_estimation);
|
||||
assert_le!(bitpacked_estimation, 0.32);
|
||||
assert_le!(bitpacked_estimation, linear_interpol_estimation);
|
||||
assert_le!(piecewise_interpol_estimation, bitpacked_estimation);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -71,9 +71,9 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, doc, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(doc, data)) - self.footer.offset
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let calculated_value = get_calculated_value(self.footer.first_val, idx, self.slope);
|
||||
(calculated_value + self.bit_unpacker.get(idx, data)) - self.footer.offset
|
||||
}
|
||||
|
||||
#[inline]
|
||||
@@ -88,6 +88,10 @@ impl FastFieldCodecReader for LinearInterpolFastFieldReader {
|
||||
|
||||
/// Fastfield serializer, which tries to guess values by linear interpolation
|
||||
/// and stores the difference bitpacked.
|
||||
#[deprecated(
|
||||
note = "Linear interpolation works best only on very rare cases and piecewise linear codec \
|
||||
already works great on them."
|
||||
)]
|
||||
pub struct LinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[inline]
|
||||
@@ -105,6 +109,7 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
first_val + (pos as f32 * slope) as u64
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "LinearInterpol";
|
||||
const ID: u8 = 2;
|
||||
@@ -182,10 +187,16 @@ impl FastFieldCodecSerializer for LinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima for the deviation of the calculated value are and
|
||||
/// the offset to shift all values to >=0 is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
if stats.num_vals < 3 {
|
||||
return f32::MAX;
|
||||
}
|
||||
let first_val = fastfield_accessor.get_val(0);
|
||||
let last_val = fastfield_accessor.get_val(stats.num_vals as u64 - 1);
|
||||
let slope = get_slope(first_val, last_val, stats.num_vals);
|
||||
@@ -229,6 +240,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(deprecated)]
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -289,8 +301,10 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn linear_interpol_fast_field_rand() {
|
||||
for _ in 0..5000 {
|
||||
let mut data = (0..50).map(|_| rand::random::<u64>()).collect::<Vec<_>>();
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
create_and_validate(&data, "random");
|
||||
|
||||
data.reverse();
|
||||
|
||||
@@ -1,31 +1,52 @@
|
||||
#[macro_use]
|
||||
extern crate prettytable;
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::{FastFieldCodecSerializer, FastFieldStats};
|
||||
use std::fs::File;
|
||||
use std::io;
|
||||
use std::io::BufRead;
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use common::f64_to_u64;
|
||||
use fastfield_codecs::bitpacked::BitpackedFastFieldReader;
|
||||
#[cfg(feature = "unstable")]
|
||||
use fastfield_codecs::frame_of_reference::{FORFastFieldReader, FORFastFieldSerializer};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldStats};
|
||||
use prettytable::{Cell, Row, Table};
|
||||
use rand::prelude::StdRng;
|
||||
use rand::Rng;
|
||||
|
||||
fn main() {
|
||||
let mut table = Table::new();
|
||||
|
||||
// Add a row per time
|
||||
table.add_row(row!["", "Compression Ratio", "Compression Estimation"]);
|
||||
table.add_row(row![
|
||||
"",
|
||||
"Compression ratio",
|
||||
"Compression ratio estimation",
|
||||
"Compression time (micro)",
|
||||
"Reading time (micro)"
|
||||
]);
|
||||
|
||||
for (data, data_set_name) in get_codec_test_data_sets() {
|
||||
let mut results = vec![];
|
||||
let res = serialize_with_codec::<LinearInterpolFastFieldSerializer>(&data);
|
||||
let res = serialize_with_codec::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<MultiLinearInterpolFastFieldSerializer>(&data);
|
||||
results.push(res);
|
||||
let res = serialize_with_codec::<fastfield_codecs::bitpacked::BitpackedFastFieldSerializer>(
|
||||
&data,
|
||||
);
|
||||
#[cfg(feature = "unstable")]
|
||||
{
|
||||
let res = serialize_with_codec::<FORFastFieldSerializer, FORFastFieldReader>(&data);
|
||||
results.push(res);
|
||||
}
|
||||
let res = serialize_with_codec::<
|
||||
fastfield_codecs::bitpacked::BitpackedFastFieldSerializer,
|
||||
BitpackedFastFieldReader,
|
||||
>(&data);
|
||||
results.push(res);
|
||||
|
||||
// let best_estimation_codec = results
|
||||
//.iter()
|
||||
//.min_by(|res1, res2| res1.partial_cmp(&res2).unwrap())
|
||||
//.unwrap();
|
||||
let best_compression_ratio_codec = results
|
||||
.iter()
|
||||
.min_by(|res1, res2| res1.partial_cmp(res2).unwrap())
|
||||
@@ -33,7 +54,7 @@ fn main() {
|
||||
.unwrap();
|
||||
|
||||
table.add_row(Row::new(vec![Cell::new(data_set_name).style_spec("Bbb")]));
|
||||
for (is_applicable, est, comp, name) in results {
|
||||
for (is_applicable, est, comp, name, compression_duration, read_duration) in results {
|
||||
let (est_cell, ratio_cell) = if !is_applicable {
|
||||
("Codec Disabled".to_string(), "".to_string())
|
||||
} else {
|
||||
@@ -49,6 +70,8 @@ fn main() {
|
||||
Cell::new(name).style_spec("bFg"),
|
||||
Cell::new(&ratio_cell).style_spec(style),
|
||||
Cell::new(&est_cell).style_spec(""),
|
||||
Cell::new(&compression_duration.as_micros().to_string()),
|
||||
Cell::new(&read_duration.as_micros().to_string()),
|
||||
]));
|
||||
}
|
||||
}
|
||||
@@ -70,7 +93,6 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
current_cumulative
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
// let data = (1..=200000_u64).map(|num| num + num).collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing concave"));
|
||||
|
||||
let mut current_cumulative = 0;
|
||||
@@ -83,22 +105,79 @@ pub fn get_codec_test_data_sets() -> Vec<(Vec<u64>, &'static str)> {
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Monotonically increasing convex"));
|
||||
|
||||
let mut rng: StdRng = rand::SeedableRng::seed_from_u64(1);
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|num| num + rand::random::<u8>() as u64)
|
||||
.map(|num| num + rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Almost monotonically increasing"));
|
||||
|
||||
let data = (1000..=200_000_u64)
|
||||
.map(|_| rng.gen::<u8>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
data_and_names.push((data, "Random"));
|
||||
|
||||
let mut data = load_dataset("datasets/hdfs_logs_timestamps.txt");
|
||||
data_and_names.push((data.clone(), "HDFS logs timestamps"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "HDFS logs timestamps SORTED"));
|
||||
|
||||
let data = load_dataset("datasets/http_logs_timestamps.txt");
|
||||
data_and_names.push((data, "HTTP logs timestamps SORTED"));
|
||||
|
||||
let mut data = load_dataset("datasets/amazon_reviews_product_ids.txt");
|
||||
data_and_names.push((data.clone(), "Amazon review product ids"));
|
||||
|
||||
data.sort_unstable();
|
||||
data_and_names.push((data, "Amazon review product ids SORTED"));
|
||||
|
||||
let data = load_float_dataset("datasets/nooc_temperatures.txt");
|
||||
data_and_names.push((data, "Temperatures"));
|
||||
|
||||
data_and_names
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
pub fn load_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let l = line.unwrap();
|
||||
data.push(l.parse::<u64>().unwrap());
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn load_float_dataset(file_path: &str) -> Vec<u64> {
|
||||
println!("Load float dataset from `{}`", file_path);
|
||||
let file = File::open(file_path).expect("Error when opening file.");
|
||||
let lines = io::BufReader::new(file).lines();
|
||||
let mut data = Vec::new();
|
||||
for line in lines {
|
||||
let line_string = line.unwrap();
|
||||
let value = line_string.parse::<f64>().unwrap();
|
||||
data.push(f64_to_u64(value));
|
||||
}
|
||||
data
|
||||
}
|
||||
|
||||
pub fn serialize_with_codec<S: FastFieldCodecSerializer, R: FastFieldCodecReader>(
|
||||
data: &[u64],
|
||||
) -> (bool, f32, f32, &'static str) {
|
||||
) -> (bool, f32, f32, &'static str, Duration, Duration) {
|
||||
let is_applicable = S::is_applicable(&data, stats_from_vec(data));
|
||||
if !is_applicable {
|
||||
return (false, 0.0, 0.0, S::NAME);
|
||||
return (
|
||||
false,
|
||||
0.0,
|
||||
0.0,
|
||||
S::NAME,
|
||||
Duration::from_secs(0),
|
||||
Duration::from_secs(0),
|
||||
);
|
||||
}
|
||||
let estimation = S::estimate(&data, stats_from_vec(data));
|
||||
let start_time_compression = Instant::now();
|
||||
let estimation = S::estimate_compression_ratio(&data, stats_from_vec(data));
|
||||
let mut out = vec![];
|
||||
S::serialize(
|
||||
&mut out,
|
||||
@@ -108,9 +187,22 @@ pub fn serialize_with_codec<S: FastFieldCodecSerializer>(
|
||||
data.iter().cloned(),
|
||||
)
|
||||
.unwrap();
|
||||
|
||||
let elasped_time_compression = start_time_compression.elapsed();
|
||||
let actual_compression = out.len() as f32 / (data.len() * 8) as f32;
|
||||
(true, estimation, actual_compression, S::NAME)
|
||||
let reader = R::open_from_bytes(&out).unwrap();
|
||||
let start_time_read = Instant::now();
|
||||
for doc in 0..data.len() {
|
||||
reader.get_u64(doc as u64, &out);
|
||||
}
|
||||
let elapsed_time_read = start_time_read.elapsed();
|
||||
(
|
||||
true,
|
||||
estimation,
|
||||
actual_compression,
|
||||
S::NAME,
|
||||
elasped_time_compression,
|
||||
elapsed_time_read,
|
||||
)
|
||||
}
|
||||
|
||||
pub fn stats_from_vec(data: &[u64]) -> FastFieldStats {
|
||||
|
||||
@@ -155,14 +155,17 @@ impl FastFieldCodecReader for MultiLinearInterpolFastFieldReader {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, doc: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(doc, &self.footer.interpolations);
|
||||
let doc = doc - interpolation.start_pos;
|
||||
let calculated_value =
|
||||
get_calculated_value(interpolation.value_start_pos, doc, interpolation.slope);
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let interpolation = get_interpolation_function(idx, &self.footer.interpolations);
|
||||
let block_idx = idx - interpolation.start_pos;
|
||||
let calculated_value = get_calculated_value(
|
||||
interpolation.value_start_pos,
|
||||
block_idx,
|
||||
interpolation.slope,
|
||||
);
|
||||
let diff = interpolation
|
||||
.bit_unpacker
|
||||
.get(doc, &data[interpolation.data_start_offset as usize..]);
|
||||
.get(block_idx, &data[interpolation.data_start_offset as usize..]);
|
||||
(calculated_value + diff) - interpolation.positive_val_offset
|
||||
}
|
||||
|
||||
@@ -187,8 +190,13 @@ fn get_calculated_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
}
|
||||
|
||||
/// Same as LinearInterpolFastFieldSerializer, but working on chunks of CHUNK_SIZE elements.
|
||||
#[deprecated(
|
||||
note = "MultiLinearInterpol is replaced by PiecewiseLinear codec which fixes the slope and is \
|
||||
a little bit more optimized."
|
||||
)]
|
||||
pub struct MultiLinearInterpolFastFieldSerializer {}
|
||||
|
||||
#[allow(deprecated)]
|
||||
impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
const NAME: &'static str = "MultiLinearInterpol";
|
||||
const ID: u8 = 3;
|
||||
@@ -311,10 +319,13 @@ impl FastFieldCodecSerializer for MultiLinearInterpolFastFieldSerializer {
|
||||
}
|
||||
true
|
||||
}
|
||||
/// estimation for linear interpolation is hard because, you don't know
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate(fastfield_accessor: &impl FastFieldDataAccess, stats: FastFieldStats) -> f32 {
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = CHUNK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
@@ -366,6 +377,7 @@ fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
#[allow(deprecated)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
365
fastfield_codecs/src/piecewise_linear.rs
Normal file
@@ -0,0 +1,365 @@
|
||||
//! PiecewiseLinear codec uses piecewise linear functions for every block of 512 values to predict
|
||||
//! values and fast field values. The difference with real fast field values is then stored.
|
||||
//! For every block, the linear function can be expressed as
|
||||
//! `computed_value = slope * block_position + first_value + positive_offset`
|
||||
//! where:
|
||||
//! - `block_position` is the position inside of the block from 0 to 511
|
||||
//! - `first_value` is the first value on the block
|
||||
//! - `positive_offset` is computed such that we ensure the diff `real_value - computed_value` is
|
||||
//! always positive.
|
||||
//!
|
||||
//! 21 bytes is needed to store the block metadata, it adds an overhead of 21 * 8 / 512 = 0,33 bits
|
||||
//! per element.
|
||||
|
||||
use std::io::{self, Read, Write};
|
||||
use std::ops::Sub;
|
||||
|
||||
use common::{BinarySerializable, DeserializeFrom};
|
||||
use tantivy_bitpacker::{compute_num_bits, BitPacker, BitUnpacker};
|
||||
|
||||
use crate::{FastFieldCodecReader, FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
|
||||
const BLOCK_SIZE: u64 = 512;
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct PiecewiseLinearFastFieldReader {
|
||||
min_value: u64,
|
||||
max_value: u64,
|
||||
block_readers: Vec<BlockReader>,
|
||||
}
|
||||
|
||||
/// Block that stores metadata to predict value with a linear
|
||||
/// function `predicted_value = slope * position + first_value + positive_offset`
|
||||
/// where `positive_offset` is comupted such that predicted values
|
||||
/// are always positive.
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockMetadata {
|
||||
first_value: u64,
|
||||
positive_offset: u64,
|
||||
slope: f32,
|
||||
num_bits: u8,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default)]
|
||||
struct BlockReader {
|
||||
metadata: BlockMetadata,
|
||||
start_offset: u64,
|
||||
bit_unpacker: BitUnpacker,
|
||||
}
|
||||
|
||||
impl BlockReader {
|
||||
fn new(metadata: BlockMetadata, start_offset: u64) -> Self {
|
||||
Self {
|
||||
bit_unpacker: BitUnpacker::new(metadata.num_bits),
|
||||
metadata,
|
||||
start_offset,
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, block_pos: u64, data: &[u8]) -> u64 {
|
||||
let diff = self
|
||||
.bit_unpacker
|
||||
.get(block_pos, &data[self.start_offset as usize..]);
|
||||
let predicted_value =
|
||||
predict_value(self.metadata.first_value, block_pos, self.metadata.slope);
|
||||
(predicted_value + diff) - self.metadata.positive_offset
|
||||
}
|
||||
}
|
||||
|
||||
impl BinarySerializable for BlockMetadata {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
self.first_value.serialize(write)?;
|
||||
self.positive_offset.serialize(write)?;
|
||||
self.slope.serialize(write)?;
|
||||
self.num_bits.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let first_value = u64::deserialize(reader)?;
|
||||
let positive_offset = u64::deserialize(reader)?;
|
||||
let slope = f32::deserialize(reader)?;
|
||||
let num_bits = u8::deserialize(reader)?;
|
||||
Ok(Self {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct PiecewiseLinearFooter {
|
||||
pub num_vals: u64,
|
||||
pub min_value: u64,
|
||||
pub max_value: u64,
|
||||
block_metadatas: Vec<BlockMetadata>,
|
||||
}
|
||||
|
||||
impl BinarySerializable for PiecewiseLinearFooter {
|
||||
fn serialize<W: Write>(&self, write: &mut W) -> io::Result<()> {
|
||||
let mut out = vec![];
|
||||
self.num_vals.serialize(&mut out)?;
|
||||
self.min_value.serialize(&mut out)?;
|
||||
self.max_value.serialize(&mut out)?;
|
||||
self.block_metadatas.serialize(&mut out)?;
|
||||
write.write_all(&out)?;
|
||||
(out.len() as u32).serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
|
||||
let footer = Self {
|
||||
num_vals: u64::deserialize(reader)?,
|
||||
min_value: u64::deserialize(reader)?,
|
||||
max_value: u64::deserialize(reader)?,
|
||||
block_metadatas: Vec::<BlockMetadata>::deserialize(reader)?,
|
||||
};
|
||||
Ok(footer)
|
||||
}
|
||||
}
|
||||
|
||||
impl FastFieldCodecReader for PiecewiseLinearFastFieldReader {
|
||||
/// Opens a fast field given a file.
|
||||
fn open_from_bytes(bytes: &[u8]) -> io::Result<Self> {
|
||||
let footer_len: u32 = (&bytes[bytes.len() - 4..]).deserialize()?;
|
||||
let (_, mut footer) = bytes.split_at(bytes.len() - (4 + footer_len) as usize);
|
||||
let footer = PiecewiseLinearFooter::deserialize(&mut footer)?;
|
||||
let mut block_readers = Vec::with_capacity(footer.block_metadatas.len());
|
||||
let mut current_data_offset = 0;
|
||||
for block_metadata in footer.block_metadatas.into_iter() {
|
||||
let num_bits = block_metadata.num_bits;
|
||||
block_readers.push(BlockReader::new(block_metadata, current_data_offset));
|
||||
current_data_offset += num_bits as u64 * BLOCK_SIZE / 8;
|
||||
}
|
||||
Ok(Self {
|
||||
min_value: footer.min_value,
|
||||
max_value: footer.max_value,
|
||||
block_readers,
|
||||
})
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn get_u64(&self, idx: u64, data: &[u8]) -> u64 {
|
||||
let block_idx = (idx / BLOCK_SIZE) as usize;
|
||||
let block_pos = idx - (block_idx as u64) * BLOCK_SIZE;
|
||||
let block_reader = &self.block_readers[block_idx];
|
||||
block_reader.get_u64(block_pos, data)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn min_value(&self) -> u64 {
|
||||
self.min_value
|
||||
}
|
||||
#[inline]
|
||||
fn max_value(&self) -> u64 {
|
||||
self.max_value
|
||||
}
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn predict_value(first_val: u64, pos: u64, slope: f32) -> u64 {
|
||||
(first_val as i64 + (pos as f32 * slope) as i64) as u64
|
||||
}
|
||||
|
||||
pub struct PiecewiseLinearFastFieldSerializer;
|
||||
|
||||
impl FastFieldCodecSerializer for PiecewiseLinearFastFieldSerializer {
|
||||
const NAME: &'static str = "PiecewiseLinear";
|
||||
const ID: u8 = 4;
|
||||
/// Creates a new fast field serializer.
|
||||
fn serialize(
|
||||
write: &mut impl Write,
|
||||
_: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
data_iter: impl Iterator<Item = u64>,
|
||||
_data_iter1: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
let mut data = data_iter.collect::<Vec<_>>();
|
||||
let mut bit_packer = BitPacker::new();
|
||||
let mut block_metadatas = Vec::new();
|
||||
for data_pos in (0..data.len() as u64).step_by(BLOCK_SIZE as usize) {
|
||||
let block_num_vals = BLOCK_SIZE.min(data.len() as u64 - data_pos) as usize;
|
||||
let block_values = &mut data[data_pos as usize..data_pos as usize + block_num_vals];
|
||||
let slope = if block_num_vals == 1 {
|
||||
0f32
|
||||
} else {
|
||||
((block_values[block_values.len() - 1] as f64 - block_values[0] as f64)
|
||||
/ (block_num_vals - 1) as f64) as f32
|
||||
};
|
||||
let first_value = block_values[0];
|
||||
let mut positive_offset = 0;
|
||||
let mut max_delta = 0;
|
||||
for (pos, ¤t_value) in block_values[1..].iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64 + 1, slope);
|
||||
if computed_value > current_value {
|
||||
positive_offset = positive_offset.max(computed_value - current_value);
|
||||
} else {
|
||||
max_delta = max_delta.max(current_value - computed_value);
|
||||
}
|
||||
}
|
||||
let num_bits = compute_num_bits(max_delta + positive_offset);
|
||||
for (pos, current_value) in block_values.iter().enumerate() {
|
||||
let computed_value = predict_value(first_value, pos as u64, slope);
|
||||
let diff = (current_value + positive_offset) - computed_value;
|
||||
bit_packer.write(diff, num_bits, write)?;
|
||||
}
|
||||
bit_packer.flush(write)?;
|
||||
block_metadatas.push(BlockMetadata {
|
||||
first_value,
|
||||
positive_offset,
|
||||
slope,
|
||||
num_bits,
|
||||
});
|
||||
}
|
||||
bit_packer.close(write)?;
|
||||
|
||||
let footer = PiecewiseLinearFooter {
|
||||
num_vals: stats.num_vals,
|
||||
min_value: stats.min_value,
|
||||
max_value: stats.max_value,
|
||||
block_metadatas,
|
||||
};
|
||||
footer.serialize(write)?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn is_applicable(
|
||||
_fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> bool {
|
||||
if stats.num_vals < 10 * BLOCK_SIZE {
|
||||
return false;
|
||||
}
|
||||
// On serialization the offset is added to the actual value.
|
||||
// We need to make sure this won't run into overflow calculation issues.
|
||||
// For this we take the maximum theroretical offset and add this to the max value.
|
||||
// If this doesn't overflow the algortihm should be fine
|
||||
let theorethical_maximum_offset = stats.max_value - stats.min_value;
|
||||
if stats
|
||||
.max_value
|
||||
.checked_add(theorethical_maximum_offset)
|
||||
.is_none()
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
}
|
||||
|
||||
/// Estimation for linear interpolation is hard because, you don't know
|
||||
/// where the local maxima are for the deviation of the calculated value and
|
||||
/// the offset is also unknown.
|
||||
fn estimate_compression_ratio(
|
||||
fastfield_accessor: &impl FastFieldDataAccess,
|
||||
stats: FastFieldStats,
|
||||
) -> f32 {
|
||||
let first_val_in_first_block = fastfield_accessor.get_val(0);
|
||||
let last_elem_in_first_chunk = BLOCK_SIZE.min(stats.num_vals);
|
||||
let last_val_in_first_block =
|
||||
fastfield_accessor.get_val(last_elem_in_first_chunk as u64 - 1);
|
||||
let slope = ((last_val_in_first_block as f64 - first_val_in_first_block as f64)
|
||||
/ (stats.num_vals - 1) as f64) as f32;
|
||||
|
||||
// let's sample at 0%, 5%, 10% .. 95%, 100%, but for the first block only
|
||||
let sample_positions = (0..20)
|
||||
.map(|pos| (last_elem_in_first_chunk as f32 / 100.0 * pos as f32 * 5.0) as usize)
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
let max_distance = sample_positions
|
||||
.iter()
|
||||
.map(|&pos| {
|
||||
let calculated_value = predict_value(first_val_in_first_block, pos as u64, slope);
|
||||
let actual_value = fastfield_accessor.get_val(pos as u64);
|
||||
distance(calculated_value, actual_value)
|
||||
})
|
||||
.max()
|
||||
.unwrap();
|
||||
|
||||
// Estimate one block and extrapolate the cost to all blocks.
|
||||
// the theory would be that we don't have the actual max_distance, but we are close within
|
||||
// 50% threshold.
|
||||
// It is multiplied by 2 because in a log case scenario the line would be as much above as
|
||||
// below. So the offset would = max_distance
|
||||
let relative_max_value = (max_distance as f32 * 1.5) * 2.0;
|
||||
|
||||
let num_bits = compute_num_bits(relative_max_value as u64) as u64 * stats.num_vals as u64
|
||||
// function metadata per block
|
||||
+ 21 * (stats.num_vals / BLOCK_SIZE);
|
||||
let num_bits_uncompressed = 64 * stats.num_vals;
|
||||
num_bits as f32 / num_bits_uncompressed as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn distance<T: Sub<Output = T> + Ord>(x: T, y: T) -> T {
|
||||
if x < y {
|
||||
y - x
|
||||
} else {
|
||||
x - y
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::tests::get_codec_test_data_sets;
|
||||
|
||||
fn create_and_validate(data: &[u64], name: &str) -> (f32, f32) {
|
||||
crate::tests::create_and_validate::<
|
||||
PiecewiseLinearFastFieldSerializer,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>(data, name)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_compression() {
|
||||
let data = (10..=6_000_u64).collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) =
|
||||
create_and_validate(&data, "simple monotonically large");
|
||||
assert!(actual_compression < 0.2);
|
||||
assert!(estimate < 0.20);
|
||||
assert!(estimate > 0.15);
|
||||
assert!(actual_compression > 0.001);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_with_codec_data_sets() {
|
||||
let data_sets = get_codec_test_data_sets();
|
||||
for (mut data, name) in data_sets {
|
||||
create_and_validate(&data, name);
|
||||
data.reverse();
|
||||
create_and_validate(&data, name);
|
||||
}
|
||||
}
|
||||
#[test]
|
||||
fn test_simple() {
|
||||
let data = (10..=20_u64).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "simple monotonically");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn border_cases_1() {
|
||||
let data = (0..1024).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn border_case_2() {
|
||||
let data = (0..1025).collect::<Vec<_>>();
|
||||
create_and_validate(&data, "border case");
|
||||
}
|
||||
#[test]
|
||||
fn rand() {
|
||||
for _ in 0..10 {
|
||||
let mut data = (5_000..20_000)
|
||||
.map(|_| rand::random::<u32>() as u64)
|
||||
.collect::<Vec<_>>();
|
||||
let (estimate, actual_compression) = create_and_validate(&data, "random");
|
||||
dbg!(estimate);
|
||||
dbg!(actual_compression);
|
||||
|
||||
data.reverse();
|
||||
create_and_validate(&data, "random");
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.3.0"
|
||||
version = "0.2.0"
|
||||
edition = "2018"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.18.0"
|
||||
version = "0.15.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::Occur;
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '~', '!', '\\', '*', ' ',
|
||||
];
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*|\s)"#;
|
||||
const ESCAPED_SPECIAL_CHARS_PATTERN: &str = r#"\\(\+|\^|`|:|\{|\}|"|\[|\]|\(|\)|\~|!|\\|\*| )"#;
|
||||
|
||||
/// Parses a field_name
|
||||
/// A field name must have at least one character and be followed by a colon.
|
||||
@@ -34,8 +34,7 @@ fn field_name<'a>() -> impl Parser<&'a str, Output = String> {
|
||||
take_while(|c| !SPECIAL_CHARS.contains(&c)),
|
||||
),
|
||||
'\\',
|
||||
satisfy(|_| true), /* if the next character is not a special char, the \ will be treated
|
||||
* as the \ character. */
|
||||
satisfy(|c| SPECIAL_CHARS.contains(&c)),
|
||||
))
|
||||
.skip(char(':'))
|
||||
.map(|s| ESCAPED_SPECIAL_CHARS_RE.replace_all(&s, "$1").to_string())
|
||||
@@ -517,27 +516,15 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_field_name() {
|
||||
fn test_field_name() -> TestParseResult {
|
||||
assert_eq!(
|
||||
super::field_name().parse(".my.field.name:a"),
|
||||
Ok((".my.field.name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\ field:a"#),
|
||||
Ok(("my field".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"にんじん:a"#),
|
||||
Ok(("にんじん".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("my\\ field\\ name:a"),
|
||||
Ok(("my field name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"my\field:a"#),
|
||||
Ok((r#"my\field"#.to_string(), "a"))
|
||||
);
|
||||
assert!(super::field_name().parse("my field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("\\(1\\+1\\):2"),
|
||||
@@ -547,21 +534,14 @@ mod test {
|
||||
super::field_name().parse("my_field_name:a"),
|
||||
Ok(("my_field_name".to_string(), "a"))
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse("myfield.b:hello").unwrap(),
|
||||
("myfield.b".to_string(), "hello")
|
||||
);
|
||||
assert_eq!(
|
||||
super::field_name().parse(r#"myfield\.b:hello"#).unwrap(),
|
||||
(r#"myfield\.b"#.to_string(), "hello")
|
||||
);
|
||||
assert!(super::field_name().parse("my_field_name").is_err());
|
||||
assert!(super::field_name().parse(":a").is_err());
|
||||
assert!(super::field_name().parse("-my_field:a").is_err());
|
||||
assert_eq!(
|
||||
super::field_name().parse("_my_field:a"),
|
||||
Ok(("_my_field".to_string(), "a"))
|
||||
super::field_name().parse("_my_field:a")?,
|
||||
("_my_field".to_string(), "a")
|
||||
);
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -48,8 +48,8 @@ use std::collections::{HashMap, HashSet};
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::HistogramAggregation;
|
||||
pub use super::bucket::RangeAggregation;
|
||||
use super::bucket::{HistogramAggregation, TermsAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
|
||||
@@ -100,27 +100,12 @@ pub(crate) struct BucketAggregationInternal {
|
||||
}
|
||||
|
||||
impl BucketAggregationInternal {
|
||||
pub(crate) fn as_histogram(&self) -> Option<&HistogramAggregation> {
|
||||
pub(crate) fn as_histogram(&self) -> &HistogramAggregation {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Histogram(histogram) => Some(histogram),
|
||||
_ => None,
|
||||
BucketAggregationType::Range(_) => panic!("unexpected aggregation"),
|
||||
BucketAggregationType::Histogram(histogram) => histogram,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
|
||||
match &self.bucket_agg {
|
||||
BucketAggregationType::Terms(terms) => Some(terms),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract all fields, where the term directory is used in the tree.
|
||||
pub fn get_term_dict_field_names(aggs: &Aggregations) -> HashSet<String> {
|
||||
let mut term_dict_field_names = Default::default();
|
||||
for el in aggs.values() {
|
||||
el.get_term_dict_field_names(&mut term_dict_field_names)
|
||||
}
|
||||
term_dict_field_names
|
||||
}
|
||||
|
||||
/// Extract all fast field names used in the tree.
|
||||
@@ -145,12 +130,6 @@ pub enum Aggregation {
|
||||
}
|
||||
|
||||
impl Aggregation {
|
||||
fn get_term_dict_field_names(&self, term_field_names: &mut HashSet<String>) {
|
||||
if let Aggregation::Bucket(bucket) = self {
|
||||
bucket.get_term_dict_field_names(term_field_names)
|
||||
}
|
||||
}
|
||||
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
|
||||
@@ -183,12 +162,6 @@ pub struct BucketAggregation {
|
||||
}
|
||||
|
||||
impl BucketAggregation {
|
||||
fn get_term_dict_field_names(&self, term_dict_field_names: &mut HashSet<String>) {
|
||||
if let BucketAggregationType::Terms(terms) = &self.bucket_agg {
|
||||
term_dict_field_names.insert(terms.field.to_string());
|
||||
}
|
||||
term_dict_field_names.extend(get_term_dict_field_names(&self.sub_aggregation));
|
||||
}
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
self.bucket_agg.get_fast_field_names(fast_field_names);
|
||||
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
|
||||
@@ -204,15 +177,11 @@ pub enum BucketAggregationType {
|
||||
/// Put data into buckets of user-defined ranges.
|
||||
#[serde(rename = "histogram")]
|
||||
Histogram(HistogramAggregation),
|
||||
/// Put data into buckets of terms.
|
||||
#[serde(rename = "terms")]
|
||||
Terms(TermsAggregation),
|
||||
}
|
||||
|
||||
impl BucketAggregationType {
|
||||
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
|
||||
match self {
|
||||
BucketAggregationType::Terms(terms) => fast_field_names.insert(terms.field.to_string()),
|
||||
BucketAggregationType::Range(range) => fast_field_names.insert(range.field.to_string()),
|
||||
BucketAggregationType::Histogram(histogram) => {
|
||||
fast_field_names.insert(histogram.field.to_string())
|
||||
|
||||
@@ -1,16 +1,12 @@
|
||||
//! This will enhance the request tree with access to the fastfield and metadata.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation, TermsAggregation};
|
||||
use super::bucket::{HistogramAggregation, RangeAggregation};
|
||||
use super::metric::{AverageAggregation, StatsAggregation};
|
||||
use super::VecWithNames;
|
||||
use crate::fastfield::{
|
||||
type_and_cardinality, DynamicFastFieldReader, FastType, MultiValuedFastFieldReader,
|
||||
};
|
||||
use crate::fastfield::{type_and_cardinality, DynamicFastFieldReader, FastType};
|
||||
use crate::schema::{Cardinality, Type};
|
||||
use crate::{InvertedIndexReader, SegmentReader, TantivyError};
|
||||
use crate::{SegmentReader, TantivyError};
|
||||
|
||||
#[derive(Clone, Default)]
|
||||
pub(crate) struct AggregationsWithAccessor {
|
||||
@@ -31,32 +27,11 @@ impl AggregationsWithAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub(crate) enum FastFieldAccessor {
|
||||
Multi(MultiValuedFastFieldReader<u64>),
|
||||
Single(DynamicFastFieldReader<u64>),
|
||||
}
|
||||
impl FastFieldAccessor {
|
||||
pub fn as_single(&self) -> Option<&DynamicFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(_) => None,
|
||||
FastFieldAccessor::Single(reader) => Some(reader),
|
||||
}
|
||||
}
|
||||
pub fn as_multi(&self) -> Option<&MultiValuedFastFieldReader<u64>> {
|
||||
match self {
|
||||
FastFieldAccessor::Multi(reader) => Some(reader),
|
||||
FastFieldAccessor::Single(_) => None,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub struct BucketAggregationWithAccessor {
|
||||
/// In general there can be buckets without fast field access, e.g. buckets that are created
|
||||
/// based on search terms. So eventually this needs to be Option or moved.
|
||||
pub(crate) accessor: FastFieldAccessor,
|
||||
pub(crate) inverted_index: Option<Arc<InvertedIndexReader>>,
|
||||
pub(crate) accessor: DynamicFastFieldReader<u64>,
|
||||
pub(crate) field_type: Type,
|
||||
pub(crate) bucket_agg: BucketAggregationType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
@@ -68,25 +43,14 @@ impl BucketAggregationWithAccessor {
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
) -> crate::Result<BucketAggregationWithAccessor> {
|
||||
let mut inverted_index = None;
|
||||
let (accessor, field_type) = match &bucket {
|
||||
BucketAggregationType::Range(RangeAggregation {
|
||||
field: field_name,
|
||||
ranges: _,
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: field_name, ..
|
||||
}) => get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?,
|
||||
BucketAggregationType::Terms(TermsAggregation {
|
||||
field: field_name, ..
|
||||
}) => {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
inverted_index = Some(reader.inverted_index(field)?);
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::MultiValues)?
|
||||
}
|
||||
}) => get_ff_reader_and_validate(reader, field_name)?,
|
||||
};
|
||||
let sub_aggregation = sub_aggregation.clone();
|
||||
Ok(BucketAggregationWithAccessor {
|
||||
@@ -94,7 +58,6 @@ impl BucketAggregationWithAccessor {
|
||||
field_type,
|
||||
sub_aggregation: get_aggs_with_accessor_and_validate(&sub_aggregation, reader)?,
|
||||
bucket_agg: bucket.clone(),
|
||||
inverted_index,
|
||||
})
|
||||
}
|
||||
}
|
||||
@@ -115,14 +78,10 @@ impl MetricAggregationWithAccessor {
|
||||
match &metric {
|
||||
MetricAggregation::Average(AverageAggregation { field: field_name })
|
||||
| MetricAggregation::Stats(StatsAggregation { field: field_name }) => {
|
||||
let (accessor, field_type) =
|
||||
get_ff_reader_and_validate(reader, field_name, Cardinality::SingleValue)?;
|
||||
let (accessor, field_type) = get_ff_reader_and_validate(reader, field_name)?;
|
||||
|
||||
Ok(MetricAggregationWithAccessor {
|
||||
accessor: accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality")
|
||||
.clone(),
|
||||
accessor,
|
||||
field_type,
|
||||
metric: metric.clone(),
|
||||
})
|
||||
@@ -159,45 +118,32 @@ pub(crate) fn get_aggs_with_accessor_and_validate(
|
||||
))
|
||||
}
|
||||
|
||||
/// Get fast field reader with given cardinatility.
|
||||
fn get_ff_reader_and_validate(
|
||||
reader: &SegmentReader,
|
||||
field_name: &str,
|
||||
cardinality: Cardinality,
|
||||
) -> crate::Result<(FastFieldAccessor, Type)> {
|
||||
) -> crate::Result<(DynamicFastFieldReader<u64>, Type)> {
|
||||
let field = reader
|
||||
.schema()
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| TantivyError::FieldNotFound(field_name.to_string()))?;
|
||||
let field_type = reader.schema().get_field_entry(field).field_type();
|
||||
|
||||
if let Some((ff_type, field_cardinality)) = type_and_cardinality(field_type) {
|
||||
if ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(
|
||||
"Unsupported field type date in aggregation".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
if cardinality != field_cardinality {
|
||||
if let Some((ff_type, cardinality)) = type_and_cardinality(field_type) {
|
||||
if cardinality == Cardinality::MultiValues || ff_type == FastType::Date {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Invalid field cardinality on field {} expected {:?}, but got {:?}",
|
||||
field_name, cardinality, field_cardinality
|
||||
"Invalid field type in aggregation {:?}, only Cardinality::SingleValue supported",
|
||||
field_type.value_type()
|
||||
)));
|
||||
}
|
||||
} else {
|
||||
return Err(TantivyError::InvalidArgument(format!(
|
||||
"Only fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
"Only single value fast fields of type f64, u64, i64 are supported, but got {:?} ",
|
||||
field_type.value_type()
|
||||
)));
|
||||
};
|
||||
|
||||
let ff_fields = reader.fast_fields();
|
||||
match cardinality {
|
||||
Cardinality::SingleValue => ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Single(field), field_type.value_type())),
|
||||
Cardinality::MultiValues => ff_fields
|
||||
.u64s_lenient(field)
|
||||
.map(|field| (FastFieldAccessor::Multi(field), field_type.value_type())),
|
||||
}
|
||||
ff_fields
|
||||
.u64_lenient(field)
|
||||
.map(|field| (field, field_type.value_type()))
|
||||
}
|
||||
|
||||
@@ -7,134 +7,86 @@
|
||||
use std::cmp::Ordering;
|
||||
use std::collections::HashMap;
|
||||
|
||||
use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{
|
||||
Aggregations, AggregationsInternal, BucketAggregationInternal, MetricAggregation,
|
||||
};
|
||||
use super::bucket::{intermediate_buckets_to_final_buckets, GetDocCount};
|
||||
use super::agg_req::{Aggregations, AggregationsInternal, BucketAggregationInternal};
|
||||
use super::bucket::intermediate_buckets_to_final_buckets;
|
||||
use super::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
IntermediateMetricResult, IntermediateRangeBucketEntry,
|
||||
};
|
||||
use super::metric::{SingleMetricResult, Stats};
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::TantivyError;
|
||||
use super::Key;
|
||||
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// The final aggegation result.
|
||||
pub struct AggregationResults(pub HashMap<String, AggregationResult>);
|
||||
|
||||
impl AggregationResults {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
if let Some(agg) = self.0.get(name) {
|
||||
agg.get_value_from_aggregation(name, agg_property)
|
||||
} else {
|
||||
// Validation is be done during request parsing, so we can't reach this state.
|
||||
Err(TantivyError::InternalError(format!(
|
||||
"Can't find aggregation {:?} in sub_aggregations",
|
||||
name
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
pub fn from_intermediate_and_req(
|
||||
results: IntermediateAggregationResults,
|
||||
agg: Aggregations,
|
||||
) -> crate::Result<Self> {
|
||||
) -> Self {
|
||||
AggregationResults::from_intermediate_and_req_internal(results, &(agg.into()))
|
||||
}
|
||||
|
||||
/// Convert and intermediate result and its aggregation request to the final result
|
||||
///
|
||||
/// Internal function, CollectorAggregations is used instead Aggregations, which is optimized
|
||||
/// for internal processing, by splitting metric and buckets into seperate groups.
|
||||
pub(crate) fn from_intermediate_and_req_internal(
|
||||
intermediate_results: IntermediateAggregationResults,
|
||||
/// for internal processing
|
||||
fn from_intermediate_and_req_internal(
|
||||
results: IntermediateAggregationResults,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
) -> Self {
|
||||
let mut result = HashMap::default();
|
||||
|
||||
// Important assumption:
|
||||
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
|
||||
// request
|
||||
let mut results: HashMap<String, AggregationResult> = HashMap::new();
|
||||
|
||||
if let Some(buckets) = intermediate_results.buckets {
|
||||
add_coverted_final_buckets_to_result(&mut results, buckets, &req.buckets)?
|
||||
if let Some(buckets) = results.buckets {
|
||||
result.extend(buckets.into_iter().zip(req.buckets.values()).map(
|
||||
|((key, bucket), req)| {
|
||||
(
|
||||
key,
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
|
||||
bucket, req,
|
||||
)),
|
||||
)
|
||||
},
|
||||
));
|
||||
} else {
|
||||
// When there are no buckets, we create empty buckets, so that the serialized json
|
||||
// format is constant
|
||||
add_empty_final_buckets_to_result(&mut results, &req.buckets)?
|
||||
};
|
||||
|
||||
if let Some(metrics) = intermediate_results.metrics {
|
||||
add_converted_final_metrics_to_result(&mut results, metrics);
|
||||
} else {
|
||||
// When there are no metrics, we create empty metric results, so that the serialized
|
||||
// json format is constant
|
||||
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
|
||||
result.extend(req.buckets.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(
|
||||
empty_bucket,
|
||||
req,
|
||||
)),
|
||||
)
|
||||
}));
|
||||
}
|
||||
Ok(Self(results))
|
||||
|
||||
if let Some(metrics) = results.metrics {
|
||||
result.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
} else {
|
||||
result.extend(req.metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
}
|
||||
Self(result)
|
||||
}
|
||||
}
|
||||
|
||||
fn add_converted_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
metrics: VecWithNames<IntermediateMetricResult>,
|
||||
) {
|
||||
results.extend(
|
||||
metrics
|
||||
.into_iter()
|
||||
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
|
||||
);
|
||||
}
|
||||
|
||||
fn add_empty_final_metrics_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_metrics: &VecWithNames<MetricAggregation>,
|
||||
) -> crate::Result<()> {
|
||||
results.extend(req_metrics.iter().map(|(key, req)| {
|
||||
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
|
||||
(
|
||||
key.to_string(),
|
||||
AggregationResult::MetricResult(empty_bucket.into()),
|
||||
)
|
||||
}));
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_empty_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
let requested_buckets = req_buckets.iter();
|
||||
for (key, req) in requested_buckets {
|
||||
let empty_bucket = AggregationResult::BucketResult(BucketResult::empty_from_req(req)?);
|
||||
results.insert(key.to_string(), empty_bucket);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn add_coverted_final_buckets_to_result(
|
||||
results: &mut HashMap<String, AggregationResult>,
|
||||
buckets: VecWithNames<IntermediateBucketResult>,
|
||||
req_buckets: &VecWithNames<BucketAggregationInternal>,
|
||||
) -> crate::Result<()> {
|
||||
assert_eq!(buckets.len(), req_buckets.len());
|
||||
|
||||
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
|
||||
for ((key, bucket), req) in buckets_with_request {
|
||||
let result =
|
||||
AggregationResult::BucketResult(BucketResult::from_intermediate_and_req(bucket, req)?);
|
||||
results.insert(key, result);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// An aggregation is either a bucket or a metric.
|
||||
@@ -145,23 +97,6 @@ pub enum AggregationResult {
|
||||
MetricResult(MetricResult),
|
||||
}
|
||||
|
||||
impl AggregationResult {
|
||||
pub(crate) fn get_value_from_aggregation(
|
||||
&self,
|
||||
_name: &str,
|
||||
agg_property: &str,
|
||||
) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
AggregationResult::BucketResult(_bucket) => Err(TantivyError::InternalError(
|
||||
"Tried to retrieve value from bucket aggregation. This is not supported and \
|
||||
should not happen during collection, but should be catched during validation"
|
||||
.to_string(),
|
||||
)),
|
||||
AggregationResult::MetricResult(metric) => metric.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
#[serde(untagged)]
|
||||
/// MetricResult
|
||||
@@ -172,14 +107,6 @@ pub enum MetricResult {
|
||||
Stats(Stats),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match self {
|
||||
MetricResult::Average(avg) => Ok(avg.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
}
|
||||
}
|
||||
}
|
||||
impl From<IntermediateMetricResult> for MetricResult {
|
||||
fn from(metric: IntermediateMetricResult) -> Self {
|
||||
match metric {
|
||||
@@ -213,64 +140,39 @@ pub enum BucketResult {
|
||||
/// See [HistogramAggregation](super::bucket::HistogramAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
},
|
||||
/// This is the term result
|
||||
Terms {
|
||||
/// The buckets.
|
||||
///
|
||||
/// See [TermsAggregation](super::bucket::TermsAggregation)
|
||||
buckets: Vec<BucketEntry>,
|
||||
/// The number of documents that didn’t make it into to TOP N due to shard_size or size
|
||||
sum_other_doc_count: u64,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
/// The upper bound error for the doc count of each term.
|
||||
doc_count_error_upper_bound: Option<u64>,
|
||||
},
|
||||
}
|
||||
|
||||
impl BucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationInternal) -> crate::Result<Self> {
|
||||
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
|
||||
BucketResult::from_intermediate_and_req(empty_bucket, req)
|
||||
}
|
||||
|
||||
fn from_intermediate_and_req(
|
||||
bucket_result: IntermediateBucketResult,
|
||||
req: &BucketAggregationInternal,
|
||||
) -> crate::Result<Self> {
|
||||
) -> Self {
|
||||
match bucket_result {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_res
|
||||
.buckets
|
||||
IntermediateBucketResult::Range(range_map) => {
|
||||
let mut buckets: Vec<RangeBucketEntry> = range_map
|
||||
.into_iter()
|
||||
.map(|(_, bucket)| {
|
||||
RangeBucketEntry::from_intermediate_and_req(bucket, &req.sub_aggregation)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
.collect_vec();
|
||||
|
||||
buckets.sort_by(|left, right| {
|
||||
// TODO use total_cmp next stable rust release
|
||||
left.from
|
||||
buckets.sort_by(|a, b| {
|
||||
a.from
|
||||
.unwrap_or(f64::MIN)
|
||||
.partial_cmp(&right.from.unwrap_or(f64::MIN))
|
||||
.partial_cmp(&b.from.unwrap_or(f64::MIN))
|
||||
.unwrap_or(Ordering::Equal)
|
||||
});
|
||||
Ok(BucketResult::Range { buckets })
|
||||
BucketResult::Range { buckets }
|
||||
}
|
||||
IntermediateBucketResult::Histogram { buckets } => {
|
||||
let buckets = intermediate_buckets_to_final_buckets(
|
||||
buckets,
|
||||
req.as_histogram()
|
||||
.expect("unexpected aggregation, expected histogram aggregation"),
|
||||
req.as_histogram(),
|
||||
&req.sub_aggregation,
|
||||
)?;
|
||||
);
|
||||
|
||||
Ok(BucketResult::Histogram { buckets })
|
||||
BucketResult::Histogram { buckets }
|
||||
}
|
||||
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
|
||||
req.as_term()
|
||||
.expect("unexpected aggregation, expected term aggregation"),
|
||||
&req.sub_aggregation,
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -308,7 +210,7 @@ pub struct BucketEntry {
|
||||
/// Number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
#[serde(flatten)]
|
||||
/// Sub-aggregations in this bucket.
|
||||
/// sub-aggregations in this bucket.
|
||||
pub sub_aggregation: AggregationResults,
|
||||
}
|
||||
|
||||
@@ -316,25 +218,15 @@ impl BucketEntry {
|
||||
pub(crate) fn from_intermediate_and_req(
|
||||
entry: IntermediateHistogramBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
Ok(BucketEntry {
|
||||
) -> Self {
|
||||
BucketEntry {
|
||||
key: Key::F64(entry.key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
impl GetDocCount for &BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
}
|
||||
}
|
||||
impl GetDocCount for BucketEntry {
|
||||
fn doc_count(&self) -> u64 {
|
||||
self.doc_count
|
||||
),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -389,16 +281,16 @@ impl RangeBucketEntry {
|
||||
fn from_intermediate_and_req(
|
||||
entry: IntermediateRangeBucketEntry,
|
||||
req: &AggregationsInternal,
|
||||
) -> crate::Result<Self> {
|
||||
Ok(RangeBucketEntry {
|
||||
) -> Self {
|
||||
RangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
req,
|
||||
)?,
|
||||
),
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -13,7 +13,9 @@ use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResults, IntermediateBucketResult, IntermediateHistogramBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentHistogramBucketEntry,
|
||||
};
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
@@ -56,7 +58,7 @@ use crate::{DocId, TantivyError};
|
||||
/// "prices": {
|
||||
/// "histogram": {
|
||||
/// "field": "price",
|
||||
/// "interval": 10
|
||||
/// "interval": 10,
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
@@ -69,17 +71,16 @@ use crate::{DocId, TantivyError};
|
||||
pub struct HistogramAggregation {
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// The interval to chunk your data range. Each bucket spans a value range of [0..interval).
|
||||
/// The interval to chunk your data range. The buckets span ranges of [0..interval).
|
||||
/// Must be a positive value.
|
||||
pub interval: f64,
|
||||
/// Intervals implicitely defines an absolute grid of buckets `[interval * k, interval * (k +
|
||||
/// 1))`.
|
||||
///
|
||||
/// Offset makes it possible to shift this grid into
|
||||
/// `[offset + interval * k, offset + interval * (k + 1))`. Offset has to be in the range [0,
|
||||
/// interval).
|
||||
/// Offset makes it possible to shift this grid into `[offset + interval * k, offset + interval
|
||||
/// * (k + 1)) Offset has to be in the range [0, interval).
|
||||
///
|
||||
/// As an example, if there are two documents with value 9 and 12 and interval 10.0, they would
|
||||
/// As an example. If there are two documents with value 8 and 12 and interval 10.0, they would
|
||||
/// fall into the buckets with the key 0 and 10.
|
||||
/// With offset 5 and interval 10, they would both fall into the bucket with they key 5 and the
|
||||
/// range [5..15)
|
||||
@@ -92,22 +93,6 @@ pub struct HistogramAggregation {
|
||||
///
|
||||
/// hard_bounds only limits the buckets, to force a range set both extended_bounds and
|
||||
/// hard_bounds to the same range.
|
||||
///
|
||||
/// ## Example
|
||||
/// ```json
|
||||
/// {
|
||||
/// "prices": {
|
||||
/// "histogram": {
|
||||
/// "field": "price",
|
||||
/// "interval": 10,
|
||||
/// "hard_bounds": {
|
||||
/// "min": 0,
|
||||
/// "max": 100
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub hard_bounds: Option<HistogramBounds>,
|
||||
/// Can be set to extend your bounds. The range of the buckets is by default defined by the
|
||||
/// data range of the values of the documents. As the name suggests, this can only be used to
|
||||
@@ -174,27 +159,6 @@ impl HistogramBounds {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentHistogramBucketEntry {
|
||||
pub key: f64,
|
||||
pub doc_count: u64,
|
||||
}
|
||||
|
||||
impl SegmentHistogramBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
sub_aggregation: SegmentAggregationResultsCollector,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateHistogramBucketEntry> {
|
||||
Ok(IntermediateHistogramBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation: sub_aggregation
|
||||
.into_intermediate_aggregations_result(agg_with_accessor)?,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
/// The collector puts values from the fast field into the correct buckets and does a conversion to
|
||||
/// the correct datatype.
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
@@ -210,10 +174,7 @@ pub struct SegmentHistogramCollector {
|
||||
}
|
||||
|
||||
impl SegmentHistogramCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let mut buckets = Vec::with_capacity(
|
||||
self.buckets
|
||||
.iter()
|
||||
@@ -227,20 +188,13 @@ impl SegmentHistogramCollector {
|
||||
//
|
||||
// Empty buckets may be added later again in the final result, depending on the request.
|
||||
if let Some(sub_aggregations) = self.sub_aggregations {
|
||||
for bucket_res in self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.zip(sub_aggregations.into_iter())
|
||||
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
|
||||
.map(|(bucket, sub_aggregation)| {
|
||||
bucket.into_intermediate_bucket_entry(
|
||||
sub_aggregation,
|
||||
&agg_with_accessor.sub_aggregation,
|
||||
)
|
||||
})
|
||||
{
|
||||
buckets.push(bucket_res?);
|
||||
}
|
||||
buckets.extend(
|
||||
self.buckets
|
||||
.into_iter()
|
||||
.zip(sub_aggregations.into_iter())
|
||||
.filter(|(bucket, _sub_aggregation)| bucket.doc_count != 0)
|
||||
.map(|(bucket, sub_aggregation)| (bucket, sub_aggregation).into()),
|
||||
)
|
||||
} else {
|
||||
buckets.extend(
|
||||
self.buckets
|
||||
@@ -250,7 +204,7 @@ impl SegmentHistogramCollector {
|
||||
);
|
||||
};
|
||||
|
||||
Ok(IntermediateBucketResult::Histogram { buckets })
|
||||
IntermediateBucketResult::Histogram { buckets }
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
@@ -319,16 +273,12 @@ impl SegmentHistogramCollector {
|
||||
let get_bucket_num =
|
||||
|val| (get_bucket_num_f64(val, interval, offset) as i64 - first_bucket_num) as usize;
|
||||
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
for docs in iter.by_ref() {
|
||||
let val0 = self.f64_from_fastfield_u64(accessor.get(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(accessor.get(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(accessor.get(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(accessor.get(docs[3]));
|
||||
let val0 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[0]));
|
||||
let val1 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[1]));
|
||||
let val2 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[2]));
|
||||
let val3 = self.f64_from_fastfield_u64(bucket_with_accessor.accessor.get(docs[3]));
|
||||
|
||||
let bucket_pos0 = get_bucket_num(val0);
|
||||
let bucket_pos1 = get_bucket_num(val1);
|
||||
@@ -365,7 +315,8 @@ impl SegmentHistogramCollector {
|
||||
);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = f64_from_fastfield_u64(accessor.get(*doc), &self.field_type);
|
||||
let val =
|
||||
f64_from_fastfield_u64(bucket_with_accessor.accessor.get(*doc), &self.field_type);
|
||||
if !bounds.contains(val) {
|
||||
continue;
|
||||
}
|
||||
@@ -442,7 +393,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
) -> Vec<BucketEntry> {
|
||||
// Generate the the full list of buckets without gaps.
|
||||
//
|
||||
// The bounds are the min max from the current buckets, optionally extended by
|
||||
@@ -485,7 +436,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
.map(|intermediate_bucket| {
|
||||
BucketEntry::from_intermediate_and_req(intermediate_bucket, sub_aggregation)
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
.collect_vec()
|
||||
}
|
||||
|
||||
// Convert to BucketEntry
|
||||
@@ -493,7 +444,7 @@ pub(crate) fn intermediate_buckets_to_final_buckets(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &AggregationsInternal,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
) -> Vec<BucketEntry> {
|
||||
if histogram_req.min_doc_count() == 0 {
|
||||
// With min_doc_count != 0, we may need to add buckets, so that there are no
|
||||
// gaps, since intermediate result does not contain empty buckets (filtered to
|
||||
@@ -505,7 +456,7 @@ pub(crate) fn intermediate_buckets_to_final_buckets(
|
||||
.into_iter()
|
||||
.filter(|bucket| bucket.doc_count >= histogram_req.min_doc_count())
|
||||
.map(|bucket| BucketEntry::from_intermediate_and_req(bucket, sub_aggregation))
|
||||
.collect::<crate::Result<Vec<_>>>()
|
||||
.collect_vec()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -679,9 +630,41 @@ mod tests {
|
||||
};
|
||||
use crate::aggregation::metric::{AverageAggregation, StatsAggregation};
|
||||
use crate::aggregation::tests::{
|
||||
exec_request, exec_request_with_query, get_test_index_2_segments,
|
||||
get_test_index_from_values, get_test_index_with_num_docs,
|
||||
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
|
||||
};
|
||||
use crate::aggregation::AggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::IndexRecordOption;
|
||||
use crate::{Index, Term};
|
||||
|
||||
fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
|
||||
exec_request_with_query(agg_req, index, None)
|
||||
}
|
||||
fn exec_request_with_query(
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = if let Some((field, term)) = query {
|
||||
let text_field = reader.searcher().schema().get_field(field).unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
searcher.search(&term_query, &collector)?
|
||||
} else {
|
||||
searcher.search(&AllQuery, &collector)?
|
||||
};
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_test_crooked_values() -> crate::Result<()> {
|
||||
@@ -1364,29 +1347,4 @@ mod tests {
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn histogram_invalid_request() -> crate::Result<()> {
|
||||
let index = get_test_index_2_segments(true)?;
|
||||
|
||||
let agg_req: Aggregations = vec![(
|
||||
"histogram".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
|
||||
field: "score_f64".to_string(),
|
||||
interval: 0.0,
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let agg_res = exec_request(agg_req, &index);
|
||||
|
||||
assert!(agg_res.is_err());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,132 +9,8 @@
|
||||
|
||||
mod histogram;
|
||||
mod range;
|
||||
mod term_agg;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub(crate) use histogram::SegmentHistogramCollector;
|
||||
pub use histogram::*;
|
||||
pub(crate) use range::SegmentRangeCollector;
|
||||
pub use range::*;
|
||||
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
|
||||
pub use term_agg::*;
|
||||
|
||||
/// Order for buckets in a bucket aggregation.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub enum Order {
|
||||
/// Asc order
|
||||
#[serde(rename = "asc")]
|
||||
Asc,
|
||||
/// Desc order
|
||||
#[serde(rename = "desc")]
|
||||
Desc,
|
||||
}
|
||||
|
||||
impl Default for Order {
|
||||
fn default() -> Self {
|
||||
Order::Desc
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
/// Order property by which to apply the order
|
||||
pub enum OrderTarget {
|
||||
/// The key of the bucket
|
||||
Key,
|
||||
/// The doc count of the bucket
|
||||
Count,
|
||||
/// Order by value of the sub aggregation metric with identified by given `String`.
|
||||
///
|
||||
/// Only single value metrics are supported currently
|
||||
SubAggregation(String),
|
||||
}
|
||||
|
||||
impl Default for OrderTarget {
|
||||
fn default() -> Self {
|
||||
OrderTarget::Count
|
||||
}
|
||||
}
|
||||
impl From<&str> for OrderTarget {
|
||||
fn from(val: &str) -> Self {
|
||||
match val {
|
||||
"_key" => OrderTarget::Key,
|
||||
"_count" => OrderTarget::Count,
|
||||
_ => OrderTarget::SubAggregation(val.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl ToString for OrderTarget {
|
||||
fn to_string(&self) -> String {
|
||||
match self {
|
||||
OrderTarget::Key => "_key".to_string(),
|
||||
OrderTarget::Count => "_count".to_string(),
|
||||
OrderTarget::SubAggregation(agg) => agg.to_string(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Set the order. target is either "_count", "_key", or the name of
|
||||
/// a metric sub_aggregation.
|
||||
///
|
||||
/// De/Serializes to elasticsearch compatible JSON.
|
||||
///
|
||||
/// Examples in JSON format:
|
||||
/// { "_count": "asc" }
|
||||
/// { "_key": "asc" }
|
||||
/// { "average_price": "asc" }
|
||||
#[derive(Clone, Default, Debug, PartialEq)]
|
||||
pub struct CustomOrder {
|
||||
/// The target property by which to sort by
|
||||
pub target: OrderTarget,
|
||||
/// The order asc or desc
|
||||
pub order: Order,
|
||||
}
|
||||
|
||||
impl Serialize for CustomOrder {
|
||||
fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
|
||||
where S: Serializer {
|
||||
let map: HashMap<String, Order> =
|
||||
std::iter::once((self.target.to_string(), self.order)).collect();
|
||||
map.serialize(serializer)
|
||||
}
|
||||
}
|
||||
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn custom_order_serde_test() {
|
||||
let order = CustomOrder {
|
||||
target: OrderTarget::Key,
|
||||
order: Order::Desc,
|
||||
};
|
||||
|
||||
let order_str = serde_json::to_string(&order).unwrap();
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
}
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
use std::fmt::Debug;
|
||||
use std::ops::Range;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -6,10 +5,10 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::aggregation::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor,
|
||||
};
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateBucketResult, IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateBucketResult;
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use crate::aggregation::segment_agg_result::SegmentAggregationResultsCollector;
|
||||
use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, Key};
|
||||
use crate::fastfield::FastFieldReader;
|
||||
use crate::schema::Type;
|
||||
@@ -39,12 +38,12 @@ use crate::{DocId, TantivyError};
|
||||
/// # Request JSON Format
|
||||
/// ```json
|
||||
/// {
|
||||
/// "my_ranges": {
|
||||
/// "range": {
|
||||
/// "field": "score",
|
||||
/// "ranges": [
|
||||
/// { "to": 3.0 },
|
||||
/// { "from": 3.0, "to": 7.0 },
|
||||
/// { "from": 7.0, "to": 20.0 },
|
||||
/// { "from": 7.0, "to": 20.0 }
|
||||
/// { "from": 20.0 }
|
||||
/// ]
|
||||
/// }
|
||||
@@ -103,72 +102,22 @@ pub struct SegmentRangeCollector {
|
||||
field_type: Type,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None. Open interval, `to` is not
|
||||
/// inclusive.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
impl SegmentRangeBucketEntry {
|
||||
pub(crate) fn into_intermediate_bucket_entry(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateRangeBucketEntry> {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
|
||||
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
Ok(IntermediateRangeBucketEntry {
|
||||
key: self.key,
|
||||
doc_count: self.doc_count,
|
||||
sub_aggregation,
|
||||
from: self.from,
|
||||
to: self.to,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentRangeCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
pub fn into_intermediate_bucket_result(self) -> IntermediateBucketResult {
|
||||
let field_type = self.field_type;
|
||||
|
||||
let buckets = self
|
||||
.buckets
|
||||
.into_iter()
|
||||
.map(move |range_bucket| {
|
||||
Ok((
|
||||
(
|
||||
range_to_string(&range_bucket.range, &field_type),
|
||||
range_bucket
|
||||
.bucket
|
||||
.into_intermediate_bucket_entry(&agg_with_accessor.sub_aggregation)?,
|
||||
))
|
||||
range_bucket.bucket.into(),
|
||||
)
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
.collect();
|
||||
|
||||
Ok(IntermediateBucketResult::Range(
|
||||
IntermediateRangeBucketResult { buckets },
|
||||
))
|
||||
IntermediateBucketResult::Range(buckets)
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(
|
||||
@@ -226,15 +175,11 @@ impl SegmentRangeCollector {
|
||||
force_flush: bool,
|
||||
) {
|
||||
let mut iter = doc.chunks_exact(4);
|
||||
let accessor = bucket_with_accessor
|
||||
.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinatility");
|
||||
for docs in iter.by_ref() {
|
||||
let val1 = accessor.get(docs[0]);
|
||||
let val2 = accessor.get(docs[1]);
|
||||
let val3 = accessor.get(docs[2]);
|
||||
let val4 = accessor.get(docs[3]);
|
||||
let val1 = bucket_with_accessor.accessor.get(docs[0]);
|
||||
let val2 = bucket_with_accessor.accessor.get(docs[1]);
|
||||
let val3 = bucket_with_accessor.accessor.get(docs[2]);
|
||||
let val4 = bucket_with_accessor.accessor.get(docs[3]);
|
||||
let bucket_pos1 = self.get_bucket_pos(val1);
|
||||
let bucket_pos2 = self.get_bucket_pos(val2);
|
||||
let bucket_pos3 = self.get_bucket_pos(val3);
|
||||
@@ -246,7 +191,7 @@ impl SegmentRangeCollector {
|
||||
self.increment_bucket(bucket_pos4, docs[3], &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
for doc in iter.remainder() {
|
||||
let val = accessor.get(*doc);
|
||||
let val = bucket_with_accessor.accessor.get(*doc);
|
||||
let bucket_pos = self.get_bucket_pos(val);
|
||||
self.increment_bucket(bucket_pos, *doc, &bucket_with_accessor.sub_aggregation);
|
||||
}
|
||||
@@ -401,8 +346,7 @@ mod tests {
|
||||
ranges,
|
||||
};
|
||||
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type)
|
||||
.expect("unexpected error")
|
||||
SegmentRangeCollector::from_req_and_validate(&req, &Default::default(), field_type).unwrap()
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -543,7 +487,11 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn range_binary_search_test_f64() {
|
||||
let ranges = vec![(10.0..100.0).into()];
|
||||
let ranges = vec![
|
||||
//(f64::MIN..10.0).into(),
|
||||
(10.0..100.0).into(),
|
||||
//(100.0..f64::MAX).into(),
|
||||
];
|
||||
|
||||
let collector = get_collector_from_ranges(ranges, Type::F64);
|
||||
let search = |val: u64| collector.get_bucket_pos(val);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -86,18 +86,17 @@ impl Collector for AggregationCollector {
|
||||
&self,
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
AggregationResults::from_intermediate_and_req(res, self.agg.clone())
|
||||
merge_fruits(segment_fruits)
|
||||
.map(|res| AggregationResults::from_intermediate_and_req(res, self.agg.clone()))
|
||||
}
|
||||
}
|
||||
|
||||
fn merge_fruits(
|
||||
mut segment_fruits: Vec<crate::Result<IntermediateAggregationResults>>,
|
||||
mut segment_fruits: Vec<IntermediateAggregationResults>,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
if let Some(fruit) = segment_fruits.pop() {
|
||||
let mut fruit = fruit?;
|
||||
if let Some(mut fruit) = segment_fruits.pop() {
|
||||
for next_fruit in segment_fruits {
|
||||
fruit.merge_fruits(next_fruit?);
|
||||
fruit.merge_fruits(next_fruit);
|
||||
}
|
||||
Ok(fruit)
|
||||
} else {
|
||||
@@ -107,7 +106,7 @@ fn merge_fruits(
|
||||
|
||||
/// AggregationSegmentCollector does the aggregation collection on a segment.
|
||||
pub struct AggregationSegmentCollector {
|
||||
aggs_with_accessor: AggregationsWithAccessor,
|
||||
aggs: AggregationsWithAccessor,
|
||||
result: SegmentAggregationResultsCollector,
|
||||
}
|
||||
|
||||
@@ -122,24 +121,22 @@ impl AggregationSegmentCollector {
|
||||
let result =
|
||||
SegmentAggregationResultsCollector::from_req_and_validate(&aggs_with_accessor)?;
|
||||
Ok(AggregationSegmentCollector {
|
||||
aggs_with_accessor,
|
||||
aggs: aggs_with_accessor,
|
||||
result,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl SegmentCollector for AggregationSegmentCollector {
|
||||
type Fruit = crate::Result<IntermediateAggregationResults>;
|
||||
type Fruit = IntermediateAggregationResults;
|
||||
|
||||
#[inline]
|
||||
fn collect(&mut self, doc: crate::DocId, _score: crate::Score) {
|
||||
self.result.collect(doc, &self.aggs_with_accessor);
|
||||
self.result.collect(doc, &self.aggs);
|
||||
}
|
||||
|
||||
fn harvest(mut self) -> Self::Fruit {
|
||||
self.result
|
||||
.flush_staged_docs(&self.aggs_with_accessor, true);
|
||||
self.result
|
||||
.into_intermediate_aggregations_result(&self.aggs_with_accessor)
|
||||
self.result.flush_staged_docs(&self.aggs, true);
|
||||
self.result.into()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -9,27 +9,30 @@ use itertools::Itertools;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::agg_req::{AggregationsInternal, BucketAggregationType, MetricAggregation};
|
||||
use super::agg_result::BucketResult;
|
||||
use super::bucket::{
|
||||
cut_off_buckets, get_agg_name_and_property, GetDocCount, Order, OrderTarget,
|
||||
SegmentHistogramBucketEntry, TermsAggregation,
|
||||
};
|
||||
use super::metric::{IntermediateAverage, IntermediateStats};
|
||||
use super::segment_agg_result::SegmentMetricResultCollector;
|
||||
use super::segment_agg_result::{
|
||||
SegmentAggregationResultsCollector, SegmentBucketResultCollector, SegmentHistogramBucketEntry,
|
||||
SegmentMetricResultCollector, SegmentRangeBucketEntry,
|
||||
};
|
||||
use super::{Key, SerializedKey, VecWithNames};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
/// intermediate results.
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateAggregationResults {
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
|
||||
}
|
||||
|
||||
impl From<SegmentAggregationResultsCollector> for IntermediateAggregationResults {
|
||||
fn from(tree: SegmentAggregationResultsCollector) -> Self {
|
||||
let metrics = tree.metrics.map(VecWithNames::from_other);
|
||||
let buckets = tree.buckets.map(VecWithNames::from_other);
|
||||
|
||||
Self { metrics, buckets }
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateAggregationResults {
|
||||
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
|
||||
let metrics = if req.metrics.is_empty() {
|
||||
@@ -159,21 +162,29 @@ impl IntermediateMetricResult {
|
||||
pub enum IntermediateBucketResult {
|
||||
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
|
||||
/// sub_aggregations.
|
||||
Range(IntermediateRangeBucketResult),
|
||||
Range(FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>),
|
||||
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
Histogram {
|
||||
/// The buckets
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
},
|
||||
/// Term aggregation
|
||||
Terms(IntermediateTermBucketResult),
|
||||
}
|
||||
|
||||
impl From<SegmentBucketResultCollector> for IntermediateBucketResult {
|
||||
fn from(collector: SegmentBucketResultCollector) -> Self {
|
||||
match collector {
|
||||
SegmentBucketResultCollector::Range(range) => range.into_intermediate_bucket_result(),
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl IntermediateBucketResult {
|
||||
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
|
||||
match req {
|
||||
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
|
||||
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
|
||||
BucketAggregationType::Histogram(_) => {
|
||||
IntermediateBucketResult::Histogram { buckets: vec![] }
|
||||
@@ -183,34 +194,24 @@ impl IntermediateBucketResult {
|
||||
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
|
||||
match (self, other) {
|
||||
(
|
||||
IntermediateBucketResult::Terms(term_res_left),
|
||||
IntermediateBucketResult::Terms(term_res_right),
|
||||
IntermediateBucketResult::Range(entries_left),
|
||||
IntermediateBucketResult::Range(entries_right),
|
||||
) => {
|
||||
merge_maps(&mut term_res_left.entries, term_res_right.entries);
|
||||
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
|
||||
term_res_left.doc_count_error_upper_bound +=
|
||||
term_res_right.doc_count_error_upper_bound;
|
||||
}
|
||||
|
||||
(
|
||||
IntermediateBucketResult::Range(range_res_left),
|
||||
IntermediateBucketResult::Range(range_res_right),
|
||||
) => {
|
||||
merge_maps(&mut range_res_left.buckets, range_res_right.buckets);
|
||||
merge_maps(entries_left, entries_right);
|
||||
}
|
||||
(
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_left,
|
||||
buckets: entries_left,
|
||||
..
|
||||
},
|
||||
IntermediateBucketResult::Histogram {
|
||||
buckets: buckets_right,
|
||||
buckets: entries_right,
|
||||
..
|
||||
},
|
||||
) => {
|
||||
let buckets = buckets_left
|
||||
let mut buckets = entries_left
|
||||
.drain(..)
|
||||
.merge_join_by(buckets_right.into_iter(), |left, right| {
|
||||
.merge_join_by(entries_right.into_iter(), |left, right| {
|
||||
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
|
||||
})
|
||||
.map(|either| match either {
|
||||
@@ -223,7 +224,7 @@ impl IntermediateBucketResult {
|
||||
})
|
||||
.collect();
|
||||
|
||||
*buckets_left = buckets;
|
||||
std::mem::swap(entries_left, &mut buckets);
|
||||
}
|
||||
(IntermediateBucketResult::Range(_), _) => {
|
||||
panic!("try merge on different types")
|
||||
@@ -231,118 +232,10 @@ impl IntermediateBucketResult {
|
||||
(IntermediateBucketResult::Histogram { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
(IntermediateBucketResult::Terms { .. }, _) => {
|
||||
panic!("try merge on different types")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Range aggregation including error counts
|
||||
pub struct IntermediateRangeBucketResult {
|
||||
pub(crate) buckets: FnvHashMap<SerializedKey, IntermediateRangeBucketEntry>,
|
||||
}
|
||||
|
||||
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
/// Term aggregation including error counts
|
||||
pub struct IntermediateTermBucketResult {
|
||||
pub(crate) entries: FnvHashMap<String, IntermediateTermBucketEntry>,
|
||||
pub(crate) sum_other_doc_count: u64,
|
||||
pub(crate) doc_count_error_upper_bound: u64,
|
||||
}
|
||||
|
||||
impl IntermediateTermBucketResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &AggregationsInternal,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
.entries
|
||||
.into_iter()
|
||||
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
|
||||
.map(|(key, entry)| {
|
||||
Ok(BucketEntry {
|
||||
key: Key::Str(key),
|
||||
doc_count: entry.doc_count,
|
||||
sub_aggregation: AggregationResults::from_intermediate_and_req_internal(
|
||||
entry.sub_aggregation,
|
||||
sub_aggregation_req,
|
||||
)?,
|
||||
})
|
||||
})
|
||||
.collect::<crate::Result<_>>()?;
|
||||
|
||||
let order = req.order.order;
|
||||
match req.order.target {
|
||||
OrderTarget::Key => {
|
||||
buckets.sort_by(|left, right| {
|
||||
if req.order.order == Order::Desc {
|
||||
left.key.partial_cmp(&right.key)
|
||||
} else {
|
||||
right.key.partial_cmp(&left.key)
|
||||
}
|
||||
.expect("expected type string, which is always sortable")
|
||||
});
|
||||
}
|
||||
OrderTarget::Count => {
|
||||
if req.order.order == Order::Desc {
|
||||
buckets.sort_unstable_by_key(|bucket| std::cmp::Reverse(bucket.doc_count()));
|
||||
} else {
|
||||
buckets.sort_unstable_by_key(|bucket| bucket.doc_count());
|
||||
}
|
||||
}
|
||||
OrderTarget::SubAggregation(name) => {
|
||||
let (agg_name, agg_property) = get_agg_name_and_property(&name);
|
||||
let mut buckets_with_val = buckets
|
||||
.into_iter()
|
||||
.map(|bucket| {
|
||||
let val = bucket
|
||||
.sub_aggregation
|
||||
.get_value_from_aggregation(agg_name, agg_property)?
|
||||
.unwrap_or(f64::NAN);
|
||||
Ok((bucket, val))
|
||||
})
|
||||
.collect::<crate::Result<Vec<_>>>()?;
|
||||
|
||||
buckets_with_val.sort_by(|(_, val1), (_, val2)| {
|
||||
// TODO use total_cmp in next rust stable release
|
||||
match &order {
|
||||
Order::Desc => val2.partial_cmp(val1).unwrap_or(std::cmp::Ordering::Equal),
|
||||
Order::Asc => val1.partial_cmp(val2).unwrap_or(std::cmp::Ordering::Equal),
|
||||
}
|
||||
});
|
||||
buckets = buckets_with_val
|
||||
.into_iter()
|
||||
.map(|(bucket, _val)| bucket)
|
||||
.collect_vec();
|
||||
}
|
||||
}
|
||||
|
||||
// We ignore _term_doc_count_before_cutoff here, because it increases the upperbound error
|
||||
// only for terms that didn't make it into the top N.
|
||||
//
|
||||
// This can be interesting, as a value of quality of the results, but not good to check the
|
||||
// actual error count for the returned terms.
|
||||
let (_term_doc_count_before_cutoff, sum_other_doc_count) =
|
||||
cut_off_buckets(&mut buckets, req.size as usize);
|
||||
|
||||
let doc_count_error_upper_bound = if req.show_term_doc_count_error {
|
||||
Some(self.doc_count_error_upper_bound)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
Ok(BucketResult::Terms {
|
||||
buckets,
|
||||
sum_other_doc_count: self.sum_other_doc_count + sum_other_doc_count,
|
||||
doc_count_error_upper_bound,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
trait MergeFruits {
|
||||
fn merge_fruits(&mut self, other: Self);
|
||||
}
|
||||
@@ -384,6 +277,26 @@ impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
|
||||
}
|
||||
}
|
||||
|
||||
impl
|
||||
From<(
|
||||
SegmentHistogramBucketEntry,
|
||||
SegmentAggregationResultsCollector,
|
||||
)> for IntermediateHistogramBucketEntry
|
||||
{
|
||||
fn from(
|
||||
entry: (
|
||||
SegmentHistogramBucketEntry,
|
||||
SegmentAggregationResultsCollector,
|
||||
),
|
||||
) -> Self {
|
||||
IntermediateHistogramBucketEntry {
|
||||
key: entry.0.key,
|
||||
doc_count: entry.0.doc_count,
|
||||
sub_aggregation: entry.1.into(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// This is the range entry for a bucket, which contains a key, count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
@@ -392,6 +305,7 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub key: Key,
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
pub(crate) values: Option<Vec<u64>>,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
@@ -402,20 +316,22 @@ pub struct IntermediateRangeBucketEntry {
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
/// This is the term entry for a bucket, which contains a count, and optionally
|
||||
/// sub_aggregations.
|
||||
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateTermBucketEntry {
|
||||
/// The number of documents in the bucket.
|
||||
pub doc_count: u64,
|
||||
/// The sub_aggregation in this bucket.
|
||||
pub sub_aggregation: IntermediateAggregationResults,
|
||||
}
|
||||
impl From<SegmentRangeBucketEntry> for IntermediateRangeBucketEntry {
|
||||
fn from(entry: SegmentRangeBucketEntry) -> Self {
|
||||
let sub_aggregation = if let Some(sub_aggregation) = entry.sub_aggregation {
|
||||
sub_aggregation.into()
|
||||
} else {
|
||||
Default::default()
|
||||
};
|
||||
|
||||
impl MergeFruits for IntermediateTermBucketEntry {
|
||||
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
|
||||
self.doc_count += other.doc_count;
|
||||
self.sub_aggregation.merge_fruits(other.sub_aggregation);
|
||||
IntermediateRangeBucketEntry {
|
||||
key: entry.key,
|
||||
doc_count: entry.doc_count,
|
||||
values: None,
|
||||
sub_aggregation,
|
||||
to: entry.to,
|
||||
from: entry.from,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -450,6 +366,7 @@ mod tests {
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
sub_aggregation: Default::default(),
|
||||
from: None,
|
||||
to: None,
|
||||
@@ -458,7 +375,7 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level2".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
IntermediateBucketResult::Range(buckets),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
@@ -477,6 +394,7 @@ mod tests {
|
||||
IntermediateRangeBucketEntry {
|
||||
key: Key::Str(key.to_string()),
|
||||
doc_count: *doc_count,
|
||||
values: None,
|
||||
from: None,
|
||||
to: None,
|
||||
sub_aggregation: get_sub_test_tree(&[(
|
||||
@@ -488,7 +406,7 @@ mod tests {
|
||||
}
|
||||
map.insert(
|
||||
"my_agg_level1".to_string(),
|
||||
IntermediateBucketResult::Range(IntermediateRangeBucketResult { buckets }),
|
||||
IntermediateBucketResult::Range(buckets),
|
||||
);
|
||||
IntermediateAggregationResults {
|
||||
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
|
||||
|
||||
@@ -19,7 +19,7 @@ use crate::DocId;
|
||||
/// "avg": {
|
||||
/// "field": "score",
|
||||
/// }
|
||||
/// }
|
||||
/// }
|
||||
/// ```
|
||||
pub struct AverageAggregation {
|
||||
/// The field name to compute the stats on.
|
||||
|
||||
@@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::aggregation::f64_from_fastfield_u64;
|
||||
use crate::fastfield::{DynamicFastFieldReader, FastFieldReader};
|
||||
use crate::schema::Type;
|
||||
use crate::{DocId, TantivyError};
|
||||
use crate::DocId;
|
||||
|
||||
/// A multi-value metric aggregation that computes stats of numeric values that are
|
||||
/// extracted from the aggregated documents.
|
||||
@@ -53,23 +53,6 @@ pub struct Stats {
|
||||
pub avg: Option<f64>,
|
||||
}
|
||||
|
||||
impl Stats {
|
||||
pub(crate) fn get_value(&self, agg_property: &str) -> crate::Result<Option<f64>> {
|
||||
match agg_property {
|
||||
"count" => Ok(Some(self.count as f64)),
|
||||
"sum" => Ok(Some(self.sum)),
|
||||
"standard_deviation" => Ok(self.standard_deviation),
|
||||
"min" => Ok(self.min),
|
||||
"max" => Ok(self.max),
|
||||
"avg" => Ok(self.avg),
|
||||
_ => Err(TantivyError::InvalidArgument(format!(
|
||||
"unknown property {} on stats metric aggregation",
|
||||
agg_property
|
||||
))),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// IntermediateStats contains the mergeable version for stats.
|
||||
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||
pub struct IntermediateStats {
|
||||
|
||||
@@ -20,8 +20,7 @@
|
||||
//!
|
||||
//! #### Limitations
|
||||
//!
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64, i64 and
|
||||
//! fast fields on text fields.
|
||||
//! Currently aggregations work only on single value fast fields of type u64, f64 and i64.
|
||||
//!
|
||||
//! # JSON Format
|
||||
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
|
||||
@@ -38,7 +37,6 @@
|
||||
//! - [Bucket](bucket)
|
||||
//! - [Histogram](bucket::HistogramAggregation)
|
||||
//! - [Range](bucket::RangeAggregation)
|
||||
//! - [Terms](bucket::TermsAggregation)
|
||||
//! - [Metric](metric)
|
||||
//! - [Average](metric::AverageAggregation)
|
||||
//! - [Stats](metric::StatsAggregation)
|
||||
@@ -149,8 +147,7 @@
|
||||
//! IntermediateAggregationResults provides the
|
||||
//! [merge_fruits](intermediate_agg_result::IntermediateAggregationResults::merge_fruits) method to
|
||||
//! merge multiple results. The merged result can then be converted into
|
||||
//! [agg_result::AggregationResults] via the
|
||||
//! [agg_result::AggregationResults::from_intermediate_and_req] method.
|
||||
//! [agg_result::AggregationResults] via the [Into] trait.
|
||||
|
||||
pub mod agg_req;
|
||||
mod agg_req_with_accessor;
|
||||
@@ -248,14 +245,6 @@ impl<T: Clone> VecWithNames<T> {
|
||||
fn is_empty(&self) -> bool {
|
||||
self.keys.is_empty()
|
||||
}
|
||||
fn len(&self) -> usize {
|
||||
self.keys.len()
|
||||
}
|
||||
fn get(&self, name: &str) -> Option<&T> {
|
||||
self.keys()
|
||||
.position(|key| key == name)
|
||||
.map(|pos| &self.values[pos])
|
||||
}
|
||||
}
|
||||
|
||||
/// The serialized key is used in a HashMap.
|
||||
@@ -322,16 +311,13 @@ mod tests {
|
||||
use super::bucket::RangeAggregation;
|
||||
use super::collector::AggregationCollector;
|
||||
use super::metric::AverageAggregation;
|
||||
use crate::aggregation::agg_req::{
|
||||
get_term_dict_field_names, BucketAggregationType, MetricAggregation,
|
||||
};
|
||||
use crate::aggregation::agg_req::{BucketAggregationType, MetricAggregation};
|
||||
use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::bucket::TermsAggregation;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
|
||||
use crate::schema::{Cardinality, IndexRecordOption, Schema, TextFieldIndexing};
|
||||
use crate::{Index, Term};
|
||||
|
||||
fn get_avg_req(field_name: &str) -> Aggregation {
|
||||
@@ -350,80 +336,17 @@ mod tests {
|
||||
)
|
||||
}
|
||||
|
||||
pub fn exec_request(agg_req: Aggregations, index: &Index) -> crate::Result<Value> {
|
||||
exec_request_with_query(agg_req, index, None)
|
||||
}
|
||||
pub fn exec_request_with_query(
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let agg_res = if let Some((field, term)) = query {
|
||||
let text_field = reader.searcher().schema().get_field(field).unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, term),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
searcher.search(&term_query, &collector)?
|
||||
} else {
|
||||
searcher.search(&AllQuery, &collector)?
|
||||
};
|
||||
|
||||
// Test serialization/deserialization rountrip
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
Ok(res)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values(
|
||||
merge_segments: bool,
|
||||
values: &[f64],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let mut segment_and_values = vec![];
|
||||
for value in values {
|
||||
segment_and_values.push(vec![(*value, value.to_string())]);
|
||||
}
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_terms(
|
||||
merge_segments: bool,
|
||||
values: &[Vec<&str>],
|
||||
) -> crate::Result<Index> {
|
||||
// Every value gets its own segment
|
||||
let segment_and_values = values
|
||||
.iter()
|
||||
.map(|terms| {
|
||||
terms
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(i, term)| (i as f64, term.to_string()))
|
||||
.collect()
|
||||
})
|
||||
.collect::<Vec<_>>();
|
||||
get_test_index_from_values_and_terms(merge_segments, &segment_and_values)
|
||||
}
|
||||
|
||||
pub fn get_test_index_from_values_and_terms(
|
||||
merge_segments: bool,
|
||||
segment_and_values: &[Vec<(f64, String)>],
|
||||
) -> crate::Result<Index> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_fieldtype = crate::schema::TextOptions::default()
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype.clone());
|
||||
let text_field_id = schema_builder.add_text_field("text_id", text_fieldtype);
|
||||
let string_field_id = schema_builder.add_text_field("string_id", STRING | FAST);
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -436,20 +359,15 @@ mod tests {
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
for values in segment_and_values {
|
||||
for (i, term) in values {
|
||||
let i = *i;
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_id => term.to_string(),
|
||||
string_field_id => term.to_string(),
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
}
|
||||
for &i in values {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
score_field => i as u64,
|
||||
score_field_f64 => i as f64,
|
||||
score_field_i64 => i as i64,
|
||||
fraction_field => i as f64/100.0,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
}
|
||||
@@ -470,13 +388,15 @@ mod tests {
|
||||
merge_segments: bool,
|
||||
use_distributed_collector: bool,
|
||||
) -> crate::Result<()> {
|
||||
let mut values_and_terms = (0..80)
|
||||
.map(|val| vec![(val as f64, "terma".to_string())])
|
||||
.collect::<Vec<_>>();
|
||||
values_and_terms.last_mut().unwrap()[0].1 = "termb".to_string();
|
||||
let index = get_test_index_from_values_and_terms(merge_segments, &values_and_terms)?;
|
||||
let index = get_test_index_with_num_docs(merge_segments, 80)?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let text_field = reader.searcher().schema().get_field("text").unwrap();
|
||||
|
||||
let term_query = TermQuery::new(
|
||||
Term::from_field_text(text_field, "cool"),
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
assert_eq!(DOC_BLOCK_SIZE, 64);
|
||||
// In the tree we cache Documents of DOC_BLOCK_SIZE, before passing them down as one block.
|
||||
@@ -521,19 +441,6 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"term_agg_test":{
|
||||
"terms": {
|
||||
"field": "string_id"
|
||||
},
|
||||
"aggs": {
|
||||
"bucketsL2": {
|
||||
"histogram": {
|
||||
"field": "score",
|
||||
"interval": 70.0
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -546,15 +453,14 @@ mod tests {
|
||||
|
||||
let searcher = reader.searcher();
|
||||
AggregationResults::from_intermediate_and_req(
|
||||
searcher.search(&AllQuery, &collector).unwrap(),
|
||||
searcher.search(&term_query, &collector).unwrap(),
|
||||
agg_req,
|
||||
)
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
searcher.search(&AllQuery, &collector).unwrap()
|
||||
searcher.search(&term_query, &collector).unwrap()
|
||||
};
|
||||
|
||||
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
|
||||
@@ -584,46 +490,6 @@ mod tests {
|
||||
);
|
||||
assert_eq!(res["bucketsL1"]["buckets"][2]["doc_count"], 80 - 70);
|
||||
|
||||
assert_eq!(
|
||||
res["term_agg_test"],
|
||||
json!(
|
||||
{
|
||||
"buckets": [
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 70,
|
||||
"key": 0.0
|
||||
},
|
||||
{
|
||||
"doc_count": 9,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 79,
|
||||
"key": "terma"
|
||||
},
|
||||
{
|
||||
"bucketsL2": {
|
||||
"buckets": [
|
||||
{
|
||||
"doc_count": 1,
|
||||
"key": 70.0
|
||||
}
|
||||
]
|
||||
},
|
||||
"doc_count": 1,
|
||||
"key": "termb"
|
||||
}
|
||||
],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
)
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -641,10 +507,8 @@ mod tests {
|
||||
.set_indexing_options(
|
||||
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
|
||||
)
|
||||
.set_fast()
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
schema_builder.add_text_field("dummy_text", STRING);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -849,21 +713,10 @@ mod tests {
|
||||
IndexRecordOption::Basic,
|
||||
);
|
||||
|
||||
let sub_agg_req: Aggregations = vec![
|
||||
("average_in_range".to_string(), get_avg_req("score")),
|
||||
(
|
||||
"term_agg".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let sub_agg_req: Aggregations =
|
||||
vec![("average_in_range".to_string(), get_avg_req("score"))]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let agg_req: Aggregations = if use_elastic_json_req {
|
||||
let elasticsearch_compatible_json_req = r#"
|
||||
{
|
||||
@@ -879,8 +732,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"rangei64": {
|
||||
@@ -895,8 +747,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
},
|
||||
"average": {
|
||||
@@ -914,8 +765,7 @@ mod tests {
|
||||
]
|
||||
},
|
||||
"aggs": {
|
||||
"average_in_range": { "avg": { "field": "score" } },
|
||||
"term_agg": { "terms": { "field": "text" } }
|
||||
"average_in_range": { "avg": { "field": "score" } }
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -974,9 +824,6 @@ mod tests {
|
||||
agg_req
|
||||
};
|
||||
|
||||
let field_names = get_term_dict_field_names(&agg_req);
|
||||
assert_eq!(field_names, vec!["text".to_string()].into_iter().collect());
|
||||
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
@@ -985,7 +832,7 @@ mod tests {
|
||||
// Test de/serialization roundtrip on intermediate_agg_result
|
||||
let res: IntermediateAggregationResults =
|
||||
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
|
||||
AggregationResults::from_intermediate_and_req(res, agg_req.clone()).unwrap()
|
||||
AggregationResults::from_intermediate_and_req(res, agg_req.clone())
|
||||
} else {
|
||||
let collector = AggregationCollector::from_aggs(agg_req.clone());
|
||||
|
||||
@@ -1117,10 +964,10 @@ mod tests {
|
||||
searcher.search(&AllQuery, &collector).unwrap_err()
|
||||
};
|
||||
|
||||
let agg_res = avg_on_field("dummy_text");
|
||||
let agg_res = avg_on_field("text");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Only fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
r#"InvalidArgument("Only single value fast fields of type f64, u64, i64 are supported, but got Str ")"#
|
||||
);
|
||||
|
||||
let agg_res = avg_on_field("not_exist_field");
|
||||
@@ -1132,7 +979,7 @@ mod tests {
|
||||
let agg_res = avg_on_field("scores_i64");
|
||||
assert_eq!(
|
||||
format!("{:?}", agg_res),
|
||||
r#"InvalidArgument("Invalid field cardinality on field scores_i64 expected SingleValue, but got MultiValues")"#
|
||||
r#"InvalidArgument("Invalid field type in aggregation I64, only Cardinality::SingleValue supported")"#
|
||||
);
|
||||
|
||||
Ok(())
|
||||
@@ -1141,12 +988,11 @@ mod tests {
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::{thread_rng, Rng};
|
||||
use test::{self, Bencher};
|
||||
|
||||
use super::*;
|
||||
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds, TermsAggregation};
|
||||
use crate::aggregation::bucket::{HistogramAggregation, HistogramBounds};
|
||||
use crate::aggregation::metric::StatsAggregation;
|
||||
use crate::query::AllQuery;
|
||||
|
||||
@@ -1158,10 +1004,6 @@ mod tests {
|
||||
)
|
||||
.set_stored();
|
||||
let text_field = schema_builder.add_text_field("text", text_fieldtype);
|
||||
let text_field_many_terms =
|
||||
schema_builder.add_text_field("text_many_terms", STRING | FAST);
|
||||
let text_field_few_terms =
|
||||
schema_builder.add_text_field("text_few_terms", STRING | FAST);
|
||||
let score_fieldtype =
|
||||
crate::schema::NumericOptions::default().set_fast(Cardinality::SingleValue);
|
||||
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
|
||||
@@ -1169,10 +1011,6 @@ mod tests {
|
||||
schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
|
||||
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
|
||||
let index = Index::create_from_tempdir(schema_builder.build())?;
|
||||
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
|
||||
let many_terms_data = (0..15_000)
|
||||
.map(|num| format!("author{}", num))
|
||||
.collect::<Vec<_>>();
|
||||
{
|
||||
let mut rng = thread_rng();
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
@@ -1181,8 +1019,6 @@ mod tests {
|
||||
let val: f64 = rng.gen_range(0.0..1_000_000.0);
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "cool",
|
||||
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
|
||||
score_field => val as u64,
|
||||
score_field_f64 => val as f64,
|
||||
score_field_i64 => val as i64,
|
||||
@@ -1334,64 +1170,6 @@ mod tests {
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_few(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_few_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_terms_many(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
let reader = index.reader().unwrap();
|
||||
|
||||
b.iter(|| {
|
||||
let agg_req: Aggregations = vec![(
|
||||
"my_texts".to_string(),
|
||||
Aggregation::Bucket(BucketAggregation {
|
||||
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
|
||||
field: "text_many_terms".to_string(),
|
||||
..Default::default()
|
||||
}),
|
||||
sub_aggregation: Default::default(),
|
||||
}),
|
||||
)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
|
||||
let collector = AggregationCollector::from_aggs(agg_req);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let agg_res: AggregationResults =
|
||||
searcher.search(&AllQuery, &collector).unwrap().into();
|
||||
|
||||
agg_res
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_aggregation_range_only(b: &mut Bencher) {
|
||||
let index = get_test_index_bench(false).unwrap();
|
||||
|
||||
@@ -9,12 +9,11 @@ use super::agg_req::MetricAggregation;
|
||||
use super::agg_req_with_accessor::{
|
||||
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
|
||||
};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
use super::intermediate_agg_result::{IntermediateAggregationResults, IntermediateBucketResult};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector};
|
||||
use super::metric::{
|
||||
AverageAggregation, SegmentAverageCollector, SegmentStatsCollector, StatsAggregation,
|
||||
};
|
||||
use super::VecWithNames;
|
||||
use super::{Key, VecWithNames};
|
||||
use crate::aggregation::agg_req::BucketAggregationType;
|
||||
use crate::DocId;
|
||||
|
||||
@@ -29,17 +28,6 @@ pub(crate) struct SegmentAggregationResultsCollector {
|
||||
num_staged_docs: usize,
|
||||
}
|
||||
|
||||
impl Default for SegmentAggregationResultsCollector {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
metrics: Default::default(),
|
||||
buckets: Default::default(),
|
||||
staged_docs: [0; DOC_BLOCK_SIZE],
|
||||
num_staged_docs: Default::default(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Debug for SegmentAggregationResultsCollector {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentAggregationResultsCollector")
|
||||
@@ -52,25 +40,6 @@ impl Debug for SegmentAggregationResultsCollector {
|
||||
}
|
||||
|
||||
impl SegmentAggregationResultsCollector {
|
||||
pub fn into_intermediate_aggregations_result(
|
||||
self,
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
) -> crate::Result<IntermediateAggregationResults> {
|
||||
let buckets = if let Some(buckets) = self.buckets {
|
||||
let entries = buckets
|
||||
.into_iter()
|
||||
.zip(agg_with_accessor.buckets.values())
|
||||
.map(|((key, bucket), acc)| Ok((key, bucket.into_intermediate_bucket_result(acc)?)))
|
||||
.collect::<crate::Result<Vec<(String, _)>>>()?;
|
||||
Some(VecWithNames::from_entries(entries))
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let metrics = self.metrics.map(VecWithNames::from_other);
|
||||
|
||||
Ok(IntermediateAggregationResults { metrics, buckets })
|
||||
}
|
||||
|
||||
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
|
||||
let buckets = req
|
||||
.buckets
|
||||
@@ -128,9 +97,6 @@ impl SegmentAggregationResultsCollector {
|
||||
agg_with_accessor: &AggregationsWithAccessor,
|
||||
force_flush: bool,
|
||||
) {
|
||||
if self.num_staged_docs == 0 {
|
||||
return;
|
||||
}
|
||||
if let Some(metrics) = &mut self.metrics {
|
||||
for (collector, agg_with_accessor) in
|
||||
metrics.values_mut().zip(agg_with_accessor.metrics.values())
|
||||
@@ -196,40 +162,12 @@ impl SegmentMetricResultCollector {
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) enum SegmentBucketResultCollector {
|
||||
Range(SegmentRangeCollector),
|
||||
Histogram(Box<SegmentHistogramCollector>),
|
||||
Terms(Box<SegmentTermCollector>),
|
||||
Histogram(SegmentHistogramCollector),
|
||||
}
|
||||
|
||||
impl SegmentBucketResultCollector {
|
||||
pub fn into_intermediate_bucket_result(
|
||||
self,
|
||||
agg_with_accessor: &BucketAggregationWithAccessor,
|
||||
) -> crate::Result<IntermediateBucketResult> {
|
||||
match self {
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Range(range) => {
|
||||
range.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.into_intermediate_bucket_result(agg_with_accessor)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn from_req_and_validate(req: &BucketAggregationWithAccessor) -> crate::Result<Self> {
|
||||
match &req.bucket_agg {
|
||||
BucketAggregationType::Terms(terms_req) => Ok(Self::Terms(Box::new(
|
||||
SegmentTermCollector::from_req_and_validate(
|
||||
terms_req,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_multi()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
)?,
|
||||
))),
|
||||
BucketAggregationType::Range(range_req) => {
|
||||
Ok(Self::Range(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
@@ -237,16 +175,14 @@ impl SegmentBucketResultCollector {
|
||||
req.field_type,
|
||||
)?))
|
||||
}
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(Box::new(
|
||||
BucketAggregationType::Histogram(histogram) => Ok(Self::Histogram(
|
||||
SegmentHistogramCollector::from_req_and_validate(
|
||||
histogram,
|
||||
&req.sub_aggregation,
|
||||
req.field_type,
|
||||
req.accessor
|
||||
.as_single()
|
||||
.expect("unexpected fast field cardinality"),
|
||||
&req.accessor,
|
||||
)?,
|
||||
))),
|
||||
)),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -264,9 +200,34 @@ impl SegmentBucketResultCollector {
|
||||
SegmentBucketResultCollector::Histogram(histogram) => {
|
||||
histogram.collect_block(doc, bucket_with_accessor, force_flush)
|
||||
}
|
||||
SegmentBucketResultCollector::Terms(terms) => {
|
||||
terms.collect_block(doc, bucket_with_accessor, force_flush)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
pub(crate) struct SegmentHistogramBucketEntry {
|
||||
pub key: f64,
|
||||
pub doc_count: u64,
|
||||
}
|
||||
|
||||
#[derive(Clone, PartialEq)]
|
||||
pub(crate) struct SegmentRangeBucketEntry {
|
||||
pub key: Key,
|
||||
pub doc_count: u64,
|
||||
pub sub_aggregation: Option<SegmentAggregationResultsCollector>,
|
||||
/// The from range of the bucket. Equals f64::MIN when None.
|
||||
pub from: Option<f64>,
|
||||
/// The to range of the bucket. Equals f64::MAX when None.
|
||||
pub to: Option<f64>,
|
||||
}
|
||||
|
||||
impl Debug for SegmentRangeBucketEntry {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.debug_struct("SegmentRangeBucketEntry")
|
||||
.field("key", &self.key)
|
||||
.field("doc_count", &self.doc_count)
|
||||
.field("from", &self.from)
|
||||
.field("to", &self.to)
|
||||
.finish()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -273,18 +273,18 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut writer = index.writer_with_num_threads(1, 4_000_000)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(
|
||||
doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1986, Month::March, 9)?.with_hms(0, 0, 0)?)),
|
||||
)?;
|
||||
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.add_document(doc!(date_field=>DateTime::new_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
|
||||
writer.commit()?;
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let all_query = AllQuery;
|
||||
let week_histogram_collector = HistogramCollector::new(
|
||||
date_field,
|
||||
DateTime::from_primitive(
|
||||
DateTime::new_primitive(
|
||||
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
|
||||
),
|
||||
3600 * 24 * 365, // it is just for a unit test... sorry leap years.
|
||||
|
||||
@@ -92,7 +92,7 @@ mod histogram_collector;
|
||||
pub use histogram_collector::HistogramCollector;
|
||||
|
||||
mod multi_collector;
|
||||
pub use self::multi_collector::{FruitHandle, MultiCollector, MultiFruit};
|
||||
pub use self::multi_collector::MultiCollector;
|
||||
|
||||
mod top_collector;
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ use super::{Collector, SegmentCollector};
|
||||
use crate::collector::Fruit;
|
||||
use crate::{DocId, Score, SegmentOrdinal, SegmentReader, TantivyError};
|
||||
|
||||
/// MultiFruit keeps Fruits from every nested Collector
|
||||
pub struct MultiFruit {
|
||||
sub_fruits: Vec<Option<Box<dyn Fruit>>>,
|
||||
}
|
||||
@@ -80,17 +79,12 @@ impl<TSegmentCollector: SegmentCollector> BoxableSegmentCollector
|
||||
}
|
||||
}
|
||||
|
||||
/// FruitHandle stores reference to the corresponding collector inside MultiCollector
|
||||
pub struct FruitHandle<TFruit: Fruit> {
|
||||
pos: usize,
|
||||
_phantom: PhantomData<TFruit>,
|
||||
}
|
||||
|
||||
impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// Extract a typed fruit off a multifruit.
|
||||
///
|
||||
/// This function involves downcasting and can panic if the multifruit was
|
||||
/// created using faulty code.
|
||||
pub fn extract(self, fruits: &mut MultiFruit) -> TFruit {
|
||||
let boxed_fruit = fruits.sub_fruits[self.pos].take().expect("");
|
||||
*boxed_fruit
|
||||
|
||||
@@ -26,11 +26,11 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::from_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::from_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::from_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Name of the Wind", price => 30_200u64, date => DateTime::new_utc(OffsetDateTime::parse("1898-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Muadib", price => 29_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2020-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of Anne Frank", price => 18_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-20T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "A Dairy Cow", price => 21_240u64, date => DateTime::new_utc(OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.add_document(doc!(title => "The Diary of a Young Girl", price => 20_120u64, date => DateTime::new_utc(OffsetDateTime::parse("2018-04-09T00:00:00+00:00", &Rfc3339).unwrap())))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
@@ -55,7 +55,7 @@ pub fn test_filter_collector() -> crate::Result<()> {
|
||||
assert_eq!(filtered_top_docs.len(), 0);
|
||||
|
||||
fn date_filter(value: DateTime) -> bool {
|
||||
(value.into_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
(value.to_utc() - OffsetDateTime::parse("2019-04-09T00:00:00+00:00", &Rfc3339).unwrap())
|
||||
.whole_weeks()
|
||||
> 0
|
||||
}
|
||||
|
||||
@@ -898,7 +898,7 @@ mod tests {
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let pr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
let pr_birthday = DateTime::new_utc(OffsetDateTime::parse(
|
||||
"1898-04-09T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
@@ -906,7 +906,7 @@ mod tests {
|
||||
name => "Paul Robeson",
|
||||
birthday => pr_birthday,
|
||||
))?;
|
||||
let mr_birthday = DateTime::from_utc(OffsetDateTime::parse(
|
||||
let mr_birthday = DateTime::new_utc(OffsetDateTime::parse(
|
||||
"1947-11-08T00:00:00+00:00",
|
||||
&Rfc3339,
|
||||
)?);
|
||||
|
||||
@@ -1,7 +1,6 @@
|
||||
use crossbeam::channel;
|
||||
use rayon::{ThreadPool, ThreadPoolBuilder};
|
||||
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Search executor whether search request are single thread or multithread.
|
||||
///
|
||||
/// We don't expose Rayon thread pool directly here for several reasons.
|
||||
@@ -48,19 +47,16 @@ impl Executor {
|
||||
match self {
|
||||
Executor::SingleThread => args.map(f).collect::<crate::Result<_>>(),
|
||||
Executor::ThreadPool(pool) => {
|
||||
let args: Vec<A> = args.collect();
|
||||
let num_fruits = args.len();
|
||||
let args_with_indices: Vec<(usize, A)> = args.enumerate().collect();
|
||||
let num_fruits = args_with_indices.len();
|
||||
let fruit_receiver = {
|
||||
let (fruit_sender, fruit_receiver) = crossbeam_channel::unbounded();
|
||||
let (fruit_sender, fruit_receiver) = channel::unbounded();
|
||||
pool.scope(|scope| {
|
||||
for (idx, arg) in args.into_iter().enumerate() {
|
||||
// We name references for f and fruit_sender_ref because we do not
|
||||
// want these two to be moved into the closure.
|
||||
let f_ref = &f;
|
||||
let fruit_sender_ref = &fruit_sender;
|
||||
scope.spawn(move |_| {
|
||||
let fruit = f_ref(arg);
|
||||
if let Err(err) = fruit_sender_ref.send((idx, fruit)) {
|
||||
for arg_with_idx in args_with_indices {
|
||||
scope.spawn(|_| {
|
||||
let (idx, arg) = arg_with_idx;
|
||||
let fruit = f(arg);
|
||||
if let Err(err) = fruit_sender.send((idx, fruit)) {
|
||||
error!(
|
||||
"Failed to send search task. It probably means all search \
|
||||
threads have panicked. {:?}",
|
||||
@@ -75,19 +71,18 @@ impl Executor {
|
||||
// This is important as it makes it possible for the fruit_receiver iteration to
|
||||
// terminate.
|
||||
};
|
||||
let mut result_placeholders: Vec<Option<R>> =
|
||||
std::iter::repeat_with(|| None).take(num_fruits).collect();
|
||||
// This is lame, but safe.
|
||||
let mut results_with_position = Vec::with_capacity(num_fruits);
|
||||
for (pos, fruit_res) in fruit_receiver {
|
||||
let fruit = fruit_res?;
|
||||
result_placeholders[pos] = Some(fruit);
|
||||
results_with_position.push((pos, fruit));
|
||||
}
|
||||
let results: Vec<R> = result_placeholders.into_iter().flatten().collect();
|
||||
if results.len() != num_fruits {
|
||||
return Err(TantivyError::InternalError(
|
||||
"One of the mapped execution failed.".to_string(),
|
||||
));
|
||||
}
|
||||
Ok(results)
|
||||
results_with_position.sort_by_key(|(pos, _)| *pos);
|
||||
assert_eq!(results_with_position.len(), num_fruits);
|
||||
Ok(results_with_position
|
||||
.into_iter()
|
||||
.map(|(_, fruit)| fruit)
|
||||
.collect::<Vec<_>>())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -74,7 +74,6 @@ fn load_metas(
|
||||
pub struct IndexBuilder {
|
||||
schema: Option<Schema>,
|
||||
index_settings: IndexSettings,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
}
|
||||
impl Default for IndexBuilder {
|
||||
fn default() -> Self {
|
||||
@@ -87,7 +86,6 @@ impl IndexBuilder {
|
||||
Self {
|
||||
schema: None,
|
||||
index_settings: IndexSettings::default(),
|
||||
tokenizer_manager: TokenizerManager::default(),
|
||||
}
|
||||
}
|
||||
|
||||
@@ -105,12 +103,6 @@ impl IndexBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the tokenizers .
|
||||
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
|
||||
self.tokenizer_manager = tokenizers;
|
||||
self
|
||||
}
|
||||
|
||||
/// Creates a new index using the `RAMDirectory`.
|
||||
///
|
||||
/// The index will be allocated in anonymous memory.
|
||||
@@ -162,8 +154,7 @@ impl IndexBuilder {
|
||||
if !Index::exists(&*dir)? {
|
||||
return self.create(dir);
|
||||
}
|
||||
let mut index = Index::open(dir)?;
|
||||
index.set_tokenizers(self.tokenizer_manager.clone());
|
||||
let index = Index::open(dir)?;
|
||||
if index.schema() == self.get_expect_schema()? {
|
||||
Ok(index)
|
||||
} else {
|
||||
@@ -185,8 +176,7 @@ impl IndexBuilder {
|
||||
)?;
|
||||
let mut metas = IndexMeta::with_schema(self.get_expect_schema()?);
|
||||
metas.index_settings = self.index_settings;
|
||||
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
index.set_tokenizers(self.tokenizer_manager);
|
||||
let index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
|
||||
Ok(index)
|
||||
}
|
||||
}
|
||||
@@ -314,11 +304,6 @@ impl Index {
|
||||
}
|
||||
}
|
||||
|
||||
/// Setter for the tokenizer manager.
|
||||
pub fn set_tokenizers(&mut self, tokenizers: TokenizerManager) {
|
||||
self.tokenizers = tokenizers;
|
||||
}
|
||||
|
||||
/// Accessor for the tokenizer manager.
|
||||
pub fn tokenizers(&self) -> &TokenizerManager {
|
||||
&self.tokenizers
|
||||
@@ -329,31 +314,20 @@ impl Index {
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let field_type = field_entry.field_type();
|
||||
let tokenizer_manager: &TokenizerManager = self.tokenizers();
|
||||
let indexing_options_opt = match field_type {
|
||||
FieldType::JsonObject(options) => options.get_text_indexing_options(),
|
||||
FieldType::Str(options) => options.get_indexing_options(),
|
||||
_ => {
|
||||
return Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
)))
|
||||
}
|
||||
let tokenizer_name_opt: Option<TextAnalyzer> = match field_type {
|
||||
FieldType::Str(text_options) => text_options
|
||||
.get_indexing_options()
|
||||
.map(|text_indexing_options| text_indexing_options.tokenizer().to_string())
|
||||
.and_then(|tokenizer_name| tokenizer_manager.get(&tokenizer_name)),
|
||||
_ => None,
|
||||
};
|
||||
let indexing_options = indexing_options_opt.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No indexing options set for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})?;
|
||||
|
||||
tokenizer_manager
|
||||
.get(indexing_options.tokenizer())
|
||||
.ok_or_else(|| {
|
||||
TantivyError::InvalidArgument(format!(
|
||||
"No Tokenizer found for field {:?}",
|
||||
field_entry
|
||||
))
|
||||
})
|
||||
match tokenizer_name_opt {
|
||||
Some(tokenizer) => Ok(tokenizer),
|
||||
None => Err(TantivyError::SchemaError(format!(
|
||||
"{:?} is not a text field.",
|
||||
field_entry.name()
|
||||
))),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a default `IndexReader` for the given index.
|
||||
@@ -583,8 +557,7 @@ impl fmt::Debug for Index {
|
||||
mod tests {
|
||||
use crate::directory::{RamDirectory, WatchCallback};
|
||||
use crate::schema::{Field, Schema, INDEXED, TEXT};
|
||||
use crate::tokenizer::TokenizerManager;
|
||||
use crate::{Directory, Index, IndexBuilder, IndexReader, IndexSettings, ReloadPolicy};
|
||||
use crate::{Directory, Index, IndexReader, IndexSettings, ReloadPolicy};
|
||||
|
||||
#[test]
|
||||
fn test_indexer_for_field() {
|
||||
@@ -600,21 +573,6 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_set_tokenizer_manager() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_u64_field("num_likes", INDEXED);
|
||||
schema_builder.add_text_field("body", TEXT);
|
||||
let schema = schema_builder.build();
|
||||
let index = IndexBuilder::new()
|
||||
// set empty tokenizer manager
|
||||
.tokenizers(TokenizerManager::new())
|
||||
.schema(schema)
|
||||
.create_in_ram()
|
||||
.unwrap();
|
||||
assert!(index.tokenizers().get("raw").is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_index_exists() {
|
||||
let directory: Box<dyn Directory> = Box::new(RamDirectory::create());
|
||||
@@ -744,7 +702,7 @@ mod tests {
|
||||
.try_into()?;
|
||||
assert_eq!(reader.searcher().num_docs(), 0);
|
||||
writer.add_document(doc!(field=>1u64))?;
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _handle = index.directory_mut().watch(WatchCallback::new(move || {
|
||||
let _ = sender.send(());
|
||||
}));
|
||||
@@ -779,7 +737,7 @@ mod tests {
|
||||
reader: &IndexReader,
|
||||
) -> crate::Result<()> {
|
||||
let mut reader_index = reader.index();
|
||||
let (sender, receiver) = crossbeam_channel::unbounded();
|
||||
let (sender, receiver) = crossbeam::channel::unbounded();
|
||||
let _watch_handle = reader_index
|
||||
.directory_mut()
|
||||
.watch(WatchCallback::new(move || {
|
||||
|
||||
@@ -239,7 +239,7 @@ impl InnerSegmentMeta {
|
||||
///
|
||||
/// Contains settings which are applied on the whole
|
||||
/// index, like presort documents.
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
|
||||
#[derive(Clone, Debug, Default, Serialize, Deserialize, Eq, PartialEq)]
|
||||
pub struct IndexSettings {
|
||||
/// Sorts the documents by information
|
||||
/// provided in `IndexSortByField`
|
||||
@@ -248,26 +248,7 @@ pub struct IndexSettings {
|
||||
/// The `Compressor` used to compress the doc store.
|
||||
#[serde(default)]
|
||||
pub docstore_compression: Compressor,
|
||||
#[serde(default = "default_docstore_blocksize")]
|
||||
/// The size of each block that will be compressed and written to disk
|
||||
pub docstore_blocksize: usize,
|
||||
}
|
||||
|
||||
/// Must be a function to be compatible with serde defaults
|
||||
fn default_docstore_blocksize() -> usize {
|
||||
16_384
|
||||
}
|
||||
|
||||
impl Default for IndexSettings {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
sort_by_field: None,
|
||||
docstore_compression: Compressor::default(),
|
||||
docstore_blocksize: default_docstore_blocksize(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Settings to presort the documents in an index
|
||||
///
|
||||
/// Presorting documents can greatly performance
|
||||
@@ -420,7 +401,7 @@ mod tests {
|
||||
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
|
||||
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4"},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false}}],"opstamp":0}"#
|
||||
);
|
||||
|
||||
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
|
||||
|
||||
@@ -35,7 +35,7 @@ const ZERO_ARRAY: [u8; 8] = [0u8; 8];
|
||||
#[cfg(test)]
|
||||
fn create_uuid() -> Uuid {
|
||||
let new_auto_inc_id = (*AUTO_INC_COUNTER).fetch_add(1, atomic::Ordering::SeqCst);
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY)
|
||||
Uuid::from_fields(new_auto_inc_id as u32, 0, 0, &ZERO_ARRAY).unwrap()
|
||||
}
|
||||
|
||||
#[cfg(not(test))]
|
||||
@@ -57,7 +57,7 @@ impl SegmentId {
|
||||
/// Picking the first 8 chars is ok to identify
|
||||
/// segments in a display message (e.g. a5c4dfcb).
|
||||
pub fn short_uuid_string(&self) -> String {
|
||||
(&self.0.as_simple().to_string()[..8]).to_string()
|
||||
(&self.0.to_simple_ref().to_string()[..8]).to_string()
|
||||
}
|
||||
|
||||
/// Returns a segment uuid string.
|
||||
@@ -65,7 +65,7 @@ impl SegmentId {
|
||||
/// It consists in 32 lowercase hexadecimal chars
|
||||
/// (e.g. a5c4dfcbdfe645089129e308e26d5523)
|
||||
pub fn uuid_string(&self) -> String {
|
||||
self.0.as_simple().to_string()
|
||||
self.0.to_simple_ref().to_string()
|
||||
}
|
||||
|
||||
/// Build a `SegmentId` string from the full uuid string.
|
||||
|
||||
@@ -169,7 +169,7 @@ impl SegmentReader {
|
||||
|
||||
let fast_fields_data = segment.open_read(SegmentComponent::FastFields)?;
|
||||
let fast_fields_composite = CompositeFile::open(&fast_fields_data)?;
|
||||
let fast_fields_readers =
|
||||
let fast_field_readers =
|
||||
Arc::new(FastFieldReaders::new(schema.clone(), fast_fields_composite));
|
||||
let fieldnorm_data = segment.open_read(SegmentComponent::FieldNorms)?;
|
||||
let fieldnorm_readers = FieldNormReaders::open(fieldnorm_data)?;
|
||||
@@ -196,7 +196,7 @@ impl SegmentReader {
|
||||
max_doc,
|
||||
termdict_composite,
|
||||
postings_composite,
|
||||
fast_fields_readers,
|
||||
fast_fields_readers: fast_field_readers,
|
||||
fieldnorm_readers,
|
||||
segment_id: segment.id(),
|
||||
delete_opstamp: segment.meta().delete_opstamp(),
|
||||
|
||||
@@ -110,7 +110,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
@@ -153,7 +153,7 @@ mod tests {
|
||||
let tmp_file = tmp_dir.path().join("watched.txt");
|
||||
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(100);
|
||||
|
||||
let watcher = FileWatcher::new(&tmp_file);
|
||||
|
||||
@@ -181,7 +181,7 @@ fn test_directory_delete(directory: &dyn Directory) -> crate::Result<()> {
|
||||
|
||||
fn test_watch(directory: &dyn Directory) {
|
||||
let counter: Arc<AtomicUsize> = Default::default();
|
||||
let (tx, rx) = crossbeam_channel::unbounded();
|
||||
let (tx, rx) = crossbeam::channel::unbounded();
|
||||
let timeout = Duration::from_millis(500);
|
||||
|
||||
let handle = directory
|
||||
|
||||
@@ -97,10 +97,6 @@ pub enum TantivyError {
|
||||
/// Index incompatible with current version of Tantivy.
|
||||
#[error("{0:?}")]
|
||||
IncompatibleIndex(Incompatibility),
|
||||
/// An internal error occurred. This is are internal states that should not be reached.
|
||||
/// e.g. a datastructure is incorrectly inititalized.
|
||||
#[error("Internal error: '{0}'")]
|
||||
InternalError(String),
|
||||
}
|
||||
|
||||
#[cfg(feature = "quickwit")]
|
||||
|
||||
@@ -188,14 +188,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_iter_deser_on_fly(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_access(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_access(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&[0, 1, 1000, 10000], 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
@@ -206,14 +206,14 @@ mod bench {
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_iter_deser_on_fly_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| alive_bitset.iter_alive().collect::<Vec<_>>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_alive_bitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
fn bench_deletebitset_access_1_8_alive(bench: &mut Bencher) {
|
||||
let alive_bitset = AliveBitSet::for_test_from_deleted_docs(&get_alive(), 1_000_000);
|
||||
|
||||
bench.iter(|| {
|
||||
|
||||
@@ -167,7 +167,7 @@ impl FastValue for DateTime {
|
||||
}
|
||||
|
||||
fn to_u64(&self) -> u64 {
|
||||
self.into_unix_timestamp().to_u64()
|
||||
self.to_unix_timestamp().to_u64()
|
||||
}
|
||||
|
||||
fn fast_field_cardinality(field_type: &FieldType) -> Option<Cardinality> {
|
||||
@@ -178,7 +178,7 @@ impl FastValue for DateTime {
|
||||
}
|
||||
|
||||
fn as_u64(&self) -> u64 {
|
||||
self.into_unix_timestamp().as_u64()
|
||||
self.to_unix_timestamp().as_u64()
|
||||
}
|
||||
|
||||
fn to_type() -> Type {
|
||||
@@ -196,31 +196,10 @@ fn value_to_u64(value: &Value) -> u64 {
|
||||
}
|
||||
}
|
||||
|
||||
/// The fast field type
|
||||
pub enum FastFieldType {
|
||||
/// Numeric type, e.g. f64.
|
||||
Numeric,
|
||||
/// Fast field stores string ids.
|
||||
String,
|
||||
/// Fast field stores string ids for facets.
|
||||
Facet,
|
||||
}
|
||||
|
||||
impl FastFieldType {
|
||||
fn is_storing_term_ids(&self) -> bool {
|
||||
matches!(self, FastFieldType::String | FastFieldType::Facet)
|
||||
}
|
||||
|
||||
fn is_facet(&self) -> bool {
|
||||
matches!(self, FastFieldType::Facet)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::ops::Range;
|
||||
use std::path::Path;
|
||||
|
||||
use common::HasLen;
|
||||
@@ -232,7 +211,7 @@ mod tests {
|
||||
use super::*;
|
||||
use crate::directory::{CompositeFile, Directory, RamDirectory, WritePtr};
|
||||
use crate::merge_policy::NoMergePolicy;
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST, STRING, TEXT};
|
||||
use crate::schema::{Document, Field, NumericOptions, Schema, FAST};
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::{Index, SegmentId, SegmentReader};
|
||||
|
||||
@@ -254,7 +233,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
pub fn test_fastfield_i64_u64() {
|
||||
let datetime = DateTime::from_utc(OffsetDateTime::UNIX_EPOCH);
|
||||
let datetime = DateTime::new_utc(OffsetDateTime::UNIX_EPOCH);
|
||||
assert_eq!(i64::from_u64(datetime.to_u64()), 0i64);
|
||||
}
|
||||
|
||||
@@ -413,8 +392,7 @@ mod tests {
|
||||
serializer.close().unwrap();
|
||||
}
|
||||
let file = directory.open_read(path).unwrap();
|
||||
// assert_eq!(file.len(), 17710 as usize); //bitpacked size
|
||||
assert_eq!(file.len(), 10175_usize); // linear interpol size
|
||||
assert_eq!(file.len(), 12471_usize); // Piecewise linear codec size
|
||||
{
|
||||
let fast_fields_composite = CompositeFile::open(&file)?;
|
||||
let data = fast_fields_composite.open_read(i64_field).unwrap();
|
||||
@@ -511,7 +489,7 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests().unwrap();
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer
|
||||
.add_document(doc!(date_field =>DateTime::from_utc(OffsetDateTime::now_utc())))?;
|
||||
.add_document(doc!(date_field =>DateTime::new_utc(OffsetDateTime::now_utc())))?;
|
||||
index_writer.commit()?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
@@ -531,206 +509,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_default_datetime() {
|
||||
assert_eq!(0, DateTime::make_zero().into_unix_timestamp());
|
||||
}
|
||||
|
||||
fn get_vals_for_docs(ff: &MultiValuedFastFieldReader<u64>, docs: Range<u32>) -> Vec<u64> {
|
||||
let mut all = vec![];
|
||||
|
||||
for doc in docs {
|
||||
let mut out = vec![];
|
||||
ff.get_vals(doc, &mut out);
|
||||
all.extend(out);
|
||||
}
|
||||
all
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_text_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", TEXT | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB AAAAA", // term_ord 1,2
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA BBBBB", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..5),
|
||||
vec![1, 0, 0, 0, 1, 2]
|
||||
);
|
||||
|
||||
let mut out = vec![];
|
||||
text_fast_field.get_vals(3, &mut out);
|
||||
assert_eq!(out, vec![0, 1]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
// default tokenizer applies lower case
|
||||
assert_eq!(bytes, "aaaaa".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC AAAAA", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..3), vec![0, 1, 0]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..8),
|
||||
vec![1, 0, 0, 0, 1, 3 /* next segment */, 0, 2, 0]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_string_fastfield() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let text_field = schema_builder.add_text_field("text", STRING | FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
|
||||
{
|
||||
// first segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "BBBBB", // term_ord 1
|
||||
))?;
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "zumberthree", // term_ord 2, after merge term_ord 3
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 1);
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..6), vec![1, 0, 0, 2]);
|
||||
|
||||
let inverted_index = segment_reader.inverted_index(text_field)?;
|
||||
assert_eq!(inverted_index.terms().num_terms(), 3);
|
||||
let mut bytes = vec![];
|
||||
assert!(inverted_index.terms().ord_to_term(0, &mut bytes)?);
|
||||
assert_eq!(bytes, "AAAAA".as_bytes());
|
||||
}
|
||||
|
||||
{
|
||||
// second segment
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "AAAAA", // term_ord 0
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "CCCCC", // term_ord 1, after merge 2
|
||||
))?;
|
||||
|
||||
index_writer.add_document(doc!())?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
assert_eq!(searcher.segment_readers().len(), 2);
|
||||
let segment_reader = searcher.segment_reader(1);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(get_vals_for_docs(&text_fast_field, 0..2), vec![0, 1]);
|
||||
}
|
||||
// Merging the segments
|
||||
{
|
||||
let segment_ids = index.searchable_segment_ids()?;
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
index_writer.merge(&segment_ids).wait()?;
|
||||
index_writer.wait_merging_threads()?;
|
||||
}
|
||||
|
||||
let reader = index.reader()?;
|
||||
let searcher = reader.searcher();
|
||||
let segment_reader = searcher.segment_reader(0);
|
||||
let fast_fields = segment_reader.fast_fields();
|
||||
let text_fast_field = fast_fields.u64s(text_field).unwrap();
|
||||
|
||||
assert_eq!(
|
||||
get_vals_for_docs(&text_fast_field, 0..9),
|
||||
vec![1, 0, 0, 3 /* next segment */, 0, 2]
|
||||
);
|
||||
|
||||
Ok(())
|
||||
assert_eq!(0, DateTime::make_zero().to_unix_timestamp());
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -768,23 +547,23 @@ mod tests {
|
||||
let dates_fast_field = fast_fields.dates(multi_date_field).unwrap();
|
||||
let mut dates = vec![];
|
||||
{
|
||||
assert_eq!(date_fast_field.get(0u32).into_unix_timestamp(), 1i64);
|
||||
assert_eq!(date_fast_field.get(0u32).to_unix_timestamp(), 1i64);
|
||||
dates_fast_field.get_vals(0u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_unix_timestamp(), 2i64);
|
||||
assert_eq!(dates[1].into_unix_timestamp(), 3i64);
|
||||
assert_eq!(dates[0].to_unix_timestamp(), 2i64);
|
||||
assert_eq!(dates[1].to_unix_timestamp(), 3i64);
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(1u32).into_unix_timestamp(), 4i64);
|
||||
assert_eq!(date_fast_field.get(1u32).to_unix_timestamp(), 4i64);
|
||||
dates_fast_field.get_vals(1u32, &mut dates);
|
||||
assert!(dates.is_empty());
|
||||
}
|
||||
{
|
||||
assert_eq!(date_fast_field.get(2u32).into_unix_timestamp(), 0i64);
|
||||
assert_eq!(date_fast_field.get(2u32).to_unix_timestamp(), 0i64);
|
||||
dates_fast_field.get_vals(2u32, &mut dates);
|
||||
assert_eq!(dates.len(), 2);
|
||||
assert_eq!(dates[0].into_unix_timestamp(), 5i64);
|
||||
assert_eq!(dates[1].into_unix_timestamp(), 6i64);
|
||||
assert_eq!(dates[0].to_unix_timestamp(), 5i64);
|
||||
assert_eq!(dates[1].to_unix_timestamp(), 6i64);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -71,24 +71,24 @@ mod tests {
|
||||
let mut index_writer = index.writer_for_tests()?;
|
||||
let first_time_stamp = OffsetDateTime::now_utc();
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
date_field => DateTime::from_utc(first_time_stamp),
|
||||
date_field => DateTime::new_utc(first_time_stamp),
|
||||
date_field => DateTime::new_utc(first_time_stamp),
|
||||
time_i=>1i64))?;
|
||||
index_writer.add_document(doc!(time_i => 0i64))?;
|
||||
// add one second
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(1)),
|
||||
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(1)),
|
||||
time_i => 2i64))?;
|
||||
// add another second
|
||||
let two_secs_ahead = first_time_stamp + Duration::seconds(2);
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::from_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
date_field => DateTime::new_utc(two_secs_ahead),
|
||||
time_i => 3i64))?;
|
||||
// add three seconds
|
||||
index_writer.add_document(doc!(
|
||||
date_field => DateTime::from_utc(first_time_stamp + Duration::seconds(3)),
|
||||
date_field => DateTime::new_utc(first_time_stamp + Duration::seconds(3)),
|
||||
time_i => 4i64))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
@@ -113,7 +113,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(first_time_stamp),
|
||||
DateTime::new_utc(first_time_stamp),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -140,7 +140,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.unwrap(),
|
||||
DateTime::from_utc(two_secs_ahead)
|
||||
DateTime::new_utc(two_secs_ahead)
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
@@ -181,7 +181,7 @@ mod tests {
|
||||
.expect("cannot find value")
|
||||
.as_date()
|
||||
.expect("value not of Date type"),
|
||||
DateTime::from_utc(first_time_stamp + Duration::seconds(offset_sec)),
|
||||
DateTime::new_utc(first_time_stamp + Duration::seconds(offset_sec)),
|
||||
);
|
||||
assert_eq!(
|
||||
retrieved_doc
|
||||
|
||||
@@ -27,28 +27,22 @@ impl<Item: FastValue> MultiValuedFastFieldReader<Item> {
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)`, such that the values associated
|
||||
/// to the given document are `start..end`.
|
||||
/// Returns `(start, stop)`, such that the values associated
|
||||
/// to the given document are `start..stop`.
|
||||
#[inline]
|
||||
fn range(&self, doc: DocId) -> Range<u64> {
|
||||
let start = self.idx_reader.get(doc);
|
||||
let end = self.idx_reader.get(doc + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
fn get_vals_for_range(&self, range: Range<u64>, vals: &mut Vec<Item>) {
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
let stop = self.idx_reader.get(doc + 1);
|
||||
start..stop
|
||||
}
|
||||
|
||||
/// Returns the array of values associated to the given `doc`.
|
||||
#[inline]
|
||||
pub fn get_vals(&self, doc: DocId, vals: &mut Vec<Item>) {
|
||||
let range = self.range(doc);
|
||||
self.get_vals_for_range(range, vals);
|
||||
let len = (range.end - range.start) as usize;
|
||||
vals.resize(len, Item::make_zero());
|
||||
self.vals_reader.get_range(range.start, &mut vals[..]);
|
||||
}
|
||||
|
||||
/// Returns the minimum value for this fast field.
|
||||
|
||||
@@ -4,7 +4,7 @@ use fnv::FnvHashMap;
|
||||
use tantivy_bitpacker::minmax;
|
||||
|
||||
use crate::fastfield::serializer::BitpackedFastFieldSerializerLegacy;
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer, FastFieldType};
|
||||
use crate::fastfield::{value_to_u64, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
use crate::schema::{Document, Field};
|
||||
@@ -38,17 +38,17 @@ pub struct MultiValuedFastFieldWriter {
|
||||
field: Field,
|
||||
vals: Vec<UnorderedTermId>,
|
||||
doc_index: Vec<u64>,
|
||||
fast_field_type: FastFieldType,
|
||||
is_facet: bool,
|
||||
}
|
||||
|
||||
impl MultiValuedFastFieldWriter {
|
||||
/// Creates a new `MultiValuedFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, fast_field_type: FastFieldType) -> Self {
|
||||
/// Creates a new `IntFastFieldWriter`
|
||||
pub(crate) fn new(field: Field, is_facet: bool) -> Self {
|
||||
MultiValuedFastFieldWriter {
|
||||
field,
|
||||
vals: Vec::new(),
|
||||
doc_index: Vec::new(),
|
||||
fast_field_type,
|
||||
is_facet,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -77,13 +77,12 @@ impl MultiValuedFastFieldWriter {
|
||||
/// all of the matching field values present in the document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
self.next_doc();
|
||||
// facets/texts are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if self.fast_field_type.is_storing_term_ids() {
|
||||
return;
|
||||
}
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
// facets are indexed in the `SegmentWriter` as we encode their unordered id.
|
||||
if !self.is_facet {
|
||||
for field_value in doc.field_values() {
|
||||
if field_value.field == self.field {
|
||||
self.add_val(value_to_u64(field_value.value()));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -159,15 +158,15 @@ impl MultiValuedFastFieldWriter {
|
||||
{
|
||||
// writing the values themselves.
|
||||
let mut value_serializer: BitpackedFastFieldSerializerLegacy<'_, _>;
|
||||
if let Some(mapping) = mapping_opt {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
match mapping_opt {
|
||||
Some(mapping) => {
|
||||
value_serializer = serializer.new_u64_fast_field_with_idx(
|
||||
self.field,
|
||||
0u64,
|
||||
mapping.len() as u64,
|
||||
1,
|
||||
)?;
|
||||
|
||||
if self.fast_field_type.is_facet() {
|
||||
let mut doc_vals: Vec<u64> = Vec::with_capacity(100);
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
doc_vals.clear();
|
||||
@@ -180,27 +179,19 @@ impl MultiValuedFastFieldWriter {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
}
|
||||
None => {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
let remapped_vals = vals
|
||||
.iter()
|
||||
.map(|val| *mapping.get(val).expect("Missing term ordinal"));
|
||||
for val in remapped_vals {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
let val_min_max = minmax(self.vals.iter().cloned());
|
||||
let (val_min, val_max) = val_min_max.unwrap_or((0u64, 0u64));
|
||||
value_serializer =
|
||||
serializer.new_u64_fast_field_with_idx(self.field, val_min, val_max, 1)?;
|
||||
for vals in self.get_ordered_values(doc_id_map) {
|
||||
// sort values in case of remapped doc_ids?
|
||||
for &val in vals {
|
||||
value_serializer.add_val(val)?;
|
||||
}
|
||||
}
|
||||
}
|
||||
value_serializer.close_field()?;
|
||||
}
|
||||
|
||||
@@ -6,12 +6,17 @@ use common::BinarySerializable;
|
||||
use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldReader as BitpackedReader, BitpackedFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::linearinterpol::{
|
||||
LinearInterpolFastFieldReader, LinearInterpolFastFieldSerializer,
|
||||
};
|
||||
#[allow(deprecated)]
|
||||
use fastfield_codecs::multilinearinterpol::{
|
||||
MultiLinearInterpolFastFieldReader, MultiLinearInterpolFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::piecewise_linear::{
|
||||
PiecewiseLinearFastFieldReader, PiecewiseLinearFastFieldSerializer,
|
||||
};
|
||||
use fastfield_codecs::{FastFieldCodecReader, FastFieldCodecSerializer};
|
||||
|
||||
use super::FastValue;
|
||||
@@ -71,6 +76,8 @@ pub enum DynamicFastFieldReader<Item: FastValue> {
|
||||
LinearInterpol(FastFieldReaderCodecWrapper<Item, LinearInterpolFastFieldReader>),
|
||||
/// Blockwise linear interpolated values + bitpacked
|
||||
MultiLinearInterpol(FastFieldReaderCodecWrapper<Item, MultiLinearInterpolFastFieldReader>),
|
||||
/// Piecewise linear interpolated values + bitpacked
|
||||
PiecewiseLinear(FastFieldReaderCodecWrapper<Item, PiecewiseLinearFastFieldReader>),
|
||||
}
|
||||
|
||||
impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
@@ -86,12 +93,14 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
BitpackedReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
LinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::LinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
LinearInterpolFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
#[allow(deprecated)]
|
||||
MultiLinearInterpolFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::MultiLinearInterpol(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
@@ -100,6 +109,12 @@ impl<Item: FastValue> DynamicFastFieldReader<Item> {
|
||||
bytes
|
||||
)?)
|
||||
}
|
||||
PiecewiseLinearFastFieldSerializer::ID => {
|
||||
DynamicFastFieldReader::PiecewiseLinear(FastFieldReaderCodecWrapper::<
|
||||
Item,
|
||||
PiecewiseLinearFastFieldReader,
|
||||
>::open_from_bytes(bytes)?)
|
||||
}
|
||||
_ => {
|
||||
panic!(
|
||||
"unknown fastfield id {:?}. Data corrupted or using old tantivy version.",
|
||||
@@ -118,6 +133,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get(doc),
|
||||
Self::LinearInterpol(reader) => reader.get(doc),
|
||||
Self::MultiLinearInterpol(reader) => reader.get(doc),
|
||||
Self::PiecewiseLinear(reader) => reader.get(doc),
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
@@ -126,6 +142,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.get_range(start, output),
|
||||
Self::LinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::MultiLinearInterpol(reader) => reader.get_range(start, output),
|
||||
Self::PiecewiseLinear(reader) => reader.get_range(start, output),
|
||||
}
|
||||
}
|
||||
fn min_value(&self) -> Item {
|
||||
@@ -133,6 +150,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.min_value(),
|
||||
Self::LinearInterpol(reader) => reader.min_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.min_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.min_value(),
|
||||
}
|
||||
}
|
||||
fn max_value(&self) -> Item {
|
||||
@@ -140,6 +158,7 @@ impl<Item: FastValue> FastFieldReader<Item> for DynamicFastFieldReader<Item> {
|
||||
Self::Bitpacked(reader) => reader.max_value(),
|
||||
Self::LinearInterpol(reader) => reader.max_value(),
|
||||
Self::MultiLinearInterpol(reader) => reader.max_value(),
|
||||
Self::PiecewiseLinear(reader) => reader.max_value(),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -176,9 +195,12 @@ impl<Item: FastValue, C: FastFieldCodecReader> FastFieldReaderCodecWrapper<Item,
|
||||
_phantom: PhantomData,
|
||||
})
|
||||
}
|
||||
#[inline]
|
||||
pub(crate) fn get_u64(&self, doc: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(doc, self.bytes.as_slice()))
|
||||
|
||||
/// Get u64 for indice `idx`.
|
||||
/// `idx` can be either a `DocId` or an index used for
|
||||
/// `multivalued` fast field. See [`get_range`] for more details.
|
||||
pub(crate) fn get_u64(&self, idx: u64) -> Item {
|
||||
Item::from_u64(self.reader.get_u64(idx, self.bytes.as_slice()))
|
||||
}
|
||||
|
||||
/// Internally `multivalued` also use SingleValue Fast fields.
|
||||
|
||||
@@ -39,9 +39,6 @@ pub(crate) fn type_and_cardinality(field_type: &FieldType) -> Option<(FastType,
|
||||
.get_fastfield_cardinality()
|
||||
.map(|cardinality| (FastType::Date, cardinality)),
|
||||
FieldType::Facet(_) => Some((FastType::U64, Cardinality::MultiValues)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some((FastType::U64, Cardinality::MultiValues))
|
||||
}
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -4,9 +4,9 @@ use common::{BinarySerializable, CountingWriter};
|
||||
pub use fastfield_codecs::bitpacked::{
|
||||
BitpackedFastFieldSerializer, BitpackedFastFieldSerializerLegacy,
|
||||
};
|
||||
use fastfield_codecs::linearinterpol::LinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::multilinearinterpol::MultiLinearInterpolFastFieldSerializer;
|
||||
use fastfield_codecs::piecewise_linear::PiecewiseLinearFastFieldSerializer;
|
||||
pub use fastfield_codecs::{FastFieldCodecSerializer, FastFieldDataAccess, FastFieldStats};
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::directory::{CompositeWrite, WritePtr};
|
||||
use crate::schema::Field;
|
||||
@@ -35,18 +35,31 @@ pub struct CompositeFastFieldSerializer {
|
||||
composite_write: CompositeWrite<WritePtr>,
|
||||
}
|
||||
|
||||
// use this, when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
#[derive(Debug)]
|
||||
pub struct CodecEstimationResult<'a> {
|
||||
pub ratio: f32,
|
||||
pub name: &'a str,
|
||||
pub id: u8,
|
||||
}
|
||||
|
||||
// TODO: use this when this is merged and stabilized explicit_generic_args_with_impl_trait
|
||||
// https://github.com/rust-lang/rust/pull/86176
|
||||
fn codec_estimation<T: FastFieldCodecSerializer, A: FastFieldDataAccess>(
|
||||
stats: FastFieldStats,
|
||||
fastfield_accessor: &A,
|
||||
estimations: &mut Vec<(f32, &str, u8)>,
|
||||
) {
|
||||
) -> CodecEstimationResult {
|
||||
if !T::is_applicable(fastfield_accessor, stats.clone()) {
|
||||
return;
|
||||
return CodecEstimationResult {
|
||||
ratio: f32::MAX,
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
};
|
||||
}
|
||||
CodecEstimationResult {
|
||||
ratio: T::estimate_compression_ratio(fastfield_accessor, stats),
|
||||
name: T::NAME,
|
||||
id: T::ID,
|
||||
}
|
||||
let (ratio, name, id) = (T::estimate(fastfield_accessor, stats), T::NAME, T::ID);
|
||||
estimations.push((ratio, name, id));
|
||||
}
|
||||
|
||||
impl CompositeFastFieldSerializer {
|
||||
@@ -59,7 +72,7 @@ impl CompositeFastFieldSerializer {
|
||||
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field(
|
||||
pub fn new_u64_fast_field_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
@@ -67,7 +80,7 @@ impl CompositeFastFieldSerializer {
|
||||
data_iter_1: impl Iterator<Item = u64>,
|
||||
data_iter_2: impl Iterator<Item = u64>,
|
||||
) -> io::Result<()> {
|
||||
self.create_auto_detect_u64_fast_field_with_idx(
|
||||
self.new_u64_fast_field_with_idx_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -78,7 +91,7 @@ impl CompositeFastFieldSerializer {
|
||||
}
|
||||
/// Serialize data into a new u64 fast field. The best compression codec will be chosen
|
||||
/// automatically.
|
||||
pub fn create_auto_detect_u64_fast_field_with_idx(
|
||||
pub fn new_u64_fast_field_with_idx_with_best_codec(
|
||||
&mut self,
|
||||
field: Field,
|
||||
stats: FastFieldStats,
|
||||
@@ -88,42 +101,29 @@ impl CompositeFastFieldSerializer {
|
||||
idx: usize,
|
||||
) -> io::Result<()> {
|
||||
let field_write = self.composite_write.for_field_with_idx(field, idx);
|
||||
|
||||
let mut estimations = vec![];
|
||||
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<LinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
codec_estimation::<MultiLinearInterpolFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
&mut estimations,
|
||||
);
|
||||
if let Some(broken_estimation) = estimations.iter().find(|estimation| estimation.0.is_nan())
|
||||
{
|
||||
warn!(
|
||||
"broken estimation for fast field codec {}",
|
||||
broken_estimation.1
|
||||
);
|
||||
}
|
||||
// removing nan values for codecs with broken calculations, and max values which disables
|
||||
// codecs
|
||||
estimations.retain(|estimation| !estimation.0.is_nan() && estimation.0 != f32::MAX);
|
||||
estimations.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap());
|
||||
let (_ratio, name, id) = estimations[0];
|
||||
let estimations = vec![
|
||||
codec_estimation::<BitpackedFastFieldSerializer, _>(stats.clone(), &fastfield_accessor),
|
||||
codec_estimation::<PiecewiseLinearFastFieldSerializer, _>(
|
||||
stats.clone(),
|
||||
&fastfield_accessor,
|
||||
),
|
||||
];
|
||||
let best_codec_result = estimations
|
||||
.iter()
|
||||
.sorted_by(|result_a, result_b| {
|
||||
result_a
|
||||
.ratio
|
||||
.partial_cmp(&result_b.ratio)
|
||||
.expect("Ratio cannot be nan.")
|
||||
})
|
||||
.next()
|
||||
.expect("A codec must be present.");
|
||||
debug!(
|
||||
"choosing fast field codec {} for field_id {:?}",
|
||||
name, field
|
||||
); // todo print actual field name
|
||||
id.serialize(field_write)?;
|
||||
match name {
|
||||
"Choosing fast field codec {} for field_id {:?} among {:?}",
|
||||
best_codec_result.name, field, estimations,
|
||||
);
|
||||
best_codec_result.id.serialize(field_write)?;
|
||||
match best_codec_result.name {
|
||||
BitpackedFastFieldSerializer::NAME => {
|
||||
BitpackedFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
@@ -133,17 +133,8 @@ impl CompositeFastFieldSerializer {
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
LinearInterpolFastFieldSerializer::NAME => {
|
||||
LinearInterpolFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
data_iter_1,
|
||||
data_iter_2,
|
||||
)?;
|
||||
}
|
||||
MultiLinearInterpolFastFieldSerializer::NAME => {
|
||||
MultiLinearInterpolFastFieldSerializer::serialize(
|
||||
PiecewiseLinearFastFieldSerializer::NAME => {
|
||||
PiecewiseLinearFastFieldSerializer::serialize(
|
||||
field_write,
|
||||
&fastfield_accessor,
|
||||
stats,
|
||||
@@ -152,7 +143,7 @@ impl CompositeFastFieldSerializer {
|
||||
)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("unknown fastfield serializer {}", name)
|
||||
panic!("unknown fastfield serializer {}", best_codec_result.name)
|
||||
}
|
||||
};
|
||||
field_write.flush()?;
|
||||
@@ -216,3 +207,45 @@ impl<'a, W: Write> FastBytesFieldSerializer<'a, W> {
|
||||
self.write.flush()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::path::Path;
|
||||
|
||||
use common::BinarySerializable;
|
||||
use fastfield_codecs::FastFieldStats;
|
||||
use itertools::Itertools;
|
||||
|
||||
use super::CompositeFastFieldSerializer;
|
||||
use crate::directory::{RamDirectory, WritePtr};
|
||||
use crate::schema::Field;
|
||||
use crate::Directory;
|
||||
|
||||
#[test]
|
||||
fn new_u64_fast_field_with_best_codec() -> crate::Result<()> {
|
||||
let directory: RamDirectory = RamDirectory::create();
|
||||
let path = Path::new("test");
|
||||
let write: WritePtr = directory.open_write(path)?;
|
||||
let mut serializer = CompositeFastFieldSerializer::from_write(write)?;
|
||||
let vals = (0..10000u64).into_iter().collect_vec();
|
||||
let stats = FastFieldStats {
|
||||
min_value: 0,
|
||||
max_value: 9999,
|
||||
num_vals: vals.len() as u64,
|
||||
};
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
Field::from_field_id(0),
|
||||
stats,
|
||||
vals.clone(),
|
||||
vals.clone().into_iter(),
|
||||
vals.into_iter(),
|
||||
)?;
|
||||
serializer.close()?;
|
||||
// get the codecs id
|
||||
let mut bytes = directory.open_read(path)?.read_bytes()?;
|
||||
let codec_id = u8::deserialize(&mut bytes)?;
|
||||
// Codec id = 4 is piecewise linear.
|
||||
assert_eq!(codec_id, 4);
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,7 +7,7 @@ use tantivy_bitpacker::BlockedBitpacker;
|
||||
|
||||
use super::multivalued::MultiValuedFastFieldWriter;
|
||||
use super::serializer::FastFieldStats;
|
||||
use super::{FastFieldDataAccess, FastFieldType};
|
||||
use super::FastFieldDataAccess;
|
||||
use crate::fastfield::{BytesFastFieldWriter, CompositeFastFieldSerializer};
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::UnorderedTermId;
|
||||
@@ -16,7 +16,6 @@ use crate::termdict::TermOrdinal;
|
||||
|
||||
/// The `FastFieldsWriter` groups all of the fast field writers.
|
||||
pub struct FastFieldsWriter {
|
||||
term_id_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
single_value_writers: Vec<IntFastFieldWriter>,
|
||||
multi_values_writers: Vec<MultiValuedFastFieldWriter>,
|
||||
bytes_value_writers: Vec<BytesFastFieldWriter>,
|
||||
@@ -34,7 +33,6 @@ impl FastFieldsWriter {
|
||||
/// Create all `FastFieldWriter` required by the schema.
|
||||
pub fn from_schema(schema: &Schema) -> FastFieldsWriter {
|
||||
let mut single_value_writers = Vec::new();
|
||||
let mut term_id_writers = Vec::new();
|
||||
let mut multi_values_writers = Vec::new();
|
||||
let mut bytes_value_writers = Vec::new();
|
||||
|
||||
@@ -52,22 +50,15 @@ impl FastFieldsWriter {
|
||||
single_value_writers.push(fast_field_writer);
|
||||
}
|
||||
Some(Cardinality::MultiValues) => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Numeric);
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, false);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
None => {}
|
||||
}
|
||||
}
|
||||
FieldType::Facet(_) => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::Facet);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Str(_) if field_entry.is_fast() => {
|
||||
let fast_field_writer =
|
||||
MultiValuedFastFieldWriter::new(field, FastFieldType::String);
|
||||
term_id_writers.push(fast_field_writer);
|
||||
let fast_field_writer = MultiValuedFastFieldWriter::new(field, true);
|
||||
multi_values_writers.push(fast_field_writer);
|
||||
}
|
||||
FieldType::Bytes(bytes_option) => {
|
||||
if bytes_option.is_fast() {
|
||||
@@ -79,7 +70,6 @@ impl FastFieldsWriter {
|
||||
}
|
||||
}
|
||||
FastFieldsWriter {
|
||||
term_id_writers,
|
||||
single_value_writers,
|
||||
multi_values_writers,
|
||||
bytes_value_writers,
|
||||
@@ -88,15 +78,10 @@ impl FastFieldsWriter {
|
||||
|
||||
/// The memory used (inclusive childs)
|
||||
pub fn mem_usage(&self) -> usize {
|
||||
self.term_id_writers
|
||||
self.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.single_value_writers
|
||||
.iter()
|
||||
.map(|w| w.mem_usage())
|
||||
.sum::<usize>()
|
||||
+ self
|
||||
.multi_values_writers
|
||||
.iter()
|
||||
@@ -109,14 +94,6 @@ impl FastFieldsWriter {
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer(&self, field: Field) -> Option<&MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_field_writer(&self, field: Field) -> Option<&IntFastFieldWriter> {
|
||||
// TODO optimize
|
||||
@@ -133,17 +110,6 @@ impl FastFieldsWriter {
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Get the `FastFieldWriter` associated to a field.
|
||||
pub fn get_term_id_writer_mut(
|
||||
&mut self,
|
||||
field: Field,
|
||||
) -> Option<&mut MultiValuedFastFieldWriter> {
|
||||
// TODO optimize
|
||||
self.term_id_writers
|
||||
.iter_mut()
|
||||
.find(|field_writer| field_writer.field() == field)
|
||||
}
|
||||
|
||||
/// Returns the fast field multi-value writer for the given field.
|
||||
///
|
||||
/// Returns None if the field does not exist, or is not
|
||||
@@ -171,9 +137,6 @@ impl FastFieldsWriter {
|
||||
|
||||
/// Indexes all of the fastfields of a new document.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
for field_writer in &mut self.term_id_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
for field_writer in &mut self.single_value_writers {
|
||||
field_writer.add_document(doc);
|
||||
}
|
||||
@@ -193,10 +156,6 @@ impl FastFieldsWriter {
|
||||
mapping: &HashMap<Field, FnvHashMap<UnorderedTermId, TermOrdinal>>,
|
||||
doc_id_map: Option<&DocIdMapping>,
|
||||
) -> io::Result<()> {
|
||||
for field_writer in &self.term_id_writers {
|
||||
let field = field_writer.field();
|
||||
field_writer.serialize(serializer, mapping.get(&field), doc_id_map)?;
|
||||
}
|
||||
for field_writer in &self.single_value_writers {
|
||||
field_writer.serialize(serializer, doc_id_map)?;
|
||||
}
|
||||
@@ -285,10 +244,6 @@ impl IntFastFieldWriter {
|
||||
self.val_count += 1;
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
///
|
||||
///
|
||||
/// Extract the value associated to the fast field for
|
||||
/// this document.
|
||||
///
|
||||
@@ -299,17 +254,18 @@ impl IntFastFieldWriter {
|
||||
/// instead.
|
||||
/// If the document has more than one value for the given field,
|
||||
/// only the first one is taken in account.
|
||||
///
|
||||
/// Values on text fast fields are skipped.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
fn extract_val(&self, doc: &Document) -> u64 {
|
||||
match doc.get_first(self.field) {
|
||||
Some(v) => {
|
||||
self.add_val(super::value_to_u64(v));
|
||||
}
|
||||
None => {
|
||||
self.add_val(self.val_if_missing);
|
||||
}
|
||||
};
|
||||
Some(v) => super::value_to_u64(v),
|
||||
None => self.val_if_missing,
|
||||
}
|
||||
}
|
||||
|
||||
/// Extract the fast field value from the document
|
||||
/// (or use the default value) and records it.
|
||||
pub fn add_document(&mut self, doc: &Document) {
|
||||
let val = self.extract_val(doc);
|
||||
self.add_val(val);
|
||||
}
|
||||
|
||||
/// get iterator over the data
|
||||
@@ -328,7 +284,6 @@ impl IntFastFieldWriter {
|
||||
} else {
|
||||
(self.val_min, self.val_max)
|
||||
};
|
||||
|
||||
let fastfield_accessor = WriterFastFieldAccessProvider {
|
||||
doc_id_map,
|
||||
vals: &self.vals,
|
||||
@@ -343,7 +298,7 @@ impl IntFastFieldWriter {
|
||||
let iter = doc_id_map
|
||||
.iter_old_doc_ids()
|
||||
.map(|doc_id| self.vals.get(doc_id as usize));
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -351,7 +306,7 @@ impl IntFastFieldWriter {
|
||||
iter,
|
||||
)?;
|
||||
} else {
|
||||
serializer.create_auto_detect_u64_fast_field(
|
||||
serializer.new_u64_fast_field_with_best_codec(
|
||||
self.field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
|
||||
@@ -116,14 +116,14 @@ pub fn demux(
|
||||
) -> crate::Result<Vec<Index>> {
|
||||
let mut indices = vec![];
|
||||
for (target_segment_ord, output_directory) in output_directories.into_iter().enumerate() {
|
||||
let alive_bitset = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
let delete_bitsets = get_alive_bitsets(demux_mapping, target_segment_ord as u32)
|
||||
.into_iter()
|
||||
.map(Some)
|
||||
.collect_vec();
|
||||
let index = merge_filtered_segments(
|
||||
segments,
|
||||
target_settings.clone(),
|
||||
alive_bitset,
|
||||
delete_bitsets,
|
||||
output_directory,
|
||||
)?;
|
||||
indices.push(index);
|
||||
@@ -141,7 +141,7 @@ mod tests {
|
||||
use crate::{DocAddress, Term};
|
||||
|
||||
#[test]
|
||||
fn test_demux_map_to_alive_bitset() {
|
||||
fn test_demux_map_to_deletebitset() {
|
||||
let max_value = 2;
|
||||
let mut demux_mapping = DemuxMapping::default();
|
||||
// segment ordinal 0 mapping
|
||||
|
||||
@@ -4,6 +4,7 @@ use std::thread;
|
||||
use std::thread::JoinHandle;
|
||||
|
||||
use common::BitSet;
|
||||
use crossbeam::channel;
|
||||
use smallvec::smallvec;
|
||||
|
||||
use super::operation::{AddOperation, UserOperation};
|
||||
@@ -288,7 +289,7 @@ impl IndexWriter {
|
||||
return Err(TantivyError::InvalidArgument(err_msg));
|
||||
}
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
|
||||
let delete_queue = DeleteQueue::new();
|
||||
|
||||
@@ -325,7 +326,7 @@ impl IndexWriter {
|
||||
}
|
||||
|
||||
fn drop_sender(&mut self) {
|
||||
let (sender, _receiver) = crossbeam_channel::bounded(1);
|
||||
let (sender, _receiver) = channel::bounded(1);
|
||||
self.operation_sender = sender;
|
||||
}
|
||||
|
||||
@@ -531,7 +532,7 @@ impl IndexWriter {
|
||||
/// Returns the former segment_ready channel.
|
||||
fn recreate_document_channel(&mut self) {
|
||||
let (document_sender, document_receiver): (AddBatchSender, AddBatchReceiver) =
|
||||
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
|
||||
self.operation_sender = document_sender;
|
||||
self.index_writer_status = IndexWriterStatus::from(document_receiver);
|
||||
}
|
||||
|
||||
@@ -92,7 +92,7 @@ impl Drop for IndexWriterBomb {
|
||||
mod tests {
|
||||
use std::mem;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::IndexWriterStatus;
|
||||
|
||||
|
||||
@@ -4,7 +4,7 @@ use murmurhash32::murmurhash2;
|
||||
use crate::fastfield::FastValue;
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||
use crate::schema::{Field, Type};
|
||||
use crate::schema::Type;
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
@@ -149,11 +149,10 @@ fn index_json_value<'a>(
|
||||
json_term_writer.term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
TextOrDateTime::DateTime(dt) => {
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(dt));
|
||||
json_term_writer.set_fast_value(DateTime::new_utc(dt));
|
||||
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
|
||||
}
|
||||
},
|
||||
@@ -199,77 +198,12 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime {
|
||||
}
|
||||
}
|
||||
|
||||
// Tries to infer a JSON type from a string
|
||||
pub(crate) fn convert_to_fast_value_and_get_term(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
phrase: &str,
|
||||
) -> Option<Term> {
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
return Some(set_fastvalue_and_get_term(
|
||||
json_term_writer,
|
||||
DateTime::from_utc(dt_utc),
|
||||
));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
|
||||
}
|
||||
None
|
||||
}
|
||||
// helper function to generate a Term from a json fastvalue
|
||||
pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: T,
|
||||
) -> Term {
|
||||
json_term_writer.set_fast_value(value);
|
||||
json_term_writer.term().clone()
|
||||
}
|
||||
|
||||
// helper function to generate a list of terms with their positions from a textual json value
|
||||
pub(crate) fn set_string_and_get_terms(
|
||||
json_term_writer: &mut JsonTermWriter,
|
||||
value: &str,
|
||||
text_analyzer: &TextAnalyzer,
|
||||
) -> Vec<(usize, Term)> {
|
||||
let mut positions_and_terms = Vec::<(usize, Term)>::new();
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
let term_num_bytes = json_term_writer.term_buffer.as_slice().len();
|
||||
let mut token_stream = text_analyzer.token_stream(value);
|
||||
token_stream.process(&mut |token| {
|
||||
json_term_writer.term_buffer.truncate(term_num_bytes);
|
||||
json_term_writer
|
||||
.term_buffer
|
||||
.append_bytes(token.text.as_bytes());
|
||||
positions_and_terms.push((token.position, json_term_writer.term().clone()));
|
||||
});
|
||||
positions_and_terms
|
||||
}
|
||||
|
||||
pub struct JsonTermWriter<'a> {
|
||||
term_buffer: &'a mut Term,
|
||||
path_stack: Vec<usize>,
|
||||
}
|
||||
|
||||
impl<'a> JsonTermWriter<'a> {
|
||||
pub fn from_field_and_json_path(
|
||||
field: Field,
|
||||
json_path: &str,
|
||||
term_buffer: &'a mut Term,
|
||||
) -> Self {
|
||||
term_buffer.set_field(Type::Json, field);
|
||||
let mut json_term_writer = Self::wrap(term_buffer);
|
||||
for segment in json_path.split('.') {
|
||||
json_term_writer.push_path_segment(segment);
|
||||
}
|
||||
json_term_writer
|
||||
}
|
||||
|
||||
pub fn wrap(term_buffer: &'a mut Term) -> Self {
|
||||
term_buffer.clear_with_type(Type::Json);
|
||||
let mut path_stack = Vec::with_capacity(10);
|
||||
|
||||
@@ -170,8 +170,8 @@ impl IndexMerger {
|
||||
index_settings: IndexSettings,
|
||||
segments: &[Segment],
|
||||
) -> crate::Result<IndexMerger> {
|
||||
let alive_bitset = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
|
||||
let delete_bitsets = segments.iter().map(|_| None).collect_vec();
|
||||
Self::open_with_custom_alive_set(schema, index_settings, segments, delete_bitsets)
|
||||
}
|
||||
|
||||
// Create merge with a custom delete set.
|
||||
@@ -180,7 +180,7 @@ impl IndexMerger {
|
||||
// corresponds to the segment index.
|
||||
//
|
||||
// If `None` is provided for custom alive set, the regular alive set will be used.
|
||||
// If a alive_bitset is provided, the union between the provided and regular
|
||||
// If a delete_bitsets is provided, the union between the provided and regular
|
||||
// alive set will be used.
|
||||
//
|
||||
// This can be used to merge but also apply an additional filter.
|
||||
@@ -283,12 +283,12 @@ impl IndexMerger {
|
||||
for (field, field_entry) in self.schema.fields() {
|
||||
let field_type = field_entry.field_type();
|
||||
match field_type {
|
||||
FieldType::Facet(_) | FieldType::Str(_) if field_type.is_fast() => {
|
||||
FieldType::Facet(_) => {
|
||||
let term_ordinal_mapping = term_ord_mappings.remove(&field).expect(
|
||||
"Logic Error in Tantivy (Please report). Facet field should have required \
|
||||
a`term_ordinal_mapping`.",
|
||||
);
|
||||
self.write_term_id_fast_field(
|
||||
self.write_hierarchical_facet_field(
|
||||
field,
|
||||
&term_ordinal_mapping,
|
||||
fast_field_serializer,
|
||||
@@ -312,8 +312,8 @@ impl IndexMerger {
|
||||
self.write_bytes_fast_field(field, fast_field_serializer, doc_id_mapping)?;
|
||||
}
|
||||
}
|
||||
_ => {
|
||||
// We don't handle json fast field for the moment
|
||||
FieldType::Str(_) | FieldType::JsonObject(_) => {
|
||||
// We don't handle json / string fast field for the moment
|
||||
// They can be implemented using what is done
|
||||
// for facets in the future
|
||||
}
|
||||
@@ -384,7 +384,7 @@ impl IndexMerger {
|
||||
let fast_field_reader = &fast_field_readers[*reader_ordinal as usize];
|
||||
fast_field_reader.get(*doc_id)
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -551,7 +551,7 @@ impl IndexMerger {
|
||||
}
|
||||
offsets.push(offset);
|
||||
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field(
|
||||
fast_field_serializer.new_u64_fast_field_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
&offsets[..],
|
||||
@@ -590,14 +590,14 @@ impl IndexMerger {
|
||||
)
|
||||
}
|
||||
|
||||
fn write_term_id_fast_field(
|
||||
fn write_hierarchical_facet_field(
|
||||
&self,
|
||||
field: Field,
|
||||
term_ordinal_mappings: &TermOrdinalMapping,
|
||||
fast_field_serializer: &mut CompositeFastFieldSerializer,
|
||||
doc_id_mapping: &SegmentDocIdMapping,
|
||||
) -> crate::Result<()> {
|
||||
debug_time!("write-term-id-fast-field");
|
||||
debug_time!("write-hierarchical-facet-field");
|
||||
|
||||
// Multifastfield consists of 2 fastfields.
|
||||
// The first serves as an index into the second one and is stricly increasing.
|
||||
@@ -771,7 +771,7 @@ impl IndexMerger {
|
||||
ff_reader.get_vals(*doc_id, &mut vals);
|
||||
vals.into_iter()
|
||||
});
|
||||
fast_field_serializer.create_auto_detect_u64_fast_field_with_idx(
|
||||
fast_field_serializer.new_u64_fast_field_with_idx_with_best_codec(
|
||||
field,
|
||||
stats,
|
||||
fastfield_accessor,
|
||||
@@ -848,9 +848,6 @@ impl IndexMerger {
|
||||
|
||||
let mut term_ord_mapping_opt = match field_type {
|
||||
FieldType::Facet(_) => Some(TermOrdinalMapping::new(max_term_ords)),
|
||||
FieldType::Str(options) if options.is_fast() => {
|
||||
Some(TermOrdinalMapping::new(max_term_ords))
|
||||
}
|
||||
_ => None,
|
||||
};
|
||||
|
||||
@@ -1177,7 +1174,7 @@ mod tests {
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "af b",
|
||||
score_field => 3u64,
|
||||
date_field => DateTime::from_utc(curr_time),
|
||||
date_field => DateTime::new_utc(curr_time),
|
||||
bytes_score_field => 3u32.to_be_bytes().as_ref()
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
@@ -1194,7 +1191,7 @@ mod tests {
|
||||
// writing the segment
|
||||
index_writer.add_document(doc!(
|
||||
text_field => "af b",
|
||||
date_field => DateTime::from_utc(curr_time),
|
||||
date_field => DateTime::new_utc(curr_time),
|
||||
score_field => 11u64,
|
||||
bytes_score_field => 11u32.to_be_bytes().as_ref()
|
||||
))?;
|
||||
@@ -1252,7 +1249,7 @@ mod tests {
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_date(
|
||||
date_field,
|
||||
DateTime::from_utc(curr_time)
|
||||
DateTime::new_utc(curr_time)
|
||||
)])?,
|
||||
vec![DocAddress::new(0, 0), DocAddress::new(0, 3)]
|
||||
);
|
||||
|
||||
@@ -21,13 +21,11 @@ pub mod segment_updater;
|
||||
mod segment_writer;
|
||||
mod stamper;
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
use smallvec::SmallVec;
|
||||
|
||||
pub use self::index_writer::IndexWriter;
|
||||
pub(crate) use self::json_term_writer::{
|
||||
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
|
||||
};
|
||||
pub(crate) use self::json_term_writer::JsonTermWriter;
|
||||
pub use self::log_merge_policy::LogMergePolicy;
|
||||
pub use self::merge_operation::MergeOperation;
|
||||
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};
|
||||
|
||||
@@ -39,10 +39,9 @@ impl SegmentSerializer {
|
||||
|
||||
let postings_serializer = InvertedIndexSerializer::open(&mut segment)?;
|
||||
let compressor = segment.index().settings().docstore_compression;
|
||||
let blocksize = segment.index().settings().docstore_blocksize;
|
||||
Ok(SegmentSerializer {
|
||||
segment,
|
||||
store_writer: StoreWriter::new(store_write, compressor, blocksize),
|
||||
store_writer: StoreWriter::new(store_write, compressor),
|
||||
fast_field_serializer,
|
||||
fieldnorms_serializer: Some(fieldnorms_serializer),
|
||||
postings_serializer,
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::borrow::BorrowMut;
|
||||
use std::collections::HashSet;
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
use std::ops::Deref;
|
||||
use std::path::PathBuf;
|
||||
@@ -26,7 +27,7 @@ use crate::indexer::{
|
||||
SegmentSerializer,
|
||||
};
|
||||
use crate::schema::Schema;
|
||||
use crate::{FutureResult, Opstamp};
|
||||
use crate::{FutureResult, Opstamp, TantivyError};
|
||||
|
||||
const NUM_MERGE_THREADS: usize = 4;
|
||||
|
||||
@@ -72,12 +73,10 @@ fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()>
|
||||
let mut buffer = serde_json::to_vec_pretty(metas)?;
|
||||
// Just adding a new line at the end of the buffer.
|
||||
writeln!(&mut buffer)?;
|
||||
fail_point!("save_metas", |msg| Err(crate::TantivyError::from(
|
||||
std::io::Error::new(
|
||||
std::io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
)
|
||||
)));
|
||||
fail_point!("save_metas", |msg| Err(TantivyError::from(io::Error::new(
|
||||
io::ErrorKind::Other,
|
||||
msg.unwrap_or_else(|| "Undefined".to_string())
|
||||
))));
|
||||
directory.sync_directory()?;
|
||||
directory.atomic_write(&META_FILEPATH, &buffer[..])?;
|
||||
debug!("Saved metas {:?}", serde_json::to_string_pretty(&metas));
|
||||
|
||||
@@ -188,7 +188,7 @@ impl SegmentWriter {
|
||||
});
|
||||
if let Some(unordered_term_id) = unordered_term_id_opt {
|
||||
self.fast_field_writers
|
||||
.get_term_id_writer_mut(field)
|
||||
.get_multivalue_writer_mut(field)
|
||||
.expect("writer for facet missing")
|
||||
.add_val(unordered_term_id);
|
||||
}
|
||||
@@ -221,7 +221,6 @@ impl SegmentWriter {
|
||||
}
|
||||
|
||||
let mut indexing_position = IndexingPosition::default();
|
||||
|
||||
for mut token_stream in token_streams {
|
||||
assert_eq!(term_buffer.as_slice().len(), 5);
|
||||
postings_writer.index_text(
|
||||
@@ -230,13 +229,10 @@ impl SegmentWriter {
|
||||
term_buffer,
|
||||
ctx,
|
||||
&mut indexing_position,
|
||||
self.fast_field_writers.get_term_id_writer_mut(field),
|
||||
);
|
||||
}
|
||||
if field_entry.has_fieldnorms() {
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
}
|
||||
self.fieldnorms_writer
|
||||
.record(doc_id, field, indexing_position.num_tokens);
|
||||
}
|
||||
FieldType::U64(_) => {
|
||||
for value in values {
|
||||
@@ -372,10 +368,9 @@ fn remap_and_write(
|
||||
.segment_mut()
|
||||
.open_write(SegmentComponent::Store)?;
|
||||
let compressor = serializer.segment().index().settings().docstore_compression;
|
||||
let block_size = serializer.segment().index().settings().docstore_blocksize;
|
||||
let old_store_writer = std::mem::replace(
|
||||
&mut serializer.store_writer,
|
||||
StoreWriter::new(store_write, compressor, block_size),
|
||||
StoreWriter::new(store_write, compressor),
|
||||
);
|
||||
old_store_writer.close()?;
|
||||
let store_read = StoreReader::open(
|
||||
@@ -528,7 +523,7 @@ mod tests {
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.pop_path_segment();
|
||||
json_term_writer.push_path_segment("date");
|
||||
json_term_writer.set_fast_value(DateTime::from_utc(
|
||||
json_term_writer.set_fast_value(DateTime::new_utc(
|
||||
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
|
||||
));
|
||||
assert!(term_stream.advance());
|
||||
|
||||
24
src/lib.rs
24
src/lib.rs
@@ -158,7 +158,7 @@ impl DateTime {
|
||||
///
|
||||
/// The given date/time is converted to UTC and the actual
|
||||
/// time zone is discarded.
|
||||
pub const fn from_utc(dt: OffsetDateTime) -> Self {
|
||||
pub const fn new_utc(dt: OffsetDateTime) -> Self {
|
||||
Self::from_unix_timestamp(dt.unix_timestamp())
|
||||
}
|
||||
|
||||
@@ -166,19 +166,19 @@ impl DateTime {
|
||||
///
|
||||
/// Implicitly assumes that the given date/time is in UTC!
|
||||
/// Otherwise the original value must only be reobtained with
|
||||
/// [`Self::into_primitive()`].
|
||||
pub const fn from_primitive(dt: PrimitiveDateTime) -> Self {
|
||||
Self::from_utc(dt.assume_utc())
|
||||
/// [`to_primitive()`].
|
||||
pub const fn new_primitive(dt: PrimitiveDateTime) -> Self {
|
||||
Self::new_utc(dt.assume_utc())
|
||||
}
|
||||
|
||||
/// Convert to UNIX timestamp
|
||||
pub const fn into_unix_timestamp(self) -> i64 {
|
||||
pub const fn to_unix_timestamp(self) -> i64 {
|
||||
let Self { unix_timestamp } = self;
|
||||
unix_timestamp
|
||||
}
|
||||
|
||||
/// Convert to UTC `OffsetDateTime`
|
||||
pub fn into_utc(self) -> OffsetDateTime {
|
||||
pub fn to_utc(self) -> OffsetDateTime {
|
||||
let Self { unix_timestamp } = self;
|
||||
let utc_datetime =
|
||||
OffsetDateTime::from_unix_timestamp(unix_timestamp).expect("valid UNIX timestamp");
|
||||
@@ -187,16 +187,16 @@ impl DateTime {
|
||||
}
|
||||
|
||||
/// Convert to `OffsetDateTime` with the given time zone
|
||||
pub fn into_offset(self, offset: UtcOffset) -> OffsetDateTime {
|
||||
self.into_utc().to_offset(offset)
|
||||
pub fn to_offset(self, offset: UtcOffset) -> OffsetDateTime {
|
||||
self.to_utc().to_offset(offset)
|
||||
}
|
||||
|
||||
/// Convert to `PrimitiveDateTime` without any time zone
|
||||
///
|
||||
/// The value should have been constructed with [`Self::from_primitive()`].
|
||||
/// The value should have been constructed with [`from_primitive()`].
|
||||
/// Otherwise the time zone is implicitly assumed to be UTC.
|
||||
pub fn into_primitive(self) -> PrimitiveDateTime {
|
||||
let utc_datetime = self.into_utc();
|
||||
pub fn to_primitive(self) -> PrimitiveDateTime {
|
||||
let utc_datetime = self.to_utc();
|
||||
// Discard the UTC time zone offset
|
||||
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
|
||||
PrimitiveDateTime::new(utc_datetime.date(), utc_datetime.time())
|
||||
@@ -205,7 +205,7 @@ impl DateTime {
|
||||
|
||||
impl fmt::Debug for DateTime {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
|
||||
let utc_rfc3339 = self.to_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
|
||||
f.write_str(&utc_rfc3339)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
use std::io;
|
||||
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::postings_writer::SpecializedPostingsWriter;
|
||||
use crate::postings::recorder::{BufferLender, NothingRecorder, Recorder};
|
||||
@@ -43,7 +42,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
_fast_field_writer: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
self.str_posting_writer.index_text(
|
||||
doc_id,
|
||||
@@ -51,7 +49,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
||||
term_buffer,
|
||||
ctx,
|
||||
indexing_position,
|
||||
None,
|
||||
);
|
||||
}
|
||||
|
||||
|
||||
@@ -6,7 +6,6 @@ use std::ops::Range;
|
||||
use fnv::FnvHashMap;
|
||||
|
||||
use super::stacker::Addr;
|
||||
use crate::fastfield::MultiValuedFastFieldWriter;
|
||||
use crate::fieldnorm::FieldNormReaders;
|
||||
use crate::indexer::doc_id_mapping::DocIdMapping;
|
||||
use crate::postings::recorder::{BufferLender, Recorder};
|
||||
@@ -146,7 +145,6 @@ pub(crate) trait PostingsWriter {
|
||||
term_buffer: &mut Term,
|
||||
ctx: &mut IndexingContext,
|
||||
indexing_position: &mut IndexingPosition,
|
||||
mut term_id_fast_field_writer_opt: Option<&mut MultiValuedFastFieldWriter>,
|
||||
) {
|
||||
let end_of_path_idx = term_buffer.as_slice().len();
|
||||
let mut num_tokens = 0;
|
||||
@@ -166,14 +164,9 @@ pub(crate) trait PostingsWriter {
|
||||
term_buffer.append_bytes(token.text.as_bytes());
|
||||
let start_position = indexing_position.end_position + token.position as u32;
|
||||
end_position = start_position + token.position_length as u32;
|
||||
let unordered_term_id = self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
if let Some(term_id_fast_field_writer) = term_id_fast_field_writer_opt.as_mut() {
|
||||
term_id_fast_field_writer.add_val(unordered_term_id);
|
||||
}
|
||||
|
||||
self.subscribe(doc_id, start_position, term_buffer, ctx);
|
||||
num_tokens += 1;
|
||||
});
|
||||
|
||||
indexing_position.end_position = end_position + POSITION_GAP;
|
||||
indexing_position.num_tokens += num_tokens;
|
||||
term_buffer.truncate(end_of_path_idx);
|
||||
|
||||
@@ -247,7 +247,7 @@ impl MoreLikeThis {
|
||||
let unix_timestamp = value
|
||||
.as_date()
|
||||
.ok_or_else(|| TantivyError::InvalidArgument("invalid value".to_string()))?
|
||||
.into_unix_timestamp();
|
||||
.to_unix_timestamp();
|
||||
if !self.is_noise_word(unix_timestamp.to_string()) {
|
||||
let term = Term::from_field_i64(field, unix_timestamp);
|
||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||
|
||||
@@ -184,66 +184,6 @@ fn intersection_with_slop(left: &mut [u32], right: &[u32], slop: u32) -> usize {
|
||||
count
|
||||
}
|
||||
|
||||
fn intersection_count_with_slop(left: &[u32], right: &[u32], slop: u32) -> usize {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let mut count = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
let right_slop = if right_val >= slop {
|
||||
right_val - slop
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if left_val < right_slop {
|
||||
left_index += 1;
|
||||
} else if right_slop <= left_val && left_val <= right_val {
|
||||
while left_index + 1 < left_len {
|
||||
let next_left_val = left[left_index + 1];
|
||||
if next_left_val > right_val {
|
||||
break;
|
||||
}
|
||||
left_index += 1;
|
||||
}
|
||||
count += 1;
|
||||
left_index += 1;
|
||||
right_index += 1;
|
||||
} else if left_val > right_val {
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
count
|
||||
}
|
||||
|
||||
fn intersection_exists_with_slop(left: &[u32], right: &[u32], slop: u32) -> bool {
|
||||
let mut left_index = 0;
|
||||
let mut right_index = 0;
|
||||
let left_len = left.len();
|
||||
let right_len = right.len();
|
||||
while left_index < left_len && right_index < right_len {
|
||||
let left_val = left[left_index];
|
||||
let right_val = right[right_index];
|
||||
let right_slop = if right_val >= slop {
|
||||
right_val - slop
|
||||
} else {
|
||||
0
|
||||
};
|
||||
|
||||
if left_val < right_slop {
|
||||
left_index += 1;
|
||||
} else if right_slop <= left_val && left_val <= right_val {
|
||||
return true;
|
||||
} else if left_val > right_val {
|
||||
right_index += 1;
|
||||
}
|
||||
}
|
||||
false
|
||||
}
|
||||
|
||||
impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
pub fn new(
|
||||
term_postings: Vec<(usize, TPostings)>,
|
||||
@@ -297,25 +237,11 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
|
||||
fn phrase_exists(&mut self) -> bool {
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
if self.has_slop() {
|
||||
return intersection_exists_with_slop(
|
||||
&self.left[..intersection_len],
|
||||
&self.right[..],
|
||||
self.slop,
|
||||
);
|
||||
}
|
||||
intersection_exists(&self.left[..intersection_len], &self.right[..])
|
||||
}
|
||||
|
||||
fn compute_phrase_count(&mut self) -> u32 {
|
||||
let intersection_len = self.compute_phrase_match();
|
||||
if self.has_slop() {
|
||||
return intersection_count_with_slop(
|
||||
&self.left[..intersection_len],
|
||||
&self.right[..],
|
||||
self.slop,
|
||||
) as u32;
|
||||
}
|
||||
intersection_count(&self.left[..intersection_len], &self.right[..]) as u32
|
||||
}
|
||||
|
||||
@@ -326,7 +252,12 @@ impl<TPostings: Postings> PhraseScorer<TPostings> {
|
||||
.positions(&mut self.left);
|
||||
}
|
||||
let mut intersection_len = self.left.len();
|
||||
for i in 1..self.num_terms - 1 {
|
||||
let end_term = if self.has_slop() {
|
||||
self.num_terms
|
||||
} else {
|
||||
self.num_terms - 1
|
||||
};
|
||||
for i in 1..end_term {
|
||||
{
|
||||
self.intersection_docset
|
||||
.docset_mut_specialized(i)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{BTreeSet, HashMap};
|
||||
use std::num::{ParseFloatError, ParseIntError};
|
||||
use std::ops::Bound;
|
||||
use std::str::FromStr;
|
||||
@@ -7,9 +7,7 @@ use tantivy_query_grammar::{UserInputAst, UserInputBound, UserInputLeaf, UserInp
|
||||
|
||||
use super::logical_ast::*;
|
||||
use crate::core::Index;
|
||||
use crate::indexer::{
|
||||
convert_to_fast_value_and_get_term, set_string_and_get_terms, JsonTermWriter,
|
||||
};
|
||||
use crate::indexer::JsonTermWriter;
|
||||
use crate::query::{
|
||||
AllQuery, BooleanQuery, BoostQuery, EmptyQuery, Occur, PhraseQuery, Query, RangeQuery,
|
||||
TermQuery,
|
||||
@@ -18,7 +16,7 @@ use crate::schema::{
|
||||
Facet, FacetParseError, Field, FieldType, IndexRecordOption, Schema, Term, Type,
|
||||
};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::OffsetDateTime;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
|
||||
use crate::{DateTime, Score};
|
||||
|
||||
@@ -26,13 +24,13 @@ use crate::{DateTime, Score};
|
||||
#[derive(Debug, PartialEq, Eq, Error)]
|
||||
pub enum QueryParserError {
|
||||
/// Error in the query syntax
|
||||
#[error("Syntax Error: {0}")]
|
||||
SyntaxError(String),
|
||||
#[error("Syntax Error")]
|
||||
SyntaxError,
|
||||
/// This query is unsupported.
|
||||
#[error("Unsupported query: {0}")]
|
||||
UnsupportedQuery(String),
|
||||
/// The query references a field that is not in the schema
|
||||
#[error("Field does not exists: '{0}'")]
|
||||
#[error("Field does not exists: '{0:?}'")]
|
||||
FieldDoesNotExist(String),
|
||||
/// The query contains a term for a `u64` or `i64`-field, but the value
|
||||
/// is neither.
|
||||
@@ -55,11 +53,11 @@ pub enum QueryParserError {
|
||||
NoDefaultFieldDeclared,
|
||||
/// The field searched for is not declared
|
||||
/// as indexed in the schema.
|
||||
#[error("The field '{0}' is not declared as indexed")]
|
||||
#[error("The field '{0:?}' is not declared as indexed")]
|
||||
FieldNotIndexed(String),
|
||||
/// A phrase query was requested for a field that does not
|
||||
/// have any positions indexed.
|
||||
#[error("The field '{0}' does not have positions indexed")]
|
||||
#[error("The field '{0:?}' does not have positions indexed")]
|
||||
FieldDoesNotHavePositionsIndexed(String),
|
||||
/// The tokenizer for the given field is unknown
|
||||
/// The two argument strings are the name of the field, the name of the tokenizer
|
||||
@@ -171,7 +169,7 @@ pub struct QueryParser {
|
||||
conjunction_by_default: bool,
|
||||
tokenizer_manager: TokenizerManager,
|
||||
boost: HashMap<Field, Score>,
|
||||
field_names: HashMap<String, Field>,
|
||||
field_names: BTreeSet<String>,
|
||||
}
|
||||
|
||||
fn all_negative(ast: &LogicalAst) -> bool {
|
||||
@@ -184,31 +182,6 @@ fn all_negative(ast: &LogicalAst) -> bool {
|
||||
}
|
||||
}
|
||||
|
||||
// Returns the position (in byte offsets) of the unescaped '.' in the `field_path`.
|
||||
//
|
||||
// This function operates directly on bytes (as opposed to codepoint), relying
|
||||
// on a encoding property of utf-8 for its correctness.
|
||||
fn locate_splitting_dots(field_path: &str) -> Vec<usize> {
|
||||
let mut splitting_dots_pos = Vec::new();
|
||||
let mut escape_state = false;
|
||||
for (pos, b) in field_path.bytes().enumerate() {
|
||||
if escape_state {
|
||||
escape_state = false;
|
||||
continue;
|
||||
}
|
||||
match b {
|
||||
b'\\' => {
|
||||
escape_state = true;
|
||||
}
|
||||
b'.' => {
|
||||
splitting_dots_pos.push(pos);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
splitting_dots_pos
|
||||
}
|
||||
|
||||
impl QueryParser {
|
||||
/// Creates a `QueryParser`, given
|
||||
/// * schema - index Schema
|
||||
@@ -220,7 +193,7 @@ impl QueryParser {
|
||||
) -> QueryParser {
|
||||
let field_names = schema
|
||||
.fields()
|
||||
.map(|(field, field_entry)| (field_entry.name().to_string(), field))
|
||||
.map(|(_, field_entry)| field_entry.name().to_string())
|
||||
.collect();
|
||||
QueryParser {
|
||||
schema,
|
||||
@@ -234,18 +207,25 @@ impl QueryParser {
|
||||
|
||||
// Splits a full_path as written in a query, into a field name and a
|
||||
// json path.
|
||||
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> Option<(Field, &'a str)> {
|
||||
if let Some(field) = self.field_names.get(full_path) {
|
||||
return Some((*field, ""));
|
||||
pub(crate) fn split_full_path<'a>(&self, full_path: &'a str) -> (&'a str, &'a str) {
|
||||
if full_path.is_empty() {
|
||||
return ("", "");
|
||||
}
|
||||
let mut splitting_period_pos: Vec<usize> = locate_splitting_dots(full_path);
|
||||
while let Some(pos) = splitting_period_pos.pop() {
|
||||
let (prefix, suffix) = full_path.split_at(pos);
|
||||
if let Some(field) = self.field_names.get(prefix) {
|
||||
return Some((*field, &suffix[1..]));
|
||||
if self.field_names.contains(full_path) {
|
||||
return (full_path, "");
|
||||
}
|
||||
let mut result = ("", full_path);
|
||||
let mut cursor = 0;
|
||||
while let Some(pos) = full_path[cursor..].find('.') {
|
||||
cursor += pos;
|
||||
let prefix = &full_path[..cursor];
|
||||
let suffix = &full_path[cursor + 1..];
|
||||
if self.field_names.contains(prefix) {
|
||||
result = (prefix, suffix);
|
||||
}
|
||||
cursor += 1;
|
||||
}
|
||||
None
|
||||
result
|
||||
}
|
||||
|
||||
/// Creates a `QueryParser`, given
|
||||
@@ -293,11 +273,17 @@ impl QueryParser {
|
||||
|
||||
/// Parse the user query into an AST.
|
||||
fn parse_query_to_logical_ast(&self, query: &str) -> Result<LogicalAst, QueryParserError> {
|
||||
let user_input_ast = tantivy_query_grammar::parse_query(query)
|
||||
.map_err(|_| QueryParserError::SyntaxError(query.to_string()))?;
|
||||
let user_input_ast =
|
||||
tantivy_query_grammar::parse_query(query).map_err(|_| QueryParserError::SyntaxError)?;
|
||||
self.compute_logical_ast(user_input_ast)
|
||||
}
|
||||
|
||||
fn resolve_field_name(&self, field_name: &str) -> Result<Field, QueryParserError> {
|
||||
self.schema
|
||||
.get_field(field_name)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(String::from(field_name)))
|
||||
}
|
||||
|
||||
fn compute_logical_ast(
|
||||
&self,
|
||||
user_input_ast: UserInputAst,
|
||||
@@ -348,7 +334,7 @@ impl QueryParser {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
Ok(Term::from_field_date(field, DateTime::from_utc(dt)))
|
||||
Ok(Term::from_field_date(field, DateTime::new_utc(dt)))
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
let option = str_options.get_indexing_options().ok_or_else(|| {
|
||||
@@ -404,12 +390,6 @@ impl QueryParser {
|
||||
if !field_type.is_indexed() {
|
||||
return Err(QueryParserError::FieldNotIndexed(field_name.to_string()));
|
||||
}
|
||||
if field_type.value_type() != Type::Json && !json_path.is_empty() {
|
||||
let field_name = self.schema.get_field_name(field);
|
||||
return Err(QueryParserError::FieldDoesNotExist(format!(
|
||||
"{field_name}.{json_path}"
|
||||
)));
|
||||
}
|
||||
match *field_type {
|
||||
FieldType::U64(_) => {
|
||||
let val: u64 = u64::from_str(phrase)?;
|
||||
@@ -428,7 +408,7 @@ impl QueryParser {
|
||||
}
|
||||
FieldType::Date(_) => {
|
||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
|
||||
let dt_term = Term::from_field_date(field, DateTime::new_utc(dt));
|
||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||
}
|
||||
FieldType::Str(ref str_options) => {
|
||||
@@ -551,56 +531,37 @@ impl QueryParser {
|
||||
})
|
||||
}
|
||||
|
||||
/// Given a literal, returns the list of terms that should be searched.
|
||||
///
|
||||
/// The terms are identified by a triplet:
|
||||
/// - tantivy field
|
||||
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
|
||||
/// object by naturally extending the json field name with a "." separated field_path
|
||||
/// - field_phrase: the phrase that is being searched.
|
||||
///
|
||||
/// The literal identifies the targetted field by a so-called *full field path*,
|
||||
/// specified before the ":". (e.g. identity.username:fulmicoton).
|
||||
///
|
||||
/// The way we split the full field path into (field_name, field_path) can be ambiguous,
|
||||
/// because field_names can contain "." themselves.
|
||||
// For instance if a field is named `one.two` and another one is named `one`,
|
||||
/// should `one.two:three` target `one.two` with field path `` or or `one` with
|
||||
/// the field path `two`.
|
||||
///
|
||||
/// In this case tantivy, just picks the solution with the longest field name.
|
||||
///
|
||||
/// Quirk: As a hack for quickwit, we do not split over a dot that appear escaped '\.'.
|
||||
fn compute_path_triplets_for_literal<'a>(
|
||||
fn compute_path_triplet_for_literal<'a>(
|
||||
&self,
|
||||
literal: &'a UserInputLiteral,
|
||||
) -> Result<Vec<(Field, &'a str, &'a str)>, QueryParserError> {
|
||||
let full_path = if let Some(full_path) = &literal.field_name {
|
||||
full_path
|
||||
} else {
|
||||
// The user did not specify any path...
|
||||
// We simply target default fields.
|
||||
if self.default_fields.is_empty() {
|
||||
return Err(QueryParserError::NoDefaultFieldDeclared);
|
||||
match &literal.field_name {
|
||||
Some(ref full_path) => {
|
||||
// We need to add terms associated to json default fields.
|
||||
let (field_name, path) = self.split_full_path(full_path);
|
||||
if let Ok(field) = self.resolve_field_name(field_name) {
|
||||
return Ok(vec![(field, path, literal.phrase.as_str())]);
|
||||
}
|
||||
let triplets: Vec<(Field, &str, &str)> = self
|
||||
.default_indexed_json_fields()
|
||||
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
|
||||
.collect();
|
||||
if triplets.is_empty() {
|
||||
return Err(QueryParserError::FieldDoesNotExist(field_name.to_string()));
|
||||
}
|
||||
Ok(triplets)
|
||||
}
|
||||
None => {
|
||||
if self.default_fields.is_empty() {
|
||||
return Err(QueryParserError::NoDefaultFieldDeclared);
|
||||
}
|
||||
Ok(self
|
||||
.default_fields
|
||||
.iter()
|
||||
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
|
||||
.collect::<Vec<(Field, &str, &str)>>())
|
||||
}
|
||||
return Ok(self
|
||||
.default_fields
|
||||
.iter()
|
||||
.map(|default_field| (*default_field, "", literal.phrase.as_str()))
|
||||
.collect::<Vec<(Field, &str, &str)>>());
|
||||
};
|
||||
if let Some((field, path)) = self.split_full_path(full_path) {
|
||||
return Ok(vec![(field, path, literal.phrase.as_str())]);
|
||||
}
|
||||
// We need to add terms associated to json default fields.
|
||||
let triplets: Vec<(Field, &str, &str)> = self
|
||||
.default_indexed_json_fields()
|
||||
.map(|json_field| (json_field, full_path.as_str(), literal.phrase.as_str()))
|
||||
.collect();
|
||||
if triplets.is_empty() {
|
||||
return Err(QueryParserError::FieldDoesNotExist(full_path.to_string()));
|
||||
}
|
||||
Ok(triplets)
|
||||
}
|
||||
|
||||
fn compute_logical_ast_from_leaf(
|
||||
@@ -610,7 +571,7 @@ impl QueryParser {
|
||||
match leaf {
|
||||
UserInputLeaf::Literal(literal) => {
|
||||
let term_phrases: Vec<(Field, &str, &str)> =
|
||||
self.compute_path_triplets_for_literal(&literal)?;
|
||||
self.compute_path_triplet_for_literal(&literal)?;
|
||||
let mut asts: Vec<LogicalAst> = Vec::new();
|
||||
for (field, json_path, phrase) in term_phrases {
|
||||
for ast in self.compute_logical_ast_for_leaf(field, json_path, phrase)? {
|
||||
@@ -637,9 +598,8 @@ impl QueryParser {
|
||||
"Range query need to target a specific field.".to_string(),
|
||||
)
|
||||
})?;
|
||||
let (field, json_path) = self
|
||||
.split_full_path(&full_path)
|
||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone()))?;
|
||||
let (field_name, json_path) = self.split_full_path(&full_path);
|
||||
let field = self.resolve_field_name(field_name)?;
|
||||
let field_entry = self.schema.get_field_entry(field);
|
||||
let value_type = field_entry.field_type().value_type();
|
||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
||||
@@ -700,6 +660,30 @@ fn generate_literals_for_str(
|
||||
Ok(Some(LogicalLiteral::Phrase(terms)))
|
||||
}
|
||||
|
||||
enum NumValue {
|
||||
U64(u64),
|
||||
I64(i64),
|
||||
F64(f64),
|
||||
DateTime(OffsetDateTime),
|
||||
}
|
||||
|
||||
fn infer_type_num(phrase: &str) -> Option<NumValue> {
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
return Some(NumValue::DateTime(dt_utc));
|
||||
}
|
||||
if let Ok(u64_val) = str::parse::<u64>(phrase) {
|
||||
return Some(NumValue::U64(u64_val));
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
return Some(NumValue::I64(i64_val));
|
||||
}
|
||||
if let Ok(f64_val) = str::parse::<f64>(phrase) {
|
||||
return Some(NumValue::F64(f64_val));
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
fn generate_literals_for_json_object(
|
||||
field_name: &str,
|
||||
field: Field,
|
||||
@@ -710,13 +694,38 @@ fn generate_literals_for_json_object(
|
||||
) -> Result<Vec<LogicalLiteral>, QueryParserError> {
|
||||
let mut logical_literals = Vec::new();
|
||||
let mut term = Term::new();
|
||||
let mut json_term_writer =
|
||||
JsonTermWriter::from_field_and_json_path(field, json_path, &mut term);
|
||||
if let Some(term) = convert_to_fast_value_and_get_term(&mut json_term_writer, phrase) {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
term.set_field(Type::Json, field);
|
||||
let mut json_term_writer = JsonTermWriter::wrap(&mut term);
|
||||
for segment in json_path.split('.') {
|
||||
json_term_writer.push_path_segment(segment);
|
||||
}
|
||||
let terms = set_string_and_get_terms(&mut json_term_writer, phrase, text_analyzer);
|
||||
if let Some(num_value) = infer_type_num(phrase) {
|
||||
match num_value {
|
||||
NumValue::U64(u64_val) => {
|
||||
json_term_writer.set_fast_value(u64_val);
|
||||
}
|
||||
NumValue::I64(i64_val) => {
|
||||
json_term_writer.set_fast_value(i64_val);
|
||||
}
|
||||
NumValue::F64(f64_val) => {
|
||||
json_term_writer.set_fast_value(f64_val);
|
||||
}
|
||||
NumValue::DateTime(dt_val) => {
|
||||
json_term_writer.set_fast_value(DateTime::new_utc(dt_val));
|
||||
}
|
||||
}
|
||||
logical_literals.push(LogicalLiteral::Term(json_term_writer.term().clone()));
|
||||
}
|
||||
json_term_writer.close_path_and_set_type(Type::Str);
|
||||
drop(json_term_writer);
|
||||
let term_num_bytes = term.as_slice().len();
|
||||
let mut token_stream = text_analyzer.token_stream(phrase);
|
||||
let mut terms: Vec<(usize, Term)> = Vec::new();
|
||||
token_stream.process(&mut |token| {
|
||||
term.truncate(term_num_bytes);
|
||||
term.append_bytes(token.text.as_bytes());
|
||||
terms.push((token.position, term.clone()));
|
||||
});
|
||||
if terms.len() <= 1 {
|
||||
for (_, term) in terms {
|
||||
logical_literals.push(LogicalLiteral::Term(term));
|
||||
@@ -1211,11 +1220,9 @@ mod test {
|
||||
#[test]
|
||||
pub fn test_query_parser_field_does_not_exist() {
|
||||
let query_parser = make_query_parser();
|
||||
assert_eq!(
|
||||
query_parser
|
||||
.parse_query("boujou:\"18446744073709551615\"")
|
||||
.unwrap_err(),
|
||||
QueryParserError::FieldDoesNotExist("boujou".to_string())
|
||||
assert_matches!(
|
||||
query_parser.parse_query("boujou:\"18446744073709551615\""),
|
||||
Err(QueryParserError::FieldDoesNotExist(_))
|
||||
);
|
||||
}
|
||||
|
||||
@@ -1390,56 +1397,29 @@ mod test {
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_escaped_field() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field(r#"a\.b"#, STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
let query = query_parser.parse_query(r#"a\.b:hello"#).unwrap();
|
||||
assert_eq!(
|
||||
format!("{:?}", query),
|
||||
"TermQuery(Term(type=Str, field=0, \"hello\"))"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_full_path() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
schema_builder.add_text_field("second", STRING);
|
||||
schema_builder.add_text_field("first", STRING);
|
||||
schema_builder.add_text_field("first.toto", STRING);
|
||||
schema_builder.add_text_field("first.toto.titi", STRING);
|
||||
schema_builder.add_text_field("third.a.b.c", STRING);
|
||||
let schema = schema_builder.build();
|
||||
let query_parser =
|
||||
QueryParser::new(schema.clone(), Vec::new(), TokenizerManager::default());
|
||||
let query_parser = QueryParser::new(schema, Vec::new(), TokenizerManager::default());
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto"),
|
||||
Some((schema.get_field("first.toto").unwrap(), ""))
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto.bubu"),
|
||||
Some((schema.get_field("first.toto").unwrap(), "bubu"))
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.toto.titi"),
|
||||
Some((schema.get_field("first.toto.titi").unwrap(), ""))
|
||||
("first.toto", "")
|
||||
);
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("first.titi"),
|
||||
Some((schema.get_field("first").unwrap(), "titi"))
|
||||
("first", "titi")
|
||||
);
|
||||
assert_eq!(query_parser.split_full_path("third"), None);
|
||||
assert_eq!(query_parser.split_full_path("hello.toto"), None);
|
||||
assert_eq!(query_parser.split_full_path(""), None);
|
||||
assert_eq!(query_parser.split_full_path("firsty"), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_locate_splitting_dots() {
|
||||
assert_eq!(&super::locate_splitting_dots("a.b.c"), &[1, 3]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\.b.c"#), &[4]);
|
||||
assert_eq!(&super::locate_splitting_dots(r#"a\..b.c"#), &[3, 5]);
|
||||
assert_eq!(query_parser.split_full_path("third"), ("", "third"));
|
||||
assert_eq!(
|
||||
query_parser.split_full_path("hello.toto"),
|
||||
("", "hello.toto")
|
||||
);
|
||||
assert_eq!(query_parser.split_full_path(""), ("", ""));
|
||||
assert_eq!(query_parser.split_full_path("firsty"), ("", "firsty"));
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,7 +2,7 @@ use std::ops::{Deref, DerefMut};
|
||||
use std::sync::atomic::{AtomicUsize, Ordering};
|
||||
use std::sync::Arc;
|
||||
|
||||
use crossbeam_channel::{unbounded, Receiver, RecvError, Sender};
|
||||
use crossbeam::channel::{unbounded, Receiver, RecvError, Sender};
|
||||
|
||||
pub struct GenerationItem<T> {
|
||||
generation: usize,
|
||||
@@ -197,7 +197,7 @@ mod tests {
|
||||
|
||||
use std::{iter, mem};
|
||||
|
||||
use crossbeam_channel as channel;
|
||||
use crossbeam::channel;
|
||||
|
||||
use super::{Pool, Queue};
|
||||
|
||||
|
||||
@@ -147,7 +147,7 @@ impl WarmingStateInner {
|
||||
/// Every [GC_INTERVAL] attempt to GC, with panics caught and logged using
|
||||
/// [std::panic::catch_unwind].
|
||||
fn gc_loop(inner: Weak<Mutex<WarmingStateInner>>) {
|
||||
for _ in crossbeam_channel::tick(GC_INTERVAL) {
|
||||
for _ in crossbeam::channel::tick(GC_INTERVAL) {
|
||||
if let Some(inner) = inner.upgrade() {
|
||||
// rely on deterministic gc in tests
|
||||
#[cfg(not(test))]
|
||||
|
||||
@@ -213,8 +213,6 @@ impl BinarySerializable for Document {
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
|
||||
use common::BinarySerializable;
|
||||
|
||||
use crate::schema::*;
|
||||
|
||||
#[test]
|
||||
@@ -225,22 +223,4 @@ mod tests {
|
||||
doc.add_text(text_field, "My title");
|
||||
assert_eq!(doc.field_values().len(), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_doc_serialization_issue() {
|
||||
let mut doc = Document::default();
|
||||
doc.add_json_object(
|
||||
Field::from_field_id(0),
|
||||
serde_json::json!({"key": 2u64})
|
||||
.as_object()
|
||||
.unwrap()
|
||||
.clone(),
|
||||
);
|
||||
doc.add_text(Field::from_field_id(1), "hello");
|
||||
assert_eq!(doc.field_values().len(), 2);
|
||||
let mut payload: Vec<u8> = Vec::new();
|
||||
doc.serialize(&mut payload).unwrap();
|
||||
assert_eq!(payload.len(), 26);
|
||||
Document::deserialize(&mut &payload[..]).unwrap();
|
||||
}
|
||||
}
|
||||
|
||||
@@ -93,7 +93,13 @@ impl FieldEntry {
|
||||
|
||||
/// Returns true if the field is a int (signed or unsigned) fast field
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.field_type.is_fast()
|
||||
match self.field_type {
|
||||
FieldType::U64(ref options)
|
||||
| FieldType::I64(ref options)
|
||||
| FieldType::Date(ref options)
|
||||
| FieldType::F64(ref options) => options.is_fast(),
|
||||
_ => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns true if the field is stored
|
||||
@@ -138,8 +144,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
"stored": false
|
||||
}
|
||||
}"#;
|
||||
let field_value_json = serde_json::to_string_pretty(&field_value).unwrap();
|
||||
|
||||
@@ -185,20 +185,6 @@ impl FieldType {
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is fast.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
match *self {
|
||||
FieldType::Bytes(ref bytes_options) => bytes_options.is_fast(),
|
||||
FieldType::Str(ref text_options) => text_options.is_fast(),
|
||||
FieldType::U64(ref int_options)
|
||||
| FieldType::I64(ref int_options)
|
||||
| FieldType::F64(ref int_options)
|
||||
| FieldType::Date(ref int_options) => int_options.get_fastfield_cardinality().is_some(),
|
||||
FieldType::Facet(_) => true,
|
||||
FieldType::JsonObject(_) => false,
|
||||
}
|
||||
}
|
||||
|
||||
/// returns true if the field is normed (see [fieldnorms](crate::fieldnorm)).
|
||||
pub fn has_fieldnorms(&self) -> bool {
|
||||
match *self {
|
||||
@@ -268,7 +254,7 @@ impl FieldType {
|
||||
expected: "rfc3339 format",
|
||||
json: JsonValue::String(field_text),
|
||||
})?;
|
||||
Ok(DateTime::from_utc(dt_with_fixed_tz).into())
|
||||
Ok(DateTime::new_utc(dt_with_fixed_tz).into())
|
||||
}
|
||||
FieldType::Str(_) => Ok(Value::Str(field_text)),
|
||||
FieldType::U64(_) | FieldType::I64(_) | FieldType::F64(_) => {
|
||||
@@ -388,7 +374,7 @@ mod tests {
|
||||
let naive_date = Date::from_calendar_date(1982, Month::September, 17).unwrap();
|
||||
let naive_time = Time::from_hms(13, 20, 0).unwrap();
|
||||
let date_time = PrimitiveDateTime::new(naive_date, naive_time);
|
||||
doc.add_date(date_field, DateTime::from_primitive(date_time));
|
||||
doc.add_date(date_field, DateTime::new_primitive(date_time));
|
||||
let doc_json = schema.to_json(&doc);
|
||||
assert_eq!(doc_json, r#"{"date":["1982-09-17T13:20:00Z"]}"#);
|
||||
}
|
||||
|
||||
@@ -417,7 +417,6 @@ mod tests {
|
||||
use std::collections::BTreeMap;
|
||||
|
||||
use matches::{assert_matches, matches};
|
||||
use pretty_assertions::assert_eq;
|
||||
use serde_json;
|
||||
|
||||
use crate::schema::field_type::ValueParsingError;
|
||||
@@ -470,8 +469,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
"stored": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -483,8 +481,7 @@ mod tests {
|
||||
"fieldnorms": false,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
"stored": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -787,8 +784,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
"stored": false
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -820,8 +816,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "raw"
|
||||
},
|
||||
"stored": true,
|
||||
"fast": false
|
||||
"stored": true
|
||||
}
|
||||
},
|
||||
{
|
||||
@@ -843,8 +838,7 @@ mod tests {
|
||||
"fieldnorms": true,
|
||||
"tokenizer": "default"
|
||||
},
|
||||
"stored": false,
|
||||
"fast": false
|
||||
"stored": false
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::ops::BitOr;
|
||||
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::flags::FastFlag;
|
||||
use crate::schema::flags::{SchemaFlagList, StoredFlag};
|
||||
use crate::schema::IndexRecordOption;
|
||||
|
||||
@@ -15,8 +14,6 @@ pub struct TextOptions {
|
||||
indexing: Option<TextFieldIndexing>,
|
||||
#[serde(default)]
|
||||
stored: bool,
|
||||
#[serde(default)]
|
||||
fast: bool,
|
||||
}
|
||||
|
||||
impl TextOptions {
|
||||
@@ -30,30 +27,6 @@ impl TextOptions {
|
||||
self.stored
|
||||
}
|
||||
|
||||
/// Returns true iff the value is a fast field.
|
||||
pub fn is_fast(&self) -> bool {
|
||||
self.fast
|
||||
}
|
||||
|
||||
/// Set the field as a fast field.
|
||||
///
|
||||
/// Fast fields are designed for random access.
|
||||
/// Access time are similar to a random lookup in an array.
|
||||
/// Text fast fields will have the term ids stored in the fast field.
|
||||
/// The fast field will be a multivalued fast field.
|
||||
///
|
||||
/// The effective cardinality depends on the tokenizer. When creating fast fields on text
|
||||
/// fields it is recommended to use the "raw" tokenizer, since it will store the original text
|
||||
/// unchanged. The "default" tokenizer will store the terms as lower case and this will be
|
||||
/// reflected in the dictionary.
|
||||
///
|
||||
/// The original text can be retrieved via `ord_to_term` from the dictionary.
|
||||
#[must_use]
|
||||
pub fn set_fast(mut self) -> TextOptions {
|
||||
self.fast = true;
|
||||
self
|
||||
}
|
||||
|
||||
/// Sets the field as stored
|
||||
#[must_use]
|
||||
pub fn set_stored(mut self) -> TextOptions {
|
||||
@@ -72,13 +45,9 @@ impl TextOptions {
|
||||
#[derive(Clone, PartialEq, Debug, Eq, Serialize, Deserialize)]
|
||||
struct TokenizerName(Cow<'static, str>);
|
||||
|
||||
const DEFAULT_TOKENIZER_NAME: &str = "default";
|
||||
|
||||
const NO_TOKENIZER_NAME: &str = "raw";
|
||||
|
||||
impl Default for TokenizerName {
|
||||
fn default() -> Self {
|
||||
TokenizerName::from_static(DEFAULT_TOKENIZER_NAME)
|
||||
TokenizerName::from_static("default")
|
||||
}
|
||||
}
|
||||
|
||||
@@ -172,23 +141,21 @@ impl TextFieldIndexing {
|
||||
/// The field will be untokenized and indexed.
|
||||
pub const STRING: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: TokenizerName::from_static(NO_TOKENIZER_NAME),
|
||||
tokenizer: TokenizerName::from_static("raw"),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::Basic,
|
||||
}),
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
/// The field will be tokenized and indexed.
|
||||
pub const TEXT: TextOptions = TextOptions {
|
||||
indexing: Some(TextFieldIndexing {
|
||||
tokenizer: TokenizerName::from_static(DEFAULT_TOKENIZER_NAME),
|
||||
tokenizer: TokenizerName::from_static("default"),
|
||||
fieldnorms: true,
|
||||
record: IndexRecordOption::WithFreqsAndPositions,
|
||||
}),
|
||||
stored: false,
|
||||
fast: false,
|
||||
};
|
||||
|
||||
impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
@@ -199,7 +166,6 @@ impl<T: Into<TextOptions>> BitOr<T> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: self.indexing.or(other.indexing),
|
||||
stored: self.stored | other.stored,
|
||||
fast: self.fast | other.fast,
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -215,17 +181,6 @@ impl From<StoredFlag> for TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: true,
|
||||
fast: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<FastFlag> for TextOptions {
|
||||
fn from(_: FastFlag) -> TextOptions {
|
||||
TextOptions {
|
||||
indexing: None,
|
||||
stored: false,
|
||||
fast: true,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -43,7 +43,7 @@ impl Serialize for Value {
|
||||
Value::U64(u) => serializer.serialize_u64(u),
|
||||
Value::I64(u) => serializer.serialize_i64(u),
|
||||
Value::F64(u) => serializer.serialize_f64(u),
|
||||
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.into_utc(), serializer),
|
||||
Value::Date(ref date) => time::serde::rfc3339::serialize(&date.to_utc(), serializer),
|
||||
Value::Facet(ref facet) => facet.serialize(serializer),
|
||||
Value::Bytes(ref bytes) => serializer.serialize_bytes(bytes),
|
||||
Value::JsonObject(ref obj) => obj.serialize(serializer),
|
||||
@@ -388,16 +388,8 @@ mod binary_serialize {
|
||||
}
|
||||
}
|
||||
JSON_OBJ_CODE => {
|
||||
// As explained in
|
||||
// https://docs.serde.rs/serde_json/fn.from_reader.html
|
||||
//
|
||||
// `T::from_reader(..)` expects EOF after reading the object,
|
||||
// which is not what we want here.
|
||||
//
|
||||
// For this reason we need to create our own `Deserializer`.
|
||||
let mut de = serde_json::Deserializer::from_reader(reader);
|
||||
let json_map = <serde_json::Map::<String, serde_json::Value> as serde::Deserialize>::deserialize(&mut de)?;
|
||||
Ok(Value::JsonObject(json_map))
|
||||
let map = serde_json::from_reader(reader)?;
|
||||
Ok(Value::JsonObject(map))
|
||||
}
|
||||
_ => Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
@@ -417,12 +409,12 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_serialize_date() {
|
||||
let value = Value::from(DateTime::from_utc(
|
||||
let value = Value::from(DateTime::new_utc(
|
||||
OffsetDateTime::parse("1996-12-20T00:39:57+00:00", &Rfc3339).unwrap(),
|
||||
));
|
||||
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
|
||||
assert_eq!(serialized_value_json, r#""1996-12-20T00:39:57Z""#);
|
||||
let value = Value::from(DateTime::from_utc(
|
||||
let value = Value::from(DateTime::new_utc(
|
||||
OffsetDateTime::parse("1996-12-20T00:39:57-01:00", &Rfc3339).unwrap(),
|
||||
));
|
||||
let serialized_value_json = serde_json::to_string_pretty(&value).unwrap();
|
||||
|
||||
@@ -1,50 +0,0 @@
|
||||
use std::io;
|
||||
|
||||
use zstd::bulk::{compress_to_buffer, decompress_to_buffer};
|
||||
use zstd::DEFAULT_COMPRESSION_LEVEL;
|
||||
|
||||
#[inline]
|
||||
pub fn compress(uncompressed: &[u8], compressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let count_size = std::mem::size_of::<u32>();
|
||||
let max_size = zstd::zstd_safe::compress_bound(uncompressed.len()) + count_size;
|
||||
|
||||
compressed.clear();
|
||||
compressed.resize(max_size, 0);
|
||||
|
||||
let compressed_size = compress_to_buffer(
|
||||
uncompressed,
|
||||
&mut compressed[count_size..],
|
||||
DEFAULT_COMPRESSION_LEVEL,
|
||||
)?;
|
||||
|
||||
compressed[0..count_size].copy_from_slice(&(uncompressed.len() as u32).to_le_bytes());
|
||||
compressed.resize(compressed_size + count_size, 0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[inline]
|
||||
pub fn decompress(compressed: &[u8], decompressed: &mut Vec<u8>) -> io::Result<()> {
|
||||
let count_size = std::mem::size_of::<u32>();
|
||||
let uncompressed_size = u32::from_le_bytes(
|
||||
compressed
|
||||
.get(..count_size)
|
||||
.ok_or(io::ErrorKind::InvalidData)?
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
) as usize;
|
||||
|
||||
decompressed.clear();
|
||||
decompressed.resize(uncompressed_size, 0);
|
||||
|
||||
let decompressed_size = decompress_to_buffer(&compressed[count_size..], decompressed)?;
|
||||
|
||||
if decompressed_size != uncompressed_size {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::InvalidData,
|
||||
"doc store block not completely decompressed, data corruption".to_string(),
|
||||
));
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
@@ -26,9 +26,6 @@ pub enum Compressor {
|
||||
#[serde(rename = "snappy")]
|
||||
/// Use the snap compressor
|
||||
Snappy,
|
||||
#[serde(rename = "zstd")]
|
||||
/// Use the zstd compressor
|
||||
Zstd,
|
||||
}
|
||||
|
||||
impl Default for Compressor {
|
||||
@@ -39,8 +36,6 @@ impl Default for Compressor {
|
||||
Compressor::Brotli
|
||||
} else if cfg!(feature = "snappy-compression") {
|
||||
Compressor::Snappy
|
||||
} else if cfg!(feature = "zstd-compression") {
|
||||
Compressor::Zstd
|
||||
} else {
|
||||
Compressor::None
|
||||
}
|
||||
@@ -54,7 +49,6 @@ impl Compressor {
|
||||
1 => Compressor::Lz4,
|
||||
2 => Compressor::Brotli,
|
||||
3 => Compressor::Snappy,
|
||||
4 => Compressor::Zstd,
|
||||
_ => panic!("unknown compressor id {:?}", id),
|
||||
}
|
||||
}
|
||||
@@ -64,7 +58,6 @@ impl Compressor {
|
||||
Self::Lz4 => 1,
|
||||
Self::Brotli => 2,
|
||||
Self::Snappy => 3,
|
||||
Self::Zstd => 4,
|
||||
}
|
||||
}
|
||||
#[inline]
|
||||
@@ -105,16 +98,6 @@ impl Compressor {
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::compress(uncompressed, compressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -160,16 +143,6 @@ impl Compressor {
|
||||
panic!("snappy-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
Self::Zstd => {
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
{
|
||||
super::compression_zstd_block::decompress(compressed, decompressed)
|
||||
}
|
||||
#[cfg(not(feature = "zstd-compression"))]
|
||||
{
|
||||
panic!("zstd-compression feature flag not activated");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -50,9 +50,6 @@ mod compression_brotli;
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
mod compression_snap;
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
mod compression_zstd_block;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
|
||||
@@ -72,13 +69,10 @@ pub mod tests {
|
||||
sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt \
|
||||
mollit anim id est laborum.";
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
pub fn write_lorem_ipsum_store(
|
||||
writer: WritePtr,
|
||||
num_docs: usize,
|
||||
compressor: Compressor,
|
||||
blocksize: usize,
|
||||
) -> Schema {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let field_body = schema_builder.add_text_field("body", TextOptions::default().set_stored());
|
||||
@@ -86,7 +80,7 @@ pub mod tests {
|
||||
schema_builder.add_text_field("title", TextOptions::default().set_stored());
|
||||
let schema = schema_builder.build();
|
||||
{
|
||||
let mut store_writer = StoreWriter::new(writer, compressor, blocksize);
|
||||
let mut store_writer = StoreWriter::new(writer, compressor);
|
||||
for i in 0..num_docs {
|
||||
let mut doc = Document::default();
|
||||
doc.add_field_value(field_body, LOREM.to_string());
|
||||
@@ -109,7 +103,7 @@ pub mod tests {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4, BLOCK_SIZE);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, Compressor::Lz4);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -145,11 +139,11 @@ pub mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn test_store(compressor: Compressor, blocksize: usize) -> crate::Result<()> {
|
||||
fn test_store(compressor: Compressor) -> crate::Result<()> {
|
||||
let path = Path::new("store");
|
||||
let directory = RamDirectory::create();
|
||||
let store_wrt = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor, blocksize);
|
||||
let schema = write_lorem_ipsum_store(store_wrt, NUM_DOCS, compressor);
|
||||
let field_title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
@@ -175,28 +169,22 @@ pub mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_store_noop() -> crate::Result<()> {
|
||||
test_store(Compressor::None, BLOCK_SIZE)
|
||||
test_store(Compressor::None)
|
||||
}
|
||||
#[cfg(feature = "lz4-compression")]
|
||||
#[test]
|
||||
fn test_store_lz4_block() -> crate::Result<()> {
|
||||
test_store(Compressor::Lz4, BLOCK_SIZE)
|
||||
test_store(Compressor::Lz4)
|
||||
}
|
||||
#[cfg(feature = "snappy-compression")]
|
||||
#[test]
|
||||
fn test_store_snap() -> crate::Result<()> {
|
||||
test_store(Compressor::Snappy, BLOCK_SIZE)
|
||||
test_store(Compressor::Snappy)
|
||||
}
|
||||
#[cfg(feature = "brotli-compression")]
|
||||
#[test]
|
||||
fn test_store_brotli() -> crate::Result<()> {
|
||||
test_store(Compressor::Brotli, BLOCK_SIZE)
|
||||
}
|
||||
|
||||
#[cfg(feature = "zstd-compression")]
|
||||
#[test]
|
||||
fn test_store_zstd() -> crate::Result<()> {
|
||||
test_store(Compressor::Zstd, BLOCK_SIZE)
|
||||
test_store(Compressor::Brotli)
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -360,7 +348,6 @@ mod bench {
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
16_384,
|
||||
);
|
||||
directory.delete(path).unwrap();
|
||||
});
|
||||
@@ -374,7 +361,6 @@ mod bench {
|
||||
directory.open_write(path).unwrap(),
|
||||
1_000,
|
||||
Compressor::default(),
|
||||
16_384,
|
||||
);
|
||||
let store_file = directory.open_read(path).unwrap();
|
||||
let store = StoreReader::open(store_file).unwrap();
|
||||
|
||||
@@ -304,8 +304,6 @@ mod tests {
|
||||
use crate::store::tests::write_lorem_ipsum_store;
|
||||
use crate::Directory;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
fn get_text_field<'a>(doc: &'a Document, field: &'a Field) -> Option<&'a str> {
|
||||
doc.get_first(*field).and_then(|f| f.as_text())
|
||||
}
|
||||
@@ -315,7 +313,7 @@ mod tests {
|
||||
let directory = RamDirectory::create();
|
||||
let path = Path::new("store");
|
||||
let writer = directory.open_write(path)?;
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default(), BLOCK_SIZE);
|
||||
let schema = write_lorem_ipsum_store(writer, 500, Compressor::default());
|
||||
let title = schema.get_field("title").unwrap();
|
||||
let store_file = directory.open_read(path)?;
|
||||
let store = StoreReader::open(store_file)?;
|
||||
|
||||
@@ -11,6 +11,8 @@ use crate::schema::Document;
|
||||
use crate::store::index::Checkpoint;
|
||||
use crate::DocId;
|
||||
|
||||
const BLOCK_SIZE: usize = 16_384;
|
||||
|
||||
/// Write tantivy's [`Store`](./index.html)
|
||||
///
|
||||
/// Contrary to the other components of `tantivy`,
|
||||
@@ -20,7 +22,6 @@ use crate::DocId;
|
||||
/// The skip list index on the other hand, is built in memory.
|
||||
pub struct StoreWriter {
|
||||
compressor: Compressor,
|
||||
block_size: usize,
|
||||
doc: DocId,
|
||||
first_doc_in_block: DocId,
|
||||
offset_index_writer: SkipIndexBuilder,
|
||||
@@ -34,10 +35,9 @@ impl StoreWriter {
|
||||
///
|
||||
/// The store writer will writes blocks on disc as
|
||||
/// document are added.
|
||||
pub fn new(writer: WritePtr, compressor: Compressor, block_size: usize) -> StoreWriter {
|
||||
pub fn new(writer: WritePtr, compressor: Compressor) -> StoreWriter {
|
||||
StoreWriter {
|
||||
compressor,
|
||||
block_size,
|
||||
doc: 0,
|
||||
first_doc_in_block: 0,
|
||||
offset_index_writer: SkipIndexBuilder::new(),
|
||||
@@ -65,7 +65,7 @@ impl StoreWriter {
|
||||
VInt(doc_num_bytes as u64).serialize(&mut self.current_block)?;
|
||||
self.current_block.write_all(serialized_document)?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > self.block_size {
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
Ok(())
|
||||
@@ -86,7 +86,7 @@ impl StoreWriter {
|
||||
self.current_block
|
||||
.write_all(&self.intermediary_buffer[..])?;
|
||||
self.doc += 1;
|
||||
if self.current_block.len() > self.block_size {
|
||||
if self.current_block.len() > BLOCK_SIZE {
|
||||
self.write_and_compress_block()?;
|
||||
}
|
||||
Ok(())
|
||||
|
||||
@@ -28,6 +28,7 @@ use fst_termdict as termdict;
|
||||
mod sstable_termdict;
|
||||
#[cfg(feature = "quickwit")]
|
||||
use sstable_termdict as termdict;
|
||||
use tantivy_fst::automaton::AlwaysMatch;
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
@@ -35,4 +36,24 @@ mod tests;
|
||||
/// Position of the term in the sorted list of terms.
|
||||
pub type TermOrdinal = u64;
|
||||
|
||||
pub use self::termdict::{TermDictionary, TermDictionaryBuilder, TermMerger, TermStreamer};
|
||||
/// The term dictionary contains all of the terms in
|
||||
/// `tantivy index` in a sorted manner.
|
||||
pub type TermDictionary = self::termdict::TermDictionary;
|
||||
|
||||
/// Builder for the new term dictionary.
|
||||
///
|
||||
/// Inserting must be done in the order of the `keys`.
|
||||
pub type TermDictionaryBuilder<W> = self::termdict::TermDictionaryBuilder<W>;
|
||||
|
||||
/// Given a list of sorted term streams,
|
||||
/// returns an iterator over sorted unique terms.
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
pub type TermMerger<'a> = self::termdict::TermMerger<'a>;
|
||||
|
||||
/// `TermStreamer` acts as a cursor over a range of terms of a segment.
|
||||
/// Terms are guaranteed to be sorted.
|
||||
pub type TermStreamer<'a, A = AlwaysMatch> = self::termdict::TermStreamer<'a, A>;
|
||||
|
||||
@@ -145,12 +145,6 @@ where
|
||||
}
|
||||
|
||||
pub fn write_key(&mut self, key: &[u8]) {
|
||||
// If this is the first key in the block, we use it to
|
||||
// shorten the last term in the last block.
|
||||
if self.first_ordinal_of_the_block == self.num_terms {
|
||||
self.index_builder
|
||||
.shorten_last_block_key_given_next_key(key);
|
||||
}
|
||||
let keep_len = common_prefix_len(&self.previous_key, key);
|
||||
let add_len = key.len() - keep_len;
|
||||
let increasing_keys = add_len > 0 && (self.previous_key.len() == keep_len)
|
||||
@@ -279,12 +273,11 @@ mod test {
|
||||
33u8, 18u8, 19u8, // keep 1 push 1 | 20
|
||||
17u8, 20u8, 0u8, 0u8, 0u8, 0u8, // no more blocks
|
||||
// index
|
||||
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 115, 108, 97, 115, 116, 95, 107,
|
||||
101, 121, 95, 111, 114, 95, 103, 114, 101, 97, 116, 101, 114, 130, 17, 20, 106, 98,
|
||||
108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106, 98, 121, 116, 101, 95, 114, 97,
|
||||
110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0, 99, 101, 110, 100, 11, 109,
|
||||
102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110, 97, 108, 0, 15, 0, 0, 0, 0,
|
||||
0, 0, 0, // offset for the index
|
||||
161, 102, 98, 108, 111, 99, 107, 115, 129, 162, 104, 108, 97, 115, 116, 95, 107,
|
||||
101, 121, 130, 17, 20, 106, 98, 108, 111, 99, 107, 95, 97, 100, 100, 114, 162, 106,
|
||||
98, 121, 116, 101, 95, 114, 97, 110, 103, 101, 162, 101, 115, 116, 97, 114, 116, 0,
|
||||
99, 101, 110, 100, 11, 109, 102, 105, 114, 115, 116, 95, 111, 114, 100, 105, 110,
|
||||
97, 108, 0, 15, 0, 0, 0, 0, 0, 0, 0, // offset for the index
|
||||
3u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8, 0u8 // num terms
|
||||
]
|
||||
);
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::error::DataCorruption;
|
||||
use crate::termdict::sstable_termdict::sstable::common_prefix_len;
|
||||
|
||||
#[derive(Default, Debug, Serialize, Deserialize)]
|
||||
pub struct SSTableIndex {
|
||||
@@ -20,7 +19,7 @@ impl SSTableIndex {
|
||||
pub fn search(&self, key: &[u8]) -> Option<BlockAddr> {
|
||||
self.blocks
|
||||
.iter()
|
||||
.find(|block| &block.last_key_or_greater[..] >= key)
|
||||
.find(|block| &block.last_key[..] >= key)
|
||||
.map(|block| block.block_addr.clone())
|
||||
}
|
||||
}
|
||||
@@ -33,10 +32,7 @@ pub struct BlockAddr {
|
||||
|
||||
#[derive(Debug, Serialize, Deserialize)]
|
||||
struct BlockMeta {
|
||||
/// Any byte string that is lexicographically greater or equal to
|
||||
/// the last key in the block,
|
||||
/// and yet stricly smaller than the first key in the next block.
|
||||
pub last_key_or_greater: Vec<u8>,
|
||||
pub last_key: Vec<u8>,
|
||||
pub block_addr: BlockAddr,
|
||||
}
|
||||
|
||||
@@ -45,39 +41,10 @@ pub struct SSTableIndexBuilder {
|
||||
index: SSTableIndex,
|
||||
}
|
||||
|
||||
/// Given that left < right,
|
||||
/// mutates `left into a shorter byte string left'` that
|
||||
/// matches `left <= left' < right`.
|
||||
fn find_shorter_str_in_between(left: &mut Vec<u8>, right: &[u8]) {
|
||||
assert!(&left[..] < right);
|
||||
let common_len = common_prefix_len(&left, right);
|
||||
if left.len() == common_len {
|
||||
return;
|
||||
}
|
||||
// It is possible to do one character shorter in some case,
|
||||
// but it is not worth the extra complexity
|
||||
for pos in (common_len + 1)..left.len() {
|
||||
if left[pos] != u8::MAX {
|
||||
left[pos] += 1;
|
||||
left.truncate(pos + 1);
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl SSTableIndexBuilder {
|
||||
/// In order to make the index as light as possible, we
|
||||
/// try to find a shorter alternative to the last key of the last block
|
||||
/// that is still smaller than the next key.
|
||||
pub(crate) fn shorten_last_block_key_given_next_key(&mut self, next_key: &[u8]) {
|
||||
if let Some(last_block) = self.index.blocks.last_mut() {
|
||||
find_shorter_str_in_between(&mut last_block.last_key_or_greater, next_key);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn add_block(&mut self, last_key: &[u8], byte_range: Range<usize>, first_ordinal: u64) {
|
||||
self.index.blocks.push(BlockMeta {
|
||||
last_key_or_greater: last_key.to_vec(),
|
||||
last_key: last_key.to_vec(),
|
||||
block_addr: BlockAddr {
|
||||
byte_range,
|
||||
first_ordinal,
|
||||
@@ -130,35 +97,4 @@ mod tests {
|
||||
"Data corruption: SSTable index is corrupted."
|
||||
);
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn test_find_shorter_str_in_between_aux(left: &[u8], right: &[u8]) {
|
||||
let mut left_buf = left.to_vec();
|
||||
super::find_shorter_str_in_between(&mut left_buf, right);
|
||||
assert!(left_buf.len() <= left.len());
|
||||
assert!(left <= &left_buf);
|
||||
assert!(&left_buf[..] < &right);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_find_shorter_str_in_between() {
|
||||
test_find_shorter_str_in_between_aux(b"", b"hello");
|
||||
test_find_shorter_str_in_between_aux(b"abc", b"abcd");
|
||||
test_find_shorter_str_in_between_aux(b"abcd", b"abd");
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 0], &[0, 0, 1]);
|
||||
test_find_shorter_str_in_between_aux(&[0, 0, 255, 255, 255, 0u8], &[0, 1]);
|
||||
}
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(100))]
|
||||
#[test]
|
||||
fn test_proptest_find_shorter_str(left in any::<Vec<u8>>(), right in any::<Vec<u8>>()) {
|
||||
if left < right {
|
||||
test_find_shorter_str_in_between_aux(&left, &right);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,13 +25,6 @@ pub struct TokenizerManager {
|
||||
}
|
||||
|
||||
impl TokenizerManager {
|
||||
/// Creates an empty tokenizer manager.
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
}
|
||||
}
|
||||
|
||||
/// Registers a new tokenizer associated with a given name.
|
||||
pub fn register<T>(&self, tokenizer_name: &str, tokenizer: T)
|
||||
where TextAnalyzer: From<T> {
|
||||
@@ -59,7 +52,9 @@ impl Default for TokenizerManager {
|
||||
/// - en_stem
|
||||
/// - ja
|
||||
fn default() -> TokenizerManager {
|
||||
let manager = TokenizerManager::new();
|
||||
let manager = TokenizerManager {
|
||||
tokenizers: Arc::new(RwLock::new(HashMap::new())),
|
||||
};
|
||||
manager.register("raw", RawTokenizer);
|
||||
manager.register(
|
||||
"default",
|
||||
|
||||
Reference in New Issue
Block a user