Compare commits

..

3 Commits

Author SHA1 Message Date
Pascal Seitz
722b6c5205 bump version 2023-10-25 20:41:07 +08:00
Pascal Seitz
0f2211ca44 increase min memory to 15MB for indexing
With tantivy 0.20 the minimum memory consumption per SegmentWriter increased to
12MB. 7MB are for the different fast field collectors types (they could be
lazily created). Increase the minimum memory from 3MB to 15MB.

Change memory variable naming from arena to budget.

closes #2156
2023-10-25 20:37:47 +08:00
PSeitz
21aabf961c Fix range query (#2226)
Fix range query end check in advance
Rename vars to reduce ambiguity
add tests

Fixes #2225
2023-10-25 20:37:36 +08:00
273 changed files with 5138 additions and 14516 deletions

View File

@@ -3,6 +3,8 @@ name: Coverage
on: on:
push: push:
branches: [main] branches: [main]
pull_request:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow. # Ensures that we cancel running jobs for the same PR / same workflow.
concurrency: concurrency:
@@ -13,13 +15,13 @@ jobs:
coverage: coverage:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Install Rust - name: Install Rust
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2 - uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov - uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage - name: Generate code coverage
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov - name: Upload coverage to Codecov
uses: codecov/codecov-action@v3 uses: codecov/codecov-action@v3
continue-on-error: true continue-on-error: true

View File

@@ -19,7 +19,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Install stable - name: Install stable
uses: actions-rs/toolchain@v1 uses: actions-rs/toolchain@v1
with: with:

View File

@@ -20,7 +20,7 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Install nightly - name: Install nightly
uses: actions-rs/toolchain@v1 uses: actions-rs/toolchain@v1
@@ -39,13 +39,6 @@ jobs:
- name: Check Formatting - name: Check Formatting
run: cargo +nightly fmt --all -- --check run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
- uses: actions-rs/clippy-check@v1 - uses: actions-rs/clippy-check@v1
with: with:
@@ -67,7 +60,7 @@ jobs:
name: test-${{ matrix.features.label}} name: test-${{ matrix.features.label}}
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v3
- name: Install stable - name: Install stable
uses: actions-rs/toolchain@v1 uses: actions-rs/toolchain@v1

View File

@@ -1,71 +1,3 @@
Tantivy 0.22
================================
Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
#### Bugfixes
- Fix null byte handling in JSON paths (null bytes in json keys caused panic during indexing) [#2345](https://github.com/quickwit-oss/tantivy/pull/2345)(@PSeitz)
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
#### Breaking API Changes
- rename ReloadPolicy onCommit to onCommitWithDelay [#2235](https://github.com/quickwit-oss/tantivy/pull/2235)(@giovannicuccu)
- Move exports from the root into modules [#2220](https://github.com/quickwit-oss/tantivy/pull/2220)(@PSeitz)
- Accept field name instead of `Field` in FilterCollector [#2196](https://github.com/quickwit-oss/tantivy/pull/2196)(@PSeitz)
- remove deprecated IntOptions and DateTime [#2353](https://github.com/quickwit-oss/tantivy/pull/2353)(@PSeitz)
#### Features/Improvements
- Tantivy documents as a trait: Index data directly without converting to tantivy types first [#2071](https://github.com/quickwit-oss/tantivy/pull/2071)(@ChillFish8)
- encode some part of posting list as -1 instead of direct values (smaller inverted indices) [#2185](https://github.com/quickwit-oss/tantivy/pull/2185)(@trinity-1686a)
- **Aggregation**
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
- Add ExistsQuery to check documents that have a value [#2160](https://github.com/quickwit-oss/tantivy/pull/2160)(@imotov)
- Expose TopDocs::order_by_u64_field again [#2282](https://github.com/quickwit-oss/tantivy/pull/2282)(@ditsuke)
- **Memory/Performance**
- Faster TopN: replace BinaryHeap with TopNComputer [#2186](https://github.com/quickwit-oss/tantivy/pull/2186)(@PSeitz)
- reduce number of allocations during indexing [#2257](https://github.com/quickwit-oss/tantivy/pull/2257)(@PSeitz)
- Less Memory while indexing: docid deltas while indexing [#2249](https://github.com/quickwit-oss/tantivy/pull/2249)(@PSeitz)
- Faster indexing: use term hashmap in fastfield [#2243](https://github.com/quickwit-oss/tantivy/pull/2243)(@PSeitz)
- term hashmap remove copy in is_empty, unused unordered_id [#2229](https://github.com/quickwit-oss/tantivy/pull/2229)(@PSeitz)
- add method to fetch block of first values in columnar [#2330](https://github.com/quickwit-oss/tantivy/pull/2330)(@PSeitz)
- Faster aggregations: add fast path for full columns in fetch_block [#2328](https://github.com/quickwit-oss/tantivy/pull/2328)(@PSeitz)
- Faster sstable loading: use fst for sstable index [#2268](https://github.com/quickwit-oss/tantivy/pull/2268)(@trinity-1686a)
- **QueryParser**
- allow newline where we allow space in query parser [#2302](https://github.com/quickwit-oss/tantivy/pull/2302)(@trinity-1686a)
- allow some mixing of occur and bool in strict query parser [#2323](https://github.com/quickwit-oss/tantivy/pull/2323)(@trinity-1686a)
- handle * inside term in lenient query parser [#2228](https://github.com/quickwit-oss/tantivy/pull/2228)(@trinity-1686a)
- add support for exists query syntax in query parser [#2170](https://github.com/quickwit-oss/tantivy/pull/2170)(@trinity-1686a)
- Add shared search executor [#2312](https://github.com/quickwit-oss/tantivy/pull/2312)(@MochiXu)
- Truncate keys to u16::MAX in term hashmap [#2299](https://github.com/quickwit-oss/tantivy/pull/2299)(@PSeitz)
- report if a term matched when warming up posting list [#2309](https://github.com/quickwit-oss/tantivy/pull/2309)(@trinity-1686a)
- Support json fields in FuzzyTermQuery [#2173](https://github.com/quickwit-oss/tantivy/pull/2173)(@PingXia-at)
- Read list of fields encoded in term dictionary for JSON fields [#2184](https://github.com/quickwit-oss/tantivy/pull/2184)(@PSeitz)
- add collect_block to BoxableSegmentCollector [#2331](https://github.com/quickwit-oss/tantivy/pull/2331)(@PSeitz)
- expose collect_block buffer size [#2326](https://github.com/quickwit-oss/tantivy/pull/2326)(@PSeitz)
- Forward regex parser errors [#2288](https://github.com/quickwit-oss/tantivy/pull/2288)(@adamreichold)
- Make FacetCounts defaultable and cloneable. [#2322](https://github.com/quickwit-oss/tantivy/pull/2322)(@adamreichold)
- Derive Debug for SchemaBuilder [#2254](https://github.com/quickwit-oss/tantivy/pull/2254)(@GodTamIt)
- add missing inlines to tantivy options [#2245](https://github.com/quickwit-oss/tantivy/pull/2245)(@PSeitz)
Tantivy 0.21.1
================================
#### Bugfixes
- Range queries on fast fields with less values on that field than documents had an invalid end condition, leading to missing results. [#2226](https://github.com/quickwit-oss/tantivy/issues/2226)(@appaquet @PSeitz)
- Increase the minimum memory budget from 3MB to 15MB to avoid single doc segments (API fix). [#2176](https://github.com/quickwit-oss/tantivy/issues/2176)(@PSeitz)
Tantivy 0.21 Tantivy 0.21
================================ ================================
#### Bugfixes #### Bugfixes

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy" name = "tantivy"
version = "0.23.0" version = "0.21.1"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]
@@ -11,85 +11,78 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md" readme = "README.md"
keywords = ["search", "information", "retrieval"] keywords = ["search", "information", "retrieval"]
edition = "2021" edition = "2021"
rust-version = "1.63" rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"] exclude = ["benches/*.json", "benches/*.txt"]
[dependencies] [dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged oneshot = "0.1.5"
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" } base64 = "0.21.0"
base64 = "0.22.0"
byteorder = "1.4.3" byteorder = "1.4.3"
crc32fast = "1.3.2" crc32fast = "1.3.2"
once_cell = "1.10.0" once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = [ regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
"std",
"unicode",
] }
aho-corasick = "1.0" aho-corasick = "1.0"
tantivy-fst = "0.5" tantivy-fst = "0.4.0"
memmap2 = { version = "0.9.0", optional = true } memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true } lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false } zstd = { version = "0.12", optional = true, default-features = false }
tempfile = { version = "3.3.0", optional = true } tempfile = { version = "3.3.0", optional = true }
log = "0.4.16" log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79" serde_json = "1.0.79"
fs4 = { version = "0.8.0", optional = true } num_cpus = "1.13.1"
fs4 = { version = "0.6.3", optional = true }
levenshtein_automata = "0.2.1" levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] } uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4" crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0" rust-stemmers = "1.2.0"
downcast-rs = "1.2.0" downcast-rs = "1.2.0"
bitpacking = { version = "0.9.2", default-features = false, features = [ bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
"bitpacker4x", census = "0.4.0"
] }
census = "0.4.2"
rustc-hash = "1.1.0" rustc-hash = "1.1.0"
thiserror = "1.0.30" thiserror = "1.0.30"
htmlescape = "0.3.1" htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true } fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] } time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0" smallvec = "1.8.0"
rayon = "1.5.2" rayon = "1.5.2"
lru = "0.12.0" lru = "0.11.0"
fastdivide = "0.4.0" fastdivide = "0.4.0"
itertools = "0.13.0" itertools = "0.11.0"
measure_time = "0.8.2" measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0" arc-swap = "1.5.0"
columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" } columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true } sstable = { version= "0.2", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" } stacker = { version= "0.2", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" } query-grammar = { version= "0.21.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" } tantivy-bitpacker = { version= "0.5", path="./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" } common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" } tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] } sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true } futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies] [target.'cfg(windows)'.dependencies]
winapi = "0.3.9" winapi = "0.3.9"
[dev-dependencies] [dev-dependencies]
binggan = "0.8.0"
rand = "0.8.5" rand = "0.8.5"
maplit = "1.0.2" maplit = "1.0.2"
matches = "0.1.9" matches = "0.1.9"
pretty_assertions = "1.2.1" pretty_assertions = "1.2.1"
proptest = "1.0.0" proptest = "1.0.0"
test-log = "0.2.10" test-log = "0.2.10"
env_logger = "0.10.0"
futures = "0.3.21" futures = "0.3.21"
paste = "1.0.11" paste = "1.0.11"
more-asserts = "0.3.1" more-asserts = "0.3.1"
rand_distr = "0.4.3" rand_distr = "0.4.3"
time = { version = "0.3.10", features = ["serde-well-known", "macros"] }
postcard = { version = "1.0.4", features = [
"use-std",
], default-features = false }
[target.'cfg(not(windows))'.dev-dependencies] [target.'cfg(not(windows))'.dev-dependencies]
criterion = { version = "0.5", default-features = false } criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail] [dev-dependencies.fail]
version = "0.5.0" version = "0.5.0"
@@ -118,26 +111,12 @@ lz4-compression = ["lz4_flex"]
zstd-compression = ["zstd"] zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"] failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches. unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"] quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
[workspace] [workspace]
members = [ members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
"query-grammar",
"bitpacker",
"common",
"ownedbytes",
"stacker",
"sstable",
"tokenizer-api",
"columnar",
]
# Following the "fail" crate best practises, we isolate # Following the "fail" crate best practises, we isolate
# tests that define specific behavior in fail check points # tests that define specific behavior in fail check points
@@ -158,7 +137,3 @@ harness = false
[[bench]] [[bench]]
name = "index-bench" name = "index-bench"
harness = false harness = false
[[bench]]
name = "agg_bench"
harness = false

View File

@@ -5,18 +5,19 @@
[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT) [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
[![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy) [![Crates.io](https://img.shields.io/crates/v/tantivy.svg)](https://crates.io/crates/tantivy)
<img src="https://tantivy-search.github.io/logo/tantivy-logo.png" alt="Tantivy, the fastest full-text search engine library written in Rust" height="250"> ![Tantivy](https://tantivy-search.github.io/logo/tantivy-logo.png)
## Fast full-text search engine library written in Rust **Tantivy** is a **full-text search engine library** written in Rust.
**If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our distributed search engine built on top of Tantivy.** It is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not
an off-the-shelf search engine server, but rather a crate that can be used
Tantivy is closer to [Apache Lucene](https://lucene.apache.org/) than to [Elasticsearch](https://www.elastic.co/products/elasticsearch) or [Apache Solr](https://lucene.apache.org/solr/) in the sense it is not to build such a search engine.
an off-the-shelf search engine server, but rather a crate that can be used to build such a search engine.
Tantivy is, in fact, strongly inspired by Lucene's design. Tantivy is, in fact, strongly inspired by Lucene's design.
## Benchmark If you are looking for an alternative to Elasticsearch or Apache Solr, check out [Quickwit](https://github.com/quickwit-oss/quickwit), our search engine built on top of Tantivy.
# Benchmark
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
performance for different types of queries/collections. performance for different types of queries/collections.
@@ -27,7 +28,7 @@ Your mileage WILL vary depending on the nature of queries and their load.
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game). Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
## Features # Features
- Full-text search - Full-text search
- Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder)) - Configurable tokenizer (stemming available for 17 Latin languages) with third party support for Chinese ([tantivy-jieba](https://crates.io/crates/tantivy-jieba) and [cang-jie](https://crates.io/crates/cang-jie)), Japanese ([lindera](https://github.com/lindera-morphology/lindera-tantivy), [Vaporetto](https://crates.io/crates/vaporetto_tantivy), and [tantivy-tokenizer-tiny-segmenter](https://crates.io/crates/tantivy-tokenizer-tiny-segmenter)) and Korean ([lindera](https://github.com/lindera-morphology/lindera-tantivy) + [lindera-ko-dic-builder](https://github.com/lindera-morphology/lindera-ko-dic-builder))
@@ -53,11 +54,11 @@ Details about the benchmark can be found at this [repository](https://github.com
- Searcher Warmer API - Searcher Warmer API
- Cheesy logo with a horse - Cheesy logo with a horse
### Non-features ## Non-features
Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/). Distributed search is out of the scope of Tantivy, but if you are looking for this feature, check out [Quickwit](https://github.com/quickwit-oss/quickwit/).
## Getting started # Getting started
Tantivy works on stable Rust and supports Linux, macOS, and Windows. Tantivy works on stable Rust and supports Linux, macOS, and Windows.
@@ -67,7 +68,7 @@ index documents, and search via the CLI or a small server with a REST API.
It walks you through getting a Wikipedia search engine up and running in a few minutes. It walks you through getting a Wikipedia search engine up and running in a few minutes.
- [Reference doc for the last released version](https://docs.rs/tantivy/) - [Reference doc for the last released version](https://docs.rs/tantivy/)
## How can I support this project? # How can I support this project?
There are many ways to support this project. There are many ways to support this project.
@@ -78,16 +79,16 @@ There are many ways to support this project.
- Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE)) - Contribute code (you can join [our Discord server](https://discord.gg/MT27AG5EVE))
- Talk about Tantivy around you - Talk about Tantivy around you
## Contributing code # Contributing code
We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR. We use the GitHub Pull Request workflow: reference a GitHub ticket and/or include a comprehensive commit message when opening a PR.
Feel free to update CHANGELOG.md with your contribution. Feel free to update CHANGELOG.md with your contribution.
### Tokenizer ## Tokenizer
When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate. When implementing a tokenizer for tantivy depend on the `tantivy-tokenizer-api` crate.
### Clone and build locally ## Clone and build locally
Tantivy compiles on stable Rust. Tantivy compiles on stable Rust.
To check out and run tests, you can simply run: To check out and run tests, you can simply run:
@@ -98,7 +99,7 @@ cd tantivy
cargo test cargo test
``` ```
## Companies Using Tantivy # Companies Using Tantivy
<p align="left"> <p align="left">
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp; <img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp;
@@ -110,7 +111,7 @@ cargo test
<img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" /> <img align="center" src="doc/assets/images/element-dark-theme.png#gh-dark-mode-only" alt="Element.io" height="25" width="auto" />
</p> </p>
## FAQ # FAQ
### Can I use Tantivy in other languages? ### Can I use Tantivy in other languages?

View File

@@ -1,413 +0,0 @@
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use tantivy::aggregation::agg_req::Aggregations;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use tantivy::{doc, Index, Term};
#[global_allocator]
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
/// Mini macro to register a function via its name
/// runner.register("average_u64", move |index| average_u64(index));
macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| $func(index))
};
}
fn main() {
let inputs = vec![
("full", get_test_index_bench(Cardinality::Full).unwrap()),
(
"dense",
get_test_index_bench(Cardinality::OptionalDense).unwrap(),
),
(
"sparse",
get_test_index_bench(Cardinality::OptionalSparse).unwrap(),
),
(
"multivalue",
get_test_index_bench(Cardinality::Multivalued).unwrap(),
),
];
bench_agg(InputGroup::new_with_inputs(inputs));
}
fn bench_agg(mut group: InputGroup<Index>) {
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
register!(group, average_u64);
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
register!(group, range_agg_with_term_agg_many);
register!(group, histogram);
register!(group, histogram_hard_bounds);
register!(group, histogram_with_avg_sub_agg);
register!(group, avg_and_range_with_avg_sub_agg);
group.run();
}
fn exec_term_with_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let collector = get_collector(agg_req);
let searcher = reader.searcher();
black_box(searcher.search(&term_query, &collector).unwrap());
}
fn average_u64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64(index: &Index) {
let agg_req = json!({
"average": { "avg": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn average_f64_u64(index: &Index) {
let agg_req = json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
});
exec_term_with_agg(index, agg_req)
}
fn stats_f64(index: &Index) {
let agg_req = json!({
"average_f64": { "stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
});
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
});
execute_agg(index, agg_req);
}
fn terms_many_order_by_term(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
});
execute_agg(index, agg_req);
}
fn terms_many_with_top_hits(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"top_hits": { "top_hits":
{
"sort": [
{ "score": "desc" }
],
"size": 2,
"doc_value_fields": ["score_f64"]
}
}
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn execute_agg(index: &Index, agg_req: serde_json::Value) {
let agg_req: Aggregations = serde_json::from_value(agg_req).unwrap();
let collector = get_collector(agg_req);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
black_box(searcher.search(&AllQuery, &collector).unwrap());
}
fn range_agg(index: &Index) {
let agg_req = json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
});
execute_agg(index, agg_req);
}
fn range_agg_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_few(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_few_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn range_agg_with_term_agg_many(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"my_texts": { "terms": { "field": "text_many_terms" } },
}
},
});
execute_agg(index, agg_req);
}
fn histogram(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
});
execute_agg(index, agg_req);
}
fn histogram_hard_bounds(index: &Index) {
let agg_req = json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
});
execute_agg(index, agg_req);
}
fn histogram_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
});
execute_agg(index, agg_req);
}
fn avg_and_range_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
});
execute_agg(index, agg_req);
}
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
OptionalDense = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
OptionalSparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> tantivy::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = tantivy::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = tantivy::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{num}"))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::OptionalDense {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::OptionalSparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::OptionalSparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}

View File

@@ -1,98 +1,14 @@
use criterion::{criterion_group, criterion_main, BatchSize, Bencher, Criterion, Throughput}; use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT}; use pprof::criterion::{Output, PProfProfiler};
use tantivy::{tokenizer, Index, IndexWriter}; use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json"); const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json"); const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json"); const WIKI: &str = include_str!("wiki.json");
fn benchmark( fn get_lines(input: &str) -> Vec<&str> {
b: &mut Bencher, input.trim().split('\n').collect()
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
is_dynamic: bool,
) {
if is_dynamic {
benchmark_dynamic_json(b, input, schema, commit, parse_json)
} else {
_benchmark(b, input, schema, commit, parse_json, |schema, doc_json| {
TantivyDocument::parse_json(schema, doc_json).unwrap()
})
}
}
fn get_index(schema: tantivy::schema::Schema) -> Index {
let mut index = Index::create_in_ram(schema.clone());
let ff_tokenizer_manager = tokenizer::TokenizerManager::default();
ff_tokenizer_manager.register(
"raw",
tokenizer::TextAnalyzer::builder(tokenizer::RawTokenizer::default())
.filter(tokenizer::RemoveLongFilter::limit(255))
.build(),
);
index.set_fast_field_tokenizers(ff_tokenizer_manager.clone());
index
}
fn _benchmark(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
include_json_parsing: bool,
create_doc: impl Fn(&tantivy::schema::Schema, &str) -> TantivyDocument,
) {
if include_json_parsing {
let lines: Vec<&str> = input.trim().split('\n').collect();
b.iter(|| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = create_doc(&schema, doc_json);
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
})
} else {
let docs: Vec<_> = input
.trim()
.split('\n')
.map(|doc_json| create_doc(&schema, doc_json))
.collect();
b.iter_batched(
|| docs.clone(),
|docs| {
let index = get_index(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc in docs {
index_writer.add_document(doc).unwrap();
}
if commit {
index_writer.commit().unwrap();
}
},
BatchSize::SmallInput,
)
}
}
fn benchmark_dynamic_json(
b: &mut Bencher,
input: &str,
schema: tantivy::schema::Schema,
commit: bool,
parse_json: bool,
) {
let json_field = schema.get_field("json").unwrap();
_benchmark(b, input, schema, commit, parse_json, |_schema, doc_json| {
let json_val: serde_json::Value = serde_json::from_str(doc_json).unwrap();
tantivy::doc!(json_field=>json_val)
})
} }
pub fn hdfs_index_benchmark(c: &mut Criterion) { pub fn hdfs_index_benchmark(c: &mut Criterion) {
@@ -103,14 +19,7 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
schema_builder.add_text_field("severity", STRING); schema_builder.add_text_field("severity", STRING);
schema_builder.build() schema_builder.build()
}; };
let schema_only_fast = { let schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", FAST);
schema_builder.add_text_field("body", FAST);
schema_builder.add_text_field("severity", FAST);
schema_builder.build()
};
let _schema_with_store = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_u64_field("timestamp", INDEXED | STORED); schema_builder.add_u64_field("timestamp", INDEXED | STORED);
schema_builder.add_text_field("body", TEXT | STORED); schema_builder.add_text_field("body", TEXT | STORED);
@@ -119,40 +28,74 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
}; };
let dynamic_schema = { let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new(); let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST); schema_builder.add_json_field("json", TEXT);
schema_builder.build() schema_builder.build()
}; };
let mut group = c.benchmark_group("index-hdfs"); let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64)); group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20); group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
let benches = [ let lines = get_lines(HDFS_LOGS);
("only-indexed-".to_string(), schema, false), b.iter(|| {
//("stored-".to_string(), _schema_with_store, false), let index = Index::create_in_ram(schema.clone());
("only-fast-".to_string(), schema_only_fast, false), let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
("dynamic-".to_string(), dynamic_schema, true), for doc_json in &lines {
]; let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
for (prefix, schema, is_dynamic) in benches {
for commit in [false, true] {
let suffix = if commit { "with-commit" } else { "no-commit" };
{
let parse_json = false;
// for parse_json in [false, true] {
let suffix = if parse_json {
format!("{suffix}-with-json-parsing")
} else {
suffix.to_string()
};
let bench_name = format!("{prefix}{suffix}");
group.bench_function(bench_name, |b| {
benchmark(b, HDFS_LOGS, schema.clone(), commit, parse_json, is_dynamic)
});
} }
} })
} });
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
} }
pub fn gh_index_benchmark(c: &mut Criterion) { pub fn gh_index_benchmark(c: &mut Criterion) {
@@ -161,24 +104,38 @@ pub fn gh_index_benchmark(c: &mut Criterion) {
schema_builder.add_json_field("json", TEXT | FAST); schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build() schema_builder.build()
}; };
let dynamic_schema_fast = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh"); let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64)); group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| { group.bench_function("index-gh-no-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema.clone(), false, false) let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
}); });
group.bench_function("index-gh-fast", |b| { group.bench_function("index-gh-with-commit", |b| {
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), false, false) let lines = get_lines(GH_LOGS);
}); b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
group.bench_function("index-gh-fast-with-commit", |b| { let index = Index::create_in_ram(dynamic_schema.clone());
benchmark_dynamic_json(b, GH_LOGS, dynamic_schema_fast.clone(), true, false) let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
}); });
} }
@@ -193,10 +150,33 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
group.throughput(Throughput::Bytes(WIKI.len() as u64)); group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| { group.bench_function("index-wiki-no-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), false, false) let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
}); });
group.bench_function("index-wiki-with-commit", |b| { group.bench_function("index-wiki-with-commit", |b| {
benchmark_dynamic_json(b, WIKI, dynamic_schema.clone(), true, false) let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
}); });
} }
@@ -207,12 +187,12 @@ criterion_group! {
} }
criterion_group! { criterion_group! {
name = gh_benches; name = gh_benches;
config = Criterion::default(); config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark targets = gh_index_benchmark
} }
criterion_group! { criterion_group! {
name = wiki_benches; name = wiki_benches;
config = Criterion::default(); config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark targets = wiki_index_benchmark
} }
criterion_main!(benches, gh_benches, wiki_benches); criterion_main!(benches, gh_benches, wiki_benches);

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-bitpacker" name = "tantivy-bitpacker"
version = "0.6.0" version = "0.5.0"
edition = "2021" edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
@@ -15,7 +15,7 @@ homepage = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies] [dependencies]
bitpacking = { version = "0.9.2", default-features = false, features = ["bitpacker1x"] } bitpacking = {version="0.8", default-features=false, features = ["bitpacker1x"]}
[dev-dependencies] [dev-dependencies]
rand = "0.8" rand = "0.8"

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::io; use std::io;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
@@ -366,7 +367,7 @@ mod test {
let mut output: Vec<u32> = Vec::new(); let mut output: Vec<u32> = Vec::new();
for len in [0, 1, 2, 32, 33, 34, 64] { for len in [0, 1, 2, 32, 33, 34, 64] {
for start_idx in 0u32..32u32 { for start_idx in 0u32..32u32 {
output.resize(len, 0); output.resize(len as usize, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output); bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len { for i in 0..len {
let expected = (start_idx + i as u32) & mask; let expected = (start_idx + i as u32) & mask;

View File

@@ -1,10 +1,6 @@
# configuration file for git-cliff{ pattern = "foo", replace = "bar"} # configuration file for git-cliff{ pattern = "foo", replace = "bar"}
# see https://github.com/orhun/git-cliff#configuration-file # see https://github.com/orhun/git-cliff#configuration-file
[remote.github]
owner = "quickwit-oss"
repo = "tantivy"
[changelog] [changelog]
# changelog header # changelog header
header = """ header = """
@@ -12,43 +8,15 @@ header = """
# template for the changelog body # template for the changelog body
# https://tera.netlify.app/docs/#introduction # https://tera.netlify.app/docs/#introduction
body = """ body = """
## What's Changed {% if version %}\
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
{%- if version %} in {{ version }}{%- endif -%} ==================
{% else %}\
## [unreleased]
{% endif %}\
{% for commit in commits %} {% for commit in commits %}
{% if commit.github.pr_title -%} - {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
{%- set commit_message = commit.github.pr_title -%} {% endfor %}
{%- else -%}
{%- set commit_message = commit.message -%}
{%- endif -%}
- {{ commit_message | split(pat="\n") | first | trim }}\
{% if commit.github.pr_number %} \
[#{{ commit.github.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.github.pr_number }}){% if commit.github.username %}(@{{ commit.github.username }}){%- endif -%} \
{%- endif %}
{%- endfor -%}
{% if github.contributors | filter(attribute="is_first_time", value=true) | length != 0 %}
{% raw %}\n{% endraw -%}
## New Contributors
{%- endif %}\
{% for contributor in github.contributors | filter(attribute="is_first_time", value=true) %}
* @{{ contributor.username }} made their first contribution
{%- if contributor.pr_number %} in \
[#{{ contributor.pr_number }}]({{ self::remote_url() }}/pull/{{ contributor.pr_number }}) \
{%- endif %}
{%- endfor -%}
{% if version %}
{% if previous.version %}
**Full Changelog**: {{ self::remote_url() }}/compare/{{ previous.version }}...{{ version }}
{% endif %}
{% else -%}
{% raw %}\n{% endraw %}
{% endif %}
{%- macro remote_url() -%}
https://github.com/{{ remote.github.owner }}/{{ remote.github.repo }}
{%- endmacro -%}
""" """
# remove the leading and trailing whitespace from the template # remove the leading and trailing whitespace from the template
trim = true trim = true
@@ -57,24 +25,53 @@ footer = """
""" """
postprocessors = [ postprocessors = [
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
{ pattern = 'Naveen Aiathurai', replace = "naveenann"}, # replace with github user
{ pattern = '', replace = ""}, # replace with github user
] ]
[git] [git]
# parse the commits based on https://www.conventionalcommits.org # parse the commits based on https://www.conventionalcommits.org
# This is required or commit.message contains the whole commit message and not just the title # This is required or commit.message contains the whole commit message and not just the title
conventional_commits = false conventional_commits = true
# filter out the commits that are not conventional # filter out the commits that are not conventional
filter_unconventional = true filter_unconventional = false
# process each line of a commit as an individual commit # process each line of a commit as an individual commit
split_commits = false split_commits = false
# regex for preprocessing the commit messages # regex for preprocessing the commit messages
commit_preprocessors = [ commit_preprocessors = [
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = ""}, { pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
] ]
#link_parsers = [ #link_parsers = [
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"}, #{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
#] #]
# regex for parsing and grouping commits # regex for parsing and grouping commits
commit_parsers = [
{ message = "^feat", group = "Features"},
{ message = "^fix", group = "Bug Fixes"},
{ message = "^doc", group = "Documentation"},
{ message = "^perf", group = "Performance"},
{ message = "^refactor", group = "Refactor"},
{ message = "^style", group = "Styling"},
{ message = "^test", group = "Testing"},
{ message = "^chore\\(release\\): prepare for", skip = true},
{ message = "(?i)clippy", skip = true},
{ message = "(?i)dependabot", skip = true},
{ message = "(?i)fmt", skip = true},
{ message = "(?i)bump", skip = true},
{ message = "(?i)readme", skip = true},
{ message = "(?i)comment", skip = true},
{ message = "(?i)spelling", skip = true},
{ message = "^chore", group = "Miscellaneous Tasks"},
{ body = ".*security", group = "Security"},
{ message = ".*", group = "Other", default_scope = "other"},
]
# protect breaking changes from being skipped due to matching a skipping commit_parser # protect breaking changes from being skipped due to matching a skipping commit_parser
protect_breaking_commits = false protect_breaking_commits = false
# filter out the commits that are not matched by commit parsers # filter out the commits that are not matched by commit parsers

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-columnar" name = "tantivy-columnar"
version = "0.3.0" version = "0.2.0"
edition = "2021" edition = "2021"
license = "MIT" license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy" homepage = "https://github.com/quickwit-oss/tantivy"
@@ -9,15 +9,15 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"] categories = ["database-implementations", "data-structures", "compression"]
[dependencies] [dependencies]
itertools = "0.13.0" itertools = "0.11.0"
fnv = "1.0.7"
fastdivide = "0.4.0" fastdivide = "0.4.0"
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"} stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.3", path = "../sstable", package = "tantivy-sstable" } sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.7", path = "../common", package = "tantivy-common" } common = { version= "0.6", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.6", path = "../bitpacker/" } tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
serde = "1.0.152" serde = "1.0.152"
downcast-rs = "1.2.0"
[dev-dependencies] [dev-dependencies]
proptest = "1" proptest = "1"

View File

@@ -1,155 +0,0 @@
#![feature(test)]
extern crate test;
use std::sync::Arc;
use rand::prelude::*;
use tantivy_columnar::column_values::{serialize_and_load_u64_based_column_values, CodecType};
use tantivy_columnar::*;
use test::{black_box, Bencher};
struct Columns {
pub optional: Column,
pub full: Column,
pub multi: Column,
}
fn get_test_columns() -> Columns {
let data = generate_permutation();
let mut dataframe_writer = ColumnarWriter::default();
for (idx, val) in data.iter().enumerate() {
dataframe_writer.record_numerical(idx as u32, "full_values", NumericalValue::U64(*val));
if idx % 2 == 0 {
dataframe_writer.record_numerical(
idx as u32,
"optional_values",
NumericalValue::U64(*val),
);
}
dataframe_writer.record_numerical(idx as u32, "multi_values", NumericalValue::U64(*val));
dataframe_writer.record_numerical(idx as u32, "multi_values", NumericalValue::U64(*val));
}
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(data.len() as u32, None, &mut buffer)
.unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("optional_values").unwrap();
assert_eq!(cols.len(), 1);
let optional = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(optional.index.get_cardinality(), Cardinality::Optional);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("full_values").unwrap();
assert_eq!(cols.len(), 1);
let column_full = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(column_full.index.get_cardinality(), Cardinality::Full);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("multi_values").unwrap();
assert_eq!(cols.len(), 1);
let multi = cols[0].open_u64_lenient().unwrap().unwrap();
assert_eq!(multi.index.get_cardinality(), Cardinality::Multivalued);
Columns {
optional,
full: column_full,
multi,
}
}
const NUM_VALUES: u64 = 100_000;
fn generate_permutation() -> Vec<u64> {
let mut permutation: Vec<u64> = (0u64..NUM_VALUES).collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation
}
pub fn serialize_and_load(column: &[u64], codec_type: CodecType) -> Arc<dyn ColumnValues<u64>> {
serialize_and_load_u64_based_column_values(&column, &[codec_type])
}
fn run_bench_on_column_full_scan(b: &mut Bencher, column: Column) {
let num_iter = black_box(NUM_VALUES);
b.iter(|| {
let mut sum = 0u64;
for i in 0..num_iter as u32 {
let val = column.first(i);
sum += val.unwrap_or(0);
}
sum
});
}
fn run_bench_on_column_block_fetch(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
column.first_vals(&fetch_docids, &mut block);
block[0]
});
}
fn run_bench_on_column_block_single_calls(b: &mut Bencher, column: Column) {
let mut block: Vec<Option<u64>> = vec![None; 64];
let fetch_docids = (0..64).collect::<Vec<_>>();
b.iter(move || {
for i in 0..fetch_docids.len() {
block[i] = column.first(fetch_docids[i]);
}
block[0]
});
}
/// Column first method
#[bench]
fn bench_get_first_on_full_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_optional_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_full_scan(b, column);
}
#[bench]
fn bench_get_first_on_multi_column_full_scan(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_full_scan(b, column);
}
/// Block fetch column accessor
#[bench]
fn bench_get_block_first_on_optional_column(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_fetch(b, column);
}
#[bench]
fn bench_get_block_first_on_optional_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().optional;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_multi_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().multi;
run_bench_on_column_block_single_calls(b, column);
}
#[bench]
fn bench_get_block_first_on_full_column_single_calls(b: &mut Bencher) {
let column = get_test_columns().full;
run_bench_on_column_block_single_calls(b, column);
}

View File

@@ -16,6 +16,14 @@ fn generate_permutation() -> Vec<u64> {
permutation permutation
} }
fn generate_random() -> Vec<u64> {
let mut permutation: Vec<u64> = (0u64..100_000u64)
.map(|el| el + random::<u16>() as u64)
.collect();
permutation.shuffle(&mut StdRng::from_seed([1u8; 32]));
permutation
}
// Warning: this generates the same permutation at each call // Warning: this generates the same permutation at each call
fn generate_permutation_gcd() -> Vec<u64> { fn generate_permutation_gcd() -> Vec<u64> {
let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect(); let mut permutation: Vec<u64> = (1u64..100_000u64).map(|el| el * 1000).collect();

View File

@@ -8,6 +8,7 @@ license = "MIT"
columnar = {path="../", package="tantivy-columnar"} columnar = {path="../", package="tantivy-columnar"}
serde_json = "1" serde_json = "1"
serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"} serde_json_borrow = {git="https://github.com/PSeitz/serde_json_borrow/"}
serde = "1"
[workspace] [workspace]
members = [] members = []

View File

@@ -14,32 +14,20 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
ColumnBlockAccessor<T> ColumnBlockAccessor<T>
{ {
#[inline] #[inline]
pub fn fetch_block<'a>(&'a mut self, docs: &'a [u32], accessor: &Column<T>) { pub fn fetch_block(&mut self, docs: &[u32], accessor: &Column<T>) {
if accessor.index.get_cardinality().is_full() { self.docid_cache.clear();
self.val_cache.resize(docs.len(), T::default()); self.row_id_cache.clear();
accessor.values.get_vals(docs, &mut self.val_cache); accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache);
} else { self.val_cache.resize(self.row_id_cache.len(), T::default());
self.docid_cache.clear(); accessor
self.row_id_cache.clear(); .values
accessor.row_ids_for_docs(docs, &mut self.docid_cache, &mut self.row_id_cache); .get_vals(&self.row_id_cache, &mut self.val_cache);
self.val_cache.resize(self.row_id_cache.len(), T::default());
accessor
.values
.get_vals(&self.row_id_cache, &mut self.val_cache);
}
} }
#[inline] #[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) { pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
self.fetch_block(docs, accessor); self.fetch_block(docs, accessor);
// no missing values // We can compare docid_cache with docs to find missing docs
if accessor.index.get_cardinality().is_full() { if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
return;
}
// We can compare docid_cache length with docs to find missing docs
// For multi value columns we can't rely on the length and always need to scan
if accessor.index.get_cardinality().is_multivalue() || docs.len() != self.docid_cache.len()
{
self.missing_docids_cache.clear(); self.missing_docids_cache.clear();
find_missing_docs(docs, &self.docid_cache, |doc| { find_missing_docs(docs, &self.docid_cache, |doc| {
self.missing_docids_cache.push(doc); self.missing_docids_cache.push(doc);
@@ -56,25 +44,11 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
} }
#[inline] #[inline]
/// Returns an iterator over the docids and values pub fn iter_docid_vals(&self) -> impl Iterator<Item = (DocId, T)> + '_ {
/// The passed in `docs` slice needs to be the same slice that was passed to `fetch_block` or self.docid_cache
/// `fetch_block_with_missing`. .iter()
/// .cloned()
/// The docs is used if the column is full (each docs has exactly one value), otherwise the .zip(self.val_cache.iter().cloned())
/// internal docid vec is used for the iterator, which e.g. may contain duplicate docs.
pub fn iter_docid_vals<'a>(
&'a self,
docs: &'a [u32],
accessor: &Column<T>,
) -> impl Iterator<Item = (DocId, T)> + '_ {
if accessor.index.get_cardinality().is_full() {
docs.iter().cloned().zip(self.val_cache.iter().cloned())
} else {
self.docid_cache
.iter()
.cloned()
.zip(self.val_cache.iter().cloned())
}
} }
} }

View File

@@ -3,17 +3,17 @@ mod serialize;
use std::fmt::{self, Debug}; use std::fmt::{self, Debug};
use std::io::Write; use std::io::Write;
use std::ops::{Range, RangeInclusive}; use std::ops::{Deref, Range, RangeInclusive};
use std::sync::Arc; use std::sync::Arc;
use common::BinarySerializable; use common::BinarySerializable;
pub use dictionary_encoded::{BytesColumn, StrColumn}; pub use dictionary_encoded::{BytesColumn, StrColumn};
pub use serialize::{ pub use serialize::{
open_column_bytes, open_column_str, open_column_u128, open_column_u128_as_compact_u64, open_column_bytes, open_column_str, open_column_u128, open_column_u64,
open_column_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64, serialize_column_mappable_to_u128, serialize_column_mappable_to_u64,
}; };
use crate::column_index::{ColumnIndex, Set}; use crate::column_index::ColumnIndex;
use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal; use crate::column_values::monotonic_mapping::StrictlyMonotonicMappingToInternal;
use crate::column_values::{monotonic_map_column, ColumnValues}; use crate::column_values::{monotonic_map_column, ColumnValues};
use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId}; use crate::{Cardinality, DocId, EmptyColumnValues, MonotonicallyMappableToU64, RowId};
@@ -83,36 +83,10 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
self.values.max_value() self.values.max_value()
} }
#[inline]
pub fn first(&self, row_id: RowId) -> Option<T> { pub fn first(&self, row_id: RowId) -> Option<T> {
self.values_for_doc(row_id).next() self.values_for_doc(row_id).next()
} }
/// Load the first value for each docid in the provided slice.
#[inline]
pub fn first_vals(&self, docids: &[DocId], output: &mut [Option<T>]) {
match &self.index {
ColumnIndex::Empty { .. } => {}
ColumnIndex::Full => self.values.get_vals_opt(docids, output),
ColumnIndex::Optional(optional_index) => {
for (i, docid) in docids.iter().enumerate() {
output[i] = optional_index
.rank_if_exists(*docid)
.map(|rowid| self.values.get_val(rowid));
}
}
ColumnIndex::Multivalued(multivalued_index) => {
for (i, docid) in docids.iter().enumerate() {
let range = multivalued_index.range(*docid);
let is_empty = range.start == range.end;
if !is_empty {
output[i] = Some(self.values.get_val(range.start));
}
}
}
}
}
/// Translates a block of docis to row_ids. /// Translates a block of docis to row_ids.
/// ///
/// returns the row_ids and the matching docids on the same index /// returns the row_ids and the matching docids on the same index
@@ -131,8 +105,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ { pub fn values_for_doc(&self, doc_id: DocId) -> impl Iterator<Item = T> + '_ {
self.index self.value_row_ids(doc_id)
.value_row_ids(doc_id)
.map(|value_row_id: RowId| self.values.get_val(value_row_id)) .map(|value_row_id: RowId| self.values.get_val(value_row_id))
} }
@@ -174,6 +147,14 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
} }
} }
impl<T> Deref for Column<T> {
type Target = ColumnIndex;
fn deref(&self) -> &Self::Target {
&self.index
}
}
impl BinarySerializable for Cardinality { impl BinarySerializable for Cardinality {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> { fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
self.to_code().serialize(writer) self.to_code().serialize(writer)
@@ -195,7 +176,6 @@ struct FirstValueWithDefault<T: Copy> {
impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T> impl<T: PartialOrd + Debug + Send + Sync + Copy + 'static> ColumnValues<T>
for FirstValueWithDefault<T> for FirstValueWithDefault<T>
{ {
#[inline(always)]
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u32) -> T {
self.column.first(idx).unwrap_or(self.default_value) self.column.first(idx).unwrap_or(self.default_value)
} }

View File

@@ -76,26 +76,6 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
}) })
} }
/// Open the column as u64.
///
/// See [`open_u128_as_compact_u64`] for more details.
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
.as_slice()
.try_into()
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
Ok(Column {
index: column_index,
values: column_values,
})
}
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> { pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
let (body, dictionary_len_bytes) = data.rsplit(4); let (body, dictionary_len_bytes) = data.rsplit(4);
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap()); let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());

View File

@@ -140,7 +140,7 @@ mod tests {
#[test] #[test]
fn test_merge_column_index_optional_shuffle() { fn test_merge_column_index_optional_shuffle() {
let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into(); let optional_index: ColumnIndex = OptionalIndex::for_test(2, &[0]).into();
let column_indexes = [optional_index, ColumnIndex::Full]; let column_indexes = vec![optional_index, ColumnIndex::Full];
let row_addrs = vec![ let row_addrs = vec![
RowAddr { RowAddr {
segment_ord: 0u32, segment_ord: 0u32,

View File

@@ -111,7 +111,10 @@ fn stack_multivalued_indexes<'a>(
let mut last_row_id = 0; let mut last_row_id = 0;
let mut current_it = multivalued_indexes.next(); let mut current_it = multivalued_indexes.next();
Box::new(std::iter::from_fn(move || loop { Box::new(std::iter::from_fn(move || loop {
if let Some(row_id) = current_it.as_mut()?.next() { let Some(multivalued_index) = current_it.as_mut() else {
return None;
};
if let Some(row_id) = multivalued_index.next() {
last_row_id = offset + row_id; last_row_id = offset + row_id;
return Some(last_row_id); return Some(last_row_id);
} }

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge; mod merge;
mod multivalued_index; mod multivalued_index;
mod optional_index; mod optional_index;
@@ -42,10 +37,14 @@ impl From<MultiValueIndex> for ColumnIndex {
} }
impl ColumnIndex { impl ColumnIndex {
/// Returns the cardinality of the column index. #[inline]
/// pub fn is_multivalue(&self) -> bool {
/// By convention, if the column contains no docs, we consider that it is matches!(self, ColumnIndex::Multivalued(_))
/// full. }
// Returns the cardinality of the column index.
//
// By convention, if the column contains no docs, we consider that it is
// full.
#[inline] #[inline]
pub fn get_cardinality(&self) -> Cardinality { pub fn get_cardinality(&self) -> Cardinality {
match self { match self {
@@ -122,18 +121,18 @@ impl ColumnIndex {
} }
} }
pub fn docid_range_to_rowids(&self, doc_id_range: Range<DocId>) -> Range<RowId> { pub fn docid_range_to_rowids(&self, doc_id: Range<DocId>) -> Range<RowId> {
match self { match self {
ColumnIndex::Empty { .. } => 0..0, ColumnIndex::Empty { .. } => 0..0,
ColumnIndex::Full => doc_id_range, ColumnIndex::Full => doc_id,
ColumnIndex::Optional(optional_index) => { ColumnIndex::Optional(optional_index) => {
let row_start = optional_index.rank(doc_id_range.start); let row_start = optional_index.rank(doc_id.start);
let row_end = optional_index.rank(doc_id_range.end); let row_end = optional_index.rank(doc_id.end);
row_start..row_end row_start..row_end
} }
ColumnIndex::Multivalued(multivalued_index) => { ColumnIndex::Multivalued(multivalued_index) => {
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1; let end_docid = doc_id.end.min(multivalued_index.num_docs() - 1) + 1;
let start_docid = doc_id_range.start.min(end_docid); let start_docid = doc_id.start.min(end_docid);
let row_start = multivalued_index.start_index_column.get_val(start_docid); let row_start = multivalued_index.start_index_column.get_val(start_docid);
let row_end = multivalued_index.start_index_column.get_val(end_docid); let row_end = multivalued_index.start_index_column.get_val(end_docid);

View File

@@ -21,6 +21,8 @@ const DENSE_BLOCK_THRESHOLD: u32 =
const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1; const ELEMENTS_PER_BLOCK: u32 = u16::MAX as u32 + 1;
const BLOCK_SIZE: RowId = 1 << 16;
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
struct BlockMeta { struct BlockMeta {
non_null_rows_before_block: u32, non_null_rows_before_block: u32,
@@ -107,8 +109,8 @@ struct RowAddr {
#[inline(always)] #[inline(always)]
fn row_addr_from_row_id(row_id: RowId) -> RowAddr { fn row_addr_from_row_id(row_id: RowId) -> RowAddr {
RowAddr { RowAddr {
block_id: (row_id / ELEMENTS_PER_BLOCK) as u16, block_id: (row_id / BLOCK_SIZE) as u16,
in_block_row_id: (row_id % ELEMENTS_PER_BLOCK) as u16, in_block_row_id: (row_id % BLOCK_SIZE) as u16,
} }
} }
@@ -183,13 +185,8 @@ impl Set<RowId> for OptionalIndex {
} }
} }
/// Any value doc_id is allowed.
/// In particular, doc_id = num_rows.
#[inline] #[inline]
fn rank(&self, doc_id: DocId) -> RowId { fn rank(&self, doc_id: DocId) -> RowId {
if doc_id >= self.num_docs() {
return self.num_non_nulls();
}
let RowAddr { let RowAddr {
block_id, block_id,
in_block_row_id, in_block_row_id,
@@ -203,15 +200,13 @@ impl Set<RowId> for OptionalIndex {
block_meta.non_null_rows_before_block + block_offset_row_id block_meta.non_null_rows_before_block + block_offset_row_id
} }
/// Any value doc_id is allowed.
/// In particular, doc_id = num_rows.
#[inline] #[inline]
fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> { fn rank_if_exists(&self, doc_id: DocId) -> Option<RowId> {
let RowAddr { let RowAddr {
block_id, block_id,
in_block_row_id, in_block_row_id,
} = row_addr_from_row_id(doc_id); } = row_addr_from_row_id(doc_id);
let block_meta = *self.block_metas.get(block_id as usize)?; let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta); let block = self.block(block_meta);
let block_offset_row_id = match block { let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank_if_exists(in_block_row_id), Block::Dense(dense_block) => dense_block.rank_if_exists(in_block_row_id),
@@ -496,7 +491,7 @@ fn deserialize_optional_index_block_metadatas(
non_null_rows_before_block += num_non_null_rows; non_null_rows_before_block += num_non_null_rows;
} }
block_metas.resize( block_metas.resize(
((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize, ((num_rows + BLOCK_SIZE - 1) / BLOCK_SIZE) as usize,
BlockMeta { BlockMeta {
non_null_rows_before_block, non_null_rows_before_block,
start_byte_offset, start_byte_offset,

View File

@@ -39,8 +39,7 @@ pub trait Set<T> {
/// ///
/// # Panics /// # Panics
/// ///
/// May panic if rank is greater or equal to the number of /// May panic if rank is greater than the number of elements in the Set.
/// elements in the Set.
fn select(&self, rank: T) -> T; fn select(&self, rank: T) -> T;
/// Creates a brand new select cursor. /// Creates a brand new select cursor.

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::io::{self, Write}; use std::io::{self, Write};
use common::BinarySerializable; use common::BinarySerializable;

View File

@@ -1,31 +1,8 @@
use proptest::prelude::*; use proptest::prelude::{any, prop, *};
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
use super::*; use super::*;
use crate::{ColumnarReader, ColumnarWriter, DynamicColumnHandle};
#[test]
fn test_optional_index_bug_2293() {
// tests for panic in docid_range_to_rowids for docid == num_docs
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK - 1);
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK);
test_optional_index_with_num_docs(ELEMENTS_PER_BLOCK + 1);
}
fn test_optional_index_with_num_docs(num_docs: u32) {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(100, "score", 80i64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_docs, None, &mut buffer)
.unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
assert_eq!(cols.len(), 1);
let col = cols[0].open().unwrap();
col.column_index().docid_range_to_rowids(0..num_docs);
}
#[test] #[test]
fn test_dense_block_threshold() { fn test_dense_block_threshold() {
@@ -58,7 +35,7 @@ proptest! {
#[test] #[test]
fn test_with_random_sets_simple() { fn test_with_random_sets_simple() {
let vals = 10..ELEMENTS_PER_BLOCK * 2; let vals = 10..BLOCK_SIZE * 2;
let mut out: Vec<u8> = Vec::new(); let mut out: Vec<u8> = Vec::new();
serialize_optional_index(&vals, 100, &mut out).unwrap(); serialize_optional_index(&vals, 100, &mut out).unwrap();
let null_index = open_optional_index(OwnedBytes::new(out)).unwrap(); let null_index = open_optional_index(OwnedBytes::new(out)).unwrap();
@@ -194,7 +171,7 @@ fn test_optional_index_rank() {
test_optional_index_rank_aux(&[0u32, 1u32]); test_optional_index_rank_aux(&[0u32, 1u32]);
let mut block = Vec::new(); let mut block = Vec::new();
block.push(3u32); block.push(3u32);
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
test_optional_index_rank_aux(&block); test_optional_index_rank_aux(&block);
} }
@@ -208,8 +185,8 @@ fn test_optional_index_iter_empty_one() {
fn test_optional_index_iter_dense_block() { fn test_optional_index_iter_dense_block() {
let mut block = Vec::new(); let mut block = Vec::new();
block.push(3u32); block.push(3u32);
block.extend((0..ELEMENTS_PER_BLOCK).map(|i| i + ELEMENTS_PER_BLOCK + 1)); block.extend((0..BLOCK_SIZE).map(|i| i + BLOCK_SIZE + 1));
test_optional_index_iter_aux(&block, 3 * ELEMENTS_PER_BLOCK); test_optional_index_iter_aux(&block, 3 * BLOCK_SIZE);
} }
#[test] #[test]
@@ -238,12 +215,12 @@ mod bench {
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES) let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio)) .map(|_| rng.gen_bool(fill_ratio))
.enumerate() .enumerate()
.filter(|(_pos, val)| *val) .filter(|(pos, val)| *val)
.map(|(pos, _)| pos as RowId) .map(|(pos, _)| pos as RowId)
.collect(); .collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap(); serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap() codec
} }
fn random_range_iterator( fn random_range_iterator(
@@ -265,7 +242,7 @@ mod bench {
} }
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> { fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0; let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32; let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1; let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation) random_range_iterator(0, num_values, step_size, deviation)

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
} }
} }
/// Serialize a column index.
pub fn serialize_column_index( pub fn serialize_column_index(
column_index: SerializableColumnIndex, column_index: SerializableColumnIndex,
output: &mut impl Write, output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes) Ok(column_index_num_bytes)
} }
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> { pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() { if bytes.is_empty() {
return Err(io::Error::new( return Err(io::Error::new(

View File

@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) merge_row_order: &'a MergeRowOrder, pub(crate) merge_row_order: &'a MergeRowOrder,
} }
impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> { impl<'a, T: Copy + PartialOrd + Debug> Iterable<T> for MergedColumnValues<'a, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> { fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order { match self.merge_row_order {
MergeRowOrder::Stack(_) => Box::new( MergeRowOrder::Stack(_) => Box::new(

View File

@@ -10,7 +10,6 @@ use std::fmt::Debug;
use std::ops::{Range, RangeInclusive}; use std::ops::{Range, RangeInclusive};
use std::sync::Arc; use std::sync::Arc;
use downcast_rs::DowncastSync;
pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn}; pub use monotonic_mapping::{MonotonicallyMappableToU64, StrictlyMonotonicFn};
pub use monotonic_mapping_u128::MonotonicallyMappableToU128; pub use monotonic_mapping_u128::MonotonicallyMappableToU128;
@@ -26,10 +25,7 @@ mod monotonic_column;
pub(crate) use merge::MergedColumnValues; pub(crate) use merge::MergedColumnValues;
pub use stats::ColumnStats; pub use stats::ColumnStats;
pub use u128_based::{ pub use u128_based::{open_u128_mapped, serialize_column_values_u128};
open_u128_as_compact_u64, open_u128_mapped, serialize_column_values_u128,
CompactSpaceU64Accessor,
};
pub use u64_based::{ pub use u64_based::{
load_u64_based_column_values, serialize_and_load_u64_based_column_values, load_u64_based_column_values, serialize_and_load_u64_based_column_values,
serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES, serialize_u64_based_column_values, CodecType, ALL_U64_CODEC_TYPES,
@@ -45,7 +41,7 @@ use crate::RowId;
/// ///
/// Any methods with a default and specialized implementation need to be called in the /// Any methods with a default and specialized implementation need to be called in the
/// wrappers that implement the trait: Arc and MonotonicMappingColumn /// wrappers that implement the trait: Arc and MonotonicMappingColumn
pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync { pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync {
/// Return the value associated with the given idx. /// Return the value associated with the given idx.
/// ///
/// This accessor should return as fast as possible. /// This accessor should return as fast as possible.
@@ -72,40 +68,11 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
out_x4[3] = self.get_val(idx_x4[3]); out_x4[3] = self.get_val(idx_x4[3]);
} }
let out_and_idx_chunks = output let step_size = 4;
.chunks_exact_mut(4) let cutoff = indexes.len() - indexes.len() % step_size;
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
*out = self.get_val(*idx);
}
}
/// Allows to push down multiple fetch calls, to avoid dynamic dispatch overhead. for idx in cutoff..indexes.len() {
/// The slightly weird `Option<T>` in output allows pushdown to full columns. output[idx] = self.get_val(indexes[idx]);
///
/// idx and output should have the same length
///
/// # Panics
///
/// May panic if `idx` is greater than the column length.
fn get_vals_opt(&self, indexes: &[u32], output: &mut [Option<T>]) {
assert!(indexes.len() == output.len());
let out_and_idx_chunks = output.chunks_exact_mut(4).zip(indexes.chunks_exact(4));
for (out_x4, idx_x4) in out_and_idx_chunks {
out_x4[0] = Some(self.get_val(idx_x4[0]));
out_x4[1] = Some(self.get_val(idx_x4[1]));
out_x4[2] = Some(self.get_val(idx_x4[2]));
out_x4[3] = Some(self.get_val(idx_x4[3]));
}
let out_and_idx_chunks = output
.chunks_exact_mut(4)
.into_remainder()
.iter_mut()
.zip(indexes.chunks_exact(4).remainder());
for (out, idx) in out_and_idx_chunks {
*out = Some(self.get_val(*idx));
} }
} }
@@ -134,7 +101,7 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
row_id_hits: &mut Vec<RowId>, row_id_hits: &mut Vec<RowId>,
) { ) {
let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals()); let row_id_range = row_id_range.start..row_id_range.end.min(self.num_vals());
for idx in row_id_range { for idx in row_id_range.start..row_id_range.end {
let val = self.get_val(idx); let val = self.get_val(idx);
if value_range.contains(&val) { if value_range.contains(&val) {
row_id_hits.push(idx); row_id_hits.push(idx);
@@ -172,7 +139,6 @@ pub trait ColumnValues<T: PartialOrd = u64>: Send + Sync + DowncastSync {
Box::new((0..self.num_vals()).map(|idx| self.get_val(idx))) Box::new((0..self.num_vals()).map(|idx| self.get_val(idx)))
} }
} }
downcast_rs::impl_downcast!(sync ColumnValues<T> where T: PartialOrd);
/// Empty column of values. /// Empty column of values.
pub struct EmptyColumnValues; pub struct EmptyColumnValues;
@@ -195,17 +161,12 @@ impl<T: PartialOrd + Default> ColumnValues<T> for EmptyColumnValues {
} }
} }
impl<T: Copy + PartialOrd + Debug + 'static> ColumnValues<T> for Arc<dyn ColumnValues<T>> { impl<T: Copy + PartialOrd + Debug> ColumnValues<T> for Arc<dyn ColumnValues<T>> {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> T { fn get_val(&self, idx: u32) -> T {
self.as_ref().get_val(idx) self.as_ref().get_val(idx)
} }
#[inline(always)]
fn get_vals_opt(&self, indexes: &[u32], output: &mut [Option<T>]) {
self.as_ref().get_vals_opt(indexes, output)
}
#[inline(always)] #[inline(always)]
fn min_value(&self) -> T { fn min_value(&self) -> T {
self.as_ref().min_value() self.as_ref().min_value()

View File

@@ -31,10 +31,10 @@ pub fn monotonic_map_column<C, T, Input, Output>(
monotonic_mapping: T, monotonic_mapping: T,
) -> impl ColumnValues<Output> ) -> impl ColumnValues<Output>
where where
C: ColumnValues<Input> + 'static, C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync + 'static, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Debug + Send + Sync + Clone + 'static, Input: PartialOrd + Debug + Send + Sync + Clone,
Output: PartialOrd + Debug + Send + Sync + Clone + 'static, Output: PartialOrd + Debug + Send + Sync + Clone,
{ {
MonotonicMappingColumn { MonotonicMappingColumn {
from_column, from_column,
@@ -45,10 +45,10 @@ where
impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input> impl<C, T, Input, Output> ColumnValues<Output> for MonotonicMappingColumn<C, T, Input>
where where
C: ColumnValues<Input> + 'static, C: ColumnValues<Input>,
T: StrictlyMonotonicFn<Input, Output> + Send + Sync + 'static, T: StrictlyMonotonicFn<Input, Output> + Send + Sync,
Input: PartialOrd + Send + Debug + Sync + Clone + 'static, Input: PartialOrd + Send + Debug + Sync + Clone,
Output: PartialOrd + Send + Debug + Sync + Clone + 'static, Output: PartialOrd + Send + Debug + Sync + Clone,
{ {
#[inline(always)] #[inline(always)]
fn get_val(&self, idx: u32) -> Output { fn get_val(&self, idx: u32) -> Output {
@@ -107,7 +107,7 @@ mod tests {
#[test] #[test]
fn test_monotonic_mapping_iter() { fn test_monotonic_mapping_iter() {
let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect(); let vals: Vec<u64> = (0..100u64).map(|el| el * 10).collect();
let col = VecColumn::from(vals); let col = VecColumn::from(&vals);
let mapped = monotonic_map_column( let mapped = monotonic_map_column(
col, col,
StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()), StrictlyMonotonicMappingInverter::from(StrictlyMonotonicMappingToInternal::<i64>::new()),

View File

@@ -22,7 +22,7 @@ mod build_compact_space;
use build_compact_space::get_compact_space; use build_compact_space::get_compact_space;
use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128}; use common::{BinarySerializable, CountingWriter, OwnedBytes, VInt, VIntU128};
use tantivy_bitpacker::{BitPacker, BitUnpacker}; use tantivy_bitpacker::{self, BitPacker, BitUnpacker};
use crate::column_values::ColumnValues; use crate::column_values::ColumnValues;
use crate::RowId; use crate::RowId;
@@ -148,7 +148,7 @@ impl CompactSpace {
.binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start) .binary_search_by_key(&compact, |range_mapping| range_mapping.compact_start)
// Correctness: Overflow. The first range starts at compact space 0, the error from // Correctness: Overflow. The first range starts at compact space 0, the error from
// binary search can never be 0 // binary search can never be 0
.unwrap_or_else(|e| e - 1); .map_or_else(|e| e - 1, |v| v);
let range_mapping = &self.ranges_mapping[pos]; let range_mapping = &self.ranges_mapping[pos];
let diff = compact - range_mapping.compact_start; let diff = compact - range_mapping.compact_start;
@@ -292,63 +292,6 @@ impl BinarySerializable for IPCodecParams {
} }
} }
/// Exposes the compact space compressed values as u64.
///
/// This allows faster access to the values, as u64 is faster to work with than u128.
/// It also allows to handle u128 values like u64, via the `open_u64_lenient` as a uniform
/// access interface.
///
/// When converting from the internal u64 to u128 `compact_to_u128` can be used.
pub struct CompactSpaceU64Accessor(CompactSpaceDecompressor);
impl CompactSpaceU64Accessor {
pub(crate) fn open(data: OwnedBytes) -> io::Result<CompactSpaceU64Accessor> {
let decompressor = CompactSpaceU64Accessor(CompactSpaceDecompressor::open(data)?);
Ok(decompressor)
}
/// Convert a compact space value to u128
pub fn compact_to_u128(&self, compact: u32) -> u128 {
self.0.compact_to_u128(compact)
}
}
impl ColumnValues<u64> for CompactSpaceU64Accessor {
#[inline]
fn get_val(&self, doc: u32) -> u64 {
let compact = self.0.get_compact(doc);
compact as u64
}
fn min_value(&self) -> u64 {
self.0.u128_to_compact(self.0.min_value()).unwrap() as u64
}
fn max_value(&self) -> u64 {
self.0.u128_to_compact(self.0.max_value()).unwrap() as u64
}
fn num_vals(&self) -> u32 {
self.0.params.num_vals
}
#[inline]
fn iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(self.0.iter_compact().map(|el| el as u64))
}
#[inline]
fn get_row_ids_for_value_range(
&self,
value_range: RangeInclusive<u64>,
position_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let value_range = self.0.compact_to_u128(*value_range.start() as u32)
..=self.0.compact_to_u128(*value_range.end() as u32);
self.0
.get_row_ids_for_value_range(value_range, position_range, positions)
}
}
impl ColumnValues<u128> for CompactSpaceDecompressor { impl ColumnValues<u128> for CompactSpaceDecompressor {
#[inline] #[inline]
fn get_val(&self, doc: u32) -> u128 { fn get_val(&self, doc: u32) -> u128 {
@@ -459,14 +402,9 @@ impl CompactSpaceDecompressor {
.map(|compact| self.compact_to_u128(compact)) .map(|compact| self.compact_to_u128(compact))
} }
#[inline]
pub fn get_compact(&self, idx: u32) -> u32 {
self.params.bit_unpacker.get(idx, &self.data) as u32
}
#[inline] #[inline]
pub fn get(&self, idx: u32) -> u128 { pub fn get(&self, idx: u32) -> u128 {
let compact = self.get_compact(idx); let compact = self.params.bit_unpacker.get(idx, &self.data) as u32;
self.compact_to_u128(compact) self.compact_to_u128(compact)
} }

View File

@@ -6,9 +6,7 @@ use std::sync::Arc;
mod compact_space; mod compact_space;
use common::{BinarySerializable, OwnedBytes, VInt}; use common::{BinarySerializable, OwnedBytes, VInt};
pub use compact_space::{ use compact_space::{CompactSpaceCompressor, CompactSpaceDecompressor};
CompactSpaceCompressor, CompactSpaceDecompressor, CompactSpaceU64Accessor,
};
use crate::column_values::monotonic_map_column; use crate::column_values::monotonic_map_column;
use crate::column_values::monotonic_mapping::{ use crate::column_values::monotonic_mapping::{
@@ -110,23 +108,6 @@ pub fn open_u128_mapped<T: MonotonicallyMappableToU128 + Debug>(
StrictlyMonotonicMappingToInternal::<T>::new().into(); StrictlyMonotonicMappingToInternal::<T>::new().into();
Ok(Arc::new(monotonic_map_column(reader, inverted))) Ok(Arc::new(monotonic_map_column(reader, inverted)))
} }
/// Returns the u64 representation of the u128 data.
/// The internal representation of the data as u64 is useful for faster processing.
///
/// In order to convert to u128 back cast to `CompactSpaceU64Accessor` and call
/// `compact_to_u128`.
///
/// # Notice
/// In case there are new codecs added, check for usages of `CompactSpaceDecompressorU64` and
/// also handle the new codecs.
pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn ColumnValues<u64>>> {
let header = U128Header::deserialize(&mut bytes)?;
assert_eq!(header.codec_type, U128FastFieldCodecType::CompactSpace);
let reader = CompactSpaceU64Accessor::open(bytes)?;
Ok(Arc::new(reader))
}
#[cfg(test)] #[cfg(test)]
pub mod tests { pub mod tests {
use super::*; use super::*;

View File

@@ -63,6 +63,7 @@ impl ColumnValues for BitpackedReader {
fn get_val(&self, doc: u32) -> u64 { fn get_val(&self, doc: u32) -> u64 {
self.stats.min_value + self.stats.gcd.get() * self.bit_unpacker.get(doc, &self.data) self.stats.min_value + self.stats.gcd.get() * self.bit_unpacker.get(doc, &self.data)
} }
#[inline] #[inline]
fn min_value(&self) -> u64 { fn min_value(&self) -> u64 {
self.stats.min_value self.stats.min_value

View File

@@ -63,10 +63,7 @@ impl BlockwiseLinearEstimator {
if self.block.is_empty() { if self.block.is_empty() {
return; return;
} }
let column = VecColumn::from(std::mem::take(&mut self.block)); let line = Line::train(&VecColumn::from(&self.block));
let line = Line::train(&column);
self.block = column.into();
let mut max_value = 0u64; let mut max_value = 0u64;
for (i, buffer_val) in self.block.iter().enumerate() { for (i, buffer_val) in self.block.iter().enumerate() {
let interpolated_val = line.eval(i as u32); let interpolated_val = line.eval(i as u32);
@@ -128,7 +125,7 @@ impl ColumnCodecEstimator for BlockwiseLinearEstimator {
*buffer_val = gcd_divider.divide(*buffer_val - stats.min_value); *buffer_val = gcd_divider.divide(*buffer_val - stats.min_value);
} }
let line = Line::train(&VecColumn::from(buffer.to_vec())); let line = Line::train(&VecColumn::from(&buffer));
assert!(!buffer.is_empty()); assert!(!buffer.is_empty());

View File

@@ -184,7 +184,7 @@ mod tests {
} }
fn test_eval_max_err(ys: &[u64]) -> Option<u64> { fn test_eval_max_err(ys: &[u64]) -> Option<u64> {
let line = Line::train(&VecColumn::from(ys.to_vec())); let line = Line::train(&VecColumn::from(&ys));
ys.iter() ys.iter()
.enumerate() .enumerate()
.map(|(x, y)| y.wrapping_sub(line.eval(x as u32))) .map(|(x, y)| y.wrapping_sub(line.eval(x as u32)))

View File

@@ -173,9 +173,7 @@ impl LinearCodecEstimator {
fn collect_before_line_estimation(&mut self, value: u64) { fn collect_before_line_estimation(&mut self, value: u64) {
self.block.push(value); self.block.push(value);
if self.block.len() == LINE_ESTIMATION_BLOCK_LEN { if self.block.len() == LINE_ESTIMATION_BLOCK_LEN {
let column = VecColumn::from(std::mem::take(&mut self.block)); let line = Line::train(&VecColumn::from(&self.block));
let line = Line::train(&column);
self.block = column.into();
let block = std::mem::take(&mut self.block); let block = std::mem::take(&mut self.block);
for val in block { for val in block {
self.collect_after_line_estimation(&line, val); self.collect_after_line_estimation(&line, val);

View File

@@ -1,4 +1,5 @@
use proptest::prelude::*; use proptest::prelude::*;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest}; use proptest::{prop_oneof, proptest};
#[test] #[test]

View File

@@ -4,14 +4,14 @@ use tantivy_bitpacker::minmax;
use crate::ColumnValues; use crate::ColumnValues;
/// VecColumn provides `Column` over a `Vec<T>`. /// VecColumn provides `Column` over a slice.
pub struct VecColumn<T = u64> { pub struct VecColumn<'a, T = u64> {
pub(crate) values: Vec<T>, pub(crate) values: &'a [T],
pub(crate) min_value: T, pub(crate) min_value: T,
pub(crate) max_value: T, pub(crate) max_value: T,
} }
impl<T: Copy + PartialOrd + Send + Sync + Debug + 'static> ColumnValues<T> for VecColumn<T> { impl<'a, T: Copy + PartialOrd + Send + Sync + Debug> ColumnValues<T> for VecColumn<'a, T> {
fn get_val(&self, position: u32) -> T { fn get_val(&self, position: u32) -> T {
self.values[position as usize] self.values[position as usize]
} }
@@ -37,8 +37,11 @@ impl<T: Copy + PartialOrd + Send + Sync + Debug + 'static> ColumnValues<T> for V
} }
} }
impl<T: Copy + PartialOrd + Default> From<Vec<T>> for VecColumn<T> { impl<'a, T: Copy + PartialOrd + Default, V> From<&'a V> for VecColumn<'a, T>
fn from(values: Vec<T>) -> Self { where V: AsRef<[T]> + ?Sized
{
fn from(values: &'a V) -> Self {
let values = values.as_ref();
let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default(); let (min_value, max_value) = minmax(values.iter().copied()).unwrap_or_default();
Self { Self {
values, values,
@@ -47,8 +50,3 @@ impl<T: Copy + PartialOrd + Default> From<Vec<T>> for VecColumn<T> {
} }
} }
} }
impl From<VecColumn> for Vec<u64> {
fn from(column: VecColumn) -> Self {
column.values
}
}

View File

@@ -1,3 +1,7 @@
use std::collections::BTreeMap;
use itertools::Itertools;
use super::*; use super::*;
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId}; use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};

View File

@@ -269,8 +269,7 @@ impl StrOrBytesColumnWriter {
dictionaries: &mut [DictionaryBuilder], dictionaries: &mut [DictionaryBuilder],
arena: &mut MemoryArena, arena: &mut MemoryArena,
) { ) {
let unordered_id = let unordered_id = dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes);
dictionaries[self.dictionary_id as usize].get_or_allocate_id(bytes, arena);
self.column_writer.record(doc, unordered_id, arena); self.column_writer.record(doc, unordered_id, arena);
} }

View File

@@ -13,7 +13,9 @@ pub(crate) use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena}; use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_index::SerializableColumnIndex; use crate::column_index::SerializableColumnIndex;
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64}; use crate::column_values::{
ColumnValues, MonotonicallyMappableToU128, MonotonicallyMappableToU64, VecColumn,
};
use crate::columnar::column_type::ColumnType; use crate::columnar::column_type::ColumnType;
use crate::columnar::writer::column_writers::{ use crate::columnar::writer::column_writers::{
ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter, ColumnWriter, NumericalColumnWriter, StrOrBytesColumnWriter,
@@ -59,6 +61,22 @@ pub struct ColumnarWriter {
buffers: SpareBuffers, buffers: SpareBuffers,
} }
#[inline]
fn mutate_or_create_column<V, TMutator>(
arena_hash_map: &mut ArenaHashMap,
column_name: &str,
updater: TMutator,
) where
V: Copy + 'static,
TMutator: FnMut(Option<V>) -> V,
{
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
arena_hash_map.mutate_or_create(column_name.as_bytes(), updater);
}
impl ColumnarWriter { impl ColumnarWriter {
pub fn mem_usage(&self) -> usize { pub fn mem_usage(&self) -> usize {
self.arena.mem_usage() self.arena.mem_usage()
@@ -159,8 +177,9 @@ impl ColumnarWriter {
}, },
&mut self.dictionaries, &mut self.dictionaries,
); );
hash_map.mutate_or_create( mutate_or_create_column(
column_name.as_bytes(), hash_map,
column_name,
|column_opt: Option<StrOrBytesColumnWriter>| { |column_opt: Option<StrOrBytesColumnWriter>| {
let mut column_writer = if let Some(column_writer) = column_opt { let mut column_writer = if let Some(column_writer) = column_opt {
column_writer column_writer
@@ -175,21 +194,24 @@ impl ColumnarWriter {
); );
} }
ColumnType::Bool => { ColumnType::Bool => {
self.bool_field_hash_map.mutate_or_create( mutate_or_create_column(
column_name.as_bytes(), &mut self.bool_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
); );
} }
ColumnType::DateTime => { ColumnType::DateTime => {
self.datetime_field_hash_map.mutate_or_create( mutate_or_create_column(
column_name.as_bytes(), &mut self.datetime_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
); );
} }
ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => { ColumnType::I64 | ColumnType::F64 | ColumnType::U64 => {
let numerical_type = column_type.numerical_type().unwrap(); let numerical_type = column_type.numerical_type().unwrap();
self.numerical_field_hash_map.mutate_or_create( mutate_or_create_column(
column_name.as_bytes(), &mut self.numerical_field_hash_map,
column_name,
|column_opt: Option<NumericalColumnWriter>| { |column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.force_numerical_type(numerical_type); column.force_numerical_type(numerical_type);
@@ -197,8 +219,9 @@ impl ColumnarWriter {
}, },
); );
} }
ColumnType::IpAddr => self.ip_addr_field_hash_map.mutate_or_create( ColumnType::IpAddr => mutate_or_create_column(
column_name.as_bytes(), &mut self.ip_addr_field_hash_map,
column_name,
|column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(), |column_opt: Option<ColumnWriter>| column_opt.unwrap_or_default(),
), ),
} }
@@ -211,8 +234,9 @@ impl ColumnarWriter {
numerical_value: T, numerical_value: T,
) { ) {
let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.numerical_field_hash_map, &mut self.arena);
hash_map.mutate_or_create( mutate_or_create_column(
column_name.as_bytes(), hash_map,
column_name,
|column_opt: Option<NumericalColumnWriter>| { |column_opt: Option<NumericalColumnWriter>| {
let mut column: NumericalColumnWriter = column_opt.unwrap_or_default(); let mut column: NumericalColumnWriter = column_opt.unwrap_or_default();
column.record_numerical_value(doc, numerical_value.into(), arena); column.record_numerical_value(doc, numerical_value.into(), arena);
@@ -222,6 +246,10 @@ impl ColumnarWriter {
} }
pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) { pub fn record_ip_addr(&mut self, doc: RowId, column_name: &str, ip_addr: Ipv6Addr) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.ip_addr_field_hash_map, &mut self.arena);
hash_map.mutate_or_create( hash_map.mutate_or_create(
column_name.as_bytes(), column_name.as_bytes(),
@@ -235,30 +263,24 @@ impl ColumnarWriter {
pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) { pub fn record_bool(&mut self, doc: RowId, column_name: &str, val: bool) {
let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.bool_field_hash_map, &mut self.arena);
hash_map.mutate_or_create( mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
column_name.as_bytes(), let mut column: ColumnWriter = column_opt.unwrap_or_default();
|column_opt: Option<ColumnWriter>| { column.record(doc, val, arena);
let mut column: ColumnWriter = column_opt.unwrap_or_default(); column
column.record(doc, val, arena); });
column
},
);
} }
pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) { pub fn record_datetime(&mut self, doc: RowId, column_name: &str, datetime: common::DateTime) {
let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena); let (hash_map, arena) = (&mut self.datetime_field_hash_map, &mut self.arena);
hash_map.mutate_or_create( mutate_or_create_column(hash_map, column_name, |column_opt: Option<ColumnWriter>| {
column_name.as_bytes(), let mut column: ColumnWriter = column_opt.unwrap_or_default();
|column_opt: Option<ColumnWriter>| { column.record(
let mut column: ColumnWriter = column_opt.unwrap_or_default(); doc,
column.record( NumericalValue::I64(datetime.into_timestamp_nanos()),
doc, arena,
NumericalValue::I64(datetime.into_timestamp_nanos()), );
arena, column
); });
column
},
);
} }
pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) { pub fn record_str(&mut self, doc: RowId, column_name: &str, value: &str) {
@@ -283,6 +305,10 @@ impl ColumnarWriter {
} }
pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) { pub fn record_bytes(&mut self, doc: RowId, column_name: &str, value: &[u8]) {
assert!(
!column_name.as_bytes().contains(&0u8),
"key may not contain the 0 byte"
);
let (hash_map, arena, dictionaries) = ( let (hash_map, arena, dictionaries) = (
&mut self.bytes_field_hash_map, &mut self.bytes_field_hash_map,
&mut self.arena, &mut self.arena,
@@ -312,7 +338,7 @@ impl ColumnarWriter {
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map .numerical_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| { .map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter = let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr); self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into(); let column_type = numerical_column_writer.numerical_type().into();
@@ -322,27 +348,27 @@ impl ColumnarWriter {
columns.extend( columns.extend(
self.bytes_field_hash_map self.bytes_field_hash_map
.iter() .iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)), .map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
); );
columns.extend( columns.extend(
self.str_field_hash_map self.str_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::Str, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
); );
columns.extend( columns.extend(
self.bool_field_hash_map self.bool_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::Bool, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
); );
columns.extend( columns.extend(
self.ip_addr_field_hash_map self.ip_addr_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::IpAddr, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
); );
columns.extend( columns.extend(
self.datetime_field_hash_map self.datetime_field_hash_map
.iter() .iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)), .map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
); );
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type)); columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
@@ -411,7 +437,6 @@ impl ColumnarWriter {
&mut symbol_byte_buffer, &mut symbol_byte_buffer,
), ),
buffers, buffers,
&self.arena,
&mut column_serializer, &mut column_serializer,
)?; )?;
column_serializer.finalize()?; column_serializer.finalize()?;
@@ -465,7 +490,6 @@ impl ColumnarWriter {
// Serialize [Dictionary, Column, dictionary num bytes U32::LE] // Serialize [Dictionary, Column, dictionary num bytes U32::LE]
// Column: [Column Index, Column Values, column index num bytes U32::LE] // Column: [Column Index, Column Values, column index num bytes U32::LE]
#[allow(clippy::too_many_arguments)]
fn serialize_bytes_or_str_column( fn serialize_bytes_or_str_column(
cardinality: Cardinality, cardinality: Cardinality,
num_docs: RowId, num_docs: RowId,
@@ -473,7 +497,6 @@ fn serialize_bytes_or_str_column(
dictionary_builder: &DictionaryBuilder, dictionary_builder: &DictionaryBuilder,
operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>, operation_it: impl Iterator<Item = ColumnOperation<UnorderedId>>,
buffers: &mut SpareBuffers, buffers: &mut SpareBuffers,
arena: &MemoryArena,
wrt: impl io::Write, wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()> {
let SpareBuffers { let SpareBuffers {
@@ -482,8 +505,7 @@ fn serialize_bytes_or_str_column(
.. ..
} = buffers; } = buffers;
let mut counting_writer = CountingWriter::wrap(wrt); let mut counting_writer = CountingWriter::wrap(wrt);
let term_id_mapping: TermIdMapping = let term_id_mapping: TermIdMapping = dictionary_builder.serialize(&mut counting_writer)?;
dictionary_builder.serialize(arena, &mut counting_writer)?;
let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32; let dictionary_num_bytes: u32 = counting_writer.written_bytes() as u32;
let mut wrt = counting_writer.finish(); let mut wrt = counting_writer.finish();
let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| { let operation_iterator = operation_it.map(|symbol: ColumnOperation<UnorderedId>| {
@@ -619,7 +641,10 @@ fn send_to_serialize_column_mappable_to_u128<
value_index_builders: &mut PreallocatedIndexBuilders, value_index_builders: &mut PreallocatedIndexBuilders,
values: &mut Vec<T>, values: &mut Vec<T>,
mut wrt: impl io::Write, mut wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()>
where
for<'a> VecColumn<'a, T>: ColumnValues<T>,
{
values.clear(); values.clear();
// TODO: split index and values // TODO: split index and values
let serializable_column_index = match cardinality { let serializable_column_index = match cardinality {
@@ -672,7 +697,10 @@ fn send_to_serialize_column_mappable_to_u64(
value_index_builders: &mut PreallocatedIndexBuilders, value_index_builders: &mut PreallocatedIndexBuilders,
values: &mut Vec<u64>, values: &mut Vec<u64>,
mut wrt: impl io::Write, mut wrt: impl io::Write,
) -> io::Result<()> { ) -> io::Result<()>
where
for<'a> VecColumn<'a, u64>: ColumnValues<u64>,
{
values.clear(); values.clear();
let serializable_column_index = match cardinality { let serializable_column_index = match cardinality {
Cardinality::Full => { Cardinality::Full => {

View File

@@ -18,12 +18,7 @@ pub struct ColumnarSerializer<W: io::Write> {
/// code. /// code.
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) { fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
buffer.clear(); buffer.clear();
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path. buffer.extend_from_slice(key);
if key.contains(&0u8) {
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
} else {
buffer.extend_from_slice(key);
}
buffer.push(0u8); buffer.push(0u8);
buffer.push(column_type.to_code()); buffer.push(column_type.to_code());
} }
@@ -101,13 +96,14 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::columnar::column_type::ColumnType;
#[test] #[test]
fn test_prepare_key_bytes() { fn test_prepare_key_bytes() {
let mut buffer: Vec<u8> = b"somegarbage".to_vec(); let mut buffer: Vec<u8> = b"somegarbage".to_vec();
prepare_key(b"root\0child", ColumnType::Str, &mut buffer); prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
assert_eq!(buffer.len(), 12); assert_eq!(buffer.len(), 12);
assert_eq!(&buffer[..10], b"root0child"); assert_eq!(&buffer[..10], b"root\0child");
assert_eq!(buffer[10], 0u8); assert_eq!(buffer[10], 0u8);
assert_eq!(buffer[11], ColumnType::Str.to_code()); assert_eq!(buffer[11], ColumnType::Str.to_code());
} }

View File

@@ -1,7 +1,7 @@
use std::io; use std::io;
use fnv::FnvHashMap;
use sstable::SSTable; use sstable::SSTable;
use stacker::{MemoryArena, SharedArenaHashMap};
pub(crate) struct TermIdMapping { pub(crate) struct TermIdMapping {
unordered_to_ord: Vec<OrderedId>, unordered_to_ord: Vec<OrderedId>,
@@ -31,38 +31,29 @@ pub struct OrderedId(pub u32);
/// mapping. /// mapping.
#[derive(Default)] #[derive(Default)]
pub(crate) struct DictionaryBuilder { pub(crate) struct DictionaryBuilder {
dict: SharedArenaHashMap, dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
} }
impl DictionaryBuilder { impl DictionaryBuilder {
/// Get or allocate an unordered id. /// Get or allocate an unordered id.
/// (This ID is simply an auto-incremented id.) /// (This ID is simply an auto-incremented id.)
pub fn get_or_allocate_id(&mut self, term: &[u8], arena: &mut MemoryArena) -> UnorderedId { pub fn get_or_allocate_id(&mut self, term: &[u8]) -> UnorderedId {
let next_id = self.dict.len() as u32; if let Some(term_id) = self.dict.get(term) {
let unordered_id = self return *term_id;
.dict }
.mutate_or_create(term, arena, |unordered_id: Option<u32>| { let new_id = UnorderedId(self.dict.len() as u32);
if let Some(unordered_id) = unordered_id { self.dict.insert(term.to_vec(), new_id);
unordered_id self.memory_consumption += term.len();
} else { self.memory_consumption += 40; // Term Metadata + HashMap overhead
next_id new_id
}
});
UnorderedId(unordered_id)
} }
/// Serialize the dictionary into an fst, and returns the /// Serialize the dictionary into an fst, and returns the
/// `UnorderedId -> TermOrdinal` map. /// `UnorderedId -> TermOrdinal` map.
pub fn serialize<'a, W: io::Write + 'a>( pub fn serialize<'a, W: io::Write + 'a>(&self, wrt: &mut W) -> io::Result<TermIdMapping> {
&self, let mut terms: Vec<(&[u8], UnorderedId)> =
arena: &MemoryArena, self.dict.iter().map(|(k, v)| (k.as_slice(), *v)).collect();
wrt: &mut W,
) -> io::Result<TermIdMapping> {
let mut terms: Vec<(&[u8], UnorderedId)> = self
.dict
.iter(arena)
.map(|(k, v)| (k, arena.read(v)))
.collect();
terms.sort_unstable_by_key(|(key, _)| *key); terms.sort_unstable_by_key(|(key, _)| *key);
// TODO Remove the allocation. // TODO Remove the allocation.
let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()]; let mut unordered_to_ord: Vec<OrderedId> = vec![OrderedId(0u32); terms.len()];
@@ -77,7 +68,7 @@ impl DictionaryBuilder {
} }
pub(crate) fn mem_usage(&self) -> usize { pub(crate) fn mem_usage(&self) -> usize {
self.dict.mem_usage() self.memory_consumption
} }
} }
@@ -87,13 +78,12 @@ mod tests {
#[test] #[test]
fn test_dictionary_builder() { fn test_dictionary_builder() {
let mut arena = MemoryArena::default();
let mut dictionary_builder = DictionaryBuilder::default(); let mut dictionary_builder = DictionaryBuilder::default();
let hello_uid = dictionary_builder.get_or_allocate_id(b"hello", &mut arena); let hello_uid = dictionary_builder.get_or_allocate_id(b"hello");
let happy_uid = dictionary_builder.get_or_allocate_id(b"happy", &mut arena); let happy_uid = dictionary_builder.get_or_allocate_id(b"happy");
let tax_uid = dictionary_builder.get_or_allocate_id(b"tax", &mut arena); let tax_uid = dictionary_builder.get_or_allocate_id(b"tax");
let mut buffer = Vec::new(); let mut buffer = Vec::new();
let id_mapping = dictionary_builder.serialize(&arena, &mut buffer).unwrap(); let id_mapping = dictionary_builder.serialize(&mut buffer).unwrap();
assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1)); assert_eq!(id_mapping.to_ord(hello_uid), OrderedId(1));
assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0)); assert_eq!(id_mapping.to_ord(happy_uid), OrderedId(0));
assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2)); assert_eq!(id_mapping.to_ord(tax_uid), OrderedId(2));

View File

@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column, StrColumn}; use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn}; use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::columnar::ColumnType; use crate::columnar::ColumnType;
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType}; use crate::{Cardinality, ColumnIndex, NumericalType};
#[derive(Clone)] #[derive(Clone)]
pub enum DynamicColumn { pub enum DynamicColumn {
@@ -247,12 +247,7 @@ impl DynamicColumnHandle {
} }
/// Returns the `u64` fast field reader reader associated with `fields` of types /// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, bool, ip, or datetime. /// Str, u64, i64, f64, bool, or datetime.
///
/// Notice that for IpAddr, the fastfield reader will return the u64 representation of the
/// IpAddr.
/// In order to convert to u128 back cast to `CompactSpaceU64Accessor` and call
/// `compact_to_u128`.
/// ///
/// If not, the fastfield reader will returns the u64-value associated with the original /// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue. /// FastValue.
@@ -263,10 +258,7 @@ impl DynamicColumnHandle {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?; let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column)) Ok(Some(column.term_ord_column))
} }
ColumnType::IpAddr => { ColumnType::IpAddr => Ok(None),
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
Ok(Some(column))
}
ColumnType::Bool ColumnType::Bool
| ColumnType::I64 | ColumnType::I64
| ColumnType::U64 | ColumnType::U64

View File

@@ -1,22 +1,3 @@
//! # Tantivy-Columnar
//!
//! `tantivy-columnar`provides a columnar storage for tantivy.
//! The crate allows for efficient read operations on specific columns rather than entire records.
//!
//! ## Overview
//!
//! - **columnar**: Reading, writing, and merging multiple columns:
//! - **[ColumnarWriter]**: Makes it possible to create a new columnar.
//! - **[ColumnarReader]**: The ColumnarReader makes it possible to access a set of columns
//! associated to field names.
//! - **[merge_columnar]**: Contains the functionalities to merge multiple ColumnarReader or
//! segments into a single one.
//!
//! - **column**: A single column, which contains
//! - [column_index]: Resolves the rows for a document id. Manages the cardinality of the
//! column.
//! - [column_values]: Stores the values of a column in a dense format.
#![cfg_attr(all(feature = "unstable", test), feature(test))] #![cfg_attr(all(feature = "unstable", test), feature(test))]
#[cfg(test)] #[cfg(test)]
@@ -31,7 +12,7 @@ use std::io;
mod block_accessor; mod block_accessor;
mod column; mod column;
pub mod column_index; mod column_index;
pub mod column_values; pub mod column_values;
mod columnar; mod columnar;
mod dictionary; mod dictionary;
@@ -113,9 +94,6 @@ impl Cardinality {
pub fn is_multivalue(&self) -> bool { pub fn is_multivalue(&self) -> bool {
matches!(self, Cardinality::Multivalued) matches!(self, Cardinality::Multivalued)
} }
pub fn is_full(&self) -> bool {
matches!(self, Cardinality::Full)
}
pub(crate) fn to_code(self) -> u8 { pub(crate) fn to_code(self) -> u8 {
self as u8 self as u8
} }

View File

@@ -26,7 +26,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1); assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap(); let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73); assert_eq!(cols[0].num_bytes(), 87);
} }
#[test] #[test]
@@ -40,7 +40,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1); assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap(); let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1); assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 73); assert_eq!(cols[0].num_bytes(), 87);
} }
#[test] #[test]
@@ -330,9 +330,9 @@ fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
// A random column value // A random column value
fn column_value_strategy() -> impl Strategy<Value = ColumnValue> { fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
prop_oneof![ prop_oneof![
10 => string_strategy().prop_map(ColumnValue::Str), 10 => string_strategy().prop_map(|s| ColumnValue::Str(s)),
1 => bytes_strategy().prop_map(ColumnValue::Bytes), 1 => bytes_strategy().prop_map(|b| ColumnValue::Bytes(b)),
40 => num_strategy().prop_map(ColumnValue::Numerical), 40 => num_strategy().prop_map(|n| ColumnValue::Numerical(n)),
1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new( 1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
127, 127,
0, 0,
@@ -343,7 +343,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
0, 0,
ip_addr_byte ip_addr_byte
))), ))),
1 => any::<bool>().prop_map(ColumnValue::Bool), 1 => any::<bool>().prop_map(|b| ColumnValue::Bool(b)),
1 => (0_679_723_993i64..1_679_723_995i64) 1 => (0_679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) }) .prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
] ]
@@ -419,8 +419,8 @@ fn build_columnar_with_mapping(
columnar_writer columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer) .serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap(); .unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
ColumnarReader::open(buffer).unwrap() columnar_reader
} }
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader { fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
@@ -746,7 +746,7 @@ proptest! {
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into(); let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap(); crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect(); let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
@@ -772,7 +772,7 @@ fn test_columnar_merging_empty_columnar() {
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect(); columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }
@@ -809,7 +809,7 @@ fn test_columnar_merging_number_columns() {
.unwrap(); .unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap(); let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect(); columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]); let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar); assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
} }

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-common" name = "tantivy-common"
version = "0.7.0" version = "0.6.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"] authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT" license = "MIT"
edition = "2021" edition = "2021"
@@ -14,7 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies] [dependencies]
byteorder = "1.4.3" byteorder = "1.4.3"
ownedbytes = { version= "0.7", path="../ownedbytes" } ownedbytes = { version= "0.6", path="../ownedbytes" }
async-trait = "0.1" async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] } time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] } serde = { version = "1.0.136", features = ["derive"] }

View File

@@ -1,5 +1,6 @@
use std::convert::TryInto;
use std::io::Write; use std::io::Write;
use std::{fmt, io}; use std::{fmt, io, u64};
use ownedbytes::OwnedBytes; use ownedbytes::OwnedBytes;

View File

@@ -1,12 +1,11 @@
#![allow(deprecated)]
use std::fmt; use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339; use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset}; use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only /// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision. /// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive( #[derive(
@@ -25,6 +24,9 @@ pub enum DateTimePrecision {
Nanoseconds, Nanoseconds,
} }
#[deprecated(since = "0.20.0", note = "Use `DateTimePrecision` instead")]
pub type DatePrecision = DateTimePrecision;
/// A date/time value with nanoseconds precision. /// A date/time value with nanoseconds precision.
/// ///
/// This timestamp does not carry any explicit time zone information. /// This timestamp does not carry any explicit time zone information.
@@ -35,7 +37,7 @@ pub enum DateTimePrecision {
/// All constructors and conversions are provided as explicit /// All constructors and conversions are provided as explicit
/// functions and not by implementing any `From`/`Into` traits /// functions and not by implementing any `From`/`Into` traits
/// to prevent unintended usage. /// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] #[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DateTime { pub struct DateTime {
// Timestamp in nanoseconds. // Timestamp in nanoseconds.
pub(crate) timestamp_nanos: i64, pub(crate) timestamp_nanos: i64,
@@ -162,15 +164,3 @@ impl fmt::Debug for DateTime {
f.write_str(&utc_rfc3339) f.write_str(&utc_rfc3339)
} }
} }
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -1,144 +0,0 @@
use crate::replace_in_place;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
/// Separates the json path and the value in
/// a JSON term binary representation.
pub const JSON_END_OF_PATH: u8 = 0u8;
pub const JSON_END_OF_PATH_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_END_OF_PATH]) };
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}
impl JsonPathWriter {
pub fn with_expand_dots(expand_dots: bool) -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots,
}
}
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}
/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if self.indices.len() > 1 {
self.path.push(JSON_PATH_SEGMENT_SEP as char);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}
/// Set the end of JSON path marker.
#[inline]
pub fn set_end(&mut self) {
self.path.push_str(JSON_END_OF_PATH_STR);
}
/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}
/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}
/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}
impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();
writer.set_expand_dots(false);
writer.push("root");
assert_eq!(writer.as_str(), "root");
writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");
writer.pop();
assert_eq!(writer.as_str(), "root");
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
#[test]
fn test_json_path_expand_dots_enabled_pop_segment() {
let mut json_writer = JsonPathWriter::with_expand_dots(true);
json_writer.push("hello");
assert_eq!(json_writer.as_str(), "hello");
json_writer.push("color.hue");
assert_eq!(json_writer.as_str(), "hello\x01color\x01hue");
json_writer.pop();
assert_eq!(json_writer.as_str(), "hello");
}
}

View File

@@ -9,15 +9,15 @@ mod byte_count;
mod datetime; mod datetime;
pub mod file_slice; pub mod file_slice;
mod group_by; mod group_by;
pub mod json_path_writer;
mod serialize; mod serialize;
mod vint; mod vint;
mod writer; mod writer;
pub use bitset::*; pub use bitset::*;
pub use byte_count::ByteCount; pub use byte_count::ByteCount;
#[allow(deprecated)]
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision}; pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended; pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref}; pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize}; pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{ pub use vint::{
@@ -116,7 +116,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
/// ///
/// This function assumes that the needle is rarely contained in the bytes string /// This function assumes that the needle is rarely contained in the bytes string
/// and offers a fast path if the needle is not present. /// and offers a fast path if the needle is not present.
#[inline]
pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) { pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
if !bytes.contains(&needle) { if !bytes.contains(&needle) {
return; return;

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write}; use std::io::{Read, Write};
use std::{fmt, io}; use std::{fmt, io};
@@ -250,47 +249,11 @@ impl BinarySerializable for String {
} }
} }
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)] #[cfg(test)]
pub mod test { pub mod test {
use super::*; use super::{VInt, *};
use crate::serialize::BinarySerializable;
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() { pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new(); let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap(); O::default().serialize(&mut buffer).unwrap();

View File

@@ -151,7 +151,7 @@ pub fn read_u32_vint_no_advance(data: &[u8]) -> (u32, usize) {
(result, vlen) (result, vlen)
} }
/// Write a `u32` as a vint payload. /// Write a `u32` as a vint payload.
pub fn write_u32_vint<W: io::Write + ?Sized>(val: u32, writer: &mut W) -> io::Result<()> { pub fn write_u32_vint<W: io::Write>(val: u32, writer: &mut W) -> io::Result<()> {
let mut buf = [0u8; 8]; let mut buf = [0u8; 8];
let data = serialize_vint_u32(val, &mut buf); let data = serialize_vint_u32(val, &mut buf);
writer.write_all(data) writer.write_all(data)

View File

@@ -12,7 +12,7 @@ use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::AggregationCollector; use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery; use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST}; use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Create Schema // # Create Schema
@@ -132,10 +132,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>(); let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let mut num_indexed = 0; let mut num_indexed = 0;
for value in stream { for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?; let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
num_indexed += 1; num_indexed += 1;
if num_indexed > 4 { if num_indexed > 4 {

View File

@@ -15,18 +15,17 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Normally you would use `MMapDirectory` instead to persist data on disk. // Let's create a temporary directory for the
// https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html // sake of this example
// But for this example, we will use a temporary directory `TempDir`.
let index_path = TempDir::new()?; let index_path = TempDir::new()?;
// # Defining the schema // # Defining the schema
// //
// The Tantivy index requires a schema. // The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index, // The schema declares which fields are in the index,
// and for each field, its type and "the way it should // and for each field, its type and "the way it should
// be indexed". // be indexed".
@@ -76,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`. // Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase // Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty. // throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents! // Let's index our documents!
// We first need a handle on the title and the body field. // We first need a handle on the title and the body field.
@@ -88,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap(); let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default(); let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text( old_man_doc.add_text(
body, body,
@@ -165,7 +164,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit. // will reload the index automatically after each commit.
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
// We now need to acquire a searcher. // We now need to acquire a searcher.
@@ -218,8 +217,8 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain // the document returned will only contain
// a title. // a title.
for (_score, doc_address) in top_docs { for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
// We can also get an explanation to understand // We can also get an explanation to understand

View File

@@ -11,10 +11,9 @@ use columnar::Column;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::collector::{Collector, SegmentCollector}; use tantivy::collector::{Collector, SegmentCollector};
use tantivy::index::SegmentReader;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT}; use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score}; use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)] #[derive(Default)]
struct Stats { struct Stats {
@@ -143,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example. // this example.
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
product_name => "Super Broom 2000", product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \ product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer; use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
// //
// Here we use a buffer of 50MB per thread. Using a bigger // Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput. // memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \ body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs { for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
Ok(()) Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, Schema, Value, INDEXED, STORED, STRING}; use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED) let opts = DateOptions::from(INDEXED)
.set_stored() .set_stored()
.set_fast() .set_fast()
.set_precision(tantivy::schema::DateTimePrecision::Seconds); .set_precision(tantivy::DateTimePrecision::Seconds);
// Add `occurred_at` date field type // Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts); let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED); let event_type = schema_builder.add_text_field("event", STRING | STORED);
@@ -22,18 +22,16 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format // The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"occurred_at": "2022-06-22T12:53:50.53Z", "occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request" "event": "pull-request"
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"occurred_at": "2022-06-22T13:00:00.22Z", "occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment" "event": "comment"
@@ -60,15 +58,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?; let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1); assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs { for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
assert!(retrieved_doc assert!(matches!(
.get_first(occurred_at) retrieved_doc.get_first(occurred_at),
.unwrap() Some(Value::Date(_))
.as_value() ));
.as_datetime()
.is_some(),);
assert_eq!( assert_eq!(
retrieved_doc.to_json(&schema), schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"# r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
); );
} }

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter}; use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document // A simple helper function to fetch a single document
// given its id from our index. // given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn( fn extract_doc_given_isbn(
reader: &IndexReader, reader: &IndexReader,
isbn_term: &Term, isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> { ) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher(); let searcher = reader.searcher();
// This is the simplest query you can think of. // This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example. // Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default(); let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea"); old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!( index_writer.add_document(doc!(
isbn => "978-0099908401", isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled // Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap(); let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
frankenstein_doc_misspelled.to_json(&schema), schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
); );
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo! // No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap(); let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!( assert_eq!(
frankenstein_new_doc.to_json(&schema), schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#, r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
); );

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector; use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery}; use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example // Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?; let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to // For convenience, tantivy also comes with a macro to
// reduce the boilerplate above. // reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery; use tantivy::query::BooleanQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader}; use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?; let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "Fried egg", title => "Fried egg",
@@ -51,7 +51,7 @@ fn main() -> tantivy::Result<()> {
let reader = index.reader()?; let reader = index.reader()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
{ {
let facets = [ let facets = vec![
Facet::from("/ingredient/egg"), Facet::from("/ingredient/egg"),
Facet::from("/ingredient/oil"), Facet::from("/ingredient/oil"),
Facet::from("/ingredient/garlic"), Facet::from("/ingredient/garlic"),
@@ -91,11 +91,13 @@ fn main() -> tantivy::Result<()> {
.iter() .iter()
.map(|(_, doc_id)| { .map(|(_, doc_id)| {
searcher searcher
.doc::<TantivyDocument>(*doc_id) .doc(*doc_id)
.unwrap() .unwrap()
.get_first(title) .get_first(title)
.and_then(|v| v.as_str().map(|el| el.to_string()))
.unwrap() .unwrap()
.as_text()
.unwrap()
.to_owned()
}) })
.collect(); .collect();
assert_eq!(titles, vec!["Fried egg", "Egg rolls"]); assert_eq!(titles, vec!["Fried egg", "Egg rolls"]);

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery; use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`. // Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase // Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty. // throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents! // Let's index our documents!
// We first need a handle on the title and the body field. // We first need a handle on the title and the body field.
@@ -123,7 +123,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit. // will reload the index automatically after each commit.
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
// We now need to acquire a searcher. // We now need to acquire a searcher.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3); assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3); assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit. // Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563 // There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
// score 1.0 doc {"title":["The Diary of Muadib"]} // score 1.0 doc {"title":["The Diary of Muadib"]}
// //
// score 1.0 doc {"title":["The Diary of a Young Girl"]} // score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -61,7 +61,7 @@ fn main() -> tantivy::Result<()> {
debris of the winters flooding; and sycamores with mottled, white, recumbent \ debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool" limbs and branches that arch over the pool"
))?; ))?;
println!("add doc {i} from thread 1 - opstamp {opstamp}"); println!("add doc {} from thread 1 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(20)); thread::sleep(Duration::from_millis(20));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())
@@ -82,7 +82,7 @@ fn main() -> tantivy::Result<()> {
body => "Some great book description..." body => "Some great book description..."
))? ))?
}; };
println!("add doc {i} from thread 2 - opstamp {opstamp}"); println!("add doc {} from thread 2 - opstamp {}", i, opstamp);
thread::sleep(Duration::from_millis(10)); thread::sleep(Duration::from_millis(10));
} }
Result::<(), TantivyError>::Ok(()) Result::<(), TantivyError>::Ok(())

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#; }"#;
// We can parse our document // We can parse our document
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?; let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are // Multi-valued field are allowed, they are
// expressed in JSON by an array. // expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"], "title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818 "year": 1818
}"#; }"#;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?; let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory. // Note that the schema is saved in your index directory.
// //

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count; use tantivy::collector::Count;
use tantivy::query::RangeQuery; use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED}; use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result}; use tantivy::{doc, Index, Result};
fn main() -> Result<()> { fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field // For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let reader = index.reader()?; let reader = index.reader()?;
{ {
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 { for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?; index_writer.add_document(doc!(year_field => year))?;
} }

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING}; use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -22,22 +22,20 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// ### IPv4 // ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as // Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it // `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6. // internally as IPv6.
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "192.168.0.33", "ip": "192.168.0.33",
"event_type": "login" "event_type": "login"
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "192.168.0.80", "ip": "192.168.0.80",
"event_type": "checkout" "event_type": "checkout"
@@ -46,8 +44,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
// ### IPv6 // ### IPv6
// Adding a document that contains an IPv6 address. // Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334", "ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout" "event_type": "checkout"

View File

@@ -7,11 +7,10 @@
// the list of documents containing a term, getting // the list of documents containing a term, getting
// its term frequency, and accessing its positions. // its term frequency, and accessing its positions.
use tantivy::postings::Postings;
// --- // ---
// Importing tantivy... // Importing tantivy...
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, TERMINATED}; use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the // We first create a schema for the sake of the
@@ -25,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?; let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?; index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?; index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?; index_writer.add_document(doc!(title => "The modern Promotheus"))?;

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs}; use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT}; use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument}; use tantivy::Index;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// # Defining the schema // # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"timestamp": "2022-02-22T23:20:50.53Z", "timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click", "event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#, }"#,
)?; )?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json( let doc = schema.parse_document(
&schema,
r#"{ r#"{
"timestamp": "2022-02-22T23:20:51.53Z", "timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click", "event_type": "click",

View File

@@ -1,7 +1,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result}; use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> Result<()> { fn main() -> Result<()> {
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!( index_writer.add_document(doc!(
title => "The Old Man and the Sea", title => "The Old Man and the Sea",
@@ -51,7 +51,7 @@ fn main() -> Result<()> {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -67,12 +67,8 @@ fn main() -> Result<()> {
let mut titles = top_docs let mut titles = top_docs
.into_iter() .into_iter()
.map(|(_score, doc_address)| { .map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?; let doc = searcher.doc(doc_address)?;
let title = doc let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
Ok(title) Ok(title)
}) })
.collect::<Result<Vec<_>>>()?; .collect::<Result<Vec<_>>>()?;

View File

@@ -13,7 +13,7 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery; use tantivy::query::TermQuery;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer}; use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy}; use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir; use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> { fn pre_tokenize_text(text: &str) -> Vec<Token> {
@@ -38,7 +38,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?; let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields // We can create a document manually, by setting the fields
// one by one in a Document object. // one by one in a Document object.
@@ -83,7 +83,7 @@ fn main() -> tantivy::Result<()> {
}] }]
}"#; }"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?; let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?; index_writer.add_document(short_man_doc)?;
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
let reader = index let reader = index
.reader_builder() .reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay) .reload_policy(ReloadPolicy::OnCommit)
.try_into()?; .try_into()?;
let searcher = reader.searcher(); let searcher = reader.searcher();
@@ -115,8 +115,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text // Note that the tokens are not stored along with the original text
// in the document store // in the document store
for (_score, doc_address) in top_docs { for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema)); println!("Document: {}", schema.to_json(&retrieved_doc));
} }
// In contrary to the previous query, when we search for the "man" term we // In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,8 +10,7 @@
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::snippet::{Snippet, SnippetGenerator}; use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tempfile::TempDir; use tempfile::TempDir;
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
@@ -28,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents // # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?; let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example. // we'll only need one doc for this example.
index_writer.add_document(doc!( index_writer.add_document(doc!(
@@ -55,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?; let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?; let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc); let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:"); println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap()); println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html()); println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet)); println!("custom highlighting: {}", highlight(snippet));
} }

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::*; use tantivy::schema::*;
use tantivy::tokenizer::*; use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter}; use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> { fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search` // this example assumes you understand the content in `basic_search`
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer); index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?; let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap(); let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap(); let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?; let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs { for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?; let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:"); println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema)); println!("{}", schema.to_json(&retrieved_doc));
} }
Ok(()) Ok(())

View File

@@ -3,12 +3,11 @@ use std::collections::{HashMap, HashSet};
use std::sync::{Arc, RwLock, Weak}; use std::sync::{Arc, RwLock, Weak};
use tantivy::collector::TopDocs; use tantivy::collector::TopDocs;
use tantivy::index::SegmentId;
use tantivy::query::QueryParser; use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT}; use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{ use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
SegmentReader, Warmer, Warmer,
}; };
// This example shows how warmers can be used to // This example shows how warmers can be used to
@@ -144,7 +143,7 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222; const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?; let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?; writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?; writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?; writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;

View File

@@ -1,7 +1,7 @@
[package] [package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"] authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes" name = "ownedbytes"
version = "0.7.0" version = "0.6.0"
edition = "2021" edition = "2021"
description = "Expose data as static slice" description = "Expose data as static slice"
license = "MIT" license = "MIT"

View File

@@ -1,3 +1,4 @@
use std::convert::TryInto;
use std::ops::{Deref, Range}; use std::ops::{Deref, Range};
use std::sync::Arc; use std::sync::Arc;
use std::{fmt, io}; use std::{fmt, io};

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.22.0" version = "0.21.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"] authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT" license = "MIT"
categories = ["database-implementations", "data-structures"] categories = ["database-implementations", "data-structures"]

View File

@@ -81,8 +81,8 @@ where
T: InputTakeAtPosition + Clone, T: InputTakeAtPosition + Clone,
<T as InputTakeAtPosition>::Item: AsChar + Clone, <T as InputTakeAtPosition>::Item: AsChar + Clone,
{ {
opt_i(nom::character::complete::multispace0)(input) opt_i(nom::character::complete::space0)(input)
.map(|(left, (spaces, errors))| (left, (spaces.expect("multispace0 can't fail"), errors))) .map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
} }
pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>> pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
@@ -90,7 +90,7 @@ where
T: InputTakeAtPosition + Clone + InputLength, T: InputTakeAtPosition + Clone + InputLength,
<T as InputTakeAtPosition>::Item: AsChar + Clone, <T as InputTakeAtPosition>::Item: AsChar + Clone,
{ {
opt_i(nom::character::complete::multispace1)(input).map(|(left, (spaces, mut errors))| { opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
if spaces.is_none() { if spaces.is_none() {
errors.push(LenientErrorInternal { errors.push(LenientErrorInternal {
pos: left.input_len(), pos: left.input_len(),

View File

@@ -3,11 +3,11 @@ use std::iter::once;
use nom::branch::alt; use nom::branch::alt;
use nom::bytes::complete::tag; use nom::bytes::complete::tag;
use nom::character::complete::{ use nom::character::complete::{
anychar, char, digit1, multispace0, multispace1, none_of, one_of, satisfy, u32, anychar, char, digit1, none_of, one_of, satisfy, space0, space1, u32,
}; };
use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify}; use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify};
use nom::error::{Error, ErrorKind}; use nom::error::{Error, ErrorKind};
use nom::multi::{many0, many1, separated_list0}; use nom::multi::{many0, many1, separated_list0, separated_list1};
use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple}; use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple};
use nom::IResult; use nom::IResult;
@@ -24,7 +24,7 @@ const SPECIAL_CHARS: &[char] = &[
/// consume a field name followed by colon. Return the field name with escape sequence /// consume a field name followed by colon. Return the field name with escape sequence
/// already interpreted /// already interpreted
fn field_name(inp: &str) -> IResult<&str, String> { fn field_name(i: &str) -> IResult<&str, String> {
let simple_char = none_of(SPECIAL_CHARS); let simple_char = none_of(SPECIAL_CHARS);
let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-'); let first_char = verify(none_of(SPECIAL_CHARS), |c| *c != '-');
let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS)); let escape_sequence = || preceded(char('\\'), one_of(SPECIAL_CHARS));
@@ -38,12 +38,12 @@ fn field_name(inp: &str) -> IResult<&str, String> {
char(':'), char(':'),
), ),
|(first_char, next)| once(first_char).chain(next).collect(), |(first_char, next)| once(first_char).chain(next).collect(),
)(inp) )(i)
} }
/// Consume a word outside of any context. /// Consume a word outside of any context.
// TODO should support escape sequences // TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> { fn word(i: &str) -> IResult<&str, &str> {
map_res( map_res(
recognize(tuple(( recognize(tuple((
satisfy(|c| { satisfy(|c| {
@@ -55,45 +55,45 @@ fn word(inp: &str) -> IResult<&str, &str> {
})), })),
))), ))),
|s| match s { |s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)), "OR" | "AND" | "NOT" | "IN" => Err(Error::new(i, ErrorKind::Tag)),
_ => Ok(s), _ => Ok(s),
}, },
)(inp) )(i)
} }
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ { fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| { |i| {
opt_i_err( opt_i_err(
preceded( preceded(
multispace0, space0,
recognize(many1(satisfy(|c| { recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c) !c.is_whitespace() && !delimiter.contains(c)
}))), }))),
), ),
"expected word", "expected word",
)(inp) )(i)
} }
} }
/// Consume a word inside a Range context. More values are allowed as they are /// Consume a word inside a Range context. More values are allowed as they are
/// not ambiguous in this context. /// not ambiguous in this context.
fn relaxed_word(inp: &str) -> IResult<&str, &str> { fn relaxed_word(i: &str) -> IResult<&str, &str> {
recognize(tuple(( recognize(tuple((
satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)), satisfy(|c| !c.is_whitespace() && !['`', '{', '}', '"', '[', ']', '(', ')'].contains(&c)),
many0(satisfy(|c: char| { many0(satisfy(|c: char| {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c) !c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})), })),
)))(inp) )))(i)
} }
fn negative_number(inp: &str) -> IResult<&str, &str> { fn negative_number(i: &str) -> IResult<&str, &str> {
recognize(preceded( recognize(preceded(
char('-'), char('-'),
tuple((digit1, opt(tuple((char('.'), digit1))))), tuple((digit1, opt(tuple((char('.'), digit1))))),
))(inp) ))(i)
} }
fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> { fn simple_term(i: &str) -> IResult<&str, (Delimiter, String)> {
let escaped_string = |delimiter| { let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char. // we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter); let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -123,13 +123,13 @@ fn simple_term(inp: &str) -> IResult<&str, (Delimiter, String)> {
simple_quotes, simple_quotes,
double_quotes, double_quotes,
text_no_delimiter, text_no_delimiter,
))(inp) ))(i)
} }
fn simple_term_infallible( fn simple_term_infallible(
delimiter: &str, delimiter: &str,
) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ { ) -> impl Fn(&str) -> JResult<&str, Option<(Delimiter, String)>> + '_ {
|inp| { |i| {
let escaped_string = |delimiter| { let escaped_string = |delimiter| {
// we need this because none_of can't accept an owned array of char. // we need this because none_of can't accept an owned array of char.
let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter); let not_delimiter = verify(anychar, move |parsed| *parsed != delimiter);
@@ -162,11 +162,11 @@ fn simple_term_infallible(
map(word_infallible(delimiter), |(text, errors)| { map(word_infallible(delimiter), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors) (text.map(|text| (Delimiter::None, text.to_string())), errors)
}), }),
)(inp) )(i)
} }
} }
fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> { fn term_or_phrase(i: &str) -> IResult<&str, UserInputLeaf> {
map( map(
tuple((simple_term, fallible(slop_or_prefix_val))), tuple((simple_term, fallible(slop_or_prefix_val))),
|((delimiter, phrase), (slop, prefix))| { |((delimiter, phrase), (slop, prefix))| {
@@ -179,13 +179,13 @@ fn term_or_phrase(inp: &str) -> IResult<&str, UserInputLeaf> {
} }
.into() .into()
}, },
)(inp) )(i)
} }
fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>> { fn term_or_phrase_infallible(i: &str) -> JResult<&str, Option<UserInputLeaf>> {
map( map(
// ~* for slop/prefix, ) inside group or ast tree, ^ if boost // ~* for slop/prefix, ) inside group or ast tree, ^ if boost
tuple_infallible((simple_term_infallible(")^"), slop_or_prefix_val)), tuple_infallible((simple_term_infallible("*)^"), slop_or_prefix_val)),
|((delimiter_phrase, (slop, prefix)), errors)| { |((delimiter_phrase, (slop, prefix)), errors)| {
let leaf = if let Some((delimiter, phrase)) = delimiter_phrase { let leaf = if let Some((delimiter, phrase)) = delimiter_phrase {
Some( Some(
@@ -214,93 +214,103 @@ fn term_or_phrase_infallible(inp: &str) -> JResult<&str, Option<UserInputLeaf>>
}; };
(leaf, errors) (leaf, errors)
}, },
)(inp) )(i)
} }
fn term_group(inp: &str) -> IResult<&str, UserInputAst> { fn term_group(i: &str) -> IResult<&str, UserInputAst> {
let occur_symbol = alt((
value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')),
));
map( map(
tuple(( tuple((
terminated(field_name, multispace0), terminated(field_name, space0),
delimited(tuple((char('('), multispace0)), ast, char(')')), delimited(
tuple((char('('), space0)),
separated_list0(space1, tuple((opt(occur_symbol), term_or_phrase))),
char(')'),
),
)), )),
|(field_name, mut ast)| { |(field_name, terms)| {
ast.set_default_field(field_name); UserInputAst::Clause(
ast terms
.into_iter()
.map(|(occur, leaf)| (occur, leaf.set_field(Some(field_name.clone())).into()))
.collect(),
)
}, },
)(inp) )(i)
} }
// this is a precondition for term_group_infallible. Without it, term_group_infallible can fail // this is a precondition for term_group_infallible. Without it, term_group_infallible can fail
// with a panic. It does not consume its input. // with a panic. It does not consume its input.
fn term_group_precond(inp: &str) -> IResult<&str, (), ()> { fn term_group_precond(i: &str) -> IResult<&str, (), ()> {
value( value(
(), (),
peek(tuple(( peek(tuple((
field_name, field_name,
multispace0, space0,
char('('), // when we are here, we know it can't be anything but a term group char('('), // when we are here, we know it can't be anything but a term group
))), ))),
)(inp) )(i)
.map_err(|e| e.map(|_| ())) .map_err(|e| e.map(|_| ()))
} }
fn term_group_infallible(inp: &str) -> JResult<&str, UserInputAst> { fn term_group_infallible(i: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _, _)) = let (mut i, (field_name, _, _, _)) =
tuple((field_name, multispace0, char('('), multispace0))(inp).expect("precondition failed"); tuple((field_name, space0, char('('), space0))(i).expect("precondition failed");
let res = delimited_infallible( let mut terms = Vec::new();
nothing, let mut errs = Vec::new();
map(ast_infallible, |(mut ast, errors)| {
ast.set_default_field(field_name.to_string()); let mut first_round = true;
(ast, errors) loop {
}), let mut space_error = if first_round {
opt_i_err(char(')'), "expected ')'"), first_round = false;
)(inp); Vec::new()
res } else {
let (rest, (_, err)) = space1_infallible(i)?;
i = rest;
err
};
if i.is_empty() {
errs.push(LenientErrorInternal {
pos: i.len(),
message: "missing )".to_string(),
});
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
if let Some(i) = i.strip_prefix(')') {
break Ok((i, (UserInputAst::Clause(terms), errs)));
}
// only append missing space error if we did not reach the end of group
errs.append(&mut space_error);
// here we do the assumption term_or_phrase_infallible always consume something if the
// first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, ((occur, leaf), mut err)) =
tuple_infallible((occur_symbol, term_or_phrase_infallible))(i)?;
errs.append(&mut err);
if let Some(leaf) = leaf {
terms.push((occur, leaf.set_field(Some(field_name.clone())).into()));
}
i = rest;
}
} }
fn exists(inp: &str) -> IResult<&str, UserInputLeaf> { fn literal(i: &str) -> IResult<&str, UserInputAst> {
value(
UserInputLeaf::Exists {
field: String::new(),
},
tuple((multispace0, char('*'))),
)(inp)
}
fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
value(
(),
peek(tuple((
field_name,
multispace0,
char('*'), // when we are here, we know it can't be anything but a exists
))),
)(inp)
.map_err(|e| e.map(|_| ()))
}
fn exists_infallible(inp: &str) -> JResult<&str, UserInputAst> {
let (inp, (field_name, _, _)) =
tuple((field_name, multispace0, char('*')))(inp).expect("precondition failed");
let exists = UserInputLeaf::Exists { field: field_name }.into();
Ok((inp, (exists, Vec::new())))
}
fn literal(inp: &str) -> IResult<&str, UserInputAst> {
// * alone is already parsed by our caller, so if `exists` succeed, we can be confident
// something (a field name) got parsed before
alt(( alt((
map( map(
tuple((opt(field_name), alt((range, set, exists, term_or_phrase)))), tuple((opt(field_name), alt((range, set, term_or_phrase)))),
|(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(), |(field_name, leaf): (Option<String>, UserInputLeaf)| leaf.set_field(field_name).into(),
), ),
term_group, term_group,
))(inp) ))(i)
} }
fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> { fn literal_no_group_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map( map(
tuple_infallible(( tuple_infallible((
opt_i(field_name), opt_i(field_name),
@@ -308,7 +318,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
alt_infallible( alt_infallible(
( (
( (
value((), tuple((tag("IN"), multispace0, char('[')))), value((), tuple((tag("IN"), space0, char('[')))),
map(set_infallible, |(set, errs)| (Some(set), errs)), map(set_infallible, |(set, errs)| (Some(set), errs)),
), ),
( (
@@ -327,7 +337,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none() && field_name.is_none()
{ {
errors.push(LenientErrorInternal { errors.push(LenientErrorInternal {
pos: inp.len(), pos: i.len(),
message: "parsed possible invalid field as term".to_string(), message: "parsed possible invalid field as term".to_string(),
}); });
} }
@@ -336,7 +346,7 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
&& field_name.is_none() && field_name.is_none()
{ {
errors.push(LenientErrorInternal { errors.push(LenientErrorInternal {
pos: inp.len(), pos: i.len(),
message: "parsed keyword NOT as term. It should be quoted".to_string(), message: "parsed keyword NOT as term. It should be quoted".to_string(),
}); });
} }
@@ -345,40 +355,34 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
errors, errors,
) )
}, },
)(inp) )(i)
} }
fn literal_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> { fn literal_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible( alt_infallible(
( ((
( term_group_precond,
term_group_precond, map(term_group_infallible, |(group, errs)| (Some(group), errs)),
map(term_group_infallible, |(group, errs)| (Some(group), errs)), ),),
),
(
exists_precond,
map(exists_infallible, |(exists, errs)| (Some(exists), errs)),
),
),
literal_no_group_infallible, literal_no_group_infallible,
)(inp) )(i)
} }
fn slop_or_prefix_val(inp: &str) -> JResult<&str, (u32, bool)> { fn slop_or_prefix_val(i: &str) -> JResult<&str, (u32, bool)> {
map( map(
opt_i(alt(( opt_i(alt((
value((0, true), char('*')), value((0, true), char('*')),
map(preceded(char('~'), u32), |slop| (slop, false)), map(preceded(char('~'), u32), |slop| (slop, false)),
))), ))),
|(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err), |(slop_or_prefix_opt, err)| (slop_or_prefix_opt.unwrap_or_default(), err),
)(inp) )(i)
} }
/// Function that parses a range out of a Stream /// Function that parses a range out of a Stream
/// Supports ranges like: /// Supports ranges like:
/// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10 /// [5 TO 10], {5 TO 10}, [* TO 10], [10 TO *], {10 TO *], >5, <=10
/// [a TO *], [a TO c], [abc TO bcd} /// [a TO *], [a TO c], [abc TO bcd}
fn range(inp: &str) -> IResult<&str, UserInputLeaf> { fn range(i: &str) -> IResult<&str, UserInputLeaf> {
let range_term_val = || { let range_term_val = || {
map( map(
alt((negative_number, relaxed_word, tag("*"))), alt((negative_number, relaxed_word, tag("*"))),
@@ -389,8 +393,8 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
// check for unbounded range in the form of <5, <=10, >5, >=5 // check for unbounded range in the form of <5, <=10, >5, >=5
let elastic_unbounded_range = map( let elastic_unbounded_range = map(
tuple(( tuple((
preceded(multispace0, alt((tag(">="), tag("<="), tag("<"), tag(">")))), preceded(space0, alt((tag(">="), tag("<="), tag("<"), tag(">")))),
preceded(multispace0, range_term_val()), preceded(space0, range_term_val()),
)), )),
|(comparison_sign, bound)| match comparison_sign { |(comparison_sign, bound)| match comparison_sign {
">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded), ">=" => (UserInputBound::Inclusive(bound), UserInputBound::Unbounded),
@@ -403,7 +407,7 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
); );
let lower_bound = map( let lower_bound = map(
separated_pair(one_of("{["), multispace0, range_term_val()), separated_pair(one_of("{["), space0, range_term_val()),
|(boundary_char, lower_bound)| { |(boundary_char, lower_bound)| {
if lower_bound == "*" { if lower_bound == "*" {
UserInputBound::Unbounded UserInputBound::Unbounded
@@ -416,7 +420,7 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
); );
let upper_bound = map( let upper_bound = map(
separated_pair(range_term_val(), multispace0, one_of("}]")), separated_pair(range_term_val(), space0, one_of("}]")),
|(upper_bound, boundary_char)| { |(upper_bound, boundary_char)| {
if upper_bound == "*" { if upper_bound == "*" {
UserInputBound::Unbounded UserInputBound::Unbounded
@@ -428,11 +432,8 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
}, },
); );
let lower_to_upper = separated_pair( let lower_to_upper =
lower_bound, separated_pair(lower_bound, tuple((space1, tag("TO"), space1)), upper_bound);
tuple((multispace1, tag("TO"), multispace1)),
upper_bound,
);
map( map(
alt((elastic_unbounded_range, lower_to_upper)), alt((elastic_unbounded_range, lower_to_upper)),
@@ -441,10 +442,10 @@ fn range(inp: &str) -> IResult<&str, UserInputLeaf> {
lower, lower,
upper, upper,
}, },
)(inp) )(i)
} }
fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> { fn range_infallible(i: &str) -> JResult<&str, UserInputLeaf> {
let lower_to_upper = map( let lower_to_upper = map(
tuple_infallible(( tuple_infallible((
opt_i(anychar), opt_i(anychar),
@@ -452,16 +453,13 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
word_infallible("]}"), word_infallible("]}"),
space1_infallible, space1_infallible,
opt_i_err( opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))), terminated(tag("TO"), alt((value((), space1), value((), eof)))),
"missing keyword TO", "missing keyword TO",
), ),
word_infallible("]}"), word_infallible("]}"),
opt_i_err(one_of("]}"), "missing range delimiter"), opt_i_err(one_of("]}"), "missing range delimiter"),
)), )),
|( |((lower_bound_kind, _space0, lower, _space1, to, upper, upper_bound_kind), errs)| {
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs,
)| {
let lower_bound = match (lower_bound_kind, lower) { let lower_bound = match (lower_bound_kind, lower) {
(_, Some("*")) => UserInputBound::Unbounded, (_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded, (_, None) => UserInputBound::Unbounded,
@@ -555,16 +553,16 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
errors, errors,
) )
}, },
)(inp) )(i)
} }
fn set(inp: &str) -> IResult<&str, UserInputLeaf> { fn set(i: &str) -> IResult<&str, UserInputLeaf> {
map( map(
preceded( preceded(
tuple((multispace0, tag("IN"), multispace1)), tuple((space0, tag("IN"), space1)),
delimited( delimited(
tuple((char('['), multispace0)), tuple((char('['), space0)),
separated_list0(multispace1, map(simple_term, |(_, term)| term)), separated_list0(space1, map(simple_term, |(_, term)| term)),
char(']'), char(']'),
), ),
), ),
@@ -572,10 +570,10 @@ fn set(inp: &str) -> IResult<&str, UserInputLeaf> {
field: None, field: None,
elements, elements,
}, },
)(inp) )(i)
} }
fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> { fn set_infallible(mut i: &str) -> JResult<&str, UserInputLeaf> {
// `IN [` has already been parsed when we enter, we only need to parse simple terms until we // `IN [` has already been parsed when we enter, we only need to parse simple terms until we
// find a `]` // find a `]`
let mut elements = Vec::new(); let mut elements = Vec::new();
@@ -586,41 +584,41 @@ fn set_infallible(mut inp: &str) -> JResult<&str, UserInputLeaf> {
first_round = false; first_round = false;
Vec::new() Vec::new()
} else { } else {
let (rest, (_, err)) = space1_infallible(inp)?; let (rest, (_, err)) = space1_infallible(i)?;
inp = rest; i = rest;
err err
}; };
if inp.is_empty() { if i.is_empty() {
// TODO push error about missing ] // TODO push error about missing ]
// //
errs.push(LenientErrorInternal { errs.push(LenientErrorInternal {
pos: inp.len(), pos: i.len(),
message: "missing ]".to_string(), message: "missing ]".to_string(),
}); });
let res = UserInputLeaf::Set { let res = UserInputLeaf::Set {
field: None, field: None,
elements, elements,
}; };
return Ok((inp, (res, errs))); return Ok((i, (res, errs)));
} }
if let Some(inp) = inp.strip_prefix(']') { if let Some(i) = i.strip_prefix(']') {
let res = UserInputLeaf::Set { let res = UserInputLeaf::Set {
field: None, field: None,
elements, elements,
}; };
return Ok((inp, (res, errs))); return Ok((i, (res, errs)));
} }
errs.append(&mut space_error); errs.append(&mut space_error);
// TODO // TODO
// here we do the assumption term_or_phrase_infallible always consume something if the // here we do the assumption term_or_phrase_infallible always consume something if the
// first byte is not `)` or ' '. If it did not, we would end up looping. // first byte is not `)` or ' '. If it did not, we would end up looping.
let (rest, (delim_term, mut err)) = simple_term_infallible("]")(inp)?; let (rest, (delim_term, mut err)) = simple_term_infallible("]")(i)?;
errs.append(&mut err); errs.append(&mut err);
if let Some((_, term)) = delim_term { if let Some((_, term)) = delim_term {
elements.push(term); elements.push(term);
} }
inp = rest; i = rest;
} }
} }
@@ -628,16 +626,16 @@ fn negate(expr: UserInputAst) -> UserInputAst {
expr.unary(Occur::MustNot) expr.unary(Occur::MustNot)
} }
fn leaf(inp: &str) -> IResult<&str, UserInputAst> { fn leaf(i: &str) -> IResult<&str, UserInputAst> {
alt(( alt((
delimited(char('('), ast, char(')')), delimited(char('('), ast, char(')')),
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)), map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate), map(preceded(tuple((tag("NOT"), space1)), leaf), negate),
literal, literal,
))(inp) ))(i)
} }
fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> { fn leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
alt_infallible( alt_infallible(
( (
( (
@@ -667,23 +665,23 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
), ),
), ),
literal_infallible, literal_infallible,
)(inp) )(i)
} }
fn positive_float_number(inp: &str) -> IResult<&str, f64> { fn positive_float_number(i: &str) -> IResult<&str, f64> {
map( map(
recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))), recognize(tuple((digit1, opt(tuple((char('.'), digit1)))))),
// TODO this is actually dangerous if the number is actually not representable as a f64 // TODO this is actually dangerous if the number is actually not representable as a f64
// (too big for instance) // (too big for instance)
|float_str: &str| float_str.parse::<f64>().unwrap(), |float_str: &str| float_str.parse::<f64>().unwrap(),
)(inp) )(i)
} }
fn boost(inp: &str) -> JResult<&str, Option<f64>> { fn boost(i: &str) -> JResult<&str, Option<f64>> {
opt_i(preceded(char('^'), positive_float_number))(inp) opt_i(preceded(char('^'), positive_float_number))(i)
} }
fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> { fn boosted_leaf(i: &str) -> IResult<&str, UserInputAst> {
map( map(
tuple((leaf, fallible(boost))), tuple((leaf, fallible(boost))),
|(leaf, boost_opt)| match boost_opt { |(leaf, boost_opt)| match boost_opt {
@@ -692,10 +690,10 @@ fn boosted_leaf(inp: &str) -> IResult<&str, UserInputAst> {
} }
_ => leaf, _ => leaf,
}, },
)(inp) )(i)
} }
fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> { fn boosted_leaf_infallible(i: &str) -> JResult<&str, Option<UserInputAst>> {
map( map(
tuple_infallible((leaf_infallible, boost)), tuple_infallible((leaf_infallible, boost)),
|((leaf, boost_opt), error)| match boost_opt { |((leaf, boost_opt), error)| match boost_opt {
@@ -705,30 +703,30 @@ fn boosted_leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
), ),
_ => (leaf, error), _ => (leaf, error),
}, },
)(inp) )(i)
} }
fn occur_symbol(inp: &str) -> JResult<&str, Option<Occur>> { fn occur_symbol(i: &str) -> JResult<&str, Option<Occur>> {
opt_i(alt(( opt_i(alt((
value(Occur::MustNot, char('-')), value(Occur::MustNot, char('-')),
value(Occur::Must, char('+')), value(Occur::Must, char('+')),
)))(inp) )))(i)
} }
fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> { fn occur_leaf(i: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(inp) tuple((fallible(occur_symbol), boosted_leaf))(i)
} }
#[allow(clippy::type_complexity)] #[allow(clippy::type_complexity)]
fn operand_occur_leaf_infallible( fn operand_occur_leaf_infallible(
inp: &str, i: &str,
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> { ) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
// TODO maybe this should support multiple chained AND/OR, and "fuse" them? // TODO maybe this should support multiple chained AND/OR, and "fuse" them?
tuple_infallible(( tuple_infallible((
delimited_infallible(nothing, opt_i(binary_operand), space0_infallible), delimited_infallible(nothing, opt_i(binary_operand), space0_infallible),
occur_symbol, occur_symbol,
boosted_leaf_infallible, boosted_leaf_infallible,
))(inp) ))(i)
} }
#[derive(Clone, Copy, Debug, PartialEq, Eq)] #[derive(Clone, Copy, Debug, PartialEq, Eq)]
@@ -737,31 +735,35 @@ enum BinaryOperand {
And, And,
} }
fn binary_operand(inp: &str) -> IResult<&str, BinaryOperand> { fn binary_operand(i: &str) -> IResult<&str, BinaryOperand> {
alt(( alt((
value(BinaryOperand::And, tag("AND ")), value(BinaryOperand::And, tag("AND ")),
value(BinaryOperand::Or, tag("OR ")), value(BinaryOperand::Or, tag("OR ")),
))(inp) ))(i)
} }
fn aggregate_binary_expressions( fn aggregate_binary_expressions(
left: (Option<Occur>, UserInputAst), left: UserInputAst,
others: Vec<(Option<BinaryOperand>, Option<Occur>, UserInputAst)>, others: Vec<(BinaryOperand, UserInputAst)>,
) -> Result<UserInputAst, LenientErrorInternal> { ) -> UserInputAst {
let mut leafs = Vec::with_capacity(others.len() + 1); let mut dnf: Vec<Vec<UserInputAst>> = vec![vec![left]];
leafs.push((None, left.0, Some(left.1))); for (operator, operand_ast) in others {
leafs.extend( match operator {
others BinaryOperand::And => {
.into_iter() if let Some(last) = dnf.last_mut() {
.map(|(operand, occur, ast)| (operand, occur, Some(ast))), last.push(operand_ast);
); }
// the parameters we pass should statically guarantee we can't get errors }
// (no prefix BinaryOperand is provided) BinaryOperand::Or => {
let (res, mut errors) = aggregate_infallible_expressions(leafs); dnf.push(vec![operand_ast]);
if errors.is_empty() { }
Ok(res) }
}
if dnf.len() == 1 {
UserInputAst::and(dnf.into_iter().next().unwrap()) //< safe
} else { } else {
Err(errors.swap_remove(0)) let conjunctions = dnf.into_iter().map(UserInputAst::and).collect();
UserInputAst::or(conjunctions)
} }
} }
@@ -777,10 +779,30 @@ fn aggregate_infallible_expressions(
return (UserInputAst::empty_query(), err); return (UserInputAst::empty_query(), err);
} }
let use_operand = leafs.iter().any(|(operand, _, _)| operand.is_some());
let all_operand = leafs
.iter()
.skip(1)
.all(|(operand, _, _)| operand.is_some());
let early_operand = leafs let early_operand = leafs
.iter() .iter()
.take(1) .take(1)
.all(|(operand, _, _)| operand.is_some()); .all(|(operand, _, _)| operand.is_some());
let use_occur = leafs.iter().any(|(_, occur, _)| occur.is_some());
if use_operand && use_occur {
err.push(LenientErrorInternal {
pos: 0,
message: "Use of mixed occur and boolean operator".to_string(),
});
}
if use_operand && !all_operand {
err.push(LenientErrorInternal {
pos: 0,
message: "Missing boolean operator".to_string(),
});
}
if early_operand { if early_operand {
err.push(LenientErrorInternal { err.push(LenientErrorInternal {
@@ -807,15 +829,7 @@ fn aggregate_infallible_expressions(
Some(BinaryOperand::And) => Some(Occur::Must), Some(BinaryOperand::And) => Some(Occur::Must),
_ => Some(Occur::Should), _ => Some(Occur::Should),
}; };
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) { clauses.push(vec![(occur.or(default_op), ast.clone())]);
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
clauses.push(vec![(
Some(Occur::Should),
ast.clone().unary(Occur::MustNot),
)])
} else {
clauses.push(vec![(occur.or(default_op), ast.clone())]);
}
} }
None => { None => {
let default_op = match next_operator { let default_op = match next_operator {
@@ -823,15 +837,7 @@ fn aggregate_infallible_expressions(
Some(BinaryOperand::Or) => Some(Occur::Should), Some(BinaryOperand::Or) => Some(Occur::Should),
None => None, None => None,
}; };
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) { clauses.push(vec![(occur.or(default_op), ast.clone())])
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
clauses.push(vec![(
Some(Occur::Should),
ast.clone().unary(Occur::MustNot),
)])
} else {
clauses.push(vec![(occur.or(default_op), ast.clone())])
}
} }
} }
} }
@@ -848,12 +854,7 @@ fn aggregate_infallible_expressions(
} }
} }
Some(BinaryOperand::Or) => { Some(BinaryOperand::Or) => {
if last_occur == Some(Occur::MustNot) { clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
} else {
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
}
} }
None => clauses.push(vec![(last_occur, last_ast)]), None => clauses.push(vec![(last_occur, last_ast)]),
} }
@@ -879,32 +880,38 @@ fn aggregate_infallible_expressions(
} }
} }
fn operand_leaf(inp: &str) -> IResult<&str, (Option<BinaryOperand>, Option<Occur>, UserInputAst)> { fn operand_leaf(i: &str) -> IResult<&str, (BinaryOperand, UserInputAst)> {
map( tuple((
tuple(( terminated(binary_operand, space0),
terminated(opt(binary_operand), multispace0), terminated(boosted_leaf, space0),
terminated(occur_leaf, multispace0), ))(i)
)),
|(operand, (occur, ast))| (operand, occur, ast),
)(inp)
} }
fn ast(inp: &str) -> IResult<&str, UserInputAst> { fn ast(i: &str) -> IResult<&str, UserInputAst> {
let boolean_expr = map_res( let boolean_expr = map(
separated_pair(occur_leaf, multispace1, many1(operand_leaf)), separated_pair(boosted_leaf, space1, many1(operand_leaf)),
|(left, right)| aggregate_binary_expressions(left, right), |(left, right)| aggregate_binary_expressions(left, right),
); );
let single_leaf = map(occur_leaf, |(occur, ast)| { let whitespace_separated_leaves = map(separated_list1(space1, occur_leaf), |subqueries| {
if occur == Some(Occur::MustNot) { if subqueries.len() == 1 {
ast.unary(Occur::MustNot) let (occur_opt, ast) = subqueries.into_iter().next().unwrap();
match occur_opt.unwrap_or(Occur::Should) {
Occur::Must | Occur::Should => ast,
Occur::MustNot => UserInputAst::Clause(vec![(Some(Occur::MustNot), ast)]),
}
} else { } else {
ast UserInputAst::Clause(subqueries.into_iter().collect())
} }
}); });
delimited(multispace0, alt((boolean_expr, single_leaf)), multispace0)(inp)
delimited(
space0,
alt((boolean_expr, whitespace_separated_leaves)),
space0,
)(i)
} }
fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> { fn ast_infallible(i: &str) -> JResult<&str, UserInputAst> {
// ast() parse either `term AND term OR term` or `+term term -term` // ast() parse either `term AND term OR term` or `+term term -term`
// both are locally ambiguous, and as we allow error, it's hard to permit backtracking. // both are locally ambiguous, and as we allow error, it's hard to permit backtracking.
// Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant. // Instead, we allow a mix of both syntaxes, trying to make sense of what a user meant.
@@ -921,13 +928,13 @@ fn ast_infallible(inp: &str) -> JResult<&str, UserInputAst> {
}, },
); );
delimited_infallible(space0_infallible, expression, space0_infallible)(inp) delimited_infallible(space0_infallible, expression, space0_infallible)(i)
} }
pub fn parse_to_ast(inp: &str) -> IResult<&str, UserInputAst> { pub fn parse_to_ast(i: &str) -> IResult<&str, UserInputAst> {
map(delimited(multispace0, opt(ast), eof), |opt_ast| { map(delimited(space0, opt(ast), eof), |opt_ast| {
rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query)) rewrite_ast(opt_ast.unwrap_or_else(UserInputAst::empty_query))
})(inp) })(i)
} }
pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>) { pub fn parse_to_ast_lenient(query_str: &str) -> (UserInputAst, Vec<LenientError>) {
@@ -1069,9 +1076,6 @@ mod test {
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'"); test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded"); test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("mr james bo?d", "(*mr *james *bo?d)");
test_parse_query_to_ast_helper("mr james bo*", "(*mr *james *bo*)");
test_parse_query_to_ast_helper("mr james b*d", "(*mr *james *b*d)");
} }
#[test] #[test]
@@ -1101,43 +1105,24 @@ mod test {
#[test] #[test]
fn test_parse_query_to_ast_binary_op() { fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+a +b)"); test_parse_query_to_ast_helper("a AND b", "(+a +b)");
test_parse_query_to_ast_helper("a\nAND b", "(+a +b)");
test_parse_query_to_ast_helper("a OR b", "(?a ?b)"); test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))"); test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)"); test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)");
test_parse_query_to_ast_helper("a OR b aaa", "(?a ?b *aaa)"); test_is_parse_err("a OR b aaa", "(?a ?b *aaa)");
test_parse_query_to_ast_helper("a AND b aaa", "(?(+a +b) *aaa)"); test_is_parse_err("a AND b aaa", "(?(+a +b) *aaa)");
test_parse_query_to_ast_helper("aaa a OR b ", "(*aaa ?a ?b)"); test_is_parse_err("aaa a OR b ", "(*aaa ?a ?b)");
test_parse_query_to_ast_helper("aaa ccc a OR b ", "(*aaa *ccc ?a ?b)"); test_is_parse_err("aaa ccc a OR b ", "(*aaa *ccc ?a ?b)");
test_parse_query_to_ast_helper("aaa a AND b ", "(*aaa ?(+a +b))"); test_is_parse_err("aaa a AND b ", "(*aaa ?(+a +b))");
test_parse_query_to_ast_helper("aaa ccc a AND b ", "(*aaa *ccc ?(+a +b))"); test_is_parse_err("aaa ccc a AND b ", "(*aaa *ccc ?(+a +b))");
} }
#[test] #[test]
fn test_parse_mixed_bool_occur() { fn test_parse_mixed_bool_occur() {
test_parse_query_to_ast_helper("+a OR +b", "(+a +b)"); test_is_parse_err("a OR b +aaa", "(?a ?b +aaa)");
test_is_parse_err("a AND b -aaa", "(?(+a +b) -aaa)");
test_parse_query_to_ast_helper("a AND -b", "(+a -b)"); test_is_parse_err("+a OR +b aaa", "(+a +b *aaa)");
test_parse_query_to_ast_helper("-a AND b", "(-a +b)"); test_is_parse_err("-a AND -b aaa", "(?(-a -b) *aaa)");
test_parse_query_to_ast_helper("a AND NOT b", "(+a +(-b))"); test_is_parse_err("-aaa +ccc -a OR b ", "(-aaa +ccc -a ?b)");
test_parse_query_to_ast_helper("NOT a AND b", "(+(-a) +b)");
test_parse_query_to_ast_helper("a AND NOT b AND c", "(+a +(-b) +c)");
test_parse_query_to_ast_helper("a AND -b AND c", "(+a -b +c)");
test_parse_query_to_ast_helper("a OR -b", "(?a ?(-b))");
test_parse_query_to_ast_helper("-a OR b", "(?(-a) ?b)");
test_parse_query_to_ast_helper("a OR NOT b", "(?a ?(-b))");
test_parse_query_to_ast_helper("NOT a OR b", "(?(-a) ?b)");
test_parse_query_to_ast_helper("a OR NOT b OR c", "(?a ?(-b) ?c)");
test_parse_query_to_ast_helper("a OR -b OR c", "(?a ?(-b) ?c)");
test_parse_query_to_ast_helper("a OR b +aaa", "(?a ?b +aaa)");
test_parse_query_to_ast_helper("a AND b -aaa", "(?(+a +b) -aaa)");
test_parse_query_to_ast_helper("+a OR +b aaa", "(+a +b *aaa)");
test_parse_query_to_ast_helper("-a AND -b aaa", "(?(-a -b) *aaa)");
test_parse_query_to_ast_helper("-aaa +ccc -a OR b ", "(-aaa +ccc ?(-a) ?b)");
} }
#[test] #[test]
@@ -1427,18 +1412,8 @@ mod test {
#[test] #[test]
fn test_parse_query_term_group() { fn test_parse_query_term_group() {
test_parse_query_to_ast_helper(r#"field:(abc)"#, r#""field":abc"#); test_parse_query_to_ast_helper(r#"field:(abc)"#, r#"(*"field":abc)"#);
test_parse_query_to_ast_helper(r#"field:(+a -"b c")"#, r#"(+"field":a -"field":"b c")"#); test_parse_query_to_ast_helper(r#"field:(+a -"b c")"#, r#"(+"field":a -"field":"b c")"#);
test_parse_query_to_ast_helper(r#"field:(a AND "b c")"#, r#"(+"field":a +"field":"b c")"#);
test_parse_query_to_ast_helper(r#"field:(a OR "b c")"#, r#"(?"field":a ?"field":"b c")"#);
test_parse_query_to_ast_helper(
r#"field:(a OR (b AND c))"#,
r#"(?"field":a ?(+"field":b +"field":c))"#,
);
test_parse_query_to_ast_helper(
r#"field:(a [b TO c])"#,
r#"(*"field":a *"field":["b" TO "c"])"#,
);
test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#); test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#);
} }
@@ -1563,17 +1538,6 @@ mod test {
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*"); test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
} }
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
}
#[test] #[test]
fn test_not_queries_are_consistent() { fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)"); test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");

View File

@@ -16,9 +16,6 @@ pub enum UserInputLeaf {
field: Option<String>, field: Option<String>,
elements: Vec<String>, elements: Vec<String>,
}, },
Exists {
field: String,
},
} }
impl UserInputLeaf { impl UserInputLeaf {
@@ -39,29 +36,6 @@ impl UserInputLeaf {
upper, upper,
}, },
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements }, UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
}
}
pub(crate) fn set_default_field(&mut self, default_field: String) {
match self {
UserInputLeaf::Literal(ref mut literal) if literal.field_name.is_none() => {
literal.field_name = Some(default_field)
}
UserInputLeaf::All => {
*self = UserInputLeaf::Exists {
field: default_field,
}
}
UserInputLeaf::Range { ref mut field, .. } if field.is_none() => {
*field = Some(default_field)
}
UserInputLeaf::Set { ref mut field, .. } if field.is_none() => {
*field = Some(default_field)
}
_ => (), // field was already set, do nothing
} }
} }
} }
@@ -100,9 +74,6 @@ impl Debug for UserInputLeaf {
write!(formatter, "]") write!(formatter, "]")
} }
UserInputLeaf::All => write!(formatter, "*"), UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
}
} }
} }
} }
@@ -225,16 +196,6 @@ impl UserInputAst {
pub fn or(asts: Vec<UserInputAst>) -> UserInputAst { pub fn or(asts: Vec<UserInputAst>) -> UserInputAst {
UserInputAst::compose(Occur::Should, asts) UserInputAst::compose(Occur::Should, asts)
} }
pub(crate) fn set_default_field(&mut self, field: String) {
match self {
UserInputAst::Clause(clauses) => clauses
.iter_mut()
.for_each(|(_, ast)| ast.set_default_field(field.clone())),
UserInputAst::Leaf(leaf) => leaf.set_default_field(field),
UserInputAst::Boost(ref mut ast, _) => ast.set_default_field(field),
}
}
} }
impl From<UserInputLiteral> for UserInputLeaf { impl From<UserInputLiteral> for UserInputLeaf {

View File

@@ -0,0 +1,550 @@
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use test::{self, Bencher};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
Optional = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
Sparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_fieldtype = crate::schema::TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
let score_field = schema_builder.add_u64_field("score", score_fieldtype.clone());
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::Optional {
index_writer.add_document(doc!())?;
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
text_field_many_terms => "cool",
text_field_few_terms => "cool",
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::Sparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
}
Ok(index)
}
use paste::paste;
#[macro_export]
macro_rules! bench_all_cardinalities {
( $x:ident ) => {
paste! {
#[bench]
fn $x(b: &mut Bencher) {
[<$x _card>](b, Cardinality::Full)
}
#[bench]
fn [<$x _opt>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Optional)
}
#[bench]
fn [<$x _multi>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Multivalued)
}
#[bench]
fn [<$x _sparse>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Sparse)
}
}
};
}
bench_all_cardinalities!(bench_aggregation_average_u64);
fn bench_aggregation_average_u64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": { "avg": { "field": "score", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_stats_f64);
fn bench_aggregation_stats_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "stats": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_f64);
fn bench_aggregation_average_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_few);
fn bench_aggregation_terms_few_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_with_sub_agg);
fn bench_aggregation_terms_many_with_sub_agg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many2);
fn bench_aggregation_terms_many2_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_order_by_term);
fn bench_aggregation_terms_many_order_by_term_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_only);
fn bench_aggregation_range_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_range_with_avg);
fn bench_aggregation_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
// hard bounds has a different algorithm, because it actually limits collection range
//
bench_all_cardinalities!(bench_aggregation_histogram_only_hard_bounds);
fn bench_aggregation_histogram_only_hard_bounds_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_with_avg);
fn bench_aggregation_histogram_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_histogram_only);
fn bench_aggregation_histogram_only_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_avg_and_range_with_avg);
fn bench_aggregation_avg_and_range_with_avg_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
let text_field = reader.searcher().schema().get_field("text").unwrap();
b.iter(|| {
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&term_query, &collector).unwrap()
});
}
}

View File

@@ -73,19 +73,18 @@ impl AggregationLimits {
/// Create a new ResourceLimitGuard, that will release the memory when dropped. /// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard { pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard { ResourceLimitGuard {
// The counter which is shared between the aggregations for one request. /// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption), memory_consumption: Arc::clone(&self.memory_consumption),
// The memory_limit in bytes /// The memory_limit in bytes
memory_limit: self.memory_limit, memory_limit: self.memory_limit,
allocated_with_the_guard: 0, allocated_with_the_guard: 0,
} }
} }
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> { pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
let prev_value = self self.memory_consumption
.memory_consumption .fetch_add(num_bytes, Ordering::Relaxed);
.fetch_add(add_num_bytes, Ordering::Relaxed); validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(()) Ok(())
} }
@@ -95,11 +94,11 @@ impl AggregationLimits {
} }
fn validate_memory_consumption( fn validate_memory_consumption(
memory_consumption: u64, memory_consumption: &AtomicU64,
memory_limit: ByteCount, memory_limit: ByteCount,
) -> Result<(), AggregationError> { ) -> Result<(), AggregationError> {
// Load the estimated memory consumed by the aggregations // Load the estimated memory consumed by the aggregations
let memory_consumed: ByteCount = memory_consumption.into(); let memory_consumed: ByteCount = memory_consumption.load(Ordering::Relaxed).into();
if memory_consumed > memory_limit { if memory_consumed > memory_limit {
return Err(AggregationError::MemoryExceeded { return Err(AggregationError::MemoryExceeded {
limit: memory_limit, limit: memory_limit,
@@ -119,11 +118,10 @@ pub struct ResourceLimitGuard {
} }
impl ResourceLimitGuard { impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> { pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
let prev_value = self self.memory_consumption
.memory_consumption .fetch_add(num_bytes, Ordering::Relaxed);
.fetch_add(add_num_bytes, Ordering::Relaxed); validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(()) Ok(())
} }
} }
@@ -136,142 +134,3 @@ impl Drop for ResourceLimitGuard {
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed); .fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
} }
} }
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -35,7 +35,7 @@ use super::bucket::{
}; };
use super::metric::{ use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
}; };
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user /// The top-level aggregation request structure, which contains [`Aggregation`] and their user
@@ -93,12 +93,7 @@ impl Aggregation {
} }
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) { fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
fast_field_names.extend( fast_field_names.insert(self.agg.get_fast_field_name().to_string());
self.agg
.get_fast_field_names()
.iter()
.map(|s| s.to_string()),
);
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation)); fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
} }
} }
@@ -152,27 +147,23 @@ pub enum AggregationVariants {
/// Computes the sum of the extracted values. /// Computes the sum of the extracted values.
#[serde(rename = "percentiles")] #[serde(rename = "percentiles")]
Percentiles(PercentilesAggregationReq), Percentiles(PercentilesAggregationReq),
/// Finds the top k values matching some order
#[serde(rename = "top_hits")]
TopHits(TopHitsAggregation),
} }
impl AggregationVariants { impl AggregationVariants {
/// Returns the name of the fields used by the aggregation. /// Returns the name of the field used by the aggregation.
pub fn get_fast_field_names(&self) -> Vec<&str> { pub fn get_fast_field_name(&self) -> &str {
match self { match self {
AggregationVariants::Terms(terms) => vec![terms.field.as_str()], AggregationVariants::Terms(terms) => terms.field.as_str(),
AggregationVariants::Range(range) => vec![range.field.as_str()], AggregationVariants::Range(range) => range.field.as_str(),
AggregationVariants::Histogram(histogram) => vec![histogram.field.as_str()], AggregationVariants::Histogram(histogram) => histogram.field.as_str(),
AggregationVariants::DateHistogram(histogram) => vec![histogram.field.as_str()], AggregationVariants::DateHistogram(histogram) => histogram.field.as_str(),
AggregationVariants::Average(avg) => vec![avg.field_name()], AggregationVariants::Average(avg) => avg.field_name(),
AggregationVariants::Count(count) => vec![count.field_name()], AggregationVariants::Count(count) => count.field_name(),
AggregationVariants::Max(max) => vec![max.field_name()], AggregationVariants::Max(max) => max.field_name(),
AggregationVariants::Min(min) => vec![min.field_name()], AggregationVariants::Min(min) => min.field_name(),
AggregationVariants::Stats(stats) => vec![stats.field_name()], AggregationVariants::Stats(stats) => stats.field_name(),
AggregationVariants::Sum(sum) => vec![sum.field_name()], AggregationVariants::Sum(sum) => sum.field_name(),
AggregationVariants::Percentiles(per) => vec![per.field_name()], AggregationVariants::Percentiles(per) => per.field_name(),
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
} }
} }

View File

@@ -1,9 +1,6 @@
//! This will enhance the request tree with access to the fastfield and metadata. //! This will enhance the request tree with access to the fastfield and metadata.
use std::collections::HashMap; use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
use std::io;
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
use super::agg_limits::ResourceLimitGuard; use super::agg_limits::ResourceLimitGuard;
use super::agg_req::{Aggregation, AggregationVariants, Aggregations}; use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
@@ -17,8 +14,7 @@ use super::metric::{
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::VecWithNames; use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key}; use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::index::SegmentReader; use crate::SegmentReader;
use crate::SegmentOrdinal;
#[derive(Default)] #[derive(Default)]
pub(crate) struct AggregationsWithAccessor { pub(crate) struct AggregationsWithAccessor {
@@ -36,7 +32,6 @@ impl AggregationsWithAccessor {
} }
pub struct AggregationWithAccessor { pub struct AggregationWithAccessor {
pub(crate) segment_ordinal: SegmentOrdinal,
/// In general there can be buckets without fast field access, e.g. buckets that are created /// In general there can be buckets without fast field access, e.g. buckets that are created
/// based on search terms. That is not that case currently, but eventually this needs to be /// based on search terms. That is not that case currently, but eventually this needs to be
/// Option or moved. /// Option or moved.
@@ -49,16 +44,10 @@ pub struct AggregationWithAccessor {
pub(crate) limits: ResourceLimitGuard, pub(crate) limits: ResourceLimitGuard,
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>, pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// Used for missing term aggregation, which checks all columns for existence. /// Used for missing term aggregation, which checks all columns for existence.
/// And also for `top_hits` aggregation, which may sort on multiple fields.
/// By convention the missing aggregation is chosen, when this property is set /// By convention the missing aggregation is chosen, when this property is set
/// (instead bein set in `agg`). /// (instead bein set in `agg`).
/// If this needs to used by other aggregations, we need to refactor this. /// If this needs to used by other aggregations, we need to refactor this.
// NOTE: we can make all other aggregations use this instead of the `accessor` and `field_type` pub(crate) accessors: Vec<Column<u64>>,
// (making them obsolete) But will it have a performance impact?
pub(crate) accessors: Vec<(Column<u64>, ColumnType)>,
/// Map field names to all associated column accessors.
/// This field is used for `docvalue_fields`, which is currently only supported for `top_hits`.
pub(crate) value_accessors: HashMap<String, Vec<DynamicColumn>>,
pub(crate) agg: Aggregation, pub(crate) agg: Aggregation,
} }
@@ -68,55 +57,19 @@ impl AggregationWithAccessor {
agg: &Aggregation, agg: &Aggregation,
sub_aggregation: &Aggregations, sub_aggregation: &Aggregations,
reader: &SegmentReader, reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: AggregationLimits, limits: AggregationLimits,
) -> crate::Result<Vec<AggregationWithAccessor>> { ) -> crate::Result<Vec<AggregationWithAccessor>> {
let mut agg = agg.clone(); let add_agg_with_accessor = |accessor: Column<u64>,
let add_agg_with_accessor = |agg: &Aggregation,
accessor: Column<u64>,
column_type: ColumnType, column_type: ColumnType,
aggs: &mut Vec<AggregationWithAccessor>| aggs: &mut Vec<AggregationWithAccessor>|
-> crate::Result<()> { -> crate::Result<()> {
let res = AggregationWithAccessor { let res = AggregationWithAccessor {
segment_ordinal,
accessor, accessor,
accessors: Default::default(), accessors: Vec::new(),
value_accessors: Default::default(),
field_type: column_type, field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate( sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation, sub_aggregation,
reader, reader,
segment_ordinal,
&limits,
)?,
agg: agg.clone(),
limits: limits.new_guard(),
missing_value_for_accessor: None,
str_dict_column: None,
column_block_accessor: Default::default(),
};
aggs.push(res);
Ok(())
};
let add_agg_with_accessors = |agg: &Aggregation,
accessors: Vec<(Column<u64>, ColumnType)>,
aggs: &mut Vec<AggregationWithAccessor>,
value_accessors: HashMap<String, Vec<DynamicColumn>>|
-> crate::Result<()> {
let (accessor, field_type) = accessors.first().expect("at least one accessor");
let res = AggregationWithAccessor {
segment_ordinal,
// TODO: We should do away with the `accessor` field altogether
accessor: accessor.clone(),
value_accessors,
field_type: *field_type,
accessors,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
segment_ordinal,
&limits, &limits,
)?, )?,
agg: agg.clone(), agg: agg.clone(),
@@ -131,36 +84,31 @@ impl AggregationWithAccessor {
let mut res: Vec<AggregationWithAccessor> = Vec::new(); let mut res: Vec<AggregationWithAccessor> = Vec::new();
use AggregationVariants::*; use AggregationVariants::*;
match &agg.agg {
match agg.agg {
Range(RangeAggregation { Range(RangeAggregation {
field: ref field_name, field: field_name, ..
..
}) => { }) => {
let (accessor, column_type) = let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?; get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?; add_agg_with_accessor(accessor, column_type, &mut res)?;
} }
Histogram(HistogramAggregation { Histogram(HistogramAggregation {
field: ref field_name, field: field_name, ..
..
}) => { }) => {
let (accessor, column_type) = let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?; get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?; add_agg_with_accessor(accessor, column_type, &mut res)?;
} }
DateHistogram(DateHistogramAggregationReq { DateHistogram(DateHistogramAggregationReq {
field: ref field_name, field: field_name, ..
..
}) => { }) => {
let (accessor, column_type) = let (accessor, column_type) =
// Only DateTime is supported for DateHistogram get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?; add_agg_with_accessor(accessor, column_type, &mut res)?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
} }
Terms(TermsAggregation { Terms(TermsAggregation {
field: ref field_name, field: field_name,
ref missing, missing,
.. ..
}) => { }) => {
let str_dict_column = reader.fast_fields().str(field_name)?; let str_dict_column = reader.fast_fields().str(field_name)?;
@@ -169,10 +117,10 @@ impl AggregationWithAccessor {
ColumnType::U64, ColumnType::U64,
ColumnType::F64, ColumnType::F64,
ColumnType::Str, ColumnType::Str,
ColumnType::DateTime,
ColumnType::Bool,
ColumnType::IpAddr,
// ColumnType::Bytes Unsupported // ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
]; ];
// In case the column is empty we want the shim column to match the missing type // In case the column is empty we want the shim column to match the missing type
@@ -197,27 +145,29 @@ impl AggregationWithAccessor {
.map(|m| matches!(m, Key::Str(_))) .map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false); .unwrap_or(false);
// Actually we could convert the text to a number and have the fast path, if it is let use_special_missing_agg = missing_and_more_than_one_col || text_on_non_text_col;
// provided in Rfc3339 format. But this use case is probably common
// enough to justify the effort.
let text_on_date_col = column_and_types.len() == 1
&& column_and_types[0].1 == ColumnType::DateTime
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg =
missing_and_more_than_one_col || text_on_non_text_col || text_on_date_col;
if use_special_missing_agg { if use_special_missing_agg {
let column_and_types = let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?; get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;
let accessors = column_and_types let accessors: Vec<Column> =
.iter() column_and_types.iter().map(|(a, _)| a.clone()).collect();
.map(|c_t| (c_t.0.clone(), c_t.1)) let agg_wit_acc = AggregationWithAccessor {
.collect(); missing_value_for_accessor: None,
add_agg_with_accessors(&agg, accessors, &mut res, Default::default())?; accessor: accessors[0].clone(),
accessors,
field_type: ColumnType::U64,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
column_block_accessor: Default::default(),
};
res.push(agg_wit_acc);
} }
for (accessor, column_type) in column_and_types { for (accessor, column_type) in column_and_types {
@@ -227,25 +177,21 @@ impl AggregationWithAccessor {
missing.clone() missing.clone()
}; };
let missing_value_for_accessor = if let Some(missing) = let missing_value_for_accessor =
missing_value_term_agg.as_ref() if let Some(missing) = missing_value_term_agg.as_ref() {
{ get_missing_val(column_type, missing, agg.agg.get_fast_field_name())?
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])? } else {
} else { None
None };
};
let agg = AggregationWithAccessor { let agg = AggregationWithAccessor {
segment_ordinal,
missing_value_for_accessor, missing_value_for_accessor,
accessor, accessor,
accessors: Default::default(), accessors: Vec::new(),
value_accessors: Default::default(),
field_type: column_type, field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate( sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation, sub_aggregation,
reader, reader,
segment_ordinal,
&limits, &limits,
)?, )?,
agg: agg.clone(), agg: agg.clone(),
@@ -257,63 +203,34 @@ impl AggregationWithAccessor {
} }
} }
Average(AverageAggregation { Average(AverageAggregation {
field: ref field_name, field: field_name, ..
..
}) })
| Count(CountAggregation { | Count(CountAggregation {
field: ref field_name, field: field_name, ..
..
}) })
| Max(MaxAggregation { | Max(MaxAggregation {
field: ref field_name, field: field_name, ..
..
}) })
| Min(MinAggregation { | Min(MinAggregation {
field: ref field_name, field: field_name, ..
..
}) })
| Stats(StatsAggregation { | Stats(StatsAggregation {
field: ref field_name, field: field_name, ..
..
}) })
| Sum(SumAggregation { | Sum(SumAggregation {
field: ref field_name, field: field_name, ..
..
}) => { }) => {
let (accessor, column_type) = let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?; get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?; add_agg_with_accessor(accessor, column_type, &mut res)?;
} }
Percentiles(ref percentiles) => { Percentiles(percentiles) => {
let (accessor, column_type) = get_ff_reader( let (accessor, column_type) = get_ff_reader(
reader, reader,
percentiles.field_name(), percentiles.field_name(),
Some(get_numeric_or_date_column_types()), Some(get_numeric_or_date_column_types()),
)?; )?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?; add_agg_with_accessor(accessor, column_type, &mut res)?;
}
TopHits(ref mut top_hits) => {
top_hits.validate_and_resolve_field_names(reader.fast_fields().columnar())?;
let accessors: Vec<(Column<u64>, ColumnType)> = top_hits
.field_names()
.iter()
.map(|field| {
get_ff_reader(reader, field, Some(get_numeric_or_date_column_types()))
})
.collect::<crate::Result<_>>()?;
let value_accessors = top_hits
.value_field_names()
.iter()
.map(|field_name| {
Ok((
field_name.to_string(),
get_dynamic_columns(reader, field_name)?,
))
})
.collect::<crate::Result<_>>()?;
add_agg_with_accessors(&agg, accessors, &mut res, value_accessors)?;
} }
}; };
@@ -335,8 +252,8 @@ fn get_missing_val(
} }
_ => { _ => {
return Err(crate::TantivyError::InvalidArgument(format!( return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {missing:?} for field {field_name} is not supported for column \ "Missing value {:?} for field {} is not supported for column type {:?}",
type {column_type:?}" missing, field_name, column_type
))); )));
} }
}; };
@@ -355,7 +272,6 @@ fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
pub(crate) fn get_aggs_with_segment_accessor_and_validate( pub(crate) fn get_aggs_with_segment_accessor_and_validate(
aggs: &Aggregations, aggs: &Aggregations,
reader: &SegmentReader, reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: &AggregationLimits, limits: &AggregationLimits,
) -> crate::Result<AggregationsWithAccessor> { ) -> crate::Result<AggregationsWithAccessor> {
let mut aggss = Vec::new(); let mut aggss = Vec::new();
@@ -364,7 +280,6 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
agg, agg,
agg.sub_aggregation(), agg.sub_aggregation(),
reader, reader,
segment_ordinal,
limits.clone(), limits.clone(),
)?; )?;
for agg in aggs { for agg in aggs {
@@ -394,19 +309,6 @@ fn get_ff_reader(
Ok(ff_field_with_type) Ok(ff_field_with_type)
} }
fn get_dynamic_columns(
reader: &SegmentReader,
field_name: &str,
) -> crate::Result<Vec<columnar::DynamicColumn>> {
let ff_fields = reader.fast_fields().dynamic_column_handles(field_name)?;
let cols = ff_fields
.iter()
.map(|h| h.open())
.collect::<io::Result<_>>()?;
assert!(!ff_fields.is_empty(), "field {field_name} not found");
Ok(cols)
}
/// Get all fast field reader or empty as default. /// Get all fast field reader or empty as default.
/// ///
/// Is guaranteed to return at least one column. /// Is guaranteed to return at least one column.

View File

@@ -8,7 +8,7 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount; use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult}; use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats};
use super::{AggregationError, Key}; use super::{AggregationError, Key};
use crate::TantivyError; use crate::TantivyError;
@@ -90,10 +90,8 @@ pub enum MetricResult {
Stats(Stats), Stats(Stats),
/// Sum metric result. /// Sum metric result.
Sum(SingleMetricResult), Sum(SingleMetricResult),
/// Percentiles metric result. /// Sum metric result.
Percentiles(PercentilesMetricResult), Percentiles(PercentilesMetricResult),
/// Top hits metric result
TopHits(TopHitsMetricResult),
} }
impl MetricResult { impl MetricResult {
@@ -108,9 +106,6 @@ impl MetricResult {
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError( MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()), AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
)), )),
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
)),
} }
} }
} }

View File

@@ -4,13 +4,12 @@ use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE; use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector; use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::AggregationLimits; use crate::aggregation::segment_agg_result::AggregationLimits;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms}; use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector; use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term}; use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation { fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({ serde_json::from_value(json!({
@@ -67,22 +66,6 @@ fn test_aggregation_flushing(
} }
} }
}, },
"top_hits_test":{
"terms": {
"field": "string_id"
},
"aggs": {
"bucketsL2": {
"top_hits": {
"size": 2,
"sort": [
{ "score": "asc" }
],
"docvalue_fields": ["score"]
}
}
}
},
"histogram_test":{ "histogram_test":{
"histogram": { "histogram": {
"field": "score", "field": "score",
@@ -125,16 +108,6 @@ fn test_aggregation_flushing(
let searcher = reader.searcher(); let searcher = reader.searcher();
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap(); let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
// Test postcard roundtrip serialization
let intermediate_agg_result_bytes = postcard::to_allocvec(&intermediate_agg_result).expect(
"Postcard Serialization failed, flatten etc. is not supported in the intermediate \
result",
);
let intermediate_agg_result: IntermediateAggregationResults =
postcard::from_bytes(&intermediate_agg_result_bytes)
.expect("Post deserialization failed");
intermediate_agg_result intermediate_agg_result
.into_final_result(agg_req, &Default::default()) .into_final_result(agg_req, &Default::default())
.unwrap() .unwrap()
@@ -613,10 +586,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color": "red"})))
.unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"color": "red"}))) .add_document(doc!(json => json!({"color": "red"})))
.unwrap(); .unwrap();
@@ -644,74 +614,12 @@ fn test_aggregation_on_json_object() {
&serde_json::json!({ &serde_json::json!({
"jsonagg": { "jsonagg": {
"buckets": [ "buckets": [
{"doc_count": 2, "key": "red"},
{"doc_count": 1, "key": "blue"}, {"doc_count": 1, "key": "blue"},
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
})
);
}
#[test]
fn test_aggregation_on_nested_json_object() {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json.blub", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "red", "color": {"nested":"red"} })))
.unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} })))
.unwrap();
index_writer
.add_document(doc!(json => json!({"color.dot": "blue", "color": {"nested":"blue"} })))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg1": {
"terms": {
"field": "json\\.blub.color\\.dot",
}
},
"jsonagg2": {
"terms": {
"field": "json\\.blub.color.nested",
}
}
}))
.unwrap();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
assert_eq!(
&aggregation_res_json,
&serde_json::json!({
"jsonagg1": {
"buckets": [
{"doc_count": 2, "key": "blue"},
{"doc_count": 1, "key": "red"}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
},
"jsonagg2": {
"buckets": [
{"doc_count": 2, "key": "blue"},
{"doc_count": 1, "key": "red"} {"doc_count": 1, "key": "red"}
], ],
"doc_count_error_upper_bound": 0, "doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0 "sum_other_doc_count": 0
} }
}) })
); );
} }
@@ -722,7 +630,7 @@ fn test_aggregation_on_json_object_empty_columns() {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Empty column when accessing color // => Empty column when accessing color
index_writer index_writer
.add_document(doc!(json => json!({"price": 10.0}))) .add_document(doc!(json => json!({"price": 10.0})))
@@ -840,41 +748,32 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0, "mixed_price": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with all values text // => Segment with all values text
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0}))) .add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with all boolen // => Segment with all boolen
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"}))) .add_document(doc!(json => json!({"mixed_type": true})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
// => Segment with mixed values // => Segment with mixed values
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": "red", "mixed_price": 1.0}))) .add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap(); .unwrap();
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5, "mixed_price": -20.5}))) .add_document(doc!(json => json!({"mixed_type": true})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
.unwrap(); .unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -888,7 +787,7 @@ fn test_aggregation_on_json_object_mixed_types() {
"order": { "min_price": "desc" } "order": { "min_price": "desc" }
}, },
"aggs": { "aggs": {
"min_price": { "min": { "field": "json.mixed_price" } } "min_price": { "min": { "field": "json.mixed_type" } }
} }
}, },
"rangeagg": { "rangeagg": {
@@ -912,7 +811,6 @@ fn test_aggregation_on_json_object_mixed_types() {
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap(); let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap(); let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
use pretty_assertions::assert_eq;
assert_eq!( assert_eq!(
&aggregation_res_json, &aggregation_res_json,
&serde_json::json!({ &serde_json::json!({
@@ -927,10 +825,10 @@ fn test_aggregation_on_json_object_mixed_types() {
"termagg": { "termagg": {
"buckets": [ "buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } }, { "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } }, { "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } }, // TODO bool is also not yet handled in aggregation
{ "doc_count": 1, "key": "blue", "min_price": { "value": null } },
{ "doc_count": 1, "key": "red", "min_price": { "value": null } },
], ],
"sum_other_doc_count": 0 "sum_other_doc_count": 0
} }

View File

@@ -1,7 +1,7 @@
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::{HistogramAggregation, HistogramBounds}; use super::{HistogramAggregation, HistogramBounds};
use crate::aggregation::*; use crate::aggregation::AggregationError;
/// DateHistogramAggregation is similar to `HistogramAggregation`, but it can only be used with date /// DateHistogramAggregation is similar to `HistogramAggregation`, but it can only be used with date
/// type. /// type.
@@ -132,7 +132,6 @@ impl DateHistogramAggregationReq {
hard_bounds: self.hard_bounds, hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds, extended_bounds: self.extended_bounds,
keyed: self.keyed, keyed: self.keyed,
is_normalized_to_ns: false,
}) })
} }
@@ -244,15 +243,15 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
} }
#[cfg(test)] #[cfg(test)]
pub mod tests { mod tests {
use pretty_assertions::assert_eq; use pretty_assertions::assert_eq;
use super::*; use super::*;
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request; use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter, TantivyDocument}; use crate::Index;
#[test] #[test]
fn test_parse_into_millisecs() { fn test_parse_into_millisecs() {
@@ -307,9 +306,7 @@ pub mod tests {
) -> crate::Result<Index> { ) -> crate::Result<Index> {
let mut schema_builder = Schema::builder(); let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST); schema_builder.add_date_field("date", FAST);
schema_builder.add_json_field("mixed", FAST); schema_builder.add_text_field("text", FAST);
schema_builder.add_text_field("text", FAST | STRING);
schema_builder.add_text_field("text2", FAST | STRING);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone()); let index = Index::create_in_ram(schema.clone());
{ {
@@ -317,7 +314,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy)); index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs { for values in segment_and_docs {
for doc_str in values { for doc_str in values {
let doc = TantivyDocument::parse_json(&schema, doc_str)?; let doc = schema.parse_document(doc_str)?;
index_writer.add_document(doc)?; index_writer.add_document(doc)?;
} }
// writing the segment // writing the segment
@@ -329,7 +326,7 @@ pub mod tests {
.searchable_segment_ids() .searchable_segment_ids()
.expect("Searchable segments failed."); .expect("Searchable segments failed.");
if segment_ids.len() > 1 { if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?; let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?; index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?; index_writer.wait_merging_threads()?;
} }
@@ -352,10 +349,8 @@ pub mod tests {
let docs = vec![ let docs = vec![
vec![r#"{ "date": "2015-01-01T12:10:30Z", "text": "aaa" }"#], vec![r#"{ "date": "2015-01-01T12:10:30Z", "text": "aaa" }"#],
vec![r#"{ "date": "2015-01-01T11:11:30Z", "text": "bbb" }"#], vec![r#"{ "date": "2015-01-01T11:11:30Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-01T11:11:30Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb" }"#], vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-06T00:00:00Z", "text": "ccc" }"#], vec![r#"{ "date": "2015-01-06T00:00:00Z", "text": "ccc" }"#],
vec![r#"{ "date": "2015-01-06T00:00:00Z", "text": "ccc" }"#],
]; ];
let index = get_test_index_from_docs(merge_segments, &docs).unwrap(); let index = get_test_index_from_docs(merge_segments, &docs).unwrap();
@@ -384,7 +379,7 @@ pub mod tests {
{ {
"key_as_string" : "2015-01-01T00:00:00Z", "key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0, "key" : 1420070400000.0,
"doc_count" : 6 "doc_count" : 4
} }
] ]
} }
@@ -422,15 +417,15 @@ pub mod tests {
{ {
"key_as_string" : "2015-01-01T00:00:00Z", "key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0, "key" : 1420070400000.0,
"doc_count" : 6, "doc_count" : 4,
"texts": { "texts": {
"buckets": [ "buckets": [
{ {
"doc_count": 3, "doc_count": 2,
"key": "bbb" "key": "bbb"
}, },
{ {
"doc_count": 2, "doc_count": 1,
"key": "ccc" "key": "ccc"
}, },
{ {
@@ -469,7 +464,7 @@ pub mod tests {
"sales_over_time": { "sales_over_time": {
"buckets": [ "buckets": [
{ {
"doc_count": 3, "doc_count": 2,
"key": 1420070400000.0, "key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z" "key_as_string": "2015-01-01T00:00:00Z"
}, },
@@ -494,7 +489,7 @@ pub mod tests {
"key_as_string": "2015-01-05T00:00:00Z" "key_as_string": "2015-01-05T00:00:00Z"
}, },
{ {
"doc_count": 2, "doc_count": 1,
"key": 1420502400000.0, "key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z" "key_as_string": "2015-01-06T00:00:00Z"
} }
@@ -535,7 +530,7 @@ pub mod tests {
"key_as_string": "2014-12-31T00:00:00Z" "key_as_string": "2014-12-31T00:00:00Z"
}, },
{ {
"doc_count": 3, "doc_count": 2,
"key": 1420070400000.0, "key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z" "key_as_string": "2015-01-01T00:00:00Z"
}, },
@@ -560,7 +555,7 @@ pub mod tests {
"key_as_string": "2015-01-05T00:00:00Z" "key_as_string": "2015-01-05T00:00:00Z"
}, },
{ {
"doc_count": 2, "doc_count": 1,
"key": 1420502400000.0, "key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z" "key_as_string": "2015-01-06T00:00:00Z"
}, },

View File

@@ -1,5 +1,8 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::fmt::Display;
use columnar::ColumnType;
use itertools::Itertools;
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use tantivy_bitpacker::minmax; use tantivy_bitpacker::minmax;
@@ -15,9 +18,9 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateHistogramBucketEntry, IntermediateHistogramBucketEntry,
}; };
use crate::aggregation::segment_agg_result::{ use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector, build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
}; };
use crate::aggregation::*; use crate::aggregation::{f64_from_fastfield_u64, format_date};
use crate::TantivyError; use crate::TantivyError;
/// Histogram is a bucket aggregation, where buckets are created dynamically for given `interval`. /// Histogram is a bucket aggregation, where buckets are created dynamically for given `interval`.
@@ -70,7 +73,6 @@ pub struct HistogramAggregation {
pub field: String, pub field: String,
/// The interval to chunk your data range. Each bucket spans a value range of [0..interval). /// The interval to chunk your data range. Each bucket spans a value range of [0..interval).
/// Must be a positive value. /// Must be a positive value.
#[serde(deserialize_with = "deserialize_f64")]
pub interval: f64, pub interval: f64,
/// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k + /// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k +
/// 1))`. /// 1))`.
@@ -83,7 +85,6 @@ pub struct HistogramAggregation {
/// fall into the buckets with the key 0 and 10. /// fall into the buckets with the key 0 and 10.
/// With offset 5 and interval 10, they would both fall into the bucket with they key 5 and the /// With offset 5 and interval 10, they would both fall into the bucket with they key 5 and the
/// range [5..15) /// range [5..15)
#[serde(default, deserialize_with = "deserialize_option_f64")]
pub offset: Option<f64>, pub offset: Option<f64>,
/// The minimum number of documents in a bucket to be returned. Defaults to 0. /// The minimum number of documents in a bucket to be returned. Defaults to 0.
pub min_doc_count: Option<u64>, pub min_doc_count: Option<u64>,
@@ -121,14 +122,11 @@ pub struct HistogramAggregation {
/// Whether to return the buckets as a hash map /// Whether to return the buckets as a hash map
#[serde(default)] #[serde(default)]
pub keyed: bool, pub keyed: bool,
/// Whether the values are normalized to ns for date time values. Defaults to false.
#[serde(default)]
pub is_normalized_to_ns: bool,
} }
impl HistogramAggregation { impl HistogramAggregation {
pub(crate) fn normalize_date_time(&mut self) { pub(crate) fn normalize(&mut self, column_type: ColumnType) {
if !self.is_normalized_to_ns { if column_type.is_date_time() {
// values are provided in ms, but the fastfield is in nano seconds // values are provided in ms, but the fastfield is in nano seconds
self.interval *= 1_000_000.0; self.interval *= 1_000_000.0;
self.offset = self.offset.map(|off| off * 1_000_000.0); self.offset = self.offset.map(|off| off * 1_000_000.0);
@@ -140,7 +138,6 @@ impl HistogramAggregation {
min: bounds.min * 1_000_000.0, min: bounds.min * 1_000_000.0,
max: bounds.max * 1_000_000.0, max: bounds.max * 1_000_000.0,
}); });
self.is_normalized_to_ns = true;
} }
} }
@@ -307,10 +304,7 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
.column_block_accessor .column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor); .fetch_block(docs, &bucket_agg_accessor.accessor);
for (doc, val) in bucket_agg_accessor for (doc, val) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let val = self.f64_from_fastfield_u64(val); let val = self.f64_from_fastfield_u64(val);
let bucket_pos = get_bucket_pos(val); let bucket_pos = get_bucket_pos(val);
@@ -331,11 +325,9 @@ impl SegmentAggregationCollector for SegmentHistogramCollector {
} }
let mem_delta = self.get_memory_consumption() - mem_pre; let mem_delta = self.get_memory_consumption() - mem_pre;
if mem_delta > 0 { bucket_agg_accessor
bucket_agg_accessor .limits
.limits .add_memory_consumed(mem_delta as u64)?;
.add_memory_consumed(mem_delta as u64)?;
}
Ok(()) Ok(())
} }
@@ -378,7 +370,7 @@ impl SegmentHistogramCollector {
Ok(IntermediateBucketResult::Histogram { Ok(IntermediateBucketResult::Histogram {
buckets, buckets,
is_date_agg: self.column_type == ColumnType::DateTime, column_type: Some(self.column_type),
}) })
} }
@@ -389,9 +381,7 @@ impl SegmentHistogramCollector {
accessor_idx: usize, accessor_idx: usize,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
req.validate()?; req.validate()?;
if field_type == ColumnType::DateTime { req.normalize(field_type);
req.normalize_date_time();
}
let sub_aggregation_blueprint = if sub_aggregation.is_empty() { let sub_aggregation_blueprint = if sub_aggregation.is_empty() {
None None
@@ -449,7 +439,6 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// memory check upfront // memory check upfront
let (_, first_bucket_num, last_bucket_num) = let (_, first_bucket_num, last_bucket_num) =
generate_bucket_pos_with_opt_minmax(histogram_req, min_max); generate_bucket_pos_with_opt_minmax(histogram_req, min_max);
// It's based on user input, so we need to account for overflows // It's based on user input, so we need to account for overflows
let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64) let added_buckets = ((last_bucket_num.saturating_sub(first_bucket_num)).max(0) as u64)
.saturating_sub(buckets.len() as u64); .saturating_sub(buckets.len() as u64);
@@ -493,7 +482,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
// Convert to BucketEntry // Convert to BucketEntry
pub(crate) fn intermediate_histogram_buckets_to_final_buckets( pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
buckets: Vec<IntermediateHistogramBucketEntry>, buckets: Vec<IntermediateHistogramBucketEntry>,
is_date_agg: bool, column_type: Option<ColumnType>,
histogram_req: &HistogramAggregation, histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations, sub_aggregation: &Aggregations,
limits: &AggregationLimits, limits: &AggregationLimits,
@@ -502,8 +491,8 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// The request used in the the call to final is not yet be normalized. // The request used in the the call to final is not yet be normalized.
// Normalization is changing the precision from milliseconds to nanoseconds. // Normalization is changing the precision from milliseconds to nanoseconds.
let mut histogram_req = histogram_req.clone(); let mut histogram_req = histogram_req.clone();
if is_date_agg { if let Some(column_type) = column_type {
histogram_req.normalize_date_time(); histogram_req.normalize(column_type);
} }
let mut buckets = if histogram_req.min_doc_count() == 0 { let mut buckets = if histogram_req.min_doc_count() == 0 {
// With min_doc_count != 0, we may need to add buckets, so that there are no // With min_doc_count != 0, we may need to add buckets, so that there are no
@@ -527,7 +516,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
// If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339 // If we have a date type on the histogram buckets, we add the `key_as_string` field as rfc339
// and normalize from nanoseconds to milliseconds // and normalize from nanoseconds to milliseconds
if is_date_agg { if column_type == Some(ColumnType::DateTime) {
for bucket in buckets.iter_mut() { for bucket in buckets.iter_mut() {
if let crate::aggregation::Key::F64(ref mut val) = bucket.key { if let crate::aggregation::Key::F64(ref mut val) = bucket.key {
let key_as_string = format_date(*val as i64)?; let key_as_string = format_date(*val as i64)?;
@@ -599,12 +588,11 @@ mod tests {
use serde_json::Value; use serde_json::Value;
use super::*; use super::*;
use crate::aggregation::agg_result::AggregationResults; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{ use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit, exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs, get_test_index_2_segments, get_test_index_from_values, get_test_index_with_num_docs,
}; };
use crate::query::AllQuery;
#[test] #[test]
fn histogram_test_crooked_values() -> crate::Result<()> { fn histogram_test_crooked_values() -> crate::Result<()> {
@@ -1356,35 +1344,6 @@ mod tests {
}) })
); );
Ok(())
}
#[test]
fn test_aggregation_histogram_empty_index() -> crate::Result<()> {
// test index without segments
let values = vec![];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = serde_json::from_value(json!({
"myhisto": {
"histogram": {
"field": "score",
"interval": 10.0
},
}
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
// Make sure the result structure is correct
assert_eq!(res["myhisto"]["buckets"].as_array().unwrap().len(), 0);
Ok(()) Ok(())
} }
} }

View File

@@ -28,7 +28,6 @@ mod term_agg;
mod term_missing_agg; mod term_missing_agg;
use std::collections::HashMap; use std::collections::HashMap;
use std::fmt;
pub use histogram::*; pub use histogram::*;
pub use range::*; pub use range::*;
@@ -73,12 +72,12 @@ impl From<&str> for OrderTarget {
} }
} }
impl fmt::Display for OrderTarget { impl ToString for OrderTarget {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn to_string(&self) -> String {
match self { match self {
OrderTarget::Key => f.write_str("_key"), OrderTarget::Key => "_key".to_string(),
OrderTarget::Count => f.write_str("_count"), OrderTarget::Count => "_count".to_string(),
OrderTarget::SubAggregation(agg) => agg.fmt(f), OrderTarget::SubAggregation(agg) => agg.to_string(),
} }
} }
} }

View File

@@ -1,6 +1,7 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::ops::Range; use std::ops::Range;
use columnar::{ColumnType, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -13,7 +14,9 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{ use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector, build_segment_agg_collector, SegmentAggregationCollector,
}; };
use crate::aggregation::*; use crate::aggregation::{
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
};
use crate::TantivyError; use crate::TantivyError;
/// Provide user-defined buckets to aggregate on. /// Provide user-defined buckets to aggregate on.
@@ -69,19 +72,11 @@ pub struct RangeAggregationRange {
pub key: Option<String>, pub key: Option<String>,
/// The from range value, which is inclusive in the range. /// The from range value, which is inclusive in the range.
/// `None` equals to an open ended interval. /// `None` equals to an open ended interval.
#[serde( #[serde(skip_serializing_if = "Option::is_none", default)]
skip_serializing_if = "Option::is_none",
default,
deserialize_with = "deserialize_option_f64"
)]
pub from: Option<f64>, pub from: Option<f64>,
/// The to range value, which is not inclusive in the range. /// The to range value, which is not inclusive in the range.
/// `None` equals to an open ended interval. /// `None` equals to an open ended interval.
#[serde( #[serde(skip_serializing_if = "Option::is_none", default)]
skip_serializing_if = "Option::is_none",
default,
deserialize_with = "deserialize_option_f64"
)]
pub to: Option<f64>, pub to: Option<f64>,
} }
@@ -235,10 +230,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
.column_block_accessor .column_block_accessor
.fetch_block(docs, &bucket_agg_accessor.accessor); .fetch_block(docs, &bucket_agg_accessor.accessor);
for (doc, val) in bucket_agg_accessor for (doc, val) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let bucket_pos = self.get_bucket_pos(val); let bucket_pos = self.get_bucket_pos(val);
let bucket = &mut self.buckets[bucket_pos]; let bucket = &mut self.buckets[bucket_pos];
@@ -449,6 +441,7 @@ pub(crate) fn range_to_key(range: &Range<u64>, field_type: &ColumnType) -> crate
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use columnar::MonotonicallyMappableToU64;
use serde_json::Value; use serde_json::Value;
use super::*; use super::*;
@@ -457,6 +450,7 @@ mod tests {
exec_request, exec_request_with_query, get_test_index_2_segments, exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs, get_test_index_with_num_docs,
}; };
use crate::aggregation::AggregationLimits;
pub fn get_collector_from_ranges( pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>, ranges: Vec<RangeAggregationRange>,

View File

@@ -1,10 +1,6 @@
use std::fmt::Debug; use std::fmt::Debug;
use std::net::Ipv6Addr;
use columnar::column_values::CompactSpaceU64Accessor; use columnar::{BytesColumn, ColumnType, StrColumn};
use columnar::{
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
};
use rustc_hash::FxHashMap; use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
@@ -20,7 +16,7 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{ use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector, build_segment_agg_collector, SegmentAggregationCollector,
}; };
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key}; use crate::aggregation::{f64_from_fastfield_u64, Key};
use crate::error::DataCorruption; use crate::error::DataCorruption;
use crate::TantivyError; use crate::TantivyError;
@@ -103,14 +99,23 @@ pub struct TermsAggregation {
#[serde(skip_serializing_if = "Option::is_none", default)] #[serde(skip_serializing_if = "Option::is_none", default)]
pub size: Option<u32>, pub size: Option<u32>,
/// To get more accurate results, we fetch more than `size` from each segment. /// Unused by tantivy.
///
/// Since tantivy doesn't know shards, this parameter is merely there to be used by consumers
/// of tantivy. shard_size is the number of terms returned by each shard.
/// The default value in elasticsearch is size * 1.5 + 10.
///
/// Should never be smaller than size.
#[serde(skip_serializing_if = "Option::is_none", default)]
#[serde(alias = "shard_size")]
pub split_size: Option<u32>,
/// The get more accurate results, we fetch more than `size` from each segment.
/// ///
/// Increasing this value is will increase the cost for more accuracy. /// Increasing this value is will increase the cost for more accuracy.
/// ///
/// Defaults to 10 * size. /// Defaults to 10 * size.
#[serde(skip_serializing_if = "Option::is_none", default)] #[serde(skip_serializing_if = "Option::is_none", default)]
#[serde(alias = "shard_size")]
#[serde(alias = "split_size")]
pub segment_size: Option<u32>, pub segment_size: Option<u32>,
/// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will /// If you set the `show_term_doc_count_error` parameter to true, the terms aggregation will
@@ -251,7 +256,7 @@ pub struct SegmentTermCollector {
term_buckets: TermBuckets, term_buckets: TermBuckets,
req: TermsAggregationInternal, req: TermsAggregationInternal,
blueprint: Option<Box<dyn SegmentAggregationCollector>>, blueprint: Option<Box<dyn SegmentAggregationCollector>>,
column_type: ColumnType, field_type: ColumnType,
accessor_idx: usize, accessor_idx: usize,
} }
@@ -310,10 +315,7 @@ impl SegmentAggregationCollector for SegmentTermCollector {
} }
// has subagg // has subagg
if let Some(blueprint) = self.blueprint.as_ref() { if let Some(blueprint) = self.blueprint.as_ref() {
for (doc, term_id) in bucket_agg_accessor for (doc, term_id) in bucket_agg_accessor.column_block_accessor.iter_docid_vals() {
.column_block_accessor
.iter_docid_vals(docs, &bucket_agg_accessor.accessor)
{
let sub_aggregations = self let sub_aggregations = self
.term_buckets .term_buckets
.sub_aggs .sub_aggs
@@ -324,11 +326,9 @@ impl SegmentAggregationCollector for SegmentTermCollector {
} }
let mem_delta = self.get_memory_consumption() - mem_pre; let mem_delta = self.get_memory_consumption() - mem_pre;
if mem_delta > 0 { bucket_agg_accessor
bucket_agg_accessor .limits
.limits .add_memory_consumed(mem_delta as u64)?;
.add_memory_consumed(mem_delta as u64)?;
}
Ok(()) Ok(())
} }
@@ -355,9 +355,10 @@ impl SegmentTermCollector {
field_type: ColumnType, field_type: ColumnType,
accessor_idx: usize, accessor_idx: usize,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
if field_type == ColumnType::Bytes { if field_type == ColumnType::Bytes || field_type == ColumnType::Bool {
return Err(TantivyError::InvalidArgument(format!( return Err(TantivyError::InvalidArgument(format!(
"terms aggregation is not supported for column type {field_type:?}" "terms aggregation is not supported for column type {:?}",
field_type
))); )));
} }
let term_buckets = TermBuckets::default(); let term_buckets = TermBuckets::default();
@@ -388,7 +389,7 @@ impl SegmentTermCollector {
req: TermsAggregationInternal::from_req(req), req: TermsAggregationInternal::from_req(req),
term_buckets, term_buckets,
blueprint, blueprint,
column_type: field_type, field_type,
accessor_idx, accessor_idx,
}) })
} }
@@ -465,7 +466,7 @@ impl SegmentTermCollector {
Ok(intermediate_entry) Ok(intermediate_entry)
}; };
if self.column_type == ColumnType::Str { if self.field_type == ColumnType::Str {
let term_dict = agg_with_accessor let term_dict = agg_with_accessor
.str_dict_column .str_dict_column
.as_ref() .as_ref()
@@ -530,55 +531,21 @@ impl SegmentTermCollector {
}); });
} }
} }
} else if self.column_type == ColumnType::DateTime {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = i64::from_u64(val);
let date = format_date(val)?;
dict.insert(IntermediateKey::Str(date), intermediate_entry);
}
} else if self.column_type == ColumnType::Bool {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = bool::from_u64(val);
dict.insert(IntermediateKey::Bool(val), intermediate_entry);
}
} else if self.column_type == ColumnType::IpAddr {
let compact_space_accessor = agg_with_accessor
.accessor
.values
.clone()
.downcast_arc::<CompactSpaceU64Accessor>()
.map_err(|_| {
TantivyError::AggregationError(
crate::aggregation::AggregationError::InternalError(
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
.to_string(),
),
)
})?;
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
let val = Ipv6Addr::from_u128(val);
dict.insert(IntermediateKey::IpAddr(val), intermediate_entry);
}
} else { } else {
for (val, doc_count) in entries { for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?; let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = f64_from_fastfield_u64(val, &self.column_type); let val = f64_from_fastfield_u64(val, &self.field_type);
dict.insert(IntermediateKey::F64(val), intermediate_entry); dict.insert(IntermediateKey::F64(val), intermediate_entry);
} }
}; };
Ok(IntermediateBucketResult::Terms { Ok(IntermediateBucketResult::Terms(
buckets: IntermediateTermBucketResult { IntermediateTermBucketResult {
entries: dict, entries: dict,
sum_other_doc_count, sum_other_doc_count,
doc_count_error_upper_bound: term_doc_count_before_cutoff, doc_count_error_upper_bound: term_doc_count_before_cutoff,
}, },
}) ))
} }
} }
@@ -616,12 +583,6 @@ pub(crate) fn cut_off_buckets<T: GetDocCount + Debug>(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use std::net::IpAddr;
use std::str::FromStr;
use common::DateTime;
use time::{Date, Month};
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{ use crate::aggregation::tests::{
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit, exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
@@ -629,8 +590,8 @@ mod tests {
}; };
use crate::aggregation::AggregationLimits; use crate::aggregation::AggregationLimits;
use crate::indexer::NoMergePolicy; use crate::indexer::NoMergePolicy;
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING}; use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn terms_aggregation_test_single_segment() -> crate::Result<()> { fn terms_aggregation_test_single_segment() -> crate::Result<()> {
@@ -1211,9 +1172,9 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "terma");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc"); assert_eq!(res["my_texts"]["buckets"][2]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0); assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0); assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0); assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
@@ -1394,7 +1355,7 @@ mod tests {
#[test] #[test]
fn terms_aggregation_different_tokenizer_on_ff_test() -> crate::Result<()> { fn terms_aggregation_different_tokenizer_on_ff_test() -> crate::Result<()> {
let terms = vec!["Hello Hello", "Hallo Hallo", "Hallo Hallo"]; let terms = vec!["Hello Hello", "Hallo Hallo"];
let index = get_test_index_from_terms(true, &[terms])?; let index = get_test_index_from_terms(true, &[terms])?;
@@ -1412,7 +1373,7 @@ mod tests {
println!("{}", serde_json::to_string_pretty(&res).unwrap()); println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo"); assert_eq!(res["my_texts"]["buckets"][0]["key"], "Hallo Hallo");
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 2); assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 1);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello"); assert_eq!(res["my_texts"]["buckets"][1]["key"], "Hello Hello");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1); assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 1);
@@ -1502,7 +1463,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -1852,151 +1813,4 @@ mod tests {
Ok(()) Ok(())
} }
#[test]
fn terms_aggregation_date() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_date_missing() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1982, Month::September, 17)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!(date_field=>DateTime::from_primitive(Date::from_calendar_date(1983, Month::September, 27)?.with_hms(0, 0, 0)?)))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_date": {
"terms": {
"field": "date_field",
"missing": "1982-09-17T00:00:00Z"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// date_field field
assert_eq!(res["my_date"]["buckets"][0]["key"], "1982-09-17T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["my_date"]["buckets"][1]["key"], "1983-09-27T00:00:00Z");
assert_eq!(res["my_date"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_date"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_bool() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_bool_field("bool_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
writer.add_document(doc!(field=>true))?;
writer.add_document(doc!(field=>false))?;
writer.add_document(doc!(field=>true))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_bool": {
"terms": {
"field": "bool_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(res["my_bool"]["buckets"][0]["key"], 1.0);
assert_eq!(res["my_bool"]["buckets"][0]["key_as_string"], "true");
assert_eq!(res["my_bool"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_bool"]["buckets"][1]["key"], 0.0);
assert_eq!(res["my_bool"]["buckets"][1]["key_as_string"], "false");
assert_eq!(res["my_bool"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_bool"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_ip_addr() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut writer = index.writer_with_num_threads(1, 15_000_000)?;
// IpV6 loopback
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
// IpV4
writer.add_document(
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
)?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_bool": {
"terms": {
"field": "ip_field"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// print as json
// println!("{}", serde_json::to_string_pretty(&res).unwrap());
assert_eq!(res["my_bool"]["buckets"][0]["key"], "::1");
assert_eq!(res["my_bool"]["buckets"][0]["doc_count"], 2);
assert_eq!(res["my_bool"]["buckets"][1]["key"], "127.0.0.1");
assert_eq!(res["my_bool"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_bool"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
} }

View File

@@ -73,13 +73,11 @@ impl SegmentAggregationCollector for TermMissingAgg {
entries.insert(missing.into(), missing_entry); entries.insert(missing.into(), missing_entry);
let bucket = IntermediateBucketResult::Terms { let bucket = IntermediateBucketResult::Terms(IntermediateTermBucketResult {
buckets: IntermediateTermBucketResult { entries,
entries, sum_other_doc_count: 0,
sum_other_doc_count: 0, doc_count_error_upper_bound: 0,
doc_count_error_upper_bound: 0, });
},
};
results.push(name, IntermediateAggregationResult::Bucket(bucket))?; results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
@@ -92,10 +90,7 @@ impl SegmentAggregationCollector for TermMissingAgg {
agg_with_accessor: &mut AggregationsWithAccessor, agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> { ) -> crate::Result<()> {
let agg = &mut agg_with_accessor.aggs.values[self.accessor_idx]; let agg = &mut agg_with_accessor.aggs.values[self.accessor_idx];
let has_value = agg let has_value = agg.accessors.iter().any(|acc| acc.index.has_value(doc));
.accessors
.iter()
.any(|(acc, _)| acc.index.has_value(doc));
if !has_value { if !has_value {
self.missing_count += 1; self.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() { if let Some(sub_agg) = self.sub_agg.as_mut() {
@@ -122,7 +117,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query; use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> { fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
@@ -131,7 +126,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0}))) .add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
@@ -191,7 +186,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?; index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
@@ -236,7 +231,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -283,7 +278,7 @@ mod tests {
let score = schema_builder.add_f64_field("score", FAST); let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?; index_writer.add_document(doc!(score => 5.0))?;
@@ -328,7 +323,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -390,7 +385,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -432,7 +427,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric // => Segment with all values numeric
index_writer index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0}))) .add_document(doc!(json => json!({"mixed_type": 10.0})))

View File

@@ -8,8 +8,7 @@ use super::segment_agg_result::{
}; };
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate; use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector}; use crate::collector::{Collector, SegmentCollector};
use crate::index::SegmentReader; use crate::{DocId, SegmentReader, TantivyError};
use crate::{DocId, SegmentOrdinal, TantivyError};
/// The default max bucket count, before the aggregation fails. /// The default max bucket count, before the aggregation fails.
pub const DEFAULT_BUCKET_LIMIT: u32 = 65000; pub const DEFAULT_BUCKET_LIMIT: u32 = 65000;
@@ -65,15 +64,10 @@ impl Collector for DistributedAggregationCollector {
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: crate::SegmentOrdinal, _segment_local_id: crate::SegmentOrdinal,
reader: &crate::SegmentReader, reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> { ) -> crate::Result<Self::Child> {
AggregationSegmentCollector::from_agg_req_and_reader( AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader, &self.limits)
&self.agg,
reader,
segment_local_id,
&self.limits,
)
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
@@ -95,15 +89,10 @@ impl Collector for AggregationCollector {
fn for_segment( fn for_segment(
&self, &self,
segment_local_id: crate::SegmentOrdinal, _segment_local_id: crate::SegmentOrdinal,
reader: &crate::SegmentReader, reader: &crate::SegmentReader,
) -> crate::Result<Self::Child> { ) -> crate::Result<Self::Child> {
AggregationSegmentCollector::from_agg_req_and_reader( AggregationSegmentCollector::from_agg_req_and_reader(&self.agg, reader, &self.limits)
&self.agg,
reader,
segment_local_id,
&self.limits,
)
} }
fn requires_scoring(&self) -> bool { fn requires_scoring(&self) -> bool {
@@ -146,11 +135,10 @@ impl AggregationSegmentCollector {
pub fn from_agg_req_and_reader( pub fn from_agg_req_and_reader(
agg: &Aggregations, agg: &Aggregations,
reader: &SegmentReader, reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: &AggregationLimits, limits: &AggregationLimits,
) -> crate::Result<Self> { ) -> crate::Result<Self> {
let mut aggs_with_accessor = let mut aggs_with_accessor =
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?; get_aggs_with_segment_accessor_and_validate(agg, reader, limits)?;
let result = let result =
BufAggregationCollector::new(build_segment_agg_collector(&mut aggs_with_accessor)?); BufAggregationCollector::new(build_segment_agg_collector(&mut aggs_with_accessor)?);
Ok(AggregationSegmentCollector { Ok(AggregationSegmentCollector {

View File

@@ -5,7 +5,6 @@
use std::cmp::Ordering; use std::cmp::Ordering;
use std::collections::hash_map::Entry; use std::collections::hash_map::Entry;
use std::hash::Hash; use std::hash::Hash;
use std::net::Ipv6Addr;
use columnar::ColumnType; use columnar::ColumnType;
use itertools::Itertools; use itertools::Itertools;
@@ -20,7 +19,7 @@ use super::bucket::{
}; };
use super::metric::{ use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats, IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer, IntermediateSum, PercentilesCollector,
}; };
use super::segment_agg_result::AggregationLimits; use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey}; use super::{format_date, AggregationError, Key, SerializedKey};
@@ -42,10 +41,6 @@ pub struct IntermediateAggregationResults {
/// This might seem redundant with `Key`, but the point is to have a different /// This might seem redundant with `Key`, but the point is to have a different
/// Serialize implementation. /// Serialize implementation.
pub enum IntermediateKey { pub enum IntermediateKey {
/// Ip Addr key
IpAddr(Ipv6Addr),
/// Bool key
Bool(bool),
/// String key /// String key
Str(String), Str(String),
/// `f64` key /// `f64` key
@@ -63,16 +58,7 @@ impl From<IntermediateKey> for Key {
fn from(value: IntermediateKey) -> Self { fn from(value: IntermediateKey) -> Self {
match value { match value {
IntermediateKey::Str(s) => Self::Str(s), IntermediateKey::Str(s) => Self::Str(s),
IntermediateKey::IpAddr(s) => {
// Prefer to use the IPv4 representation if possible
if let Some(ip) = s.to_ipv4_mapped() {
Self::Str(ip.to_string())
} else {
Self::Str(s.to_string())
}
}
IntermediateKey::F64(f) => Self::F64(f), IntermediateKey::F64(f) => Self::F64(f),
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
} }
} }
} }
@@ -85,8 +71,6 @@ impl std::hash::Hash for IntermediateKey {
match self { match self {
IntermediateKey::Str(text) => text.hash(state), IntermediateKey::Str(text) => text.hash(state),
IntermediateKey::F64(val) => val.to_bits().hash(state), IntermediateKey::F64(val) => val.to_bits().hash(state),
IntermediateKey::Bool(val) => val.hash(state),
IntermediateKey::IpAddr(val) => val.hash(state),
} }
} }
} }
@@ -182,22 +166,16 @@ impl IntermediateAggregationResults {
pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult { pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult {
use AggregationVariants::*; use AggregationVariants::*;
match req.agg { match req.agg {
Terms(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Terms { Terms(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Terms(
buckets: Default::default(), Default::default(),
}), )),
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range( Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(), Default::default(),
)), )),
Histogram(_) => { Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram { IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(), buckets: Vec::new(),
is_date_agg: false, column_type: None,
})
}
DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
is_date_agg: true,
}) })
} }
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average( Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
@@ -221,9 +199,6 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Percentiles(_) => IntermediateAggregationResult::Metric( Percentiles(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::Percentiles(PercentilesCollector::default()), IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
), ),
TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
),
} }
} }
@@ -284,8 +259,6 @@ pub enum IntermediateMetricResult {
Stats(IntermediateStats), Stats(IntermediateStats),
/// Intermediate sum result. /// Intermediate sum result.
Sum(IntermediateSum), Sum(IntermediateSum),
/// Intermediate top_hits result
TopHits(TopHitsTopNComputer),
} }
impl IntermediateMetricResult { impl IntermediateMetricResult {
@@ -313,13 +286,9 @@ impl IntermediateMetricResult {
percentiles percentiles
.into_final_result(req.agg.as_percentile().expect("unexpected metric type")), .into_final_result(req.agg.as_percentile().expect("unexpected metric type")),
), ),
IntermediateMetricResult::TopHits(top_hits) => {
MetricResult::TopHits(top_hits.into_final_result())
}
} }
} }
// TODO: this is our top-of-the-chain fruit merge mech
fn merge_fruits(&mut self, other: IntermediateMetricResult) -> crate::Result<()> { fn merge_fruits(&mut self, other: IntermediateMetricResult) -> crate::Result<()> {
match (self, other) { match (self, other) {
( (
@@ -355,9 +324,6 @@ impl IntermediateMetricResult {
) => { ) => {
left.merge_fruits(right)?; left.merge_fruits(right)?;
} }
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
left.merge_fruits(right)?;
}
_ => { _ => {
panic!("incompatible fruit types in tree or missing merge_fruits handler"); panic!("incompatible fruit types in tree or missing merge_fruits handler");
} }
@@ -377,16 +343,13 @@ pub enum IntermediateBucketResult {
/// This is the histogram entry for a bucket, which contains a key, count, and optionally /// This is the histogram entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations. /// sub_aggregations.
Histogram { Histogram {
/// The column_type of the underlying `Column` is DateTime /// The column_type of the underlying `Column`
is_date_agg: bool, column_type: Option<ColumnType>,
/// The histogram buckets /// The buckets
buckets: Vec<IntermediateHistogramBucketEntry>, buckets: Vec<IntermediateHistogramBucketEntry>,
}, },
/// Term aggregation /// Term aggregation
Terms { Terms(IntermediateTermBucketResult),
/// The term buckets
buckets: IntermediateTermBucketResult,
},
} }
impl IntermediateBucketResult { impl IntermediateBucketResult {
@@ -436,7 +399,7 @@ impl IntermediateBucketResult {
Ok(BucketResult::Range { buckets }) Ok(BucketResult::Range { buckets })
} }
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
is_date_agg, column_type,
buckets, buckets,
} => { } => {
let histogram_req = &req let histogram_req = &req
@@ -445,7 +408,7 @@ impl IntermediateBucketResult {
.expect("unexpected aggregation, expected histogram aggregation"); .expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets( let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets, buckets,
is_date_agg, column_type,
histogram_req, histogram_req,
req.sub_aggregation(), req.sub_aggregation(),
limits, limits,
@@ -463,7 +426,7 @@ impl IntermediateBucketResult {
}; };
Ok(BucketResult::Histogram { buckets }) Ok(BucketResult::Histogram { buckets })
} }
IntermediateBucketResult::Terms { buckets: terms } => terms.into_final_result( IntermediateBucketResult::Terms(terms) => terms.into_final_result(
req.agg req.agg
.as_term() .as_term()
.expect("unexpected aggregation, expected term aggregation"), .expect("unexpected aggregation, expected term aggregation"),
@@ -476,12 +439,8 @@ impl IntermediateBucketResult {
fn merge_fruits(&mut self, other: IntermediateBucketResult) -> crate::Result<()> { fn merge_fruits(&mut self, other: IntermediateBucketResult) -> crate::Result<()> {
match (self, other) { match (self, other) {
( (
IntermediateBucketResult::Terms { IntermediateBucketResult::Terms(term_res_left),
buckets: term_res_left, IntermediateBucketResult::Terms(term_res_right),
},
IntermediateBucketResult::Terms {
buckets: term_res_right,
},
) => { ) => {
merge_maps(&mut term_res_left.entries, term_res_right.entries)?; merge_maps(&mut term_res_left.entries, term_res_right.entries)?;
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count; term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
@@ -498,11 +457,11 @@ impl IntermediateBucketResult {
( (
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
buckets: buckets_left, buckets: buckets_left,
is_date_agg: _, ..
}, },
IntermediateBucketResult::Histogram { IntermediateBucketResult::Histogram {
buckets: buckets_right, buckets: buckets_right,
is_date_agg: _, ..
}, },
) => { ) => {
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> = let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =
@@ -565,15 +524,8 @@ impl IntermediateTermBucketResult {
.into_iter() .into_iter()
.filter(|bucket| bucket.1.doc_count as u64 >= req.min_doc_count) .filter(|bucket| bucket.1.doc_count as u64 >= req.min_doc_count)
.map(|(key, entry)| { .map(|(key, entry)| {
let key_as_string = match key {
IntermediateKey::Bool(key) => {
let val = if key { "true" } else { "false" };
Some(val.to_string())
}
_ => None,
};
Ok(BucketEntry { Ok(BucketEntry {
key_as_string, key_as_string: None,
key: key.into(), key: key.into(),
doc_count: entry.doc_count as u64, doc_count: entry.doc_count as u64,
sub_aggregation: entry sub_aggregation: entry

View File

@@ -2,8 +2,7 @@ use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::{IntermediateStats, SegmentStatsCollector};
use crate::aggregation::*;
/// A single-value metric aggregation that computes the average of numeric values that are /// A single-value metric aggregation that computes the average of numeric values that are
/// extracted from the aggregated documents. /// extracted from the aggregated documents.
@@ -25,7 +24,7 @@ pub struct AverageAggregation {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde(default, deserialize_with = "deserialize_option_f64")] #[serde(default)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }
@@ -66,71 +65,3 @@ impl IntermediateAverage {
self.stats.finalize().avg self.stats.finalize().avg
} }
} }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn deserialization_with_missing_test1() {
let json = r#"{
"field": "score",
"missing": "10.0"
}"#;
let avg: AverageAggregation = serde_json::from_str(json).unwrap();
assert_eq!(avg.field, "score");
assert_eq!(avg.missing, Some(10.0));
// no dot
let json = r#"{
"field": "score",
"missing": "10"
}"#;
let avg: AverageAggregation = serde_json::from_str(json).unwrap();
assert_eq!(avg.field, "score");
assert_eq!(avg.missing, Some(10.0));
// from value
let avg: AverageAggregation = serde_json::from_value(json!({
"field": "score_f64",
"missing": 10u64,
}))
.unwrap();
assert_eq!(avg.missing, Some(10.0));
// from value
let avg: AverageAggregation = serde_json::from_value(json!({
"field": "score_f64",
"missing": 10u32,
}))
.unwrap();
assert_eq!(avg.missing, Some(10.0));
let avg: AverageAggregation = serde_json::from_value(json!({
"field": "score_f64",
"missing": 10i8,
}))
.unwrap();
assert_eq!(avg.missing, Some(10.0));
}
#[test]
fn deserialization_with_missing_test_fail() {
let json = r#"{
"field": "score",
"missing": "a"
}"#;
let avg: Result<AverageAggregation, _> = serde_json::from_str(json);
assert!(avg.is_err());
assert!(avg
.unwrap_err()
.to_string()
.contains("Failed to parse f64 from string: \"a\""));
// Disallow NaN
let json = r#"{
"field": "score",
"missing": "NaN"
}"#;
let avg: Result<AverageAggregation, _> = serde_json::from_str(json);
assert!(avg.is_err());
assert!(avg.unwrap_err().to_string().contains("NaN"));
}
}

View File

@@ -2,8 +2,7 @@ use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::{IntermediateStats, SegmentStatsCollector};
use crate::aggregation::*;
/// A single-value metric aggregation that counts the number of values that are /// A single-value metric aggregation that counts the number of values that are
/// extracted from the aggregated documents. /// extracted from the aggregated documents.
@@ -25,7 +24,7 @@ pub struct CountAggregation {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde(default, deserialize_with = "deserialize_option_f64")] #[serde(default)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }

View File

@@ -2,8 +2,7 @@ use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::{IntermediateStats, SegmentStatsCollector};
use crate::aggregation::*;
/// A single-value metric aggregation that computes the maximum of numeric values that are /// A single-value metric aggregation that computes the maximum of numeric values that are
/// extracted from the aggregated documents. /// extracted from the aggregated documents.
@@ -25,7 +24,7 @@ pub struct MaxAggregation {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde(default, deserialize_with = "deserialize_option_f64")] #[serde(default)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }
@@ -72,7 +71,7 @@ mod tests {
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query; use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST}; use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn test_max_agg_with_missing() -> crate::Result<()> { fn test_max_agg_with_missing() -> crate::Result<()> {
@@ -80,7 +79,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();

View File

@@ -2,8 +2,7 @@ use std::fmt::Debug;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::{IntermediateStats, SegmentStatsCollector};
use crate::aggregation::*;
/// A single-value metric aggregation that computes the minimum of numeric values that are /// A single-value metric aggregation that computes the minimum of numeric values that are
/// extracted from the aggregated documents. /// extracted from the aggregated documents.
@@ -25,7 +24,7 @@ pub struct MinAggregation {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde(default, deserialize_with = "deserialize_option_f64")] #[serde(default)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }

View File

@@ -23,10 +23,6 @@ mod min;
mod percentiles; mod percentiles;
mod stats; mod stats;
mod sum; mod sum;
mod top_hits;
use std::collections::HashMap;
pub use average::*; pub use average::*;
pub use count::*; pub use count::*;
pub use max::*; pub use max::*;
@@ -36,9 +32,6 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
pub use stats::*; pub use stats::*;
pub use sum::*; pub use sum::*;
pub use top_hits::*;
use crate::schema::OwnedValue;
/// Single-metric aggregations use this common result structure. /// Single-metric aggregations use this common result structure.
/// ///
@@ -88,28 +81,6 @@ pub struct PercentilesMetricResult {
pub values: PercentileValues, pub values: PercentileValues,
} }
/// The top_hits metric results entry
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TopHitsVecEntry {
/// The sort values of the document, depending on the sort criteria in the request.
pub sort: Vec<Option<u64>>,
/// Search results, for queries that include field retrieval requests
/// (`docvalue_fields`).
#[serde(rename = "docvalue_fields")]
#[serde(skip_serializing_if = "HashMap::is_empty")]
pub doc_value_fields: HashMap<String, OwnedValue>,
}
/// The top_hits metric aggregation results a list of top hits by sort criteria.
///
/// The main reason for wrapping it in `hits` is to match elasticsearch output structure.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct TopHitsMetricResult {
/// The result of the top_hits metric.
pub hits: Vec<TopHitsVecEntry>,
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::aggregation::agg_req::Aggregations; use crate::aggregation::agg_req::Aggregations;
@@ -117,7 +88,7 @@ mod tests {
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::query::AllQuery; use crate::query::AllQuery;
use crate::schema::{NumericOptions, Schema}; use crate::schema::{NumericOptions, Schema};
use crate::{Index, IndexWriter}; use crate::Index;
#[test] #[test]
fn test_metric_aggregations() { fn test_metric_aggregations() {
@@ -125,7 +96,7 @@ mod tests {
let field_options = NumericOptions::default().set_fast(); let field_options = NumericOptions::default().set_fast();
let field = schema_builder.add_f64_field("price", field_options); let field = schema_builder.add_f64_field("price", field_options);
let index = Index::create_in_ram(schema_builder.build()); let index = Index::create_in_ram(schema_builder.build());
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
for i in 0..3 { for i in 0..3 {
index_writer index_writer

View File

@@ -1,5 +1,6 @@
use std::fmt::Debug; use std::fmt::Debug;
use columnar::ColumnType;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;
@@ -10,7 +11,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
}; };
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64, AggregationError};
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
/// # Percentiles /// # Percentiles
@@ -83,11 +84,7 @@ pub struct PercentilesAggregationReq {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde( #[serde(skip_serializing_if = "Option::is_none", default)]
skip_serializing_if = "Option::is_none",
default,
deserialize_with = "deserialize_option_f64"
)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }
fn default_percentiles() -> &'static [f64] { fn default_percentiles() -> &'static [f64] {
@@ -136,6 +133,7 @@ pub(crate) struct SegmentPercentilesCollector {
field_type: ColumnType, field_type: ColumnType,
pub(crate) percentiles: PercentilesCollector, pub(crate) percentiles: PercentilesCollector,
pub(crate) accessor_idx: usize, pub(crate) accessor_idx: usize,
val_cache: Vec<u64>,
missing: Option<u64>, missing: Option<u64>,
} }
@@ -245,6 +243,7 @@ impl SegmentPercentilesCollector {
field_type, field_type,
percentiles: PercentilesCollector::new(), percentiles: PercentilesCollector::new(),
accessor_idx, accessor_idx,
val_cache: Default::default(),
missing, missing,
}) })
} }

View File

@@ -1,3 +1,4 @@
use columnar::ColumnType;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
use super::*; use super::*;
@@ -8,7 +9,7 @@ use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult, IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
}; };
use crate::aggregation::segment_agg_result::SegmentAggregationCollector; use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*; use crate::aggregation::{f64_from_fastfield_u64, f64_to_fastfield_u64};
use crate::{DocId, TantivyError}; use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that /// A multi-value metric aggregation that computes a collection of statistics on numeric values that
@@ -32,7 +33,7 @@ pub struct StatsAggregation {
/// By default they will be ignored but it is also possible to treat them as if they had a /// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format: /// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" } /// { "field": "my_numbers", "missing": "10.0" }
#[serde(default, deserialize_with = "deserialize_option_f64")] #[serde(default)]
pub missing: Option<f64>, pub missing: Option<f64>,
} }
@@ -299,7 +300,7 @@ mod tests {
use crate::aggregation::AggregationCollector; use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery}; use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST}; use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term}; use crate::{Index, Term};
#[test] #[test]
fn test_aggregation_stats_empty_index() -> crate::Result<()> { fn test_aggregation_stats_empty_index() -> crate::Result<()> {
@@ -493,7 +494,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -540,7 +541,7 @@ mod tests {
let json = schema_builder.add_json_field("json", FAST); let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build(); let schema = schema_builder.build();
let index = Index::create_in_ram(schema); let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap(); let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with empty json // => Segment with empty json
index_writer.add_document(doc!()).unwrap(); index_writer.add_document(doc!()).unwrap();
index_writer.commit().unwrap(); index_writer.commit().unwrap();
@@ -579,30 +580,6 @@ mod tests {
}) })
); );
// From string
let agg_req: Aggregations = serde_json::from_value(json!({
"my_stats": {
"stats": {
"field": "json.partially_empty",
"missing": "0.0"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
assert_eq!(
res["my_stats"],
json!({
"avg": 2.5,
"count": 4,
"max": 10.0,
"min": 0.0,
"sum": 10.0
})
);
Ok(()) Ok(())
} }

Some files were not shown because too many files have changed in this diff Show More