Compare commits

..

1 Commits

Author SHA1 Message Date
Pascal Seitz
806a1e1b1e clarify tokenizer docs 2023-04-03 22:59:38 +08:00
290 changed files with 8523 additions and 22118 deletions

View File

@@ -3,23 +3,20 @@ name: Coverage
on:
push:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
pull_request:
branches: [main]
jobs:
coverage:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install Rust
run: rustup toolchain install nightly-2023-09-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2023-09-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -8,18 +8,13 @@ env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1
with:

View File

@@ -9,18 +9,13 @@ on:
env:
CARGO_TERM_COLOR: always
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
check:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install nightly
uses: actions-rs/toolchain@v1
@@ -39,13 +34,6 @@ jobs:
- name: Check Formatting
run: cargo +nightly fmt --all -- --check
- name: Check Stable Compilation
run: cargo build --all-features
- name: Check Bench Compilation
run: cargo +nightly bench --no-run --profile=dev --all-features
- uses: actions-rs/clippy-check@v1
with:
@@ -60,14 +48,14 @@ jobs:
strategy:
matrix:
features: [
{ label: "all", flags: "mmap,stopwords,lz4-compression,zstd-compression,failpoints" },
{ label: "all", flags: "mmap,stopwords,brotli-compression,lz4-compression,snappy-compression,zstd-compression,failpoints" },
{ label: "quickwit", flags: "mmap,quickwit,failpoints" }
]
name: test-${{ matrix.features.label}}
steps:
- uses: actions/checkout@v4
- uses: actions/checkout@v3
- name: Install stable
uses: actions-rs/toolchain@v1

View File

@@ -254,7 +254,7 @@ The token positions of all of the terms are then stored in a separate file with
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
we advance the position reader by the number of term frequencies of the current document.
## [fieldnorm/](src/fieldnorm): Here is my doc, how many tokens in this field?
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.

View File

@@ -1,125 +1,3 @@
Tantivy 0.21.1
================================
#### Bugfixes
- Range queries on fast fields with less values on that field than documents had an invalid end condition, leading to missing results. [#2226](https://github.com/quickwit-oss/tantivy/issues/2226)(@appaquet @PSeitz)
- Increase the minimum memory budget from 3MB to 15MB to avoid single doc segments (API fix). [#2176](https://github.com/quickwit-oss/tantivy/issues/2176)(@PSeitz)
Tantivy 0.21
================================
#### Bugfixes
- Fix track fast field memory consumption, which led to higher memory consumption than the budget allowed during indexing [#2148](https://github.com/quickwit-oss/tantivy/issues/2148)[#2147](https://github.com/quickwit-oss/tantivy/issues/2147)(@PSeitz)
- Fix a regression from 0.20 where sort index by date wasn't working anymore [#2124](https://github.com/quickwit-oss/tantivy/issues/2124)(@PSeitz)
- Fix getting the root facet on the `FacetCollector`. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086)(@adamreichold)
- Align numerical type priority order of columnar and query. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088)(@fmassot)
#### Breaking Changes
- Remove support for Brotli and Snappy compression [#2123](https://github.com/quickwit-oss/tantivy/issues/2123)(@adamreichold)
#### Features/Improvements
- Implement lenient query parser [#2129](https://github.com/quickwit-oss/tantivy/pull/2129)(@trinity-1686a)
- order_by_u64_field and order_by_fast_field allow sorting in ascending and descending order [#2111](https://github.com/quickwit-oss/tantivy/issues/2111)(@naveenann)
- Allow dynamic filters in text analyzer builder [#2110](https://github.com/quickwit-oss/tantivy/issues/2110)(@fulmicoton @fmassot)
- **Aggregation**
- Add missing parameter for term aggregation [#2149](https://github.com/quickwit-oss/tantivy/issues/2149)[#2103](https://github.com/quickwit-oss/tantivy/issues/2103)(@PSeitz)
- Add missing parameter for percentiles [#2157](https://github.com/quickwit-oss/tantivy/issues/2157)(@PSeitz)
- Add missing parameter for stats,min,max,count,sum,avg [#2151](https://github.com/quickwit-oss/tantivy/issues/2151)(@PSeitz)
- Improve aggregation deserialization error message [#2150](https://github.com/quickwit-oss/tantivy/issues/2150)(@PSeitz)
- Add validation for type Bytes to term_agg [#2077](https://github.com/quickwit-oss/tantivy/issues/2077)(@PSeitz)
- Alternative mixed field collection [#2135](https://github.com/quickwit-oss/tantivy/issues/2135)(@PSeitz)
- Add missing query_terms impl for TermSetQuery. [#2120](https://github.com/quickwit-oss/tantivy/issues/2120)(@adamreichold)
- Minor improvements to OwnedBytes [#2134](https://github.com/quickwit-oss/tantivy/issues/2134)(@adamreichold)
- Remove allocations in split compound words [#2080](https://github.com/quickwit-oss/tantivy/issues/2080)(@PSeitz)
- Ngram tokenizer now returns an error with invalid arguments [#2102](https://github.com/quickwit-oss/tantivy/issues/2102)(@fmassot)
- Make TextAnalyzerBuilder public [#2097](https://github.com/quickwit-oss/tantivy/issues/2097)(@adamreichold)
- Return an error when tokenizer is not found while indexing [#2093](https://github.com/quickwit-oss/tantivy/issues/2093)(@naveenann)
- Delayed column opening during merge [#2132](https://github.com/quickwit-oss/tantivy/issues/2132)(@PSeitz)
Tantivy 0.20.2
================================
- Align numerical type priority order on the search side. [#2088](https://github.com/quickwit-oss/tantivy/issues/2088) (@fmassot)
- Fix is_child_of function not considering the root facet. [#2086](https://github.com/quickwit-oss/tantivy/issues/2086) (@adamreichhold)
Tantivy 0.20.1
================================
- Fix building on windows with mmap [#2070](https://github.com/quickwit-oss/tantivy/issues/2070) (@ChillFish8)
Tantivy 0.20
================================
#### Bugfixes
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
- Aggregation
- Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
- Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
#### Features/Improvements
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
- Unified access for fast fields over different cardinalities.
- Unified storage for typed and untyped fields.
- Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
- Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
- Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
- **Aggregation**
- Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
- Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
- Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
- Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
- Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
- Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- tokenizer-api: reduce Tokenizer allocation overhead [#2062](https://github.com/quickwit-oss/tantivy/issues/2062) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
- Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
- Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
- sstable
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
- Query Parser
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
- PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
- Docs
- Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
- Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
- Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
- Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
- Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
Tantivy 0.19
================================
#### Bugfixes

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.21.0"
version = "0.19.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,7 +12,6 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.5"
@@ -21,49 +20,48 @@ byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "1.0"
aho-corasick = "0.7"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
tempfile = { version = "3.3.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
num_cpus = "1.13.1"
fs4 = { version = "0.7.0", optional = true }
fs4 = { version = "0.6.3", optional = true }
levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
bitpacking = { git = "https://github.com/quickwit-oss/bitpacking", rev = "f730b75", default-features = false, features = ["bitpacker4x"] }
bitpacking = { version = "0.8.4", default-features = false, features = ["bitpacker4x"] }
census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
fail = "0.5.0"
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
lru = "0.10.0"
fastdivide = "0.4.0"
itertools = "0.11.0"
itertools = "0.10.3"
measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
columnar = { version= "0.2", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.2", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.2", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.21.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.5", path="./bitpacker" }
common = { version= "0.6", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version= "0.2", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.19.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -74,16 +72,12 @@ maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
criterion = "0.4"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
[target.'cfg(not(windows))'.dev-dependencies]
criterion = "0.5"
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
[dev-dependencies.fail]
version = "0.5.0"
@@ -94,11 +88,6 @@ opt-level = 3
debug = false
debug-assertions = false
[profile.bench]
opt-level = 3
debug = true
debug-assertions = false
[profile.test]
debug-assertions = true
overflow-checks = true
@@ -108,18 +97,15 @@ default = ["mmap", "stopwords", "lz4-compression"]
mmap = ["fs4", "tempfile", "memmap2"]
stopwords = []
brotli-compression = ["brotli"]
lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
failpoints = ["fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
# Uses 64bit ahash.
compare_hash_only = ["stacker/compare_hash_only"]
quickwit = ["sstable"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
@@ -134,7 +120,7 @@ members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sst
[[test]]
name = "failpoints"
path = "tests/failpoints/mod.rs"
required-features = ["failpoints"]
required-features = ["fail/failpoints"]
[[bench]]
name = "analyzer"
@@ -143,3 +129,4 @@ harness = false
[[bench]]
name = "index-bench"
harness = false

View File

@@ -1,5 +1,5 @@
test:
@echo "Run test only... No examples."
echo "Run test only... No examples."
cargo test --tests --lib
fmt:

View File

@@ -26,8 +26,6 @@ Your mileage WILL vary depending on the nature of queries and their load.
<img src="doc/assets/images/searchbenchmark.png">
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
# Features
- Full-text search
@@ -44,7 +42,7 @@ Details about the benchmark can be found at this [repository](https://github.com
- Single valued and multivalued u64, i64, and f64 fast fields (equivalent of doc values in Lucene)
- `&[u8]` fast fields
- Text, i64, u64, f64, dates, ip, bool, and hierarchical facet fields
- Compressed document store (LZ4, Zstd, None)
- Compressed document store (LZ4, Zstd, None, Brotli, Snap)
- Range queries
- Faceted search
- Configurable indexing (optional term frequency and position indexing)

View File

@@ -1,21 +0,0 @@
# Release a new Tantivy Version
## Steps
1. Identify new packages in workspace since last release
2. Identify changed packages in workspace since last release
3. Bump version in `Cargo.toml` and their dependents for all changed packages
4. Update version of root `Cargo.toml`
5. Publish version starting with leaf nodes
6. Set git tag with new version
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
```
no-tag or it will create tags for all the subpackages

23
appveyor.yml Normal file
View File

@@ -0,0 +1,23 @@
# Appveyor configuration template for Rust using rustup for Rust installation
# https://github.com/starkat99/appveyor-rust
os: Visual Studio 2015
environment:
matrix:
- channel: stable
target: x86_64-pc-windows-msvc
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
- rustc -vV
- cargo -vV
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -1,13 +1,11 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
use tantivy::tokenizer::TokenizerManager;
const ALICE_TXT: &str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let mut tokenizer = tokenizer_manager.get("default").unwrap();
let tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
@@ -18,26 +16,7 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_group!(benches, criterion_benchmark);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

View File

@@ -1,15 +1,10 @@
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use criterion::{criterion_group, criterion_main, Criterion};
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{TantivyDocument, FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json");
const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json");
fn get_lines(input: &str) -> Vec<&str> {
input.trim().split('\n').collect()
}
const NUM_REPEATS: usize = 2;
pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = {
@@ -33,152 +28,85 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
};
let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let doc = TantivyDocument::parse_json(&schema, doc_json).unwrap();
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
});
}
pub fn gh_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| {
let lines = get_lines(GH_LOGS);
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-gh-with-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
pub fn wiki_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-wiki");
group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer: IndexWriter = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-wiki-with-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer: IndexWriter =
index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
}
index_writer.commit().unwrap();
})
@@ -187,17 +115,7 @@ pub fn wiki_index_benchmark(c: &mut Criterion) {
criterion_group! {
name = benches;
config = Criterion::default();
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = hdfs_index_benchmark
}
criterion_group! {
name = gh_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark
}
criterion_group! {
name = wiki_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark
}
criterion_main!(benches, gh_benches, wiki_benches);
criterion_main!(benches);

File diff suppressed because one or more lines are too long

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.5.0"
version = "0.3.0"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -367,7 +367,7 @@ mod test {
let mut output: Vec<u32> = Vec::new();
for len in [0, 1, 2, 32, 33, 34, 64] {
for start_idx in 0u32..32u32 {
output.resize(len, 0);
output.resize(len as usize, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len {
let expected = (start_idx + i as u32) & mask;

View File

@@ -64,8 +64,10 @@ fn mem_usage<T>(items: &Vec<T>) -> usize {
impl BlockedBitpacker {
pub fn new() -> Self {
let mut compressed_blocks = vec![];
compressed_blocks.resize(8, 0);
Self {
compressed_blocks: vec![0; 8],
compressed_blocks,
buffer: vec![],
offset_and_bits: vec![],
}

View File

@@ -1,5 +1,5 @@
//! SIMD filtering of a vector as described in the following blog post.
//! <https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512>
//! https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512
use std::arch::x86_64::{
__m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater,
_mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1,

View File

@@ -1,6 +1,6 @@
use std::ops::RangeInclusive;
#[cfg(target_arch = "x86_64")]
#[cfg(any(target_arch = "x86_64"))]
mod avx2;
mod scalar;

View File

@@ -1,90 +0,0 @@
# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
# see https://github.com/orhun/git-cliff#configuration-file
[changelog]
# changelog header
header = """
"""
# template for the changelog body
# https://tera.netlify.app/docs/#introduction
body = """
{% if version %}\
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
==================
{% else %}\
## [unreleased]
{% endif %}\
{% for commit in commits %}
- {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
{% endfor %}
"""
# remove the leading and trailing whitespace from the template
trim = true
# changelog footer
footer = """
"""
postprocessors = [
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
{ pattern = 'Naveen Aiathurai', replace = "naveenann"}, # replace with github user
{ pattern = '', replace = ""}, # replace with github user
]
[git]
# parse the commits based on https://www.conventionalcommits.org
# This is required or commit.message contains the whole commit message and not just the title
conventional_commits = true
# filter out the commits that are not conventional
filter_unconventional = false
# process each line of a commit as an individual commit
split_commits = false
# regex for preprocessing the commit messages
commit_preprocessors = [
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
]
#link_parsers = [
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
#]
# regex for parsing and grouping commits
commit_parsers = [
{ message = "^feat", group = "Features"},
{ message = "^fix", group = "Bug Fixes"},
{ message = "^doc", group = "Documentation"},
{ message = "^perf", group = "Performance"},
{ message = "^refactor", group = "Refactor"},
{ message = "^style", group = "Styling"},
{ message = "^test", group = "Testing"},
{ message = "^chore\\(release\\): prepare for", skip = true},
{ message = "(?i)clippy", skip = true},
{ message = "(?i)dependabot", skip = true},
{ message = "(?i)fmt", skip = true},
{ message = "(?i)bump", skip = true},
{ message = "(?i)readme", skip = true},
{ message = "(?i)comment", skip = true},
{ message = "(?i)spelling", skip = true},
{ message = "^chore", group = "Miscellaneous Tasks"},
{ body = ".*security", group = "Security"},
{ message = ".*", group = "Other", default_scope = "other"},
]
# protect breaking changes from being skipped due to matching a skipping commit_parser
protect_breaking_commits = false
# filter out the commits that are not matched by commit parsers
filter_commits = false
# glob pattern for matching git tags
tag_pattern = "v[0-9]*"
# regex for skipping tags
skip_tags = "v0.1.0-beta.1"
# regex for ignoring tags
ignore_tags = ""
# sort the tags topologically
topo_order = false
# sort the commits inside sections by oldest/newest order
sort_commits = "newest"
# limit the number of commits included in the changelog.
# limit_commits = 42

View File

@@ -1,28 +1,28 @@
[package]
name = "tantivy-columnar"
version = "0.2.0"
version = "0.1.0"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.11.0"
itertools = "0.10.5"
log = "0.4.17"
fnv = "1.0.7"
fastdivide = "0.4.0"
rand = { version = "0.8.5", optional = true }
measure_time = { version = "0.8.2", optional = true }
prettytable-rs = { version = "0.10.0", optional = true }
stacker = { version= "0.2", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.2", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.6", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.5", path = "../bitpacker/" }
stacker = { path = "../stacker", package="tantivy-stacker"}
sstable = { path = "../sstable", package = "tantivy-sstable" }
common = { path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
serde = "1.0.152"
[dev-dependencies]
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
rand = "0.8.5"
[features]
unstable = []

View File

@@ -1,12 +1,9 @@
use std::cmp::Ordering;
use crate::{Column, DocId, RowId};
#[derive(Debug, Default, Clone)]
pub struct ColumnBlockAccessor<T> {
val_cache: Vec<T>,
docid_cache: Vec<DocId>,
missing_docids_cache: Vec<DocId>,
row_id_cache: Vec<RowId>,
}
@@ -23,20 +20,6 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
.values
.get_vals(&self.row_id_cache, &mut self.val_cache);
}
#[inline]
pub fn fetch_block_with_missing(&mut self, docs: &[u32], accessor: &Column<T>, missing: T) {
self.fetch_block(docs, accessor);
// We can compare docid_cache with docs to find missing docs
if docs.len() != self.docid_cache.len() || accessor.index.is_multivalue() {
self.missing_docids_cache.clear();
find_missing_docs(docs, &self.docid_cache, |doc| {
self.missing_docids_cache.push(doc);
self.val_cache.push(missing);
});
self.docid_cache
.extend_from_slice(&self.missing_docids_cache);
}
}
#[inline]
pub fn iter_vals(&self) -> impl Iterator<Item = T> + '_ {
@@ -51,82 +34,3 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
.zip(self.val_cache.iter().cloned())
}
}
/// Given two sorted lists of docids `docs` and `hits`, hits is a subset of `docs`.
/// Return all docs that are not in `hits`.
fn find_missing_docs<F>(docs: &[u32], hits: &[u32], mut callback: F)
where F: FnMut(u32) {
let mut docs_iter = docs.iter();
let mut hits_iter = hits.iter();
let mut doc = docs_iter.next();
let mut hit = hits_iter.next();
while let (Some(&current_doc), Some(&current_hit)) = (doc, hit) {
match current_doc.cmp(&current_hit) {
Ordering::Less => {
callback(current_doc);
doc = docs_iter.next();
}
Ordering::Equal => {
doc = docs_iter.next();
hit = hits_iter.next();
}
Ordering::Greater => {
hit = hits_iter.next();
}
}
}
while let Some(&current_doc) = doc {
callback(current_doc);
doc = docs_iter.next();
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_find_missing_docs() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5, 6, 7, 8, 9, 10];
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 3, 5, 7, 9]);
}
#[test]
fn test_find_missing_docs_empty() {
let docs: Vec<u32> = Vec::new();
let hits: Vec<u32> = vec![2, 4, 6, 8, 10];
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![]);
}
#[test]
fn test_find_missing_docs_all_missing() {
let docs: Vec<u32> = vec![1, 2, 3, 4, 5];
let hits: Vec<u32> = Vec::new();
let mut missing_docs: Vec<u32> = Vec::new();
find_missing_docs(&docs, &hits, |missing_doc| {
missing_docs.push(missing_doc);
});
assert_eq!(missing_docs, vec![1, 2, 3, 4, 5]);
}
}

View File

@@ -30,13 +30,6 @@ impl fmt::Debug for BytesColumn {
}
impl BytesColumn {
pub fn empty(num_docs: u32) -> BytesColumn {
BytesColumn {
dictionary: Arc::new(Dictionary::empty()),
term_ord_column: Column::build_empty_column(num_docs),
}
}
/// Fills the given `output` buffer with the term associated to the ordinal `ord`.
///
/// Returns `false` if the term does not exist (e.g. `term_ord` is greater or equal to the
@@ -84,7 +77,7 @@ impl From<StrColumn> for BytesColumn {
}
impl StrColumn {
pub fn wrap(bytes_column: BytesColumn) -> StrColumn {
pub(crate) fn wrap(bytes_column: BytesColumn) -> StrColumn {
StrColumn(bytes_column)
}

View File

@@ -130,7 +130,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.select_batch_in_place(selected_docid_range.start, doc_ids);
}
/// Fills the output vector with the (possibly multiple values that are associated_with
/// Fils the output vector with the (possibly multiple values that are associated_with
/// `row_id`.
///
/// This method clears the `output` vector.

View File

@@ -1,73 +1,19 @@
mod shuffled;
mod stacked;
use common::ReadOnlyBitSet;
use shuffled::merge_column_index_shuffled;
use stacked::merge_column_index_stacked;
use crate::column_index::SerializableColumnIndex;
use crate::{Cardinality, ColumnIndex, MergeRowOrder};
fn detect_cardinality_single_column_index(
column_index: &ColumnIndex,
alive_bitset_opt: &Option<ReadOnlyBitSet>,
) -> Cardinality {
let Some(alive_bitset) = alive_bitset_opt else {
return column_index.get_cardinality();
};
let cardinality_before_deletes = column_index.get_cardinality();
if cardinality_before_deletes == Cardinality::Full {
// The columnar cardinality can only become more restrictive in the presence of deletes
// (where cardinality sorted from the more restrictive to the least restrictive are Full,
// Optional, Multivalued)
//
// If we are already "Full", we are guaranteed to stay "Full" after deletes.
return Cardinality::Full;
}
let mut cardinality_so_far = Cardinality::Full;
for doc_id in alive_bitset.iter() {
let num_values = column_index.value_row_ids(doc_id).len();
let row_cardinality = match num_values {
0 => Cardinality::Optional,
1 => Cardinality::Full,
_ => Cardinality::Multivalued,
};
cardinality_so_far = cardinality_so_far.max(row_cardinality);
if cardinality_so_far >= cardinality_before_deletes {
// There won't be any improvement in the cardinality.
// We can early exit.
return cardinality_before_deletes;
}
}
cardinality_so_far
}
fn detect_cardinality(
column_indexes: &[ColumnIndex],
merge_row_order: &MergeRowOrder,
) -> Cardinality {
match merge_row_order {
MergeRowOrder::Stack(_) => column_indexes
.iter()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full),
MergeRowOrder::Shuffled(shuffle_merge_order) => {
let mut merged_cardinality = Cardinality::Full;
for (column_index, alive_bitset_opt) in column_indexes
.iter()
.zip(shuffle_merge_order.alive_bitsets.iter())
{
let cardinality: Cardinality =
detect_cardinality_single_column_index(column_index, alive_bitset_opt);
if cardinality == Cardinality::Multivalued {
return cardinality;
}
merged_cardinality = merged_cardinality.max(cardinality);
}
merged_cardinality
}
}
// For simplification, we never have cardinality go down due to deletes.
fn detect_cardinality(columns: &[ColumnIndex]) -> Cardinality {
columns
.iter()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full)
}
pub fn merge_column_index<'a>(
@@ -76,7 +22,7 @@ pub fn merge_column_index<'a>(
) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
let cardinality_after_merge = detect_cardinality(columns);
match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => {
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
@@ -98,54 +44,34 @@ mod tests {
use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
};
use crate::{Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder};
#[test]
fn test_detect_cardinality() {
assert_eq!(
detect_cardinality(&[], &StackMergeOrder::stack_for_test(&[]).into()),
Cardinality::Full
);
assert_eq!(detect_cardinality(&[]), Cardinality::Full);
let optional_index: ColumnIndex = OptionalIndex::for_test(1, &[]).into();
let multivalued_index: ColumnIndex = MultiValueIndex::for_test(&[0, 1]).into();
assert_eq!(
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
detect_cardinality(&[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }]),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Full],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[optional_index.clone(), ColumnIndex::Full]),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(
&[
multivalued_index.clone(),
ColumnIndex::Empty { num_docs: 0 }
],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
detect_cardinality(&[
multivalued_index.clone(),
ColumnIndex::Empty { num_docs: 0 }
]),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(
&[multivalued_index.clone(), optional_index.clone()],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[multivalued_index.clone(), optional_index.clone()]),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(
&[optional_index, multivalued_index],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
detect_cardinality(&[optional_index, multivalued_index]),
Cardinality::Multivalued
);
}
@@ -168,9 +94,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
@@ -201,9 +126,8 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}

View File

@@ -157,13 +157,7 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional {
non_null_row_ids,
num_rows,
} = serializable_index
else {
panic!()
};
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
assert_eq!(num_rows, 2);
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
assert_eq!(&non_null_rows, &[1]);

View File

@@ -1,8 +1,3 @@
//! # `column_index`
//!
//! `column_index` provides rank and select operations to associate positions when not all
//! documents have exactly one element.
mod merge;
mod multivalued_index;
mod optional_index;
@@ -42,14 +37,10 @@ impl From<MultiValueIndex> for ColumnIndex {
}
impl ColumnIndex {
#[inline]
pub fn is_multivalue(&self) -> bool {
matches!(self, ColumnIndex::Multivalued(_))
}
/// Returns the cardinality of the column index.
///
/// By convention, if the column contains no docs, we consider that it is
/// full.
// Returns the cardinality of the column index.
//
// By convention, if the column contains no docs, we consider that it is
// full.
#[inline]
pub fn get_cardinality(&self) -> Cardinality {
match self {

View File

@@ -215,12 +215,12 @@ mod bench {
let vals: Vec<RowId> = (0..TOTAL_NUM_VALUES)
.map(|_| rng.gen_bool(fill_ratio))
.enumerate()
.filter(|(_pos, val)| *val)
.filter(|(pos, val)| *val)
.map(|(pos, _)| pos as RowId)
.collect();
serialize_optional_index(&&vals[..], TOTAL_NUM_VALUES, &mut out).unwrap();
open_optional_index(OwnedBytes::new(out)).unwrap()
let codec = open_optional_index(OwnedBytes::new(out)).unwrap();
codec
}
fn random_range_iterator(
@@ -242,7 +242,7 @@ mod bench {
}
fn n_percent_step_iterator(percent: f32, num_values: u32) -> impl Iterator<Item = u32> {
let ratio = percent / 100.0;
let ratio = percent as f32 / 100.0;
let step_size = (1f32 / ratio) as u32;
let deviation = step_size - 1;
random_range_iterator(0, num_values, step_size, deviation)

View File

@@ -30,7 +30,6 @@ impl<'a> SerializableColumnIndex<'a> {
}
}
/// Serialize a column index.
pub fn serialize_column_index(
column_index: SerializableColumnIndex,
output: &mut impl Write,
@@ -52,7 +51,6 @@ pub fn serialize_column_index(
Ok(column_index_num_bytes)
}
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(

View File

@@ -2,7 +2,7 @@
//! # `fastfield_codecs`
//!
//! - Columnar storage of data for tantivy [`crate::Column`].
//! - Columnar storage of data for tantivy [`Column`].
//! - Encode data in different codecs.
//! - Monotonically map values to u64/u128

View File

@@ -139,12 +139,12 @@ impl MonotonicallyMappableToU64 for i64 {
impl MonotonicallyMappableToU64 for DateTime {
#[inline(always)]
fn to_u64(self) -> u64 {
common::i64_to_u64(self.into_timestamp_nanos())
common::i64_to_u64(self.into_timestamp_micros())
}
#[inline(always)]
fn from_u64(val: u64) -> Self {
DateTime::from_timestamp_nanos(common::u64_to_i64(val))
DateTime::from_timestamp_micros(common::u64_to_i64(val))
}
}

View File

@@ -38,6 +38,6 @@ impl Ord for BlankRange {
}
impl PartialOrd for BlankRange {
fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
Some(self.cmp(other))
Some(self.blank_size().cmp(&other.blank_size()))
}
}

View File

@@ -83,8 +83,7 @@ impl ColumnValues for BitpackedReader {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;

View File

@@ -27,7 +27,7 @@ pub struct StatsCollector {
// This is the same as computing the difference between the values and the first value.
//
// This way, we can compress i64-converted-to-u64 (e.g. timestamp that were supplied in
// seconds, only to be converted in nanoseconds).
// seconds, only to be converted in microseconds).
increment_gcd_opt: Option<(NonZeroU64, DividerU64)>,
first_value_opt: Option<u64>,
}

View File

@@ -34,7 +34,7 @@ impl fmt::Display for ColumnType {
ColumnType::IpAddr => "ip",
ColumnType::DateTime => "datetime",
};
write!(f, "{short_str}")
write!(f, "{}", short_str)
}
}
@@ -54,9 +54,6 @@ impl ColumnType {
pub fn to_code(self) -> u8 {
self as u8
}
pub fn is_date_time(&self) -> bool {
self == &ColumnType::DateTime
}
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)

View File

@@ -1,7 +1,7 @@
use std::io::{self, Write};
use common::{BitSet, CountingWriter, ReadOnlyBitSet};
use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
use sstable::{SSTable, TermOrdinal};
use super::term_merger::TermMerger;
use crate::column::serialize_column_mappable_to_u64;
@@ -52,23 +52,18 @@ impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
impl<'a> RemappedTermOrdinalsValues<'a> {
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
let iter = self
.bytes_columns
.iter()
.enumerate()
.flat_map(|(seg_ord, bytes_column_opt)| {
let bytes_column = bytes_column_opt.as_ref()?;
Some((seg_ord, bytes_column))
})
.flat_map(move |(seg_ord, bytes_column)| {
let term_ord_after_merge_mapping =
self.term_ord_mapping.get_segment(seg_ord as u32);
let iter = self.bytes_columns.iter().flatten().enumerate().flat_map(
move |(seg_ord_with_column, bytes_column)| {
let term_ord_after_merge_mapping = self
.term_ord_mapping
.get_segment(seg_ord_with_column as u32);
bytes_column
.ords()
.values
.iter()
.map(move |term_ord| term_ord_after_merge_mapping[term_ord as usize])
});
},
);
Box::new(iter)
}
@@ -126,15 +121,10 @@ fn serialize_merged_dict(
let mut term_ord_mapping = TermOrdinalMapping::default();
let mut field_term_streams = Vec::new();
for column_opt in bytes_columns.iter() {
if let Some(column) = column_opt {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
field_term_streams.push(terms);
} else {
term_ord_mapping.add_segment(0);
field_term_streams.push(Streamer::empty());
}
for column in bytes_columns.iter().flatten() {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms = column.dictionary.stream()?;
field_term_streams.push(terms);
}
let mut merged_terms = TermMerger::new(field_term_streams);

View File

@@ -11,17 +11,6 @@ pub struct StackMergeOrder {
}
impl StackMergeOrder {
#[cfg(test)]
pub fn stack_for_test(num_rows_per_columnar: &[u32]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(num_rows_per_columnar.len());
let mut cumulated_row_id = 0;
for &num_rows in num_rows_per_columnar {
cumulated_row_id += num_rows;
cumulated_row_ids.push(cumulated_row_id);
}
StackMergeOrder { cumulated_row_ids }
}
pub fn stack(columnars: &[&ColumnarReader]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
let mut cumulated_row_id = 0;
@@ -52,8 +41,8 @@ pub enum MergeRowOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// ..
/// No documents is deleted.
Stack(StackMergeOrder),

View File

@@ -2,12 +2,11 @@ mod merge_dict_column;
mod merge_mapping;
mod term_merger;
use std::collections::{BTreeMap, HashSet};
use std::collections::{BTreeMap, HashMap, HashSet};
use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use itertools::Itertools;
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
use super::writer::ColumnarSerializer;
@@ -18,8 +17,7 @@ use crate::columnar::writer::CompatibleNumericalTypes;
use crate::columnar::ColumnarReader;
use crate::dynamic_column::DynamicColumn;
use crate::{
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, DynamicColumnHandle, NumericalType,
NumericalValue,
BytesColumn, Column, ColumnIndex, ColumnType, ColumnValues, NumericalType, NumericalValue,
};
/// Column types are grouped into different categories.
@@ -29,16 +27,14 @@ use crate::{
/// In practise, today, only Numerical colummns are coerced into one type today.
///
/// See also [README.md].
///
/// The ordering has to match the ordering of the variants in [ColumnType].
#[derive(Copy, Clone, Eq, PartialOrd, Ord, PartialEq, Hash, Debug)]
#[derive(Copy, Clone, Eq, PartialEq, Hash, Debug)]
pub(crate) enum ColumnTypeCategory {
Numerical,
Bytes,
Str,
Bool,
IpAddr,
Str,
Numerical,
DateTime,
Bytes,
IpAddr,
}
impl From<ColumnType> for ColumnTypeCategory {
@@ -86,22 +82,10 @@ pub fn merge_columnar(
.iter()
.map(|reader| reader.num_rows())
.collect::<Vec<u32>>();
let columns_to_merge =
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
for res in columns_to_merge {
let ((column_name, _column_type_category), grouped_columns) = res;
let grouped_columns = grouped_columns.open(&merge_row_order)?;
if grouped_columns.is_empty() {
continue;
}
let column_type = grouped_columns.column_type_after_merge();
let mut columns = grouped_columns.columns;
coerce_columns(column_type, &mut columns)?;
let columns_to_merge = group_columns_for_merge(columnar_readers, required_columns)?;
for ((column_name, column_type), columns) in columns_to_merge {
let mut column_serializer =
serializer.start_serialize_column(column_name.as_bytes(), column_type);
serializer.serialize_column(column_name.as_bytes(), column_type);
merge_column(
column_type,
&num_rows_per_columnar,
@@ -109,9 +93,7 @@ pub fn merge_columnar(
&merge_row_order,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
serializer.finalize(merge_row_order.num_rows())?;
Ok(())
}
@@ -225,12 +207,40 @@ fn merge_column(
struct GroupedColumns {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumn>>,
column_category: ColumnTypeCategory,
}
impl GroupedColumns {
/// Check is column group can be skipped during serialization.
fn is_empty(&self) -> bool {
self.required_column_type.is_none() && self.columns.iter().all(Option::is_none)
fn for_category(column_category: ColumnTypeCategory, num_columnars: usize) -> Self {
GroupedColumns {
required_column_type: None,
columns: vec![None; num_columnars],
column_category,
}
}
/// Set the dynamic column for a given columnar.
fn set_column(&mut self, columnar_id: usize, column: DynamicColumn) {
self.columns[columnar_id] = Some(column);
}
/// Force the existence of a column, as well as its type.
fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
if let Some(existing_required_type) = self.required_column_type {
if existing_required_type == required_type {
// This was just a duplicate in the `required_columns`.
// Nothing to do.
return Ok(());
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Required column conflicts with another required column of the same type \
category.",
));
}
}
self.required_column_type = Some(required_type);
Ok(())
}
/// Returns the column type after merge.
@@ -252,76 +262,11 @@ impl GroupedColumns {
}
// At the moment, only the numerical categorical column type has more than one possible
// column type.
assert!(self
.columns
.iter()
.flatten()
.all(|el| ColumnTypeCategory::from(el.column_type()) == ColumnTypeCategory::Numerical));
assert_eq!(self.column_category, ColumnTypeCategory::Numerical);
merged_numerical_columns_type(self.columns.iter().flatten()).into()
}
}
struct GroupedColumnsHandle {
required_column_type: Option<ColumnType>,
columns: Vec<Option<DynamicColumnHandle>>,
}
impl GroupedColumnsHandle {
fn new(num_columnars: usize) -> Self {
GroupedColumnsHandle {
required_column_type: None,
columns: vec![None; num_columnars],
}
}
fn open(self, merge_row_order: &MergeRowOrder) -> io::Result<GroupedColumns> {
let mut columns: Vec<Option<DynamicColumn>> = Vec::new();
for (columnar_id, column) in self.columns.iter().enumerate() {
if let Some(column) = column {
let column = column.open()?;
// We skip columns that end up with 0 documents.
// That way, we make sure they don't end up influencing the merge type or
// creating empty columns.
if is_empty_after_merge(merge_row_order, &column, columnar_id) {
columns.push(None);
} else {
columns.push(Some(column));
}
} else {
columns.push(None);
}
}
Ok(GroupedColumns {
required_column_type: self.required_column_type,
columns,
})
}
/// Set the dynamic column for a given columnar.
fn set_column(&mut self, columnar_id: usize, column: DynamicColumnHandle) {
self.columns[columnar_id] = Some(column);
}
/// Force the existence of a column, as well as its type.
fn require_type(&mut self, required_type: ColumnType) -> io::Result<()> {
if let Some(existing_required_type) = self.required_column_type {
if existing_required_type == required_type {
// This was just a duplicate in the `required_columns`.
// Nothing to do.
return Ok(());
} else {
return Err(io::Error::new(
io::ErrorKind::InvalidInput,
"Required column conflicts with another required column of the same type \
category.",
));
}
}
self.required_column_type = Some(required_type);
Ok(())
}
}
/// Returns the type of the merged numerical column.
///
/// This function picks the first numerical type out of i64, u64, f64 (order matters
@@ -342,92 +287,48 @@ fn merged_numerical_columns_type<'a>(
compatible_numerical_types.to_numerical_type()
}
fn is_empty_after_merge(
merge_row_order: &MergeRowOrder,
column: &DynamicColumn,
columnar_ord: usize,
) -> bool {
if column.num_values() == 0u32 {
// It was empty before the merge.
return true;
}
match merge_row_order {
MergeRowOrder::Stack(_) => {
// If we are stacking the columnar, no rows are being deleted.
false
}
MergeRowOrder::Shuffled(shuffled) => {
if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_ord] {
let column_index = column.column_index();
match column_index {
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_rows() {
if alive_bitset.contains(doc) {
return false;
}
}
true
}
ColumnIndex::Multivalued(multivalued_index) => {
for (doc_id, (start_index, end_index)) in multivalued_index
.start_index_column
.iter()
.tuple_windows()
.enumerate()
{
let doc_id = doc_id as u32;
if start_index == end_index {
// There are no values in this document
continue;
}
// The document contains values and is present in the alive bitset.
// The column is therefore not empty.
if alive_bitset.contains(doc_id) {
return false;
}
}
true
}
}
} else {
// No document is being deleted.
// The shuffle is applying a permutation.
false
}
}
}
}
/// Iterates over the columns of the columnar readers, grouped by column name.
/// Key functionality is that `open` of the Columns is done lazy per group.
fn group_columns_for_merge<'a>(
columnar_readers: &'a [&'a ColumnarReader],
required_columns: &'a [(String, ColumnType)],
_merge_row_order: &'a MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
#[allow(clippy::type_complexity)]
fn group_columns_for_merge(
columnar_readers: &[&ColumnarReader],
required_columns: &[(String, ColumnType)],
) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {
// Each column name may have multiple types of column associated.
// For merging we are interested in the same column type category since they can be merged.
let mut columns_grouped: HashMap<(String, ColumnTypeCategory), GroupedColumns> = HashMap::new();
for &(ref column_name, column_type) in required_columns {
columns
columns_grouped
.entry((column_name.clone(), column_type.into()))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.or_insert_with(|| {
GroupedColumns::for_category(column_type.into(), columnar_readers.len())
})
.require_type(column_type)?;
}
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.iter_columns()?;
let column_name_and_handle = columnar_reader.list_columns()?;
for (column_name, handle) in column_name_and_handle {
let column_category: ColumnTypeCategory = handle.column_type().into();
columns
let column = handle.open()?;
columns_grouped
.entry((column_name, column_category))
.or_insert_with(|| GroupedColumnsHandle::new(columnar_readers.len()))
.set_column(columnar_id, handle);
.or_insert_with(|| {
GroupedColumns::for_category(column_category, columnar_readers.len())
})
.set_column(columnar_id, column);
}
}
Ok(columns)
let mut merge_columns: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
Default::default();
for ((column_name, _), mut grouped_columns) in columns_grouped {
let column_type = grouped_columns.column_type_after_merge();
coerce_columns(column_type, &mut grouped_columns.columns)?;
merge_columns.insert((column_name, column_type), grouped_columns.columns);
}
Ok(merge_columns)
}
fn coerce_columns(

View File

@@ -1,5 +1,3 @@
use std::collections::BTreeMap;
use itertools::Itertools;
use super::*;
@@ -27,73 +25,70 @@ fn test_column_coercion_to_u64() {
let columnar1 = make_columnar("numbers", &[1i64]);
// u64 type
let columnar2 = make_columnar("numbers", &[u64::MAX]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_column_no_coercion_if_all_the_same() {
let columnar1 = make_columnar("numbers", &[1u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_column_coercion_to_i64() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
}
//#[test]
// fn test_impossible_coercion_returns_an_error() {
// let columnar1 = make_columnar("numbers", &[u64::MAX]);
// let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
// let group_error = group_columns_for_merge_iter(
//&[&columnar1],
//&[("numbers".to_string(), ColumnType::I64)],
//&merge_order,
//)
//.unwrap_err();
// assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
//}
#[test]
fn test_impossible_coercion_returns_an_error() {
let columnar1 = make_columnar("numbers", &[u64::MAX]);
let group_error =
group_columns_for_merge(&[&columnar1], &[("numbers".to_string(), ColumnType::I64)])
.map(|_| ())
.unwrap_err();
assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
}
#[test]
fn test_group_columns_with_required_column() {
let columnar1 = make_columnar("numbers", &[1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_group_columns_required_column_with_no_existing_columns() {
let columnar1 = make_columnar("numbers", &[2u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<_, _> = group_columns_for_merge(
columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
&[("required_col".to_string(), ColumnType::Str)],
)
.unwrap();
assert_eq!(column_map.len(), 2);
let columns = &column_map
.get(&("required_col".to_string(), ColumnTypeCategory::Str))
.unwrap()
.columns;
let columns = column_map
.get(&("required_col".to_string(), ColumnType::Str))
.unwrap();
assert_eq!(columns.len(), 2);
assert!(columns[0].is_none());
assert!(columns[1].is_none());
@@ -103,42 +98,35 @@ fn test_group_columns_required_column_with_no_existing_columns() {
fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_rule() {
let columnar1 = make_columnar("numbers", &[2i64]);
let columnar2 = make_columnar("numbers", &[2i64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
columnars,
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
#[test]
fn test_missing_column() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers2", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
assert_eq!(column_map.len(), 2);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
{
let columns = &column_map
.get(&("numbers".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers".to_string(), ColumnType::I64))
.unwrap();
assert!(columns[0].is_some());
assert!(columns[1].is_none());
}
{
let columns = &column_map
.get(&("numbers2".to_string(), ColumnTypeCategory::Numerical))
.unwrap()
.columns;
let columns = column_map
.get(&("numbers2".to_string(), ColumnType::U64))
.unwrap();
assert!(columns[0].is_none());
assert!(columns[1].is_some());
}
@@ -236,9 +224,7 @@ fn test_merge_columnar_numbers() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::F64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.first(0u32), Some(-1f64));
assert_eq!(vals.first(1u32), None);
@@ -264,9 +250,7 @@ fn test_merge_columnar_texts() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
@@ -313,9 +297,7 @@ fn test_merge_columnar_byte() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -369,9 +351,7 @@ fn test_merge_columnar_byte_with_missing() {
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -423,9 +403,7 @@ fn test_merge_columnar_different_types() {
// numeric column
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::I64(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
@@ -435,9 +413,7 @@ fn test_merge_columnar_different_types() {
// text column
let dynamic_column = cols[1].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
let mut out = String::new();

View File

@@ -102,41 +102,30 @@ impl ColumnarReader {
pub fn num_rows(&self) -> RowId {
self.num_rows
}
// Iterate over the columns in a sorted way
pub fn iter_columns(
&self,
) -> io::Result<impl Iterator<Item = (String, DynamicColumnHandle)> + '_> {
let mut stream = self.column_dictionary.stream()?;
Ok(std::iter::from_fn(move || {
if stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
// TODO Error Handling. The API gets quite ugly when returning the error here, so
// instead we could just check the first N columns upfront.
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))
.unwrap();
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
Some((column_name, column_handle))
} else {
None
}
}))
}
// TODO Add unit tests
pub fn list_columns(&self) -> io::Result<Vec<(String, DynamicColumnHandle)>> {
Ok(self.iter_columns()?.collect())
let mut stream = self.column_dictionary.stream()?;
let mut results = Vec::new();
while stream.advance() {
let key_bytes: &[u8] = stream.key();
let column_code: u8 = key_bytes.last().cloned().unwrap();
let column_type: ColumnType = ColumnType::try_from_code(column_code)
.map_err(|_| io_invalid_data(format!("Unknown column code `{column_code}`")))?;
let range = stream.value().clone();
let column_name =
// The last two bytes are respectively the 0u8 separator and the column_type.
String::from_utf8_lossy(&key_bytes[..key_bytes.len() - 2]).to_string();
let file_slice = self
.column_data
.slice(range.start as usize..range.end as usize);
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
};
results.push((column_name, column_handle));
}
Ok(results)
}
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {

View File

@@ -79,6 +79,7 @@ fn mutate_or_create_column<V, TMutator>(
impl ColumnarWriter {
pub fn mem_usage(&self) -> usize {
// TODO add dictionary builders.
self.arena.mem_usage()
+ self.numerical_field_hash_map.mem_usage()
+ self.bool_field_hash_map.mem_usage()
@@ -86,11 +87,6 @@ impl ColumnarWriter {
+ self.str_field_hash_map.mem_usage()
+ self.ip_addr_field_hash_map.mem_usage()
+ self.datetime_field_hash_map.mem_usage()
+ self
.dictionaries
.iter()
.map(|dict| dict.mem_usage())
.sum::<usize>()
}
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
@@ -102,15 +98,9 @@ impl ColumnarWriter {
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
.or_else(|| {
self.datetime_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
})
else {
return Vec::new();
let Some(numerical_col_writer) =
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();
@@ -276,7 +266,7 @@ impl ColumnarWriter {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_nanos()),
NumericalValue::I64(datetime.into_timestamp_micros()),
arena,
);
column
@@ -338,7 +328,7 @@ impl ColumnarWriter {
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map
.iter()
.map(|(column_name, addr)| {
.map(|(column_name, addr, _)| {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let column_type = numerical_column_writer.numerical_type().into();
@@ -348,27 +338,27 @@ impl ColumnarWriter {
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
.map(|(term, addr, _)| (term, ColumnType::Bytes, addr)),
);
columns.extend(
self.str_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Str, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Str, addr)),
);
columns.extend(
self.bool_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::Bool, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::Bool, addr)),
);
columns.extend(
self.ip_addr_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::IpAddr, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::IpAddr, addr)),
);
columns.extend(
self.datetime_field_hash_map
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
.map(|(column_name, addr, _)| (column_name, ColumnType::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
@@ -380,7 +370,7 @@ impl ColumnarWriter {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
serialize_bool_column(
cardinality,
num_docs,
@@ -392,13 +382,12 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::IpAddr => {
let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, ColumnType::IpAddr);
serializer.serialize_column(column_name, ColumnType::IpAddr);
serialize_ip_addr_column(
cardinality,
num_docs,
@@ -410,7 +399,6 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::Bytes | ColumnType::Str => {
let str_or_bytes_column_writer: StrOrBytesColumnWriter =
@@ -425,7 +413,7 @@ impl ColumnarWriter {
.column_writer
.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
serialize_bytes_or_str_column(
cardinality,
num_docs,
@@ -439,14 +427,13 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::F64 | ColumnType::I64 | ColumnType::U64 => {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let cardinality = numerical_column_writer.cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, column_type);
serializer.serialize_column(column_name, column_type);
let numerical_type = column_type.numerical_type().unwrap();
serialize_numerical_column(
cardinality,
@@ -460,13 +447,12 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::DateTime => {
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.start_serialize_column(column_name, ColumnType::DateTime);
serializer.serialize_column(column_name, ColumnType::DateTime);
serialize_numerical_column(
cardinality,
num_docs,
@@ -479,7 +465,6 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
};
}

View File

@@ -34,12 +34,11 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
/// Creates a ColumnSerializer.
pub fn start_serialize_column<'a>(
pub fn serialize_column<'a>(
&'a mut self,
column_name: &[u8],
column_type: ColumnType,
) -> ColumnSerializer<'a, W> {
) -> impl io::Write + 'a {
let start_offset = self.wrt.written_bytes();
prepare_key(column_name, column_type, &mut self.prepare_key_buffer);
ColumnSerializer {
@@ -61,21 +60,20 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
pub struct ColumnSerializer<'a, W: io::Write> {
struct ColumnSerializer<'a, W: io::Write> {
columnar_serializer: &'a mut ColumnarSerializer<W>,
start_offset: u64,
}
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
pub fn finalize(self) -> io::Result<()> {
impl<'a, W: io::Write> Drop for ColumnSerializer<'a, W> {
fn drop(&mut self) {
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
let byte_range = self.start_offset..end_offset;
self.columnar_serializer.sstable_range.insert(
self.columnar_serializer.sstable_range.insert_cannot_fail(
&self.columnar_serializer.prepare_key_buffer[..],
&byte_range,
)?;
);
self.columnar_serializer.prepare_key_buffer.clear();
Ok(())
}
}

View File

@@ -32,7 +32,6 @@ pub struct OrderedId(pub u32);
#[derive(Default)]
pub(crate) struct DictionaryBuilder {
dict: FnvHashMap<Vec<u8>, UnorderedId>,
memory_consumption: usize,
}
impl DictionaryBuilder {
@@ -44,8 +43,6 @@ impl DictionaryBuilder {
}
let new_id = UnorderedId(self.dict.len() as u32);
self.dict.insert(term.to_vec(), new_id);
self.memory_consumption += term.len();
self.memory_consumption += 40; // Term Metadata + HashMap overhead
new_id
}
@@ -66,10 +63,6 @@ impl DictionaryBuilder {
sstable_builder.finish()?;
Ok(TermIdMapping { unordered_to_ord })
}
pub(crate) fn mem_usage(&self) -> usize {
self.memory_consumption
}
}
#[cfg(test)]

View File

@@ -26,14 +26,14 @@ impl fmt::Debug for DynamicColumn {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "[{} {} |", self.get_cardinality(), self.column_type())?;
match self {
DynamicColumn::Bool(col) => write!(f, " {col:?}")?,
DynamicColumn::I64(col) => write!(f, " {col:?}")?,
DynamicColumn::U64(col) => write!(f, " {col:?}")?,
DynamicColumn::F64(col) => write!(f, "{col:?}")?,
DynamicColumn::IpAddr(col) => write!(f, "{col:?}")?,
DynamicColumn::DateTime(col) => write!(f, "{col:?}")?,
DynamicColumn::Bytes(col) => write!(f, "{col:?}")?,
DynamicColumn::Str(col) => write!(f, "{col:?}")?,
DynamicColumn::Bool(col) => write!(f, " {:?}", col)?,
DynamicColumn::I64(col) => write!(f, " {:?}", col)?,
DynamicColumn::U64(col) => write!(f, " {:?}", col)?,
DynamicColumn::F64(col) => write!(f, "{:?}", col)?,
DynamicColumn::IpAddr(col) => write!(f, "{:?}", col)?,
DynamicColumn::DateTime(col) => write!(f, "{:?}", col)?,
DynamicColumn::Bytes(col) => write!(f, "{:?}", col)?,
DynamicColumn::Str(col) => write!(f, "{:?}", col)?,
}
write!(f, "]")
}
@@ -228,7 +228,7 @@ static_dynamic_conversions!(StrColumn, Str);
static_dynamic_conversions!(BytesColumn, Bytes);
static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
#[derive(Clone, Debug)]
#[derive(Clone)]
pub struct DynamicColumnHandle {
pub(crate) file_slice: FileSlice,
pub(crate) column_type: ColumnType,
@@ -247,7 +247,7 @@ impl DynamicColumnHandle {
}
/// Returns the `u64` fast field reader reader associated with `fields` of types
/// Str, u64, i64, f64, bool, or datetime.
/// Str, u64, i64, f64, or datetime.
///
/// If not, the fastfield reader will returns the u64-value associated with the original
/// FastValue.
@@ -258,12 +258,9 @@ impl DynamicColumnHandle {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
Ok(Some(column.term_ord_column))
}
ColumnType::Bool => Ok(None),
ColumnType::IpAddr => Ok(None),
ColumnType::Bool
| ColumnType::I64
| ColumnType::U64
| ColumnType::F64
| ColumnType::DateTime => {
ColumnType::I64 | ColumnType::U64 | ColumnType::F64 | ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
Ok(Some(column))
}

View File

@@ -12,7 +12,7 @@ use std::io;
mod block_accessor;
mod column;
pub mod column_index;
mod column_index;
pub mod column_values;
mod columnar;
mod dictionary;
@@ -39,7 +39,7 @@ pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32;
pub type DocId = u32;
#[derive(Clone, Copy, Debug)]
#[derive(Clone, Copy)]
pub struct RowAddr {
pub segment_ord: u32,
pub row_id: RowId,

View File

@@ -4,15 +4,13 @@ use std::net::Ipv6Addr;
use common::DateTime;
use proptest::prelude::*;
use proptest::sample::subsequence;
use crate::column_values::MonotonicallyMappableToU128;
use crate::columnar::{ColumnType, ColumnTypeCategory};
use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
use crate::value::{Coerce, NumericalValue};
use crate::{
BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowAddr, RowId,
ShuffleMergeOrder, StackMergeOrder,
BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowId, StackMergeOrder,
};
#[test]
@@ -26,7 +24,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 99);
assert_eq!(cols[0].num_bytes(), 89);
}
#[test]
@@ -40,7 +38,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 99);
assert_eq!(cols[0].num_bytes(), 89);
}
#[test]
@@ -57,9 +55,7 @@ fn test_dataframe_writer_bool() {
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -81,9 +77,7 @@ fn test_dataframe_writer_u64_multivalued() {
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
};
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
@@ -105,9 +99,7 @@ fn test_dataframe_writer_ip_addr() {
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
@@ -140,9 +132,7 @@ fn test_dataframe_writer_numerical() {
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else {
panic!();
};
let DynamicColumn::I64(column_i64) = column else { panic!(); };
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
assert_eq!(column_i64.first(0), None);
assert_eq!(column_i64.first(1), Some(12i64));
@@ -206,9 +196,7 @@ fn test_dictionary_encoded_str() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
@@ -240,9 +228,7 @@ fn test_dictionary_encoded_bytes() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
panic!();
};
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
@@ -274,15 +260,12 @@ fn test_dictionary_encoded_bytes() {
fn num_strategy() -> impl Strategy<Value = NumericalValue> {
prop_oneof![
3 => Just(NumericalValue::U64(0u64)),
3 => Just(NumericalValue::U64(u64::MAX)),
3 => Just(NumericalValue::I64(0i64)),
3 => Just(NumericalValue::I64(i64::MIN)),
3 => Just(NumericalValue::I64(i64::MAX)),
3 => Just(NumericalValue::F64(1.2f64)),
1 => any::<f64>().prop_map(NumericalValue::from),
1 => any::<u64>().prop_map(NumericalValue::from),
1 => any::<i64>().prop_map(NumericalValue::from),
Just(NumericalValue::U64(0u64)),
Just(NumericalValue::U64(u64::MAX)),
Just(NumericalValue::I64(0i64)),
Just(NumericalValue::I64(i64::MIN)),
Just(NumericalValue::I64(i64::MAX)),
Just(NumericalValue::F64(1.2f64)),
]
}
@@ -296,12 +279,6 @@ enum ColumnValue {
DateTime(DateTime),
}
impl<T: Into<NumericalValue>> From<T> for ColumnValue {
fn from(val: T) -> ColumnValue {
ColumnValue::Numerical(val.into())
}
}
impl ColumnValue {
pub(crate) fn column_type_category(&self) -> ColumnTypeCategory {
match self {
@@ -330,9 +307,9 @@ fn bytes_strategy() -> impl Strategy<Value = &'static [u8]> {
// A random column value
fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
prop_oneof![
10 => string_strategy().prop_map(ColumnValue::Str),
1 => bytes_strategy().prop_map(ColumnValue::Bytes),
40 => num_strategy().prop_map(ColumnValue::Numerical),
10 => string_strategy().prop_map(|s| ColumnValue::Str(s)),
1 => bytes_strategy().prop_map(|b| ColumnValue::Bytes(b)),
40 => num_strategy().prop_map(|n| ColumnValue::Numerical(n)),
1 => (1u16..3u16).prop_map(|ip_addr_byte| ColumnValue::IpAddr(Ipv6Addr::new(
127,
0,
@@ -343,7 +320,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
0,
ip_addr_byte
))),
1 => any::<bool>().prop_map(ColumnValue::Bool),
1 => any::<bool>().prop_map(|b| ColumnValue::Bool(b)),
1 => (0_679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
]
@@ -351,22 +328,12 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
// A document contains up to 4 values.
fn doc_strategy() -> impl Strategy<Value = Vec<(&'static str, ColumnValue)>> {
proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..=4)
}
fn num_docs_strategy() -> impl Strategy<Value = usize> {
prop_oneof!(
// We focus heavily on the 0..2 case as we assume it is sufficient to cover all edge cases.
0usize..=3usize,
// We leave 50% of the effort exploring more defensively.
3usize..=12usize
)
proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..4)
}
// A columnar contains up to 2 docs.
fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, ColumnValue)>>> {
num_docs_strategy()
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
proptest::collection::vec(doc_strategy(), 0..=2)
}
fn columnar_docs_and_mapping_strategy(
@@ -380,11 +347,6 @@ fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
Just((0u32..n as RowId).collect()).prop_shuffle()
}
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
let vals: Vec<usize> = (0..n).collect();
subsequence(vals, 0..=n).prop_shuffle()
}
fn build_columnar_with_mapping(
docs: &[Vec<(&'static str, ColumnValue)>],
old_to_new_row_ids_opt: Option<&[RowId]>,
@@ -419,23 +381,15 @@ fn build_columnar_with_mapping(
columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
let columnar_reader = ColumnarReader::open(buffer).unwrap();
columnar_reader
}
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
build_columnar_with_mapping(docs, None)
}
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
assert_columnar_eq(left, right, false);
}
fn assert_columnar_eq(
left: &ColumnarReader,
right: &ColumnarReader,
lenient_on_numerical_value: bool,
) {
fn assert_columnar_eq(left: &ColumnarReader, right: &ColumnarReader) {
assert_eq!(left.num_rows(), right.num_rows());
let left_columns = left.list_columns().unwrap();
let right_columns = right.list_columns().unwrap();
@@ -444,7 +398,7 @@ fn assert_columnar_eq(
assert_eq!(left_columns[i].0, right_columns[i].0);
let left_column = left_columns[i].1.open().unwrap();
let right_column = right_columns[i].1.open().unwrap();
assert_dyn_column_eq(&left_column, &right_column, lenient_on_numerical_value);
assert_dyn_column_eq(&left_column, &right_column);
}
}
@@ -488,11 +442,11 @@ fn assert_bytes_column_eq(left: &BytesColumn, right: &BytesColumn) {
assert!(!right_terms.advance());
}
fn assert_dyn_column_eq(
left_dyn_column: &DynamicColumn,
right_dyn_column: &DynamicColumn,
lenient_on_numerical_value: bool,
) {
fn assert_dyn_column_eq(left_dyn_column: &DynamicColumn, right_dyn_column: &DynamicColumn) {
assert_eq!(
&left_dyn_column.column_type(),
&right_dyn_column.column_type()
);
assert_eq!(
&left_dyn_column.get_cardinality(),
&right_dyn_column.get_cardinality()
@@ -522,19 +476,8 @@ fn assert_dyn_column_eq(
(DynamicColumn::Str(left_col), DynamicColumn::Str(right_col)) => {
assert_bytes_column_eq(left_col, right_col);
}
(left, right) => {
if lenient_on_numerical_value {
assert_eq!(
ColumnTypeCategory::from(left.column_type()),
ColumnTypeCategory::from(right.column_type())
);
} else {
panic!(
"Column type are not the same: {:?} vs {:?}",
left.column_type(),
right.column_type()
);
}
_ => {
unreachable!()
}
}
}
@@ -545,36 +488,28 @@ trait AssertEqualToColumnValue {
impl AssertEqualToColumnValue for bool {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Bool(val) = column_value else {
panic!()
};
let ColumnValue::Bool(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl AssertEqualToColumnValue for Ipv6Addr {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::IpAddr(val) = column_value else {
panic!()
};
let ColumnValue::IpAddr(val) = column_value else { panic!() };
assert_eq!(self, val);
}
}
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Numerical(num) = column_value else {
panic!()
};
let ColumnValue::Numerical(num) = column_value else { panic!() };
assert_eq!(self, &T::coerce(*num));
}
}
impl AssertEqualToColumnValue for DateTime {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::DateTime(dt) = column_value else {
panic!()
};
let ColumnValue::DateTime(dt) = column_value else { panic!() };
assert_eq!(self, dt);
}
}
@@ -746,9 +681,9 @@ proptest! {
let stack_merge_order = StackMergeOrder::stack(&columnar_readers_arr[..]).into();
crate::merge_columnar(&columnar_readers_arr[..], &[], stack_merge_order, &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().flatten().cloned().collect();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
}
}
@@ -772,9 +707,9 @@ fn test_columnar_merging_empty_columnar() {
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
}
#[test]
@@ -809,137 +744,10 @@ fn test_columnar_merging_number_columns() {
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().flatten().cloned().collect();
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
}
// TODO add non trivial remap and merge
// TODO test required_columns
// TODO document edge case: required_columns incompatible with values.
fn columnar_docs_and_remap(
) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
let row_addrs: Vec<RowAddr> = columnars_docs
.iter()
.enumerate()
.flat_map(|(segment_ord, columnar_docs)| {
(0u32..columnar_docs.len() as u32).map(move |row_id| RowAddr {
segment_ord: segment_ord as u32,
row_id,
})
})
.collect();
permutation_and_subset_strategy(row_addrs.len()).prop_map(move |shuffled_subset| {
let shuffled_row_addr_subset: Vec<RowAddr> =
shuffled_subset.iter().map(|ord| row_addrs[*ord]).collect();
(columnars_docs.clone(), shuffled_row_addr_subset)
})
},
)
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
.collect();
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
}
}
#[test]
fn test_columnar_merge_empty() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 0];
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, vec![]);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 0);
assert_eq!(merged_columnar.num_columns(), 0);
}
#[test]
fn test_columnar_merge_single_str_column() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 1];
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&segment_num_rows,
vec![RowAddr {
segment_ord: 1u32,
row_id: 0u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
}
#[test]
fn test_delete_decrease_cardinality() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[
vec![
("c", ColumnValue::from(0i64)),
("c", ColumnValue::from(0i64)),
],
vec![("c", ColumnValue::from(0i64))],
][..];
// c is multivalued here
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&[0, 2],
vec![RowAddr {
segment_ord: 1u32,
row_id: 1u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
let cols = merged_columnar.read_columns("c").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].column_type(), ColumnType::I64);
assert_eq!(cols[0].open().unwrap().get_cardinality(), Cardinality::Full);
}

View File

@@ -109,7 +109,7 @@ impl Coerce for f64 {
impl Coerce for DateTime {
fn coerce(value: NumericalValue) -> Self {
let timestamp_micros = i64::coerce(value);
DateTime::from_timestamp_nanos(timestamp_micros)
DateTime::from_timestamp_micros(timestamp_micros)
}
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-common"
version = "0.6.0"
version = "0.5.0"
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
license = "MIT"
edition = "2021"
@@ -14,7 +14,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
[dependencies]
byteorder = "1.4.3"
ownedbytes = { version= "0.6", path="../ownedbytes" }
ownedbytes = { version= "0.5", path="../ownedbytes" }
async-trait = "0.1"
time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }

View File

@@ -1,39 +0,0 @@
#![feature(test)]
extern crate test;
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;
#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
}

View File

@@ -37,7 +37,7 @@ impl ByteCount {
for (suffix, threshold) in SUFFIX_AND_THRESHOLD.iter().rev() {
if self.get_bytes() >= *threshold {
let unit_num = self.get_bytes() as f64 / *threshold as f64;
return format!("{unit_num:.2} {suffix}");
return format!("{:.2} {}", unit_num, suffix);
}
}
format!("{:.2} B", self.get_bytes())

View File

@@ -1,36 +1,25 @@
#![allow(deprecated)]
use std::fmt;
use std::io::{Read, Write};
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
use crate::BinarySerializable;
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
/// DateTime Precision
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DateTimePrecision {
/// Second precision.
pub enum DatePrecision {
/// Seconds precision
#[default]
Seconds,
/// Millisecond precision.
/// Milli-seconds precision.
Milliseconds,
/// Microsecond precision.
/// Micro-seconds precision.
Microseconds,
/// Nanosecond precision.
Nanoseconds,
}
#[deprecated(since = "0.20.0", note = "Use `DateTimePrecision` instead")]
pub type DatePrecision = DateTimePrecision;
/// A date/time value with nanoseconds precision.
/// A date/time value with microsecond precision.
///
/// This timestamp does not carry any explicit time zone information.
/// Users are responsible for applying the provided conversion
@@ -42,46 +31,39 @@ pub type DatePrecision = DateTimePrecision;
/// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DateTime {
// Timestamp in nanoseconds.
pub(crate) timestamp_nanos: i64,
// Timestamp in microseconds.
pub(crate) timestamp_micros: i64,
}
impl DateTime {
/// Minimum possible `DateTime` value.
pub const MIN: DateTime = DateTime {
timestamp_nanos: i64::MIN,
timestamp_micros: i64::MIN,
};
/// Maximum possible `DateTime` value.
pub const MAX: DateTime = DateTime {
timestamp_nanos: i64::MAX,
timestamp_micros: i64::MAX,
};
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
Self {
timestamp_nanos: seconds * 1_000_000_000,
timestamp_micros: seconds * 1_000_000,
}
}
/// Create new from UNIX timestamp in milliseconds
pub const fn from_timestamp_millis(milliseconds: i64) -> Self {
Self {
timestamp_nanos: milliseconds * 1_000_000,
timestamp_micros: milliseconds * 1_000,
}
}
/// Create new from UNIX timestamp in microseconds.
pub const fn from_timestamp_micros(microseconds: i64) -> Self {
Self {
timestamp_nanos: microseconds * 1_000,
}
}
/// Create new from UNIX timestamp in nanoseconds.
pub const fn from_timestamp_nanos(nanoseconds: i64) -> Self {
Self {
timestamp_nanos: nanoseconds,
timestamp_micros: microseconds,
}
}
@@ -89,9 +71,9 @@ impl DateTime {
///
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_nanos = dt.unix_timestamp_nanos() as i64;
Self { timestamp_nanos }
pub const fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_micros = dt.unix_timestamp() * 1_000_000 + dt.microsecond() as i64;
Self { timestamp_micros }
}
/// Create new from `PrimitiveDateTime`
@@ -105,27 +87,23 @@ impl DateTime {
/// Convert to UNIX timestamp in seconds.
pub const fn into_timestamp_secs(self) -> i64 {
self.timestamp_nanos / 1_000_000_000
self.timestamp_micros / 1_000_000
}
/// Convert to UNIX timestamp in milliseconds.
pub const fn into_timestamp_millis(self) -> i64 {
self.timestamp_nanos / 1_000_000
self.timestamp_micros / 1_000
}
/// Convert to UNIX timestamp in microseconds.
pub const fn into_timestamp_micros(self) -> i64 {
self.timestamp_nanos / 1_000
}
/// Convert to UNIX timestamp in nanoseconds.
pub const fn into_timestamp_nanos(self) -> i64 {
self.timestamp_nanos
self.timestamp_micros
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(self.timestamp_nanos as i128)
let timestamp_nanos = self.timestamp_micros as i128 * 1000;
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(timestamp_nanos)
.expect("valid UNIX timestamp");
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
utc_datetime
@@ -148,34 +126,21 @@ impl DateTime {
}
/// Truncates the microseconds value to the corresponding precision.
pub fn truncate(self, precision: DateTimePrecision) -> Self {
pub fn truncate(self, precision: DatePrecision) -> Self {
let truncated_timestamp_micros = match precision {
DateTimePrecision::Seconds => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
DateTimePrecision::Milliseconds => (self.timestamp_nanos / 1_000_000) * 1_000_000,
DateTimePrecision::Microseconds => (self.timestamp_nanos / 1_000) * 1_000,
DateTimePrecision::Nanoseconds => self.timestamp_nanos,
DatePrecision::Seconds => (self.timestamp_micros / 1_000_000) * 1_000_000,
DatePrecision::Milliseconds => (self.timestamp_micros / 1_000) * 1_000,
DatePrecision::Microseconds => self.timestamp_micros,
};
Self {
timestamp_nanos: truncated_timestamp_micros,
timestamp_micros: truncated_timestamp_micros,
}
}
}
impl fmt::Debug for DateTime {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
f.write_str(&utc_rfc3339)
}
}
impl BinarySerializable for DateTime {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> std::io::Result<()> {
let timestamp_micros = self.into_timestamp_micros();
<i64 as BinarySerializable>::serialize(&timestamp_micros, writer)
}
fn deserialize<R: Read>(reader: &mut R) -> std::io::Result<Self> {
let timestamp_micros = <i64 as BinarySerializable>::deserialize(reader)?;
Ok(Self::from_timestamp_micros(timestamp_micros))
}
}

View File

@@ -0,0 +1,63 @@
use std::io::{self, Read, Write};
use crate::BinarySerializable;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum DictionaryKind {
Fst = 1,
SSTable = 2,
}
#[derive(Debug, Clone, PartialEq)]
pub struct DictionaryFooter {
pub kind: DictionaryKind,
pub version: u32,
}
impl DictionaryFooter {
pub fn verify_equal(&self, other: &DictionaryFooter) -> io::Result<()> {
if self.kind != other.kind {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Invalid dictionary type, expected {:?}, found {:?}",
self.kind, other.kind
),
));
}
if self.version != other.version {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unsuported dictionary version, expected {}, found {}",
self.version, other.version
),
));
}
Ok(())
}
}
impl BinarySerializable for DictionaryFooter {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.version.serialize(writer)?;
(self.kind as u32).serialize(writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let version = u32::deserialize(reader)?;
let kind = u32::deserialize(reader)?;
let kind = match kind {
1 => DictionaryKind::Fst,
2 => DictionaryKind::SSTable,
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("invalid dictionary kind: {kind}"),
))
}
};
Ok(DictionaryFooter { kind, version })
}
}

View File

@@ -1,4 +1,3 @@
use std::fs::File;
use std::ops::{Deref, Range, RangeBounds};
use std::sync::Arc;
use std::{fmt, io};
@@ -33,62 +32,6 @@ pub trait FileHandle: 'static + Send + Sync + HasLen + fmt::Debug {
}
}
#[derive(Debug)]
/// A File with it's length included.
pub struct WrapFile {
file: File,
len: usize,
}
impl WrapFile {
/// Creates a new WrapFile and stores its length.
pub fn new(file: File) -> io::Result<Self> {
let len = file.metadata()?.len() as usize;
Ok(WrapFile { file, len })
}
}
#[async_trait]
impl FileHandle for WrapFile {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
let file_len = self.len();
// Calculate the actual range to read, ensuring it stays within file boundaries
let start = range.start;
let end = range.end.min(file_len);
// Ensure the start is before the end of the range
if start >= end {
return Err(io::Error::new(io::ErrorKind::InvalidInput, "Invalid range"));
}
let mut buffer = vec![0; end - start];
#[cfg(unix)]
{
use std::os::unix::prelude::FileExt;
self.file.read_exact_at(&mut buffer, start as u64)?;
}
#[cfg(not(unix))]
{
use std::io::{Read, Seek};
let mut file = self.file.try_clone()?; // Clone the file to read from it separately
// Seek to the start position in the file
file.seek(io::SeekFrom::Start(start as u64))?;
// Read the data into the buffer
file.read_exact(&mut buffer)?;
}
Ok(OwnedBytes::new(buffer))
}
// todo implement async
}
impl HasLen for WrapFile {
fn len(&self) -> usize {
self.len
}
}
#[async_trait]
impl FileHandle for &'static [u8] {
fn read_bytes(&self, range: Range<usize>) -> io::Result<OwnedBytes> {
@@ -124,30 +67,6 @@ impl fmt::Debug for FileSlice {
}
}
impl FileSlice {
pub fn stream_file_chunks(&self) -> impl Iterator<Item = io::Result<OwnedBytes>> + '_ {
let len = self.range.end;
let mut start = self.range.start;
std::iter::from_fn(move || {
/// Returns chunks of 1MB of data from the FileHandle.
const CHUNK_SIZE: usize = 1024 * 1024; // 1MB
if start < len {
let end = (start + CHUNK_SIZE).min(len);
let range = start..end;
let chunk = self.data.read_bytes(range);
start += CHUNK_SIZE;
match chunk {
Ok(chunk) => Some(Ok(chunk)),
Err(e) => Some(Err(e)),
}
} else {
None
}
})
}
}
/// Takes a range, a `RangeBounds` object, and returns
/// a `Range` that corresponds to the relative application of the
/// `RangeBounds` object to the original `Range`.

View File

@@ -27,15 +27,15 @@ pub trait GroupByIteratorExtended: Iterator {
where
Self: Sized,
F: FnMut(&Self::Item) -> K,
K: PartialEq + Clone,
Self::Item: Clone,
K: PartialEq + Copy,
Self::Item: Copy,
{
GroupByIterator::new(self, key)
}
}
impl<I: Iterator> GroupByIteratorExtended for I {}
pub struct GroupByIterator<I, F, K: Clone>
pub struct GroupByIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -50,7 +50,7 @@ where
inner: Rc<RefCell<GroupByShared<I, F, K>>>,
}
struct GroupByShared<I, F, K: Clone>
struct GroupByShared<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -63,7 +63,7 @@ impl<I, F, K> GroupByIterator<I, F, K>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
fn new(inner: I, group_by_fn: F) -> Self {
let inner = GroupByShared {
@@ -80,28 +80,28 @@ where
impl<I, F, K> Iterator for GroupByIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
K: Clone,
K: Copy,
{
type Item = (K, GroupIterator<I, F, K>);
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
let value = inner.iter.peek()?.clone();
let value = *inner.iter.peek()?;
let key = (inner.group_by_fn)(&value);
let inner = self.inner.clone();
let group_iter = GroupIterator {
inner,
group_key: key.clone(),
group_key: key,
};
Some((key, group_iter))
}
}
pub struct GroupIterator<I, F, K: Clone>
pub struct GroupIterator<I, F, K: Copy>
where
I: Iterator,
F: FnMut(&I::Item) -> K,
@@ -110,10 +110,10 @@ where
group_key: K,
}
impl<I, F, K: PartialEq + Clone> Iterator for GroupIterator<I, F, K>
impl<I, F, K: PartialEq + Copy> Iterator for GroupIterator<I, F, K>
where
I: Iterator,
I::Item: Clone,
I::Item: Copy,
F: FnMut(&I::Item) -> K,
{
type Item = I::Item;
@@ -121,7 +121,7 @@ where
fn next(&mut self) -> Option<Self::Item> {
let mut inner = self.inner.borrow_mut();
// peek if next value is in group
let peek_val = inner.iter.peek()?.clone();
let peek_val = *inner.iter.peek()?;
if (inner.group_by_fn)(&peek_val) == self.group_key {
inner.iter.next()
} else {

View File

@@ -1,112 +0,0 @@
use crate::replace_in_place;
/// Separates the different segments of a json path.
pub const JSON_PATH_SEGMENT_SEP: u8 = 1u8;
pub const JSON_PATH_SEGMENT_SEP_STR: &str =
unsafe { std::str::from_utf8_unchecked(&[JSON_PATH_SEGMENT_SEP]) };
/// Create a new JsonPathWriter, that creates flattened json paths for tantivy.
#[derive(Clone, Debug, Default)]
pub struct JsonPathWriter {
path: String,
indices: Vec<usize>,
expand_dots: bool,
}
impl JsonPathWriter {
pub fn new() -> Self {
JsonPathWriter {
path: String::new(),
indices: Vec::new(),
expand_dots: false,
}
}
/// When expand_dots is enabled, json object like
/// `{"k8s.node.id": 5}` is processed as if it was
/// `{"k8s": {"node": {"id": 5}}}`.
/// This option has the merit of allowing users to
/// write queries like `k8s.node.id:5`.
/// On the other, enabling that feature can lead to
/// ambiguity.
#[inline]
pub fn set_expand_dots(&mut self, expand_dots: bool) {
self.expand_dots = expand_dots;
}
/// Push a new segment to the path.
#[inline]
pub fn push(&mut self, segment: &str) {
let len_path = self.path.len();
self.indices.push(len_path);
if !self.path.is_empty() {
self.path.push_str(JSON_PATH_SEGMENT_SEP_STR);
}
self.path.push_str(segment);
if self.expand_dots {
// This might include the separation byte, which is ok because it is not a dot.
let appended_segment = &mut self.path[len_path..];
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe {
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment.as_bytes_mut())
};
}
}
/// Remove the last segment. Does nothing if the path is empty.
#[inline]
pub fn pop(&mut self) {
if let Some(last_idx) = self.indices.pop() {
self.path.truncate(last_idx);
}
}
/// Clear the path.
#[inline]
pub fn clear(&mut self) {
self.path.clear();
self.indices.clear();
}
/// Get the current path.
#[inline]
pub fn as_str(&self) -> &str {
&self.path
}
}
impl From<JsonPathWriter> for String {
#[inline]
fn from(value: JsonPathWriter) -> Self {
value.path
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn json_path_writer_test() {
let mut writer = JsonPathWriter::new();
writer.push("root");
assert_eq!(writer.as_str(), "root");
writer.push("child");
assert_eq!(writer.as_str(), "root\u{1}child");
writer.pop();
assert_eq!(writer.as_str(), "root");
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s.node.id");
writer.set_expand_dots(true);
writer.pop();
writer.push("k8s.node.id");
assert_eq!(writer.as_str(), "root\u{1}k8s\u{1}node\u{1}id");
}
}

View File

@@ -7,23 +7,22 @@ pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod byte_count;
mod datetime;
mod dictionary_footer;
pub mod file_slice;
mod group_by;
mod json_path_writer;
mod serialize;
mod vint;
mod writer;
pub use bitset::*;
pub use byte_count::ByteCount;
#[allow(deprecated)]
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision};
pub use datetime::{DatePrecision, DateTime};
pub use dictionary_footer::*;
pub use group_by::GroupByIteratorExtended;
pub use json_path_writer::JsonPathWriter;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
serialize_vint_u32, write_u32_vint, VInt, VIntU128,
};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};
@@ -118,7 +117,6 @@ pub fn u64_to_f64(val: u64) -> f64 {
///
/// This function assumes that the needle is rarely contained in the bytes string
/// and offers a fast path if the needle is not present.
#[inline]
pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
if !bytes.contains(&needle) {
return;

View File

@@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::io::{Read, Write};
use std::{fmt, io};
@@ -250,43 +249,6 @@ impl BinarySerializable for String {
}
}
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
.read_to_string(&mut result)?;
Ok(Cow::Owned(result))
}
}
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
for it in self.iter() {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))
}
}
#[cfg(test)]
pub mod test {

View File

@@ -1,6 +1,8 @@
use std::io;
use std::io::{Read, Write};
use byteorder::{ByteOrder, LittleEndian};
use super::BinarySerializable;
/// Variable int serializes a u128 number
@@ -17,6 +19,26 @@ pub fn serialize_vint_u128(mut val: u128, output: &mut Vec<u8>) {
}
}
/// Deserializes a u128 number
///
/// Returns the number and the slice after the vint
pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> {
let mut result = 0u128;
let mut shift = 0u64;
for i in 0..19 {
let b = data[i];
result |= u128::from(b % 128u8) << shift;
if b >= STOP_BIT {
return Ok((result, &data[i + 1..]));
}
shift += 7;
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Failed to deserialize u128 vint",
))
}
/// Wrapper over a `u128` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VIntU128(pub u128);
@@ -58,13 +80,17 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
#[inline]
pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
@@ -73,29 +99,25 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
let (res, num_bytes) = if val < START_2 {
(val | STOP_BIT, 1)
} else if val < START_3 {
(
let (res, num_bytes) = match val {
0..=STOP_1 => (val | STOP_BIT, 1),
START_2..=STOP_2 => (
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
)
} else if val < START_4 {
(
),
START_3..=STOP_3 => (
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
)
} else if val < START_5 {
(
),
START_4..=STOP_4 => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
)
} else {
(
),
_ => (
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
@@ -103,9 +125,9 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
)
),
};
*buf = res.to_le_bytes();
LittleEndian::write_u64(&mut buf[..], res);
&buf[0..num_bytes]
}
@@ -223,6 +245,7 @@ impl BinarySerializable for VInt {
mod tests {
use super::{serialize_vint_u32, BinarySerializable, VInt};
use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128};
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
@@ -261,7 +284,27 @@ mod tests {
let mut buffer2 = [0u8; 8];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let res2 = serialize_vint_u32(val, &mut buffer2);
assert_eq!(&buffer[..len_vint], res2, "array wrong for {val}");
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val);
}
fn aux_test_vint_u128(val: u128) {
let mut data = vec![];
serialize_vint_u128(val, &mut data);
let (deser_val, _data) = deserialize_vint_u128(&data).unwrap();
assert_eq!(val, deser_val);
let mut out = vec![];
VIntU128(val).serialize(&mut out).unwrap();
let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap();
assert_eq!(val, deser_val.0);
}
#[test]
fn test_vint_u128() {
aux_test_vint_u128(0);
aux_test_vint_u128(1);
aux_test_vint_u128(u128::MAX / 3);
aux_test_vint_u128(u128::MAX);
}
#[test]

View File

@@ -7,12 +7,17 @@
// ---
use serde_json::{Deserializer, Value};
use tantivy::aggregation::agg_req::Aggregations;
use tantivy::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
RangeAggregation,
};
use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::bucket::RangeAggregationRange;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Create Schema
@@ -132,10 +137,10 @@ fn main() -> tantivy::Result<()> {
let stream = Deserializer::from_str(data).into_iter::<Value>();
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let mut num_indexed = 0;
for value in stream {
let doc = TantivyDocument::parse_json(&schema, &serde_json::to_string(&value.unwrap())?)?;
let doc = schema.parse_document(&serde_json::to_string(&value.unwrap())?)?;
index_writer.add_document(doc)?;
num_indexed += 1;
if num_indexed > 4 {
@@ -189,9 +194,56 @@ fn main() -> tantivy::Result<()> {
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res2: Value = serde_json::to_value(agg_res)?;
// ### Request Rust API
//
// This is exactly the same request as above, but via the rust structures.
//
let agg_req: Aggregations = vec![(
"group_by_stock".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "stock".to_string(),
ranges: vec![
RangeAggregationRange {
key: Some("few".into()),
from: None,
to: Some(1f64),
},
RangeAggregationRange {
key: Some("some".into()),
from: Some(1f64),
to: Some(10f64),
},
RangeAggregationRange {
key: Some("many".into()),
from: Some(10f64),
to: None,
},
],
..Default::default()
}),
sub_aggregation: vec![(
"average_price".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("price".to_string()),
)),
)]
.into_iter()
.collect(),
})),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
// We use the `AllQuery` which will pass all documents to the AggregationCollector.
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::to_value(agg_res)?;
let res1: Value = serde_json::to_value(agg_res)?;
// ### Aggregation Result
//
@@ -209,7 +261,8 @@ fn main() -> tantivy::Result<()> {
}
"#;
let expected_json: Value = serde_json::from_str(expected_res)?;
assert_eq!(expected_json, res);
assert_eq!(expected_json, res1);
assert_eq!(expected_json, res2);
// ### Request 2
//

View File

@@ -15,7 +15,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -75,7 +75,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -87,7 +87,7 @@ fn main() -> tantivy::Result<()> {
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
old_man_doc.add_text(
body,
@@ -164,7 +164,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher.
@@ -217,23 +217,9 @@ fn main() -> tantivy::Result<()> {
// the document returned will only contain
// a title.
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
// We can also get an explanation to understand
// how a found document got its score.
let query = query_parser.parse_query("title:sea^20 body:whale^70")?;
let (_score, doc_address) = searcher
.search(&query, &TopDocs::with_limit(1))?
.into_iter()
.next()
.unwrap();
let explanation = query.explain(&searcher, doc_address)?;
println!("{}", explanation.to_pretty_json());
Ok(())
}

View File

@@ -13,7 +13,7 @@ use columnar::Column;
use tantivy::collector::{Collector, SegmentCollector};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, TEXT};
use tantivy::{doc, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, Index, Score, SegmentReader};
#[derive(Default)]
struct Stats {
@@ -142,7 +142,7 @@ fn main() -> tantivy::Result<()> {
// this example.
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
product_name => "Super Broom 2000",
product_description => "While it is ok for short distance travel, this broom \

View File

@@ -6,7 +6,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::NgramTokenizer;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
.register("ngram3", NgramTokenizer::new(3, 3, false));
// To insert document we need an index writer.
// There must be only one writer at a time.
@@ -62,7 +62,7 @@ fn main() -> tantivy::Result<()> {
//
// Here we use a buffer of 50MB per thread. Using a bigger
// memory arena for the indexer can increase its throughput.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and \
@@ -103,8 +103,8 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (_, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -4,8 +4,8 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{DateOptions, Document, OwnedValue, Schema, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::schema::{DateOptions, Schema, Value, INDEXED, STORED, STRING};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast()
.set_precision(tantivy::DateTimePrecision::Seconds);
.set_precision(tantivy::DatePrecision::Seconds);
// Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);
@@ -22,18 +22,16 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// The dates are passed as string in the RFC3339 format
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T12:53:50.53Z",
"event": "pull-request"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"occurred_at": "2022-06-22T13:00:00.22Z",
"event": "comment"
@@ -60,13 +58,13 @@ fn main() -> tantivy::Result<()> {
let count_docs = searcher.search(&*query, &TopDocs::with_limit(4))?;
assert_eq!(count_docs.len(), 1);
for (_score, doc_address) in count_docs {
let retrieved_doc = searcher.doc::<TantivyDocument>(doc_address)?;
let retrieved_doc = searcher.doc(doc_address)?;
assert!(matches!(
retrieved_doc.get_first(occurred_at),
Some(OwnedValue::Date(_))
Some(Value::Date(_))
));
assert_eq!(
retrieved_doc.to_json(&schema),
schema.to_json(&retrieved_doc),
r#"{"event":["comment"],"occurred_at":["2022-06-22T13:00:00.22Z"]}"#
);
}

View File

@@ -11,7 +11,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexReader, IndexWriter};
use tantivy::{doc, Index, IndexReader};
// A simple helper function to fetch a single document
// given its id from our index.
@@ -19,7 +19,7 @@ use tantivy::{doc, Index, IndexReader, IndexWriter};
fn extract_doc_given_isbn(
reader: &IndexReader,
isbn_term: &Term,
) -> tantivy::Result<Option<TantivyDocument>> {
) -> tantivy::Result<Option<Document>> {
let searcher = reader.searcher();
// This is the simplest query you can think of.
@@ -69,10 +69,10 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's add a couple of documents, for the sake of the example.
let mut old_man_doc = TantivyDocument::default();
let mut old_man_doc = Document::default();
old_man_doc.add_text(title, "The Old Man and the Sea");
index_writer.add_document(doc!(
isbn => "978-0099908401",
@@ -94,7 +94,7 @@ fn main() -> tantivy::Result<()> {
// Oops our frankenstein doc seems misspelled
let frankenstein_doc_misspelled = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_doc_misspelled.to_json(&schema),
schema.to_json(&frankenstein_doc_misspelled),
r#"{"isbn":["978-9176370711"],"title":["Frankentein"]}"#,
);
@@ -136,7 +136,7 @@ fn main() -> tantivy::Result<()> {
// No more typo!
let frankenstein_new_doc = extract_doc_given_isbn(&reader, &frankenstein_isbn)?.unwrap();
assert_eq!(
frankenstein_new_doc.to_json(&schema),
schema.to_json(&frankenstein_new_doc),
r#"{"isbn":["978-9176370711"],"title":["Frankenstein"]}"#,
);

View File

@@ -17,7 +17,7 @@
use tantivy::collector::FacetCollector;
use tantivy::query::{AllQuery, TermQuery};
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// Let's create a temporary directory for the sake of this example
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
// For convenience, tantivy also comes with a macro to
// reduce the boilerplate above.

View File

@@ -12,7 +12,7 @@ use std::collections::HashSet;
use tantivy::collector::TopDocs;
use tantivy::query::BooleanQuery;
use tantivy::schema::*;
use tantivy::{doc, DocId, Index, IndexWriter, Score, SegmentReader};
use tantivy::{doc, DocId, Index, Score, SegmentReader};
fn main() -> tantivy::Result<()> {
let mut schema_builder = Schema::builder();
@@ -23,7 +23,7 @@ fn main() -> tantivy::Result<()> {
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer(30_000_000)?;
let mut index_writer = index.writer(30_000_000)?;
index_writer.add_document(doc!(
title => "Fried egg",
@@ -91,10 +91,11 @@ fn main() -> tantivy::Result<()> {
.iter()
.map(|(_, doc_id)| {
searcher
.doc::<TantivyDocument>(*doc_id)
.doc(*doc_id)
.unwrap()
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.as_text()
.unwrap()
.to_owned()
})

View File

@@ -14,7 +14,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::FuzzyTermQuery;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -66,7 +66,7 @@ fn main() -> tantivy::Result<()> {
// Here we give tantivy a budget of `50MB`.
// Using a bigger memory_arena for the indexer may increase
// throughput, but 50 MB is already plenty.
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// Let's index our documents!
// We first need a handle on the title and the body field.
@@ -123,7 +123,7 @@ fn main() -> tantivy::Result<()> {
// will reload the index automatically after each commit.
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
// We now need to acquire a searcher.
@@ -151,10 +151,10 @@ fn main() -> tantivy::Result<()> {
assert_eq!(count, 3);
assert_eq!(top_docs.len(), 3);
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
// Note that the score is not lower for the fuzzy hit.
// There's an issue open for that: https://github.com/quickwit-oss/tantivy/issues/563
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("score {score:?} doc {}", retrieved_doc.to_json(&schema));
println!("score {score:?} doc {}", schema.to_json(&retrieved_doc));
// score 1.0 doc {"title":["The Diary of Muadib"]}
//
// score 1.0 doc {"title":["The Diary of a Young Girl"]}

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()?
};
println!("committed with opstamp {opstamp}");
println!("committed with opstamp {}", opstamp);
thread::sleep(Duration::from_millis(500));
}

View File

@@ -21,7 +21,7 @@ fn main() -> tantivy::Result<()> {
}"#;
// We can parse our document
let _mice_and_men_doc = TantivyDocument::parse_json(&schema, mice_and_men_doc_json)?;
let _mice_and_men_doc = schema.parse_document(mice_and_men_doc_json)?;
// Multi-valued field are allowed, they are
// expressed in JSON by an array.
@@ -30,7 +30,7 @@ fn main() -> tantivy::Result<()> {
"title": ["Frankenstein", "The Modern Prometheus"],
"year": 1818
}"#;
let _frankenstein_doc = TantivyDocument::parse_json(&schema, frankenstein_json)?;
let _frankenstein_doc = schema.parse_document(frankenstein_json)?;
// Note that the schema is saved in your index directory.
//

View File

@@ -5,7 +5,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, Result};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -17,7 +17,7 @@ fn main() -> Result<()> {
let index = Index::create_in_ram(schema);
let reader = index.reader()?;
{
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 6_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 6_000_000)?;
for year in 1950u64..2019u64 {
index_writer.add_document(doc!(year_field => year))?;
}

View File

@@ -6,7 +6,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, INDEXED, STORED, STRING};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -22,22 +22,20 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// ### IPv4
// Adding documents that contain an IPv4 address. Notice that the IP addresses are passed as
// `String`. Since the field is of type ip, we parse the IP address from the string and store it
// internally as IPv6.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.33",
"event_type": "login"
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "192.168.0.80",
"event_type": "checkout"
@@ -46,8 +44,7 @@ fn main() -> tantivy::Result<()> {
index_writer.add_document(doc)?;
// ### IPv6
// Adding a document that contains an IPv6 address.
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"ip": "2001:0db8:85a3:0000:0000:8a2e:0370:7334",
"event_type": "checkout"

View File

@@ -10,7 +10,7 @@
// ---
// Importing tantivy...
use tantivy::schema::*;
use tantivy::{doc, DocSet, Index, IndexWriter, Postings, TERMINATED};
use tantivy::{doc, DocSet, Index, Postings, TERMINATED};
fn main() -> tantivy::Result<()> {
// We first create a schema for the sake of the
@@ -24,7 +24,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
// Doc 0: TermFreq 2: [0, 4]
// Doc 2: TermFreq 1: [0]
// ```
println!("Doc {doc_id}: TermFreq {term_freq}: {positions:?}");
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
doc_id = segment_postings.advance();
}
}
@@ -125,7 +125,7 @@ fn main() -> tantivy::Result<()> {
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {docs:?}");
println!("Docs {:?}", docs);
block_segment_postings.advance();
}
}

View File

@@ -7,7 +7,7 @@
use tantivy::collector::{Count, TopDocs};
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, STORED, STRING, TEXT};
use tantivy::{Index, IndexWriter, TantivyDocument};
use tantivy::Index;
fn main() -> tantivy::Result<()> {
// # Defining the schema
@@ -20,9 +20,8 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_ram(schema.clone());
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let doc = TantivyDocument::parse_json(
&schema,
let mut index_writer = index.writer(50_000_000)?;
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:50.53Z",
"event_type": "click",
@@ -34,8 +33,7 @@ fn main() -> tantivy::Result<()> {
}"#,
)?;
index_writer.add_document(doc)?;
let doc = TantivyDocument::parse_json(
&schema,
let doc = schema.parse_document(
r#"{
"timestamp": "2022-02-22T23:20:51.53Z",
"event_type": "click",

View File

@@ -1,83 +0,0 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, IndexWriter, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
let index_path = TempDir::new()?;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
eighty-four days now without taking a fish.",
))?;
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
))?;
index_writer.commit()?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// This will match documents containing the phrase "in the"
// followed by some word starting with "su",
// i.e. it will match "in the sunlight" and "in the success",
// but not "in the Gulf Stream".
let query = query_parser.parse_query("\"in the su\"*")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let title = doc
.get_first(title)
.and_then(|v| v.as_str())
.unwrap()
.to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;
titles.sort_unstable();
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
Ok(())
}

View File

@@ -13,12 +13,11 @@ use tantivy::collector::{Count, TopDocs};
use tantivy::query::TermQuery;
use tantivy::schema::*;
use tantivy::tokenizer::{PreTokenizedString, SimpleTokenizer, Token, TokenStream, Tokenizer};
use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());
@@ -38,7 +37,7 @@ fn main() -> tantivy::Result<()> {
let index = Index::create_in_dir(&index_path, schema.clone())?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// We can create a document manually, by setting the fields
// one by one in a Document object.
@@ -83,7 +82,7 @@ fn main() -> tantivy::Result<()> {
}]
}"#;
let short_man_doc = TantivyDocument::parse_json(&schema, short_man_json)?;
let short_man_doc = schema.parse_document(short_man_json)?;
index_writer.add_document(short_man_doc)?;
@@ -94,7 +93,7 @@ fn main() -> tantivy::Result<()> {
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
@@ -115,8 +114,8 @@ fn main() -> tantivy::Result<()> {
// Note that the tokens are not stored along with the original text
// in the document store
for (_score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("Document: {}", schema.to_json(&retrieved_doc));
}
// In contrary to the previous query, when we search for the "man" term we

View File

@@ -10,8 +10,7 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::snippet::{Snippet, SnippetGenerator};
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index, Snippet, SnippetGenerator};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
@@ -28,7 +27,7 @@ fn main() -> tantivy::Result<()> {
// # Indexing documents
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
// we'll only need one doc for this example.
index_writer.add_document(doc!(
@@ -55,10 +54,13 @@ fn main() -> tantivy::Result<()> {
let snippet_generator = SnippetGenerator::create(&searcher, &*query, body)?;
for (score, doc_address) in top_docs {
let doc = searcher.doc::<TantivyDocument>(doc_address)?;
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {score}:");
println!("title: {}", doc.get_first(title).unwrap().as_str().unwrap());
println!("Document score {}:", score);
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()
);
println!("snippet: {}", snippet.to_html());
println!("custom highlighting: {}", highlight(snippet));
}

View File

@@ -15,7 +15,7 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::tokenizer::*;
use tantivy::{doc, Index, IndexWriter};
use tantivy::{doc, Index};
fn main() -> tantivy::Result<()> {
// this example assumes you understand the content in `basic_search`
@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
@@ -60,7 +60,7 @@ fn main() -> tantivy::Result<()> {
index.tokenizers().register("stoppy", tokenizer);
let mut index_writer: IndexWriter = index.writer(50_000_000)?;
let mut index_writer = index.writer(50_000_000)?;
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
@@ -105,9 +105,9 @@ fn main() -> tantivy::Result<()> {
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
for (score, doc_address) in top_docs {
let retrieved_doc: TantivyDocument = searcher.doc(doc_address)?;
println!("\n==\nDocument score {score}:");
println!("{}", retrieved_doc.to_json(&schema));
let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {}:", score);
println!("{}", schema.to_json(&retrieved_doc));
}
Ok(())

View File

@@ -6,14 +6,12 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexWriter, Opstamp, Searcher, SearcherGeneration, SegmentId,
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
};
// This example shows how warmers can be used to
// load values from an external sources and
// tie their lifecycle to that of the index segments
// using the Warmer API.
// load a values from an external sources using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
@@ -25,11 +23,9 @@ pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
type SegmentKey = (SegmentId, Option<Opstamp>);
struct DynamicPriceColumn {
field: String,
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
@@ -50,6 +46,7 @@ impl DynamicPriceColumn {
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
@@ -58,40 +55,37 @@ impl Warmer for DynamicPriceColumn {
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let prices: Vec<Price> = (0..segment.max_doc())
.map(|doc| {
if !segment.is_deleted(doc) {
prices.next().unwrap()
} else {
0
}
})
.collect();
let key = (segment.segment_id(), segment.delete_opstamp());
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(prices));
.insert(key, Arc::new(price_vals));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
let live_keys: HashSet<SegmentKey> = live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
.collect();
self.price_cache
.write()
.unwrap()
.retain(|key, _| live_keys.contains(key));
}
}
@@ -106,17 +100,17 @@ pub struct ExternalPriceTable {
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
self.prices.write().unwrap().insert(product_id, price);
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices = self.prices.read().unwrap();
let prices_read = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
@@ -143,14 +137,17 @@ fn main() -> tantivy::Result<()> {
const SNEAKERS: ProductId = 23222;
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_with_num_threads(1, 15_000_000)?;
let mut writer = index.writer_with_num_threads(1, 10_000_000)?;
writer.add_document(doc!(product_id=>OLIVE_OIL, text=>"cooking olive oil from greece"))?;
writer.add_document(doc!(product_id=>GLOVES, text=>"kitchen gloves, perfect for cooking"))?;
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
let reader = index.reader_builder().warmers(warmers).try_into()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
reader.reload()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;

View File

@@ -1,7 +1,7 @@
[package]
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
name = "ownedbytes"
version = "0.6.0"
version = "0.5.0"
edition = "2021"
description = "Expose data as static slice"
license = "MIT"

View File

@@ -1,7 +1,7 @@
use std::convert::TryInto;
use std::ops::Deref;
use std::ops::{Deref, Range};
use std::sync::Arc;
use std::{fmt, io};
use std::{fmt, io, mem};
pub use stable_deref_trait::StableDeref;
@@ -26,8 +26,8 @@ impl OwnedBytes {
data_holder: T,
) -> OwnedBytes {
let box_stable_deref = Arc::new(data_holder);
let bytes: &[u8] = box_stable_deref.deref();
let data = unsafe { &*(bytes as *const [u8]) };
let bytes: &[u8] = box_stable_deref.as_ref();
let data = unsafe { mem::transmute::<_, &'static [u8]>(bytes.deref()) };
OwnedBytes {
data,
box_stable_deref,
@@ -37,7 +37,7 @@ impl OwnedBytes {
/// creates a fileslice that is just a view over a slice of the data.
#[must_use]
#[inline]
pub fn slice(&self, range: impl std::slice::SliceIndex<[u8], Output = [u8]>) -> Self {
pub fn slice(&self, range: Range<usize>) -> Self {
OwnedBytes {
data: &self.data[range],
box_stable_deref: self.box_stable_deref.clone(),
@@ -57,12 +57,6 @@ impl OwnedBytes {
self.data.len()
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.data.is_empty()
}
/// Splits the OwnedBytes into two OwnedBytes `(left, right)`.
///
/// Left will hold `split_len` bytes.
@@ -74,14 +68,13 @@ impl OwnedBytes {
#[inline]
#[must_use]
pub fn split(self, split_len: usize) -> (OwnedBytes, OwnedBytes) {
let (left_data, right_data) = self.data.split_at(split_len);
let right_box_stable_deref = self.box_stable_deref.clone();
let left = OwnedBytes {
data: left_data,
data: &self.data[..split_len],
box_stable_deref: self.box_stable_deref,
};
let right = OwnedBytes {
data: right_data,
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
(left, right)
@@ -106,45 +99,45 @@ impl OwnedBytes {
///
/// `self` is truncated to `split_len`, left with the remaining bytes.
pub fn split_off(&mut self, split_len: usize) -> OwnedBytes {
let (left, right) = self.data.split_at(split_len);
let right_box_stable_deref = self.box_stable_deref.clone();
let right_piece = OwnedBytes {
data: right,
data: &self.data[split_len..],
box_stable_deref: right_box_stable_deref,
};
self.data = left;
self.data = &self.data[..split_len];
right_piece
}
/// Returns true iff this `OwnedBytes` is empty.
#[inline]
pub fn is_empty(&self) -> bool {
self.as_slice().is_empty()
}
/// Drops the left most `advance_len` bytes.
#[inline]
pub fn advance(&mut self, advance_len: usize) -> &[u8] {
let (data, rest) = self.data.split_at(advance_len);
self.data = rest;
data
pub fn advance(&mut self, advance_len: usize) {
self.data = &self.data[advance_len..]
}
/// Reads an `u8` from the `OwnedBytes` and advance by one byte.
#[inline]
pub fn read_u8(&mut self) -> u8 {
self.advance(1)[0]
}
assert!(!self.is_empty());
#[inline]
fn read_n<const N: usize>(&mut self) -> [u8; N] {
self.advance(N).try_into().unwrap()
}
/// Reads an `u32` encoded as little-endian from the `OwnedBytes` and advance by 4 bytes.
#[inline]
pub fn read_u32(&mut self) -> u32 {
u32::from_le_bytes(self.read_n())
let byte = self.as_slice()[0];
self.advance(1);
byte
}
/// Reads an `u64` encoded as little-endian from the `OwnedBytes` and advance by 8 bytes.
#[inline]
pub fn read_u64(&mut self) -> u64 {
u64::from_le_bytes(self.read_n())
assert!(self.len() > 7);
let octlet: [u8; 8] = self.as_slice()[..8].try_into().unwrap();
self.advance(8);
u64::from_le_bytes(octlet)
}
}
@@ -157,7 +150,7 @@ impl fmt::Debug for OwnedBytes {
} else {
self.as_slice()
};
write!(f, "OwnedBytes({bytes_truncated:?}, len={})", self.len())
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
}
}
@@ -198,33 +191,32 @@ impl Deref for OwnedBytes {
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
impl io::Read for OwnedBytes {
#[inline]
fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
let data_len = self.data.len();
let buf_len = buf.len();
if data_len >= buf_len {
let data = self.advance(buf_len);
buf.copy_from_slice(data);
Ok(buf_len)
} else {
buf[..data_len].copy_from_slice(self.data);
self.data = &[];
Ok(data_len)
}
let read_len = {
let data = self.as_slice();
if data.len() >= buf.len() {
let buf_len = buf.len();
buf.copy_from_slice(&data[..buf_len]);
buf.len()
} else {
let data_len = data.len();
buf[..data_len].copy_from_slice(data);
data_len
}
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
fn read_to_end(&mut self, buf: &mut Vec<u8>) -> io::Result<usize> {
buf.extend(self.data);
let read_len = self.data.len();
self.data = &[];
let read_len = {
let data = self.as_slice();
buf.extend(data);
data.len()
};
self.advance(read_len);
Ok(read_len)
}
#[inline]
@@ -240,6 +232,13 @@ impl io::Read for OwnedBytes {
}
}
impl AsRef<[u8]> for OwnedBytes {
#[inline]
fn as_ref(&self) -> &[u8] {
self.as_slice()
}
}
#[cfg(test)]
mod tests {
use std::io::{self, Read};
@@ -250,12 +249,12 @@ mod tests {
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{short_bytes:?}"),
format!("{:?}", short_bytes),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{long_bytes:?}"),
format!("{:?}", long_bytes),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.21.0"
version = "0.19.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,4 +12,6 @@ keywords = ["search", "information", "retrieval"]
edition = "2021"
[dependencies]
nom = "7"
combine = {version="4", default-features=false, features=[] }
once_cell = "1.7.2"
regex ={ version = "1.5.4", default-features = false, features = ["std", "unicode"] }

View File

@@ -1,353 +0,0 @@
//! nom combinators for infallible operations
use std::convert::Infallible;
use nom::{AsChar, IResult, InputLength, InputTakeAtPosition};
pub(crate) type ErrorList = Vec<LenientErrorInternal>;
pub(crate) type JResult<I, O> = IResult<I, (O, ErrorList), Infallible>;
/// An error, with an end-of-string based offset
#[derive(Debug)]
pub(crate) struct LenientErrorInternal {
pub pos: usize,
pub message: String,
}
/// A recoverable error and the position it happened at
#[derive(Debug, PartialEq)]
pub struct LenientError {
pub pos: usize,
pub message: String,
}
impl LenientError {
pub(crate) fn from_internal(internal: LenientErrorInternal, str_len: usize) -> LenientError {
LenientError {
pos: str_len - internal.pos,
message: internal.message,
}
}
}
fn unwrap_infallible<T>(res: Result<T, nom::Err<Infallible>>) -> T {
match res {
Ok(val) => val,
Err(_) => unreachable!(),
}
}
// when rfcs#1733 get stabilized, this can make things clearer
// trait InfallibleParser<I, O> = nom::Parser<I, (O, ErrorList), std::convert::Infallible>;
/// A variant of the classical `opt` parser, except it returns an infallible error type.
///
/// It's less generic than the original to ease type resolution in the rest of the code.
pub(crate) fn opt_i<I: Clone, O, F>(mut f: F) -> impl FnMut(I) -> JResult<I, Option<O>>
where F: nom::Parser<I, O, nom::error::Error<I>> {
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => Ok((i, (None, Vec::new()))),
}
}
}
pub(crate) fn opt_i_err<'a, I: Clone + InputLength, O, F>(
mut f: F,
message: impl ToString + 'a,
) -> impl FnMut(I) -> JResult<I, Option<O>> + 'a
where
F: nom::Parser<I, O, nom::error::Error<I>> + 'a,
{
move |input: I| {
let i = input.clone();
match f.parse(input) {
Ok((i, o)) => Ok((i, (Some(o), Vec::new()))),
Err(_) => {
let errs = vec![LenientErrorInternal {
pos: i.input_len(),
message: message.to_string(),
}];
Ok((i, (None, errs)))
}
}
}
}
pub(crate) fn space0_infallible<T>(input: T) -> JResult<T, T>
where
T: InputTakeAtPosition + Clone,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space0)(input)
.map(|(left, (spaces, errors))| (left, (spaces.expect("space0 can't fail"), errors)))
}
pub(crate) fn space1_infallible<T>(input: T) -> JResult<T, Option<T>>
where
T: InputTakeAtPosition + Clone + InputLength,
<T as InputTakeAtPosition>::Item: AsChar + Clone,
{
opt_i(nom::character::complete::space1)(input).map(|(left, (spaces, mut errors))| {
if spaces.is_none() {
errors.push(LenientErrorInternal {
pos: left.input_len(),
message: "missing space".to_string(),
})
}
(left, (spaces, errors))
})
}
pub(crate) fn fallible<I, O, E: nom::error::ParseError<I>, F>(
mut f: F,
) -> impl FnMut(I) -> IResult<I, O, E>
where F: nom::Parser<I, (O, ErrorList), Infallible> {
use nom::Err;
move |input: I| match f.parse(input) {
Ok((input, (output, _err))) => Ok((input, output)),
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
}
}
pub(crate) fn delimited_infallible<I, O1, O2, O3, F, G, H>(
mut first: F,
mut second: G,
mut third: H,
) -> impl FnMut(I) -> JResult<I, O2>
where
F: nom::Parser<I, (O1, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
H: nom::Parser<I, (O3, ErrorList), Infallible>,
{
move |input: I| {
let (input, (_, mut err)) = first.parse(input)?;
let (input, (o2, mut err2)) = second.parse(input)?;
err.append(&mut err2);
let (input, (_, mut err3)) = third.parse(input)?;
err.append(&mut err3);
Ok((input, (o2, err)))
}
}
// Parse nothing. Just a lazy way to not implement terminated/preceded and use delimited instead
pub(crate) fn nothing(i: &str) -> JResult<&str, ()> {
Ok((i, ((), Vec::new())))
}
pub(crate) trait TupleInfallible<I, O> {
/// Parses the input and returns a tuple of results of each parser.
fn parse(&mut self, input: I) -> JResult<I, O>;
}
impl<Input, Output, F: nom::Parser<Input, (Output, ErrorList), Infallible>>
TupleInfallible<Input, (Output,)> for (F,)
{
fn parse(&mut self, input: Input) -> JResult<Input, (Output,)> {
self.0.parse(input).map(|(i, (o, e))| (i, ((o,), e)))
}
}
// these macros are heavily copied from nom, with some minor adaptations for our type
macro_rules! tuple_trait(
($name1:ident $ty1:ident, $name2: ident $ty2:ident, $($name:ident $ty:ident),*) => (
tuple_trait!(__impl $name1 $ty1, $name2 $ty2; $($name $ty),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident, $($name2:ident $ty2:ident),*) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait!(__impl $($name $ty),+ , $name1 $ty1; $($name2 $ty2),*);
);
(__impl $($name:ident $ty: ident),+; $name1:ident $ty1:ident) => (
tuple_trait_impl!($($name $ty),+);
tuple_trait_impl!($($name $ty),+, $name1 $ty1);
);
);
macro_rules! tuple_trait_impl(
($($name:ident $ty: ident),+) => (
impl<
Input: Clone, $($ty),+ ,
$($name: nom::Parser<Input, ($ty, ErrorList), Infallible>),+
> TupleInfallible<Input, ( $($ty),+ )> for ( $($name),+ ) {
fn parse(&mut self, input: Input) -> JResult<Input, ( $($ty),+ )> {
let mut error_list = Vec::new();
tuple_trait_inner!(0, self, input, (), error_list, $($name)+)
}
}
);
);
macro_rules! tuple_trait_inner(
($it:tt, $self:expr, $input:expr, (), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
succ!($it, tuple_trait_inner!($self, i, ( o ), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident $($id:ident)+) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
succ!($it, tuple_trait_inner!($self, i, ($($parsed)* , o), $error_list, $($id)+))
});
($it:tt, $self:expr, $input:expr, ($($parsed:tt)*), $error_list:expr, $head:ident) => ({
let (i, (o, mut err)) = $self.$it.parse($input.clone())?;
$error_list.append(&mut err);
Ok((i, (($($parsed)* , o), $error_list)))
});
);
macro_rules! succ (
(0, $submac:ident ! ($($rest:tt)*)) => ($submac!(1, $($rest)*));
(1, $submac:ident ! ($($rest:tt)*)) => ($submac!(2, $($rest)*));
(2, $submac:ident ! ($($rest:tt)*)) => ($submac!(3, $($rest)*));
(3, $submac:ident ! ($($rest:tt)*)) => ($submac!(4, $($rest)*));
(4, $submac:ident ! ($($rest:tt)*)) => ($submac!(5, $($rest)*));
(5, $submac:ident ! ($($rest:tt)*)) => ($submac!(6, $($rest)*));
(6, $submac:ident ! ($($rest:tt)*)) => ($submac!(7, $($rest)*));
(7, $submac:ident ! ($($rest:tt)*)) => ($submac!(8, $($rest)*));
(8, $submac:ident ! ($($rest:tt)*)) => ($submac!(9, $($rest)*));
(9, $submac:ident ! ($($rest:tt)*)) => ($submac!(10, $($rest)*));
(10, $submac:ident ! ($($rest:tt)*)) => ($submac!(11, $($rest)*));
(11, $submac:ident ! ($($rest:tt)*)) => ($submac!(12, $($rest)*));
(12, $submac:ident ! ($($rest:tt)*)) => ($submac!(13, $($rest)*));
(13, $submac:ident ! ($($rest:tt)*)) => ($submac!(14, $($rest)*));
(14, $submac:ident ! ($($rest:tt)*)) => ($submac!(15, $($rest)*));
(15, $submac:ident ! ($($rest:tt)*)) => ($submac!(16, $($rest)*));
(16, $submac:ident ! ($($rest:tt)*)) => ($submac!(17, $($rest)*));
(17, $submac:ident ! ($($rest:tt)*)) => ($submac!(18, $($rest)*));
(18, $submac:ident ! ($($rest:tt)*)) => ($submac!(19, $($rest)*));
(19, $submac:ident ! ($($rest:tt)*)) => ($submac!(20, $($rest)*));
(20, $submac:ident ! ($($rest:tt)*)) => ($submac!(21, $($rest)*));
);
tuple_trait!(FnA A, FnB B, FnC C, FnD D, FnE E, FnF F, FnG G, FnH H, FnI I, FnJ J, FnK K, FnL L,
FnM M, FnN N, FnO O, FnP P, FnQ Q, FnR R, FnS S, FnT T, FnU U);
// Special case: implement `TupleInfallible` for `()`, the unit type.
// This can come up in macros which accept a variable number of arguments.
// Literally, `()` is an empty tuple, so it should simply parse nothing.
impl<I> TupleInfallible<I, ()> for () {
fn parse(&mut self, input: I) -> JResult<I, ()> {
Ok((input, ((), Vec::new())))
}
}
pub(crate) fn tuple_infallible<I, O, List: TupleInfallible<I, O>>(
mut l: List,
) -> impl FnMut(I) -> JResult<I, O> {
move |i: I| l.parse(i)
}
pub(crate) fn separated_list_infallible<I, O, O2, F, G>(
mut sep: G,
mut f: F,
) -> impl FnMut(I) -> JResult<I, Vec<O>>
where
I: Clone + InputLength,
F: nom::Parser<I, (O, ErrorList), Infallible>,
G: nom::Parser<I, (O2, ErrorList), Infallible>,
{
move |i: I| {
let mut res: Vec<O> = Vec::new();
let mut errors: ErrorList = Vec::new();
let (mut i, (o, mut err)) = unwrap_infallible(f.parse(i.clone()));
errors.append(&mut err);
res.push(o);
loop {
let (i_sep_parsed, (_, mut err_sep)) = unwrap_infallible(sep.parse(i.clone()));
let len_before = i_sep_parsed.input_len();
let (i_elem_parsed, (o, mut err_elem)) =
unwrap_infallible(f.parse(i_sep_parsed.clone()));
// infinite loop check: the parser must always consume
// if we consumed nothing here, don't produce an element.
if i_elem_parsed.input_len() == len_before {
return Ok((i, (res, errors)));
}
res.push(o);
errors.append(&mut err_sep);
errors.append(&mut err_elem);
i = i_elem_parsed;
}
}
}
pub(crate) trait Alt<I, O> {
/// Tests each parser in the tuple and returns the result of the first one that succeeds
fn choice(&mut self, input: I) -> Option<JResult<I, O>>;
}
macro_rules! alt_trait(
($first_cond:ident $first:ident, $($id_cond:ident $id: ident),+) => (
alt_trait!(__impl $first_cond $first; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
alt_trait_impl!($($current_cond $current),*);
alt_trait!(__impl $($current_cond $current,)* $head_cond $head; $($id_cond $id),+);
);
(__impl $($current_cond:ident $current:ident),*; $head_cond:ident $head:ident) => (
alt_trait_impl!($($current_cond $current),*);
alt_trait_impl!($($current_cond $current,)* $head_cond $head);
);
);
macro_rules! alt_trait_impl(
($($id_cond:ident $id:ident),+) => (
impl<
Input: Clone, Output,
$(
// () are to make things easier on me, but I'm not entirely sure whether we can do better
// with rule E0207
$id_cond: nom::Parser<Input, (), ()>,
$id: nom::Parser<Input, (Output, ErrorList), Infallible>
),+
> Alt<Input, Output> for ( $(($id_cond, $id),)+ ) {
fn choice(&mut self, input: Input) -> Option<JResult<Input, Output>> {
match self.0.0.parse(input.clone()) {
Err(_) => alt_trait_inner!(1, self, input, $($id_cond $id),+),
Ok((input_left, _)) => Some(self.0.1.parse(input_left)),
}
}
}
);
);
macro_rules! alt_trait_inner(
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident, $($id_cond:ident $id:ident),+) => (
match $self.$it.0.parse($input.clone()) {
Err(_) => succ!($it, alt_trait_inner!($self, $input, $($id_cond $id),+)),
Ok((input_left, _)) => Some($self.$it.1.parse(input_left)),
}
);
($it:tt, $self:expr, $input:expr, $head_cond:ident $head:ident) => (
None
);
);
alt_trait!(A1 A, B1 B, C1 C, D1 D, E1 E, F1 F, G1 G, H1 H, I1 I, J1 J, K1 K,
L1 L, M1 M, N1 N, O1 O, P1 P, Q1 Q, R1 R, S1 S, T1 T, U1 U);
/// An alt() like combinator. For each branch, it first tries a fallible parser, which commits to
/// this branch, or tells to check next branch, and the execute the infallible parser which follow.
///
/// In case no branch match, the default (fallible) parser is executed.
pub(crate) fn alt_infallible<I: Clone, O, F, List: Alt<I, O>>(
mut l: List,
mut default: F,
) -> impl FnMut(I) -> JResult<I, O>
where
F: nom::Parser<I, (O, ErrorList), Infallible>,
{
move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i))
}

View File

@@ -1,26 +1,17 @@
#![allow(clippy::derive_partial_eq_without_eq)]
mod infallible;
mod occur;
mod query_grammar;
mod user_input_ast;
use combine::parser::Parser;
pub use crate::infallible::LenientError;
pub use crate::occur::Occur;
use crate::query_grammar::{parse_to_ast, parse_to_ast_lenient};
pub use crate::user_input_ast::{
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
};
use crate::query_grammar::parse_to_ast;
pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
pub struct Error;
/// Parse a query
pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
let (_remaining, user_input_ast) = parse_to_ast(query).map_err(|_| Error)?;
let (user_input_ast, _remaining) = parse_to_ast().parse(query).map_err(|_| Error)?;
Ok(user_input_ast)
}
/// Parse a query, trying to recover from syntax errors, and giving hints toward fixing errors.
pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
parse_to_ast_lenient(query)
}

File diff suppressed because it is too large Load Diff

View File

@@ -3,7 +3,7 @@ use std::fmt::{Debug, Formatter};
use crate::Occur;
#[derive(PartialEq, Clone)]
#[derive(PartialEq)]
pub enum UserInputLeaf {
Literal(UserInputLiteral),
All,
@@ -16,38 +16,10 @@ pub enum UserInputLeaf {
field: Option<String>,
elements: Vec<String>,
},
Exists {
field: String,
},
}
impl UserInputLeaf {
pub(crate) fn set_field(self, field: Option<String>) -> Self {
match self {
UserInputLeaf::Literal(mut literal) => {
literal.field_name = field;
UserInputLeaf::Literal(literal)
}
UserInputLeaf::All => UserInputLeaf::All,
UserInputLeaf::Range {
field: _,
lower,
upper,
} => UserInputLeaf::Range {
field,
lower,
upper,
},
UserInputLeaf::Set { field: _, elements } => UserInputLeaf::Set { field, elements },
UserInputLeaf::Exists { field: _ } => UserInputLeaf::Exists {
field: field.expect("Exist query without a field isn't allowed"),
},
}
}
}
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> {
match self {
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
UserInputLeaf::Range {
@@ -56,8 +28,7 @@ impl Debug for UserInputLeaf {
ref upper,
} => {
if let Some(ref field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
write!(formatter, "\"{}\":", field)?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
@@ -66,73 +37,43 @@ impl Debug for UserInputLeaf {
}
UserInputLeaf::Set { field, elements } => {
if let Some(ref field) = field {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\": ")?;
write!(formatter, "\"{}\": ", field)?;
}
write!(formatter, "IN [")?;
for (i, text) in elements.iter().enumerate() {
for (i, element) in elements.iter().enumerate() {
if i != 0 {
write!(formatter, " ")?;
}
// TODO properly escape element
write!(formatter, "\"{text}\"")?;
write!(formatter, "\"{}\"", element)?;
}
write!(formatter, "]")
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
}
}
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum Delimiter {
SingleQuotes,
DoubleQuotes,
None,
}
#[derive(PartialEq, Clone)]
#[derive(PartialEq)]
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
pub delimiter: Delimiter,
pub slop: u32,
pub prefix: bool,
}
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
if let Some(ref field) = self.field_name {
// TODO properly escape field (in case of \")
write!(formatter, "\"{field}\":")?;
}
match self.delimiter {
Delimiter::SingleQuotes => {
// TODO properly escape element (in case of \')
write!(formatter, "'{}'", self.phrase)?;
}
Delimiter::DoubleQuotes => {
// TODO properly escape element (in case of \")
write!(formatter, "\"{}\"", self.phrase)?;
}
Delimiter::None => {
// TODO properly escape element
write!(formatter, "{}", self.phrase)?;
}
write!(formatter, "\"{}\":", field)?;
}
write!(formatter, "\"{}\"", self.phrase)?;
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
} else if self.prefix {
write!(formatter, "*")?;
}
Ok(())
}
}
#[derive(PartialEq, Debug, Clone)]
#[derive(PartialEq)]
pub enum UserInputBound {
Inclusive(String),
Exclusive(String),
@@ -142,18 +83,16 @@ pub enum UserInputBound {
impl UserInputBound {
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
// TODO properly escape word if required
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{word}\""),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{word}\""),
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
}
}
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
// TODO properly escape word if required
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{word}\"]"),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{word}\"}}"),
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
}
}
@@ -167,7 +106,6 @@ impl UserInputBound {
}
}
#[derive(PartialEq, Clone)]
pub enum UserInputAst {
Clause(Vec<(Option<Occur>, UserInputAst)>),
Leaf(Box<UserInputLeaf>),
@@ -225,9 +163,9 @@ fn print_occur_ast(
formatter: &mut fmt::Formatter,
) -> fmt::Result {
if let Some(occur) = occur_opt {
write!(formatter, "{occur}{ast:?}")?;
write!(formatter, "{}{:?}", occur, ast)?;
} else {
write!(formatter, "*{ast:?}")?;
write!(formatter, "*{:?}", ast)?;
}
Ok(())
}
@@ -237,7 +175,6 @@ impl fmt::Debug for UserInputAst {
match *self {
UserInputAst::Clause(ref subqueries) => {
if subqueries.is_empty() {
// TODO this will break ast reserialization, is writing "( )" enought?
write!(formatter, "<emptyclause>")?;
} else {
write!(formatter, "(")?;
@@ -250,8 +187,8 @@ impl fmt::Debug for UserInputAst {
}
Ok(())
}
UserInputAst::Leaf(ref subquery) => write!(formatter, "{subquery:?}"),
UserInputAst::Boost(ref leaf, boost) => write!(formatter, "({leaf:?})^{boost}"),
UserInputAst::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
UserInputAst::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
}
}
}

View File

@@ -1,36 +1,19 @@
#[cfg(all(test, feature = "unstable"))]
mod bench {
use columnar::Cardinality;
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use rand::{thread_rng, Rng};
use test::{self, Bencher};
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
Optional = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
Sparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
use super::*;
use crate::aggregation::bucket::{
CustomOrder, HistogramAggregation, HistogramBounds, Order, OrderTarget, TermsAggregation,
};
use crate::aggregation::metric::StatsAggregation;
use crate::query::AllQuery;
use crate::schema::{Schema, TextFieldIndexing, FAST, STRING};
use crate::Index;
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -40,7 +23,6 @@ mod bench {
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
@@ -48,16 +30,13 @@ mod bench {
let score_field_f64 = schema_builder.add_f64_field("score_f64", score_fieldtype.clone());
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = ["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::Optional {
@@ -65,8 +44,6 @@ mod bench {
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
@@ -75,39 +52,22 @@ mod bench {
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => 1.0,
score_field_f64 => 1.0,
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let _val_max = 1_000_000.0;
for _ in 0..doc_with_value {
for _ in 0..1_000_000 {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => val,
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::Sparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
@@ -135,12 +95,6 @@ mod bench {
fn [<$x _multi>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Multivalued)
}
#[bench]
fn [<$x _sparse>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Sparse)
}
}
};
}
@@ -158,10 +112,14 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": { "avg": { "field": "score", } }
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -183,10 +141,14 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "stats": { "field": "score_f64", } }
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score_f64".to_string(),
))),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -208,10 +170,14 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64", } }
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -220,31 +186,6 @@ mod bench {
});
}
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
@@ -258,11 +199,22 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
}))
.unwrap();
let agg_req_1: Aggregations = vec![
(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
),
(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
),
]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -278,10 +230,21 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
}))
.unwrap();
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_few_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req);
@@ -297,42 +260,30 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: sub_agg_req,
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req);
@@ -348,10 +299,21 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
}))
.unwrap();
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req);
@@ -367,10 +329,25 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
}))
.unwrap();
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
target: OrderTarget::Key,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req);
@@ -386,17 +363,29 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..30000f64).into(),
(30000f64..40000f64).into(),
(40000f64..50000f64).into(),
(50000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -412,25 +401,38 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..30000f64).into(),
(30000f64..40000f64).into(),
(40000f64..50000f64).into(),
(50000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req,
}
},
}))
.unwrap();
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -451,10 +453,26 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64,
hard_bounds: Some(HistogramBounds {
min: 1000.0,
max: 300_000.0,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
@@ -469,15 +487,31 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: sub_agg_req,
}
}
}))
.unwrap();
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -493,15 +527,22 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -523,23 +564,43 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
}))
.unwrap();
let sub_agg_req_1: Aggregations = vec![(
"average_in_range".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![
(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
),
(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req_1,
}
.into(),
),
),
]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);

View File

@@ -1,11 +1,12 @@
use std::collections::HashMap;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::atomic::AtomicU64;
use std::sync::Arc;
use common::ByteCount;
use super::collector::DEFAULT_MEMORY_LIMIT;
use super::{AggregationError, DEFAULT_BUCKET_LIMIT};
use crate::TantivyError;
/// An estimate for memory consumption. Non recursive
pub trait MemoryConsumption {
@@ -14,8 +15,8 @@ pub trait MemoryConsumption {
impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
fn memory_consumption(&self) -> usize {
let capacity = self.capacity();
(std::mem::size_of::<K>() + std::mem::size_of::<V>() + 1) * capacity
let num_items = self.capacity();
(std::mem::size_of::<K>() + std::mem::size_of::<V>()) * num_items
}
}
@@ -60,8 +61,6 @@ impl AggregationLimits {
/// *bucket_limit*
/// Limits the maximum number of buckets returned from an aggregation request.
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
///
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
Self {
memory_consumption: Default::default(),
@@ -69,207 +68,28 @@ impl AggregationLimits {
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
}
}
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard {
// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption),
// The memory_limit in bytes
memory_limit: self.memory_limit,
allocated_with_the_guard: 0,
pub(crate) fn validate_memory_consumption(&self) -> crate::Result<()> {
if self.get_memory_consumed() > self.memory_limit {
return Err(TantivyError::AggregationError(
AggregationError::MemoryExceeded {
limit: self.memory_limit,
current: self.get_memory_consumed(),
},
));
}
}
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
Ok(())
}
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) {
self.memory_consumption
.fetch_add(num_bytes, std::sync::atomic::Ordering::Relaxed);
}
/// Returns the estimated memory consumed by the aggregations
pub fn get_memory_consumed(&self) -> ByteCount {
self.memory_consumption
.load(std::sync::atomic::Ordering::Relaxed)
.into()
}
pub(crate) fn get_bucket_limit(&self) -> u32 {
self.bucket_limit
}
}
fn validate_memory_consumption(
memory_consumption: &AtomicU64,
memory_limit: ByteCount,
) -> Result<(), AggregationError> {
// Load the estimated memory consumed by the aggregations
let memory_consumed: ByteCount = memory_consumption.load(Ordering::Relaxed).into();
if memory_consumed > memory_limit {
return Err(AggregationError::MemoryExceeded {
limit: memory_limit,
current: memory_consumed,
});
}
Ok(())
}
pub struct ResourceLimitGuard {
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc<AtomicU64>,
/// The memory_limit in bytes
memory_limit: ByteCount,
/// Allocated memory with this guard.
allocated_with_the_guard: u64,
}
impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
Ok(())
}
}
impl Drop for ResourceLimitGuard {
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
/// This is used to clear the segment specific memory consumption all at once.
fn drop(&mut self) {
self.memory_consumption
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_merge() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb", "text2": "bbb" }"#],
vec![r#"{ "text": "aaa", "text2": "bbb" }"#],
];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 1, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 1,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
// https://github.com/quickwit-oss/quickwit/issues/3837
#[test]
fn test_agg_limits_with_empty_data() {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::bucket::tests::get_test_index_from_docs;
let docs = vec![vec![r#"{ "text": "aaa", "text2": "bbb" }"#]];
let index = get_test_index_from_docs(false, &docs).unwrap();
{
// Empty result since there is no doc with dates
let elasticsearch_compatible_json = json!(
{
"1": {
"terms": {"field": "text2", "min_doc_count": 0},
"aggs": {
"2":{
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": "2015-01-01T00:00:00Z",
"max": "2015-01-10T00:00:00Z"
}
}
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request_with_query(agg_req, &index, Some(("text", "bbb"))).unwrap();
let expected_res = json!({
"1": {
"buckets": [
{
"2": {
"buckets": [
{ "doc_count": 0, "key": 1420070400000.0, "key_as_string": "2015-01-01T00:00:00Z" },
{ "doc_count": 0, "key": 1420156800000.0, "key_as_string": "2015-01-02T00:00:00Z" },
{ "doc_count": 0, "key": 1420243200000.0, "key_as_string": "2015-01-03T00:00:00Z" },
{ "doc_count": 0, "key": 1420329600000.0, "key_as_string": "2015-01-04T00:00:00Z" },
{ "doc_count": 0, "key": 1420416000000.0, "key_as_string": "2015-01-05T00:00:00Z" },
{ "doc_count": 0, "key": 1420502400000.0, "key_as_string": "2015-01-06T00:00:00Z" },
{ "doc_count": 0, "key": 1420588800000.0, "key_as_string": "2015-01-07T00:00:00Z" },
{ "doc_count": 0, "key": 1420675200000.0, "key_as_string": "2015-01-08T00:00:00Z" },
{ "doc_count": 0, "key": 1420761600000.0, "key_as_string": "2015-01-09T00:00:00Z" },
{ "doc_count": 0, "key": 1420848000000.0, "key_as_string": "2015-01-10T00:00:00Z" }
]
},
"doc_count": 0,
"key": "bbb"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
});
assert_eq!(res, expected_res);
}
}
}

View File

@@ -9,7 +9,25 @@
//! # Example
//!
//! ```
//! use tantivy::aggregation::agg_req::Aggregations;
//! use tantivy::aggregation::bucket::RangeAggregation;
//! use tantivy::aggregation::agg_req::BucketAggregationType;
//! use tantivy::aggregation::agg_req::{Aggregation, Aggregations};
//! use tantivy::aggregation::agg_req::BucketAggregation;
//! let agg_req1: Aggregations = vec![
//! (
//! "range".to_string(),
//! Aggregation::Bucket(Box::new(BucketAggregation {
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
//! field: "score".to_string(),
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
//! keyed: false,
//! }),
//! sub_aggregation: Default::default(),
//! })),
//! ),
//! ]
//! .into_iter()
//! .collect();
//!
//! let elasticsearch_compatible_json_req = r#"
//! {
@@ -23,78 +41,89 @@
//! }
//! }
//! }"#;
//! let _agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! let agg_req2: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! assert_eq!(agg_req1, agg_req2);
//! ```
use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
pub use super::bucket::RangeAggregation;
use super::bucket::{DateHistogramAggregationReq, HistogramAggregation, TermsAggregation};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation,
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
SumAggregation,
};
use super::VecWithNames;
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
/// defined names. It is also used in buckets aggregations to define sub-aggregations.
/// defined names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
///
/// The key is the user defined name of the aggregation.
pub type Aggregations = HashMap<String, Aggregation>;
/// Aggregation request.
///
/// An aggregation is either a bucket or a metric.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(try_from = "AggregationForDeserialization")]
pub struct Aggregation {
/// The aggregation variant, which can be either a bucket or a metric.
#[serde(flatten)]
pub agg: AggregationVariants,
/// on the document set in the bucket.
#[serde(rename = "aggs")]
#[serde(skip_serializing_if = "Aggregations::is_empty")]
pub sub_aggregation: Aggregations,
/// Like Aggregations, but optimized to work with the aggregation result
#[derive(Clone, Debug)]
pub(crate) struct AggregationsInternal {
pub(crate) metrics: VecWithNames<MetricAggregation>,
pub(crate) buckets: VecWithNames<BucketAggregationInternal>,
}
/// In order to display proper error message, we cannot rely on flattening
/// the json enum. Instead we introduce an intermediary struct to separate
/// the aggregation from the subaggregation.
#[derive(Deserialize)]
struct AggregationForDeserialization {
#[serde(flatten)]
pub aggs_remaining_json: serde_json::Value,
#[serde(rename = "aggs")]
#[serde(default)]
pub sub_aggregation: Aggregations,
}
impl TryFrom<AggregationForDeserialization> for Aggregation {
type Error = serde_json::Error;
fn try_from(value: AggregationForDeserialization) -> serde_json::Result<Self> {
let AggregationForDeserialization {
aggs_remaining_json,
sub_aggregation,
} = value;
let agg: AggregationVariants = serde_json::from_value(aggs_remaining_json)?;
Ok(Aggregation {
agg,
sub_aggregation,
})
impl From<Aggregations> for AggregationsInternal {
fn from(aggs: Aggregations) -> Self {
let mut metrics = vec![];
let mut buckets = vec![];
for (key, agg) in aggs {
match agg {
Aggregation::Bucket(bucket) => buckets.push((
key,
BucketAggregationInternal {
bucket_agg: bucket.bucket_agg,
sub_aggregation: bucket.sub_aggregation.into(),
},
)),
Aggregation::Metric(metric) => metrics.push((key, metric)),
}
}
Self {
metrics: VecWithNames::from_entries(metrics),
buckets: VecWithNames::from_entries(buckets),
}
}
}
impl Aggregation {
pub(crate) fn sub_aggregation(&self) -> &Aggregations {
&self.sub_aggregation
}
#[derive(Clone, Debug)]
// Like BucketAggregation, but optimized to work with the result
pub(crate) struct BucketAggregationInternal {
/// Bucket aggregation strategy to group documents.
pub bucket_agg: BucketAggregationType,
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
/// bucket.
pub sub_aggregation: AggregationsInternal,
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
fast_field_names.insert(self.agg.get_fast_field_name().to_string());
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
impl BucketAggregationInternal {
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
match &self.bucket_agg {
BucketAggregationType::Range(range) => Some(range),
_ => None,
}
}
pub(crate) fn as_histogram(&self) -> crate::Result<Option<HistogramAggregation>> {
match &self.bucket_agg {
BucketAggregationType::Histogram(histogram) => Ok(Some(histogram.clone())),
BucketAggregationType::DateHistogram(histogram) => {
Ok(Some(histogram.to_histogram_req()?))
}
_ => Ok(None),
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self.bucket_agg {
BucketAggregationType::Terms(terms) => Some(terms),
_ => None,
}
}
}
@@ -107,24 +136,97 @@ pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
fast_field_names
}
/// Aggregation request of [`BucketAggregation`] or [`MetricAggregation`].
///
/// An aggregation is either a bucket or a metric.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
/// All aggregation types.
pub enum AggregationVariants {
// Bucket aggregation types
#[serde(untagged)]
pub enum Aggregation {
/// Bucket aggregation, see [`BucketAggregation`] for details.
Bucket(Box<BucketAggregation>),
/// Metric aggregation, see [`MetricAggregation`] for details.
Metric(MetricAggregation),
}
impl Aggregation {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
Aggregation::Metric(metric) => {
fast_field_names.insert(metric.get_fast_field_name().to_string());
}
}
}
}
/// BucketAggregations create buckets of documents. Each bucket is associated with a rule which
/// determines whether or not a document in the falls into it. In other words, the buckets
/// effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
/// fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
/// compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
/// metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
/// the buckets created by their "parent" bucket aggregation. There are different bucket
/// aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
/// define fixed number of multiple buckets, and others dynamically create the buckets during the
/// aggregation process.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BucketAggregation {
/// Bucket aggregation strategy to group documents.
#[serde(flatten)]
pub bucket_agg: BucketAggregationType,
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
/// bucket.
#[serde(rename = "aggs")]
#[serde(default)]
#[serde(skip_serializing_if = "Aggregations::is_empty")]
pub sub_aggregation: Aggregations,
}
impl BucketAggregation {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
let fast_field_name = self.bucket_agg.get_fast_field_name();
fast_field_names.insert(fast_field_name.to_string());
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
}
}
/// The bucket aggregation types.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum BucketAggregationType {
/// Put data into buckets of user-defined ranges.
#[serde(rename = "range")]
Range(RangeAggregation),
/// Put data into a histogram.
/// Put data into buckets of user-defined ranges.
#[serde(rename = "histogram")]
Histogram(HistogramAggregation),
/// Put data into a date histogram.
/// Put data into buckets of user-defined ranges.
#[serde(rename = "date_histogram")]
DateHistogram(DateHistogramAggregationReq),
/// Put data into buckets of terms.
#[serde(rename = "terms")]
Terms(TermsAggregation),
}
// Metric aggregation types
impl BucketAggregationType {
fn get_fast_field_name(&self) -> &str {
match self {
BucketAggregationType::Terms(terms) => terms.field.as_str(),
BucketAggregationType::Range(range) => range.field.as_str(),
BucketAggregationType::Histogram(histogram) => histogram.field.as_str(),
BucketAggregationType::DateHistogram(histogram) => histogram.field.as_str(),
}
}
}
/// The aggregations in this family compute metrics based on values extracted
/// from the documents that are being aggregated. Values are extracted from the fast field of
/// the document.
/// Some aggregations output a single numeric metric (e.g. Average) and are called
/// single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
/// called multi-value numeric metrics aggregation.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum MetricAggregation {
/// Computes the average of the extracted values.
#[serde(rename = "avg")]
Average(AverageAggregation),
@@ -144,108 +246,25 @@ pub enum AggregationVariants {
/// Computes the sum of the extracted values.
#[serde(rename = "sum")]
Sum(SumAggregation),
/// Computes the sum of the extracted values.
#[serde(rename = "percentiles")]
Percentiles(PercentilesAggregationReq),
}
impl AggregationVariants {
/// Returns the name of the field used by the aggregation.
pub fn get_fast_field_name(&self) -> &str {
impl MetricAggregation {
fn get_fast_field_name(&self) -> &str {
match self {
AggregationVariants::Terms(terms) => terms.field.as_str(),
AggregationVariants::Range(range) => range.field.as_str(),
AggregationVariants::Histogram(histogram) => histogram.field.as_str(),
AggregationVariants::DateHistogram(histogram) => histogram.field.as_str(),
AggregationVariants::Average(avg) => avg.field_name(),
AggregationVariants::Count(count) => count.field_name(),
AggregationVariants::Max(max) => max.field_name(),
AggregationVariants::Min(min) => min.field_name(),
AggregationVariants::Stats(stats) => stats.field_name(),
AggregationVariants::Sum(sum) => sum.field_name(),
AggregationVariants::Percentiles(per) => per.field_name(),
}
}
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
match &self {
AggregationVariants::Range(range) => Some(range),
_ => None,
}
}
pub(crate) fn as_histogram(&self) -> crate::Result<Option<HistogramAggregation>> {
match &self {
AggregationVariants::Histogram(histogram) => Ok(Some(histogram.clone())),
AggregationVariants::DateHistogram(histogram) => {
Ok(Some(histogram.to_histogram_req()?))
}
_ => Ok(None),
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self {
AggregationVariants::Terms(terms) => Some(terms),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self {
AggregationVariants::Percentiles(percentile_req) => Some(percentile_req),
_ => None,
MetricAggregation::Average(avg) => avg.field_name(),
MetricAggregation::Count(count) => count.field_name(),
MetricAggregation::Max(max) => max.field_name(),
MetricAggregation::Min(min) => min.field_name(),
MetricAggregation::Stats(stats) => stats.field_name(),
MetricAggregation::Sum(sum) => sum.field_name(),
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn deser_json_test() {
let agg_req_json = r#"{
"price_avg": { "avg": { "field": "price" } },
"price_count": { "value_count": { "field": "price" } },
"price_max": { "max": { "field": "price" } },
"price_min": { "min": { "field": "price" } },
"price_stats": { "stats": { "field": "price" } },
"price_sum": { "sum": { "field": "price" } }
}"#;
let _agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
}
#[test]
fn deser_json_test_bucket() {
let agg_req_json = r#"
{
"termagg": {
"terms": {
"field": "json.mixed_type",
"order": { "min_price": "desc" }
},
"aggs": {
"min_price": { "min": { "field": "json.mixed_type" } }
}
},
"rangeagg": {
"range": {
"field": "json.mixed_type",
"ranges": [
{ "to": 3.0 },
{ "from": 19.0, "to": 20.0 },
{ "from": 20.0 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "json.mixed_type" } }
}
}
} "#;
let _agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
}
#[test]
fn test_metric_aggregations_deser() {
let agg_req_json = r#"{
@@ -259,27 +278,46 @@ mod tests {
let agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
assert!(
matches!(&agg_req.get("price_avg").unwrap().agg, AggregationVariants::Average(avg) if avg.field == "price")
matches!(agg_req.get("price_avg").unwrap(), Aggregation::Metric(MetricAggregation::Average(avg)) if avg.field == "price")
);
assert!(
matches!(&agg_req.get("price_count").unwrap().agg, AggregationVariants::Count(count) if count.field == "price")
matches!(agg_req.get("price_count").unwrap(), Aggregation::Metric(MetricAggregation::Count(count)) if count.field == "price")
);
assert!(
matches!(&agg_req.get("price_max").unwrap().agg, AggregationVariants::Max(max) if max.field == "price")
matches!(agg_req.get("price_max").unwrap(), Aggregation::Metric(MetricAggregation::Max(max)) if max.field == "price")
);
assert!(
matches!(&agg_req.get("price_min").unwrap().agg, AggregationVariants::Min(min) if min.field == "price")
matches!(agg_req.get("price_min").unwrap(), Aggregation::Metric(MetricAggregation::Min(min)) if min.field == "price")
);
assert!(
matches!(&agg_req.get("price_stats").unwrap().agg, AggregationVariants::Stats(stats) if stats.field == "price")
matches!(agg_req.get("price_stats").unwrap(), Aggregation::Metric(MetricAggregation::Stats(stats)) if stats.field == "price")
);
assert!(
matches!(&agg_req.get("price_sum").unwrap().agg, AggregationVariants::Sum(sum) if sum.field == "price")
matches!(agg_req.get("price_sum").unwrap(), Aggregation::Metric(MetricAggregation::Sum(sum)) if sum.field == "price")
);
}
#[test]
fn serialize_to_json_test() {
let agg_req1: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
],
keyed: true,
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let elasticsearch_compatible_json_req = r#"{
"range": {
"range": {
@@ -304,56 +342,57 @@ mod tests {
}
}
}"#;
let agg_req1: Aggregations =
{ serde_json::from_str(elasticsearch_compatible_json_req).unwrap() };
let agg_req2: String = serde_json::to_string_pretty(&agg_req1).unwrap();
assert_eq!(agg_req2, elasticsearch_compatible_json_req);
}
#[test]
fn test_get_fast_field_names() {
let range_agg: Aggregation = {
serde_json::from_value(json!({
"range": {
"field": "score",
"ranges": [
{ "to": 3.0 },
{ "from": 3.0, "to": 7.0 },
{ "from": 7.0, "to": 20.0 },
{ "from": 20.0 }
],
}
}))
.unwrap()
};
let agg_req1: Aggregations = {
serde_json::from_value(json!({
"range1": range_agg,
"range2":{
"range": {
"field": "score2",
"ranges": [
{ "to": 3.0 },
{ "from": 3.0, "to": 7.0 },
{ "from": 7.0, "to": 20.0 },
{ "from": 20.0 }
let agg_req2: Aggregations = vec![
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score2".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
],
},
"aggs": {
"metric": {
"avg": {
"field": "field123"
}
}
}
}
}))
.unwrap()
};
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"metric".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("field123".to_string()),
)),
),
]
.into_iter()
.collect();
let agg_req1: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
],
..Default::default()
}),
sub_aggregation: agg_req2,
})),
)]
.into_iter()
.collect();
assert_eq!(
get_fast_field_names(&agg_req1),

View File

@@ -2,8 +2,7 @@
use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
use super::agg_limits::ResourceLimitGuard;
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
@@ -13,263 +12,38 @@ use super::metric::{
};
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::SegmentReader;
#[derive(Default)]
#[derive(Clone, Default)]
pub(crate) struct AggregationsWithAccessor {
pub aggs: VecWithNames<AggregationWithAccessor>,
pub metrics: VecWithNames<MetricAggregationWithAccessor>,
pub buckets: VecWithNames<BucketAggregationWithAccessor>,
}
impl AggregationsWithAccessor {
fn from_data(aggs: VecWithNames<AggregationWithAccessor>) -> Self {
Self { aggs }
fn from_data(
metrics: VecWithNames<MetricAggregationWithAccessor>,
buckets: VecWithNames<BucketAggregationWithAccessor>,
) -> Self {
Self { metrics, buckets }
}
pub fn is_empty(&self) -> bool {
self.aggs.is_empty()
self.metrics.is_empty() && self.buckets.is_empty()
}
}
pub struct AggregationWithAccessor {
#[derive(Clone)]
pub struct BucketAggregationWithAccessor {
/// In general there can be buckets without fast field access, e.g. buckets that are created
/// based on search terms. That is not that case currently, but eventually this needs to be
/// Option or moved.
/// based on search terms. So eventually this needs to be Option or moved.
pub(crate) accessor: Column<u64>,
/// Load insert u64 for missing use case
pub(crate) missing_value_for_accessor: Option<u64>,
pub(crate) str_dict_column: Option<StrColumn>,
pub(crate) field_type: ColumnType,
pub(crate) bucket_agg: BucketAggregationType,
pub(crate) sub_aggregation: AggregationsWithAccessor,
pub(crate) limits: ResourceLimitGuard,
pub(crate) limits: AggregationLimits,
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// Used for missing term aggregation, which checks all columns for existence.
/// By convention the missing aggregation is chosen, when this property is set
/// (instead bein set in `agg`).
/// If this needs to used by other aggregations, we need to refactor this.
pub(crate) accessors: Vec<Column<u64>>,
pub(crate) agg: Aggregation,
}
impl AggregationWithAccessor {
/// May return multiple accessors if the aggregation is e.g. on mixed field types.
fn try_from_agg(
agg: &Aggregation,
sub_aggregation: &Aggregations,
reader: &SegmentReader,
limits: AggregationLimits,
) -> crate::Result<Vec<AggregationWithAccessor>> {
let add_agg_with_accessor = |accessor: Column<u64>,
column_type: ColumnType,
aggs: &mut Vec<AggregationWithAccessor>|
-> crate::Result<()> {
let res = AggregationWithAccessor {
accessor,
accessors: Vec::new(),
field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
limits: limits.new_guard(),
missing_value_for_accessor: None,
str_dict_column: None,
column_block_accessor: Default::default(),
};
aggs.push(res);
Ok(())
};
let mut res: Vec<AggregationWithAccessor> = Vec::new();
use AggregationVariants::*;
match &agg.agg {
Range(RangeAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Histogram(HistogramAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
DateHistogram(DateHistogramAggregationReq {
field: field_name, ..
}) => {
let (accessor, column_type) =
// Only DateTime is supported for DateHistogram
get_ff_reader(reader, field_name, Some(&[ColumnType::DateTime]))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Terms(TermsAggregation {
field: field_name,
missing,
..
}) => {
let str_dict_column = reader.fast_fields().str(field_name)?;
let allowed_column_types = [
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
ColumnType::DateTime,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
];
// In case the column is empty we want the shim column to match the missing type
let fallback_type = missing
.as_ref()
.map(|missing| match missing {
Key::Str(_) => ColumnType::Str,
Key::F64(_) => ColumnType::F64,
})
.unwrap_or(ColumnType::U64);
let column_and_types = get_all_ff_reader_or_empty(
reader,
field_name,
Some(&allowed_column_types),
fallback_type,
)?;
let missing_and_more_than_one_col = column_and_types.len() > 1 && missing.is_some();
let text_on_non_text_col = column_and_types.len() == 1
&& column_and_types[0].1.numerical_type().is_some()
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
// Actually we could convert the text to a number and have the fast path, if it is
// provided in Rfc3339 format. But this use case is probably common
// enough to justify the effort.
let text_on_date_col = column_and_types.len() == 1
&& column_and_types[0].1 == ColumnType::DateTime
&& missing
.as_ref()
.map(|m| matches!(m, Key::Str(_)))
.unwrap_or(false);
let use_special_missing_agg =
missing_and_more_than_one_col || text_on_non_text_col || text_on_date_col;
if use_special_missing_agg {
let column_and_types =
get_all_ff_reader_or_empty(reader, field_name, None, fallback_type)?;
let accessors: Vec<Column> =
column_and_types.iter().map(|(a, _)| a.clone()).collect();
let agg_wit_acc = AggregationWithAccessor {
missing_value_for_accessor: None,
accessor: accessors[0].clone(),
accessors,
field_type: ColumnType::U64,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
column_block_accessor: Default::default(),
};
res.push(agg_wit_acc);
}
for (accessor, column_type) in column_and_types {
let missing_value_term_agg = if use_special_missing_agg {
None
} else {
missing.clone()
};
let missing_value_for_accessor =
if let Some(missing) = missing_value_term_agg.as_ref() {
get_missing_val(column_type, missing, agg.agg.get_fast_field_name())?
} else {
None
};
let agg = AggregationWithAccessor {
missing_value_for_accessor,
accessor,
accessors: Vec::new(),
field_type: column_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
column_block_accessor: Default::default(),
};
res.push(agg);
}
}
Average(AverageAggregation {
field: field_name, ..
})
| Count(CountAggregation {
field: field_name, ..
})
| Max(MaxAggregation {
field: field_name, ..
})
| Min(MinAggregation {
field: field_name, ..
})
| Stats(StatsAggregation {
field: field_name, ..
})
| Sum(SumAggregation {
field: field_name, ..
}) => {
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
Percentiles(percentiles) => {
let (accessor, column_type) = get_ff_reader(
reader,
percentiles.field_name(),
Some(get_numeric_or_date_column_types()),
)?;
add_agg_with_accessor(accessor, column_type, &mut res)?;
}
};
Ok(res)
}
}
fn get_missing_val(
column_type: ColumnType,
missing: &Key,
field_name: &str,
) -> crate::Result<Option<u64>> {
let missing_val = match missing {
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
// Allow fallback to number on text fields
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::F64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val, &column_type)
}
_ => {
return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {:?} for field {} is not supported for column type {:?}",
missing, field_name, column_type
)));
}
};
Ok(missing_val)
}
fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
@@ -281,30 +55,131 @@ fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
]
}
pub(crate) fn get_aggs_with_segment_accessor_and_validate(
impl BucketAggregationWithAccessor {
fn try_from_bucket(
bucket: &BucketAggregationType,
sub_aggregation: &Aggregations,
reader: &SegmentReader,
limits: AggregationLimits,
) -> crate::Result<BucketAggregationWithAccessor> {
let mut str_dict_column = None;
let (accessor, field_type) = match &bucket {
BucketAggregationType::Range(RangeAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::Histogram(HistogramAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::DateHistogram(DateHistogramAggregationReq {
field: field_name,
..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::Terms(TermsAggregation {
field: field_name, ..
}) => {
str_dict_column = reader.fast_fields().str(field_name)?;
get_ff_reader_and_validate(reader, field_name, None)?
}
};
let sub_aggregation = sub_aggregation.clone();
Ok(BucketAggregationWithAccessor {
accessor,
field_type,
sub_aggregation: get_aggs_with_accessor_and_validate(
&sub_aggregation,
reader,
&limits.clone(),
)?,
bucket_agg: bucket.clone(),
str_dict_column,
limits,
column_block_accessor: Default::default(),
})
}
}
/// Contains the metric request and the fast field accessor.
#[derive(Clone)]
pub struct MetricAggregationWithAccessor {
pub metric: MetricAggregation,
pub field_type: ColumnType,
pub accessor: Column<u64>,
pub column_block_accessor: ColumnBlockAccessor<u64>,
}
impl MetricAggregationWithAccessor {
fn try_from_metric(
metric: &MetricAggregation,
reader: &SegmentReader,
) -> crate::Result<MetricAggregationWithAccessor> {
match &metric {
MetricAggregation::Average(AverageAggregation { field: field_name })
| MetricAggregation::Count(CountAggregation { field: field_name })
| MetricAggregation::Max(MaxAggregation { field: field_name })
| MetricAggregation::Min(MinAggregation { field: field_name })
| MetricAggregation::Stats(StatsAggregation { field: field_name })
| MetricAggregation::Sum(SumAggregation { field: field_name }) => {
let (accessor, field_type) = get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?;
Ok(MetricAggregationWithAccessor {
accessor,
field_type,
metric: metric.clone(),
column_block_accessor: Default::default(),
})
}
}
}
}
pub(crate) fn get_aggs_with_accessor_and_validate(
aggs: &Aggregations,
reader: &SegmentReader,
limits: &AggregationLimits,
) -> crate::Result<AggregationsWithAccessor> {
let mut aggss = Vec::new();
let mut metrics = vec![];
let mut buckets = vec![];
for (key, agg) in aggs.iter() {
let aggs = AggregationWithAccessor::try_from_agg(
agg,
agg.sub_aggregation(),
reader,
limits.clone(),
)?;
for agg in aggs {
aggss.push((key.to_string(), agg));
match agg {
Aggregation::Bucket(bucket) => buckets.push((
key.to_string(),
BucketAggregationWithAccessor::try_from_bucket(
&bucket.bucket_agg,
&bucket.sub_aggregation,
reader,
limits.clone(),
)?,
)),
Aggregation::Metric(metric) => metrics.push((
key.to_string(),
MetricAggregationWithAccessor::try_from_metric(metric, reader)?,
)),
}
}
Ok(AggregationsWithAccessor::from_data(
VecWithNames::from_entries(aggss),
VecWithNames::from_entries(metrics),
VecWithNames::from_entries(buckets),
))
}
/// Get fast field reader or empty as default.
fn get_ff_reader(
/// Get fast field reader with given cardinatility.
fn get_ff_reader_and_validate(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
@@ -320,21 +195,3 @@ fn get_ff_reader(
});
Ok(ff_field_with_type)
}
/// Get all fast field reader or empty as default.
///
/// Is guaranteed to return at least one column.
fn get_all_ff_reader_or_empty(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
fallback_type: ColumnType,
) -> crate::Result<Vec<(columnar::Column<u64>, ColumnType)>> {
let ff_fields = reader.fast_fields();
let mut ff_field_with_type =
ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
if ff_field_with_type.is_empty() {
ff_field_with_type.push((Column::build_empty_column(reader.num_docs()), fallback_type));
}
Ok(ff_field_with_type)
}

View File

@@ -7,9 +7,12 @@
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::agg_req::BucketAggregationInternal;
use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats};
use super::{AggregationError, Key};
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
use super::metric::{SingleMetricResult, Stats};
use super::segment_agg_result::AggregationLimits;
use super::Key;
use crate::TantivyError;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
@@ -34,7 +37,8 @@ impl AggregationResults {
} else {
// Validation is be done during request parsing, so we can't reach this state.
Err(TantivyError::InternalError(format!(
"Can't find aggregation {name:?} in sub-aggregations"
"Can't find aggregation {:?} in sub-aggregations",
name
)))
}
}
@@ -90,8 +94,6 @@ pub enum MetricResult {
Stats(Stats),
/// Sum metric result.
Sum(SingleMetricResult),
/// Sum metric result.
Percentiles(PercentilesMetricResult),
}
impl MetricResult {
@@ -103,9 +105,30 @@ impl MetricResult {
MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
)),
}
}
}
impl From<IntermediateMetricResult> for MetricResult {
fn from(metric: IntermediateMetricResult) -> Self {
match metric {
IntermediateMetricResult::Average(intermediate_avg) => {
MetricResult::Average(intermediate_avg.finalize().into())
}
IntermediateMetricResult::Count(intermediate_count) => {
MetricResult::Count(intermediate_count.finalize().into())
}
IntermediateMetricResult::Max(intermediate_max) => {
MetricResult::Max(intermediate_max.finalize().into())
}
IntermediateMetricResult::Min(intermediate_min) => {
MetricResult::Min(intermediate_min.finalize().into())
}
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
}
}
}
@@ -160,6 +183,14 @@ impl BucketResult {
} => buckets.iter().map(|bucket| bucket.get_bucket_count()).sum(),
}
}
pub(crate) fn empty_from_req(
req: &BucketAggregationInternal,
limits: &AggregationLimits,
) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
empty_bucket.into_final_bucket_result(req, limits)
}
}
/// This is the wrapper of buckets entries, which can be vector or hashmap

View File

@@ -1,23 +1,25 @@
use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::bucket::{RangeAggregation, TermsAggregation};
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::metric::AverageAggregation;
use crate::aggregation::segment_agg_result::AggregationLimits;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, IndexWriter, Term};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
serde_json::from_value(json!({
"avg": {
"field": field_name,
}
}))
.unwrap()
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name(field_name.to_string()),
))
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
@@ -109,7 +111,7 @@ fn test_aggregation_flushing(
let searcher = reader.searcher();
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
intermediate_agg_result
.into_final_result(agg_req, &Default::default())
.into_final_bucket_result(agg_req, &Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req);
@@ -196,74 +198,6 @@ fn test_aggregation_flushing_variants() {
test_aggregation_flushing(true, true).unwrap();
}
#[test]
fn test_aggregation_level1_simple() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
let reader = index.reader()?;
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let range_agg = |field_name: &str| -> Aggregation {
serde_json::from_value(json!({
"range": {
"field": field_name,
"ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ]
}
}))
.unwrap()
};
let agg_req_1: Aggregations = vec![
("average".to_string(), get_avg_req("score")),
("range".to_string(), range_agg("score")),
]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["average"]["value"], 12.142857142857142);
assert_eq!(
res["range"]["buckets"],
json!(
[
{
"key": "*-3",
"doc_count": 1,
"to": 3.0
},
{
"key": "3-7",
"doc_count": 2,
"from": 3.0,
"to": 7.0
},
{
"key": "7-20",
"doc_count": 3,
"from": 7.0,
"to": 20.0
},
{
"key": "20-*",
"doc_count": 1,
"from": 20.0
}
])
);
Ok(())
}
#[test]
fn test_aggregation_level1() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
@@ -276,23 +210,43 @@ fn test_aggregation_level1() -> crate::Result<()> {
IndexRecordOption::Basic,
);
let range_agg = |field_name: &str| -> Aggregation {
serde_json::from_value(json!({
"range": {
"field": field_name,
"ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ]
}
}))
.unwrap()
};
let agg_req_1: Aggregations = vec![
("average_i64".to_string(), get_avg_req("score_i64")),
("average_f64".to_string(), get_avg_req("score_f64")),
("average".to_string(), get_avg_req("score")),
("range".to_string(), range_agg("score")),
("rangef64".to_string(), range_agg("score_f64")),
("rangei64".to_string(), range_agg("score_i64")),
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"rangef64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"rangei64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_i64".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
]
.into_iter()
.collect();
@@ -341,6 +295,7 @@ fn test_aggregation_level1() -> crate::Result<()> {
fn test_aggregation_level2(
merge_segments: bool,
use_distributed_collector: bool,
use_elastic_json_req: bool,
) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
@@ -357,7 +312,23 @@ fn test_aggregation_level2(
IndexRecordOption::Basic,
);
let elasticsearch_compatible_json_req = r#"
let sub_agg_req: Aggregations = vec![
("average_in_range".to_string(), get_avg_req("score")),
(
"term_agg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
]
.into_iter()
.collect();
let agg_req: Aggregations = if use_elastic_json_req {
let elasticsearch_compatible_json_req = r#"
{
"rangef64": {
"range": {
@@ -412,7 +383,61 @@ fn test_aggregation_level2(
}
}
"#;
let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
let value: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
value
} else {
let agg_req: Aggregations = vec![
("average".to_string(), get_avg_req("score")),
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req.clone(),
})),
),
(
"rangef64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req.clone(),
})),
),
(
"rangei64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_i64".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req,
})),
),
]
.into_iter()
.collect();
agg_req
};
let agg_res: AggregationResults = if use_distributed_collector {
let collector =
@@ -420,7 +445,10 @@ fn test_aggregation_level2(
let searcher = reader.searcher();
let res = searcher.search(&term_query, &collector).unwrap();
res.into_final_result(agg_req.clone(), &Default::default())
// Test de/serialization roundtrip on intermediate_agg_result
let res: IntermediateAggregationResults =
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
res.into_final_bucket_result(agg_req.clone(), &Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req.clone());
@@ -490,22 +518,42 @@ fn test_aggregation_level2(
#[test]
fn test_aggregation_level2_multi_segments() -> crate::Result<()> {
test_aggregation_level2(false, false)
test_aggregation_level2(false, false, false)
}
#[test]
fn test_aggregation_level2_single_segment() -> crate::Result<()> {
test_aggregation_level2(true, false)
test_aggregation_level2(true, false, false)
}
#[test]
fn test_aggregation_level2_multi_segments_distributed_collector() -> crate::Result<()> {
test_aggregation_level2(false, true)
test_aggregation_level2(false, true, false)
}
#[test]
fn test_aggregation_level2_single_segment_distributed_collector() -> crate::Result<()> {
test_aggregation_level2(true, true)
test_aggregation_level2(true, true, false)
}
#[test]
fn test_aggregation_level2_multi_segments_use_json() -> crate::Result<()> {
test_aggregation_level2(false, false, true)
}
#[test]
fn test_aggregation_level2_single_segment_use_json() -> crate::Result<()> {
test_aggregation_level2(true, false, true)
}
#[test]
fn test_aggregation_level2_multi_segments_distributed_collector_use_json() -> crate::Result<()> {
test_aggregation_level2(false, true, true)
}
#[test]
fn test_aggregation_level2_single_segment_distributed_collector_use_json() -> crate::Result<()> {
test_aggregation_level2(true, true, true)
}
#[test]
@@ -515,14 +563,14 @@ fn test_aggregation_invalid_requests() -> crate::Result<()> {
let reader = index.reader()?;
let avg_on_field = |field_name: &str| {
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": {
"avg": {
"field": field_name,
},
}
}))
.unwrap();
let agg_req_1: Aggregations = vec![(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name(field_name.to_string()),
)),
)]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
@@ -533,36 +581,10 @@ fn test_aggregation_invalid_requests() -> crate::Result<()> {
let agg_res = avg_on_field("dummy_text").unwrap_err();
assert_eq!(
format!("{agg_res:?}"),
format!("{:?}", agg_res),
r#"InvalidArgument("Field \"dummy_text\" is not configured as fast field")"#
);
let agg_req_1: Result<Aggregations, serde_json::Error> = serde_json::from_value(json!({
"average": {
"avg": {
"fieldd": "a",
},
}
}));
assert_eq!(agg_req_1.is_err(), true);
assert_eq!(agg_req_1.unwrap_err().to_string(), "missing field `field`");
let agg_req_1: Result<Aggregations, serde_json::Error> = serde_json::from_value(json!({
"average": {
"doesnotmatchanyagg": {
"field": "a",
},
}
}));
assert_eq!(agg_req_1.is_err(), true);
// TODO: This should list valid values
assert!(agg_req_1
.unwrap_err()
.to_string()
.contains("unknown variant `doesnotmatchanyagg`, expected one of"));
// TODO: This should return an error
// let agg_res = avg_on_field("not_exist_field").unwrap_err();
// assert_eq!(
@@ -586,7 +608,7 @@ fn test_aggregation_on_json_object() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"color": "red"})))
.unwrap();
@@ -596,16 +618,18 @@ fn test_aggregation_on_json_object() {
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg": {
"terms": {
"field": "json.color",
}
}
}))
.unwrap();
let agg: Aggregations = vec![(
"jsonagg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "json.color".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
@@ -630,7 +654,7 @@ fn test_aggregation_on_json_object_empty_columns() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Empty column when accessing color
index_writer
.add_document(doc!(json => json!({"price": 10.0})))
@@ -663,15 +687,18 @@ fn test_aggregation_on_json_object_empty_columns() {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg": {
"terms": {
"field": "json.color",
}
}
}))
.unwrap();
let agg: Aggregations = vec![(
"jsonagg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "json.color".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
@@ -748,7 +775,7 @@ fn test_aggregation_on_json_object_mixed_types() {
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let mut index_writer = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
@@ -826,9 +853,10 @@ fn test_aggregation_on_json_object_mixed_types() {
"buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
// TODO red is missing since there is no multi aggregation within one
// segment for multiple types
// TODO bool is also not yet handled in aggregation
{ "doc_count": 1, "key": "blue", "min_price": { "value": null } },
{ "doc_count": 1, "key": "red", "min_price": { "value": null } },
{ "doc_count": 1, "key": "blue", "min_price": { "value": null } }
],
"sum_other_doc_count": 0
}

View File

@@ -37,10 +37,10 @@ pub struct DateHistogramAggregationReq {
interval: Option<String>,
#[doc(hidden)]
/// Only for validation
calendar_interval: Option<String>,
date_interval: Option<String>,
/// The field to aggregate on.
pub field: String,
/// The format to format dates. Unsupported currently.
/// The format to format dates.
pub format: Option<String>,
/// The interval to chunk your data range. Each bucket spans a value range of
/// [0..fixed_interval). Accepted values
@@ -67,13 +67,6 @@ pub struct DateHistogramAggregationReq {
pub fixed_interval: Option<String>,
/// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k +
/// 1))`.
///
/// Offset makes it possible to shift this grid into
/// `[offset + interval * k, offset + interval * (k + 1))`. Offset has to be in the range [0,
/// interval).
///
/// The `offset` parameter is has the same syntax as the `fixed_interval` parameter, but
/// also allows for negative values.
pub offset: Option<String>,
/// The minimum number of documents in a bucket to be returned. Defaults to 0.
pub min_doc_count: Option<u64>,
@@ -84,7 +77,7 @@ pub struct DateHistogramAggregationReq {
/// hard_bounds only limits the buckets, to force a range set both extended_bounds and
/// hard_bounds to the same range.
///
/// Needs to be provided as timestamp in millisecond precision.
/// Needs to be provided as timestamp in microseconds precision.
///
/// ## Example
/// ```json
@@ -95,7 +88,7 @@ pub struct DateHistogramAggregationReq {
/// "interval": "1d",
/// "hard_bounds": {
/// "min": 0,
/// "max": 1420502400000
/// "max": 1420502400000000
/// }
/// }
/// }
@@ -121,32 +114,33 @@ impl DateHistogramAggregationReq {
self.validate()?;
Ok(HistogramAggregation {
field: self.field.to_string(),
interval: parse_into_milliseconds(self.fixed_interval.as_ref().unwrap())? as f64,
interval: parse_into_microseconds(self.fixed_interval.as_ref().unwrap())? as f64,
offset: self
.offset
.as_ref()
.map(|offset| parse_offset_into_milliseconds(offset))
.map(|offset| parse_offset_into_microseconds(offset))
.transpose()?
.map(|el| el as f64),
min_doc_count: self.min_doc_count,
hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds,
hard_bounds: None,
extended_bounds: None,
keyed: self.keyed,
is_normalized_to_ns: false,
})
}
fn validate(&self) -> crate::Result<()> {
if let Some(interval) = self.interval.as_ref() {
return Err(crate::TantivyError::InvalidArgument(format!(
"`interval` parameter {interval:?} in date histogram is unsupported, only \
`fixed_interval` is supported"
"`interval` parameter {:?} in date histogram is unsupported, only \
`fixed_interval` is supported",
interval
)));
}
if let Some(interval) = self.calendar_interval.as_ref() {
if let Some(interval) = self.date_interval.as_ref() {
return Err(crate::TantivyError::InvalidArgument(format!(
"`calendar_interval` parameter {interval:?} in date histogram is unsupported, \
only `fixed_interval` is supported"
"`date_interval` parameter {:?} in date histogram is unsupported, only \
`fixed_interval` is supported",
interval
)));
}
if self.format.is_some() {
@@ -161,7 +155,7 @@ impl DateHistogramAggregationReq {
));
}
parse_into_milliseconds(self.fixed_interval.as_ref().unwrap())?;
parse_into_microseconds(self.fixed_interval.as_ref().unwrap())?;
Ok(())
}
@@ -182,12 +176,9 @@ pub enum DateHistogramParseError {
/// Offset invalid
#[error("passed offset is invalid {0:?}")]
InvalidOffset(String),
/// Value out of bounds
#[error("passed value is out of bounds: {0:?}")]
OutOfBounds(String),
}
fn parse_offset_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
fn parse_offset_into_microseconds(input: &str) -> Result<i64, AggregationError> {
let is_sign = |byte| &[byte] == b"-" || &[byte] == b"+";
if input.is_empty() {
return Err(DateHistogramParseError::InvalidOffset(input.to_string()).into());
@@ -196,18 +187,18 @@ fn parse_offset_into_milliseconds(input: &str) -> Result<i64, AggregationError>
let has_sign = is_sign(input.as_bytes()[0]);
if has_sign {
let (sign, input) = input.split_at(1);
let val = parse_into_milliseconds(input)?;
let val = parse_into_microseconds(input)?;
if sign == "-" {
Ok(-val)
} else {
Ok(val)
}
} else {
parse_into_milliseconds(input)
parse_into_microseconds(input)
}
}
fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
fn parse_into_microseconds(input: &str) -> Result<i64, AggregationError> {
let split_boundary = input
.as_bytes()
.iter()
@@ -226,79 +217,73 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
// here and being defensive does not hurt.
.map_err(|_err| DateHistogramParseError::NumberMissing(input.to_string()))?;
let unit_in_ms = match unit {
"ms" | "milliseconds" => 1,
"s" | "seconds" => 1000,
"m" | "minutes" => 60 * 1000,
"h" | "hours" => 60 * 60 * 1000,
"d" | "days" => 24 * 60 * 60 * 1000,
let multiplier_from_unit = match unit {
"ms" => 1,
"s" => 1000,
"m" => 60 * 1000,
"h" => 60 * 60 * 1000,
"d" => 24 * 60 * 60 * 1000,
_ => return Err(DateHistogramParseError::UnitNotRecognized(unit.to_string()).into()),
};
let val = number * unit_in_ms;
// The field type is in nanoseconds precision, so validate the value to fit the range
val.checked_mul(1_000_000)
.ok_or_else(|| DateHistogramParseError::OutOfBounds(input.to_string()))?;
Ok(val)
Ok(number * multiplier_from_unit * 1000)
}
#[cfg(test)]
pub mod tests {
mod tests {
use pretty_assertions::assert_eq;
use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request;
use crate::indexer::NoMergePolicy;
use crate::schema::{Schema, FAST, STRING};
use crate::{Index, IndexWriter, TantivyDocument};
use crate::schema::{Schema, FAST};
use crate::Index;
#[test]
fn test_parse_into_millisecs() {
assert_eq!(parse_into_milliseconds("1m").unwrap(), 60_000);
assert_eq!(parse_into_milliseconds("2m").unwrap(), 120_000);
assert_eq!(parse_into_milliseconds("2minutes").unwrap(), 120_000);
fn test_parse_into_microseconds() {
assert_eq!(parse_into_microseconds("1m").unwrap(), 60_000_000);
assert_eq!(parse_into_microseconds("2m").unwrap(), 120_000_000);
assert_eq!(
parse_into_milliseconds("2y").unwrap_err(),
parse_into_microseconds("2y").unwrap_err(),
DateHistogramParseError::UnitNotRecognized("y".to_string()).into()
);
assert_eq!(
parse_into_milliseconds("2000").unwrap_err(),
parse_into_microseconds("2000").unwrap_err(),
DateHistogramParseError::UnitMissing("2000".to_string()).into()
);
assert_eq!(
parse_into_milliseconds("ms").unwrap_err(),
parse_into_microseconds("ms").unwrap_err(),
DateHistogramParseError::NumberMissing("ms".to_string()).into()
);
}
#[test]
fn test_parse_offset_into_milliseconds() {
assert_eq!(parse_offset_into_milliseconds("1m").unwrap(), 60_000);
assert_eq!(parse_offset_into_milliseconds("+1m").unwrap(), 60_000);
assert_eq!(parse_offset_into_milliseconds("-1m").unwrap(), -60_000);
assert_eq!(parse_offset_into_milliseconds("2m").unwrap(), 120_000);
assert_eq!(parse_offset_into_milliseconds("+2m").unwrap(), 120_000);
assert_eq!(parse_offset_into_milliseconds("-2m").unwrap(), -120_000);
assert_eq!(parse_offset_into_milliseconds("-2ms").unwrap(), -2);
fn test_parse_offset_into_microseconds() {
assert_eq!(parse_offset_into_microseconds("1m").unwrap(), 60_000_000);
assert_eq!(parse_offset_into_microseconds("+1m").unwrap(), 60_000_000);
assert_eq!(parse_offset_into_microseconds("-1m").unwrap(), -60_000_000);
assert_eq!(parse_offset_into_microseconds("2m").unwrap(), 120_000_000);
assert_eq!(parse_offset_into_microseconds("+2m").unwrap(), 120_000_000);
assert_eq!(parse_offset_into_microseconds("-2m").unwrap(), -120_000_000);
assert_eq!(parse_offset_into_microseconds("-2ms").unwrap(), -2_000);
assert_eq!(
parse_offset_into_milliseconds("2y").unwrap_err(),
parse_offset_into_microseconds("2y").unwrap_err(),
DateHistogramParseError::UnitNotRecognized("y".to_string()).into()
);
assert_eq!(
parse_offset_into_milliseconds("2000").unwrap_err(),
parse_offset_into_microseconds("2000").unwrap_err(),
DateHistogramParseError::UnitMissing("2000".to_string()).into()
);
assert_eq!(
parse_offset_into_milliseconds("ms").unwrap_err(),
parse_offset_into_microseconds("ms").unwrap_err(),
DateHistogramParseError::NumberMissing("ms".to_string()).into()
);
}
#[test]
fn test_parse_into_milliseconds_do_not_accept_non_ascii() {
assert!(parse_into_milliseconds("m").is_err());
assert!(parse_into_microseconds("m").is_err());
}
pub fn get_test_index_from_docs(
@@ -307,8 +292,7 @@ pub mod tests {
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
schema_builder.add_date_field("date", FAST);
schema_builder.add_text_field("text", FAST | STRING);
schema_builder.add_text_field("text2", FAST | STRING);
schema_builder.add_text_field("text", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema.clone());
{
@@ -316,7 +300,7 @@ pub mod tests {
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_docs {
for doc_str in values {
let doc = TantivyDocument::parse_json(&schema, doc_str)?;
let doc = schema.parse_document(doc_str)?;
index_writer.add_document(doc)?;
}
// writing the segment
@@ -328,7 +312,7 @@ pub mod tests {
.searchable_segment_ids()
.expect("Searchable segments failed.");
if segment_ids.len() > 1 {
let mut index_writer: IndexWriter = index.writer_for_tests()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
@@ -338,316 +322,168 @@ pub mod tests {
}
#[test]
fn histogram_test_date_force_merge_segments() {
fn histogram_test_date_force_merge_segments() -> crate::Result<()> {
histogram_test_date_merge_segments(true)
}
#[test]
fn histogram_test_date() {
fn histogram_test_date() -> crate::Result<()> {
histogram_test_date_merge_segments(false)
}
fn histogram_test_date_merge_segments(merge_segments: bool) {
fn histogram_test_date_merge_segments(merge_segments: bool) -> crate::Result<()> {
let docs = vec![
vec![r#"{ "date": "2015-01-01T12:10:30Z", "text": "aaa" }"#],
vec![r#"{ "date": "2015-01-01T11:11:30Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-06T00:00:00Z", "text": "ccc" }"#],
];
let index = get_test_index_from_docs(merge_segments, &docs).unwrap();
{
// 30day + offset
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
}
let index = get_test_index_from_docs(merge_segments, &docs)?;
// 30day + offset
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
}
}
);
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0,
"doc_count" : 4
}
]
}
});
assert_eq!(res, expected_res);
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000000.0,
"doc_count" : 4
}
]
}
});
assert_eq!(res, expected_res);
{
// 30day + offset + sub_agg
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
},
"aggs": {
"texts": {
"terms": {"field": "text"}
}
}
// 30day + offset + sub_agg
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
},
"aggs": {
"texts": {
"terms": {"field": "text"}
}
}
);
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
println!("{}", serde_json::to_string_pretty(&res).unwrap());
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000000.0,
"doc_count" : 4,
"texts": {
"buckets": [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0,
"doc_count" : 4,
"texts": {
"buckets": [
{
"doc_count": 2,
"key": "bbb"
},
{
"doc_count": 1,
"key": "ccc"
},
{
"doc_count": 1,
"key": "aaa"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d"
}
"doc_count": 2,
"key": "bbb"
},
{
"doc_count": 1,
"key": "ccc"
},
{
"doc_count": 1,
"key": "aaa"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!( {
"sales_over_time": {
"buckets": [
{
"doc_count": 2,
"key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
]
}
});
assert_eq!(res, expected_res);
{
// 1day + extended_bounds
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": 1419984000000.0,
"max": 1420588800000.0
}
}
}
// 1day
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d"
}
);
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 0,
"key": 1419984000000.0,
"key_as_string": "2014-12-31T00:00:00Z"
},
{
"doc_count": 2,
"key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420588800000.0,
"key_as_string": "2015-01-07T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day + hard_bounds + extended_bounds
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"hard_bounds": {
"min": 1420156800000.0,
"max": 1420243200000.0
}
}
}
}
);
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
let expected_res = json!( {
"sales_over_time": {
"buckets": [
{
"doc_count": 2,
"key": 1420070400000000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000000.0,
"key_as_string": "2015-01-06T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day + hard_bounds as Rfc3339
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"hard_bounds": {
"min": "2015-01-02T00:00:00Z",
"max": "2015-01-02T12:00:00Z"
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
Ok(())
}
#[test]
fn histogram_test_invalid_req() {
fn histogram_test_invalid_req() -> crate::Result<()> {
let docs = vec![];
let index = get_test_index_from_docs(false, &docs).unwrap();
let index = get_test_index_from_docs(false, &docs)?;
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
@@ -668,5 +504,7 @@ pub mod tests {
err.to_string(),
r#"An invalid argument was passed: '`interval` parameter "30d" in date histogram is unsupported, only `fixed_interval` is supported'"#
);
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,39 +1,24 @@
//! Module for all bucket aggregations.
//!
//! BucketAggregations create buckets of documents.
//! Each bucket is associated with a rule which
//! determines whether or not a document in the falls into it. In other words, the buckets
//! effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
//! fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
//! compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
//! metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
//! the buckets created by their "parent" bucket aggregation. There are different bucket
//! aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
//! define fixed number of multiple buckets, and others dynamically create the buckets during the
//! aggregation process.
//! BucketAggregations create buckets of documents
//! [`BucketAggregation`](super::agg_req::BucketAggregation).
//!
//! Results of final buckets are [`BucketResult`](super::agg_result::BucketResult).
//! Results of intermediate buckets are
//! [`IntermediateBucketResult`](super::intermediate_agg_result::IntermediateBucketResult)
//!
//! ## Supported Bucket Aggregations
//! - [Histogram](HistogramAggregation)
//! - [DateHistogram](DateHistogramAggregationReq)
//! - [Range](RangeAggregation)
//! - [Terms](TermsAggregation)
mod histogram;
mod range;
mod term_agg;
mod term_missing_agg;
use std::collections::HashMap;
pub(crate) use histogram::SegmentHistogramCollector;
pub use histogram::*;
pub(crate) use range::SegmentRangeCollector;
pub use range::*;
use serde::{de, Deserialize, Deserializer, Serialize, Serializer};
pub use term_agg::*;
pub use term_missing_agg::*;
/// Order for buckets in a bucket aggregation.
#[derive(Clone, Copy, Debug, PartialEq, Serialize, Deserialize, Default)]

View File

@@ -5,17 +5,16 @@ use columnar::{ColumnType, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_limits::ResourceLimitGuard;
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
IntermediateAggregationResults, IntermediateBucketResult, IntermediateRangeBucketEntry,
IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
};
use crate::aggregation::{
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey, VecWithNames,
};
use crate::TantivyError;
@@ -158,18 +157,16 @@ impl SegmentRangeBucketEntry {
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateRangeBucketEntry> {
let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation
.add_intermediate_aggregation_result(agg_with_accessor, &mut sub_aggregation_res)?
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry {
key: self.key.into(),
key: self.key,
doc_count: self.doc_count,
sub_aggregation: sub_aggregation_res,
sub_aggregation,
from: self.from,
to: self.to,
})
@@ -177,14 +174,13 @@ impl SegmentRangeBucketEntry {
}
impl SegmentAggregationCollector for SegmentRangeCollector {
fn add_intermediate_aggregation_result(
fn into_intermediate_aggregations_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
) -> crate::Result<IntermediateAggregationResults> {
let field_type = self.column_type;
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let sub_agg = &agg_with_accessor.aggs.values[self.accessor_idx].sub_aggregation;
let name = agg_with_accessor.buckets.keys[self.accessor_idx].to_string();
let sub_agg = &agg_with_accessor.buckets.values[self.accessor_idx].sub_aggregation;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
@@ -204,9 +200,12 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
column_type: Some(self.column_type),
});
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
let buckets = Some(VecWithNames::from_entries(vec![(name, bucket)]));
Ok(())
Ok(IntermediateAggregationResults {
metrics: None,
buckets,
})
}
#[inline]
@@ -224,7 +223,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
let bucket_agg_accessor = &mut agg_with_accessor.buckets.values[self.accessor_idx];
bucket_agg_accessor
.column_block_accessor
@@ -246,7 +245,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
fn flush(&mut self, agg_with_accessor: &mut AggregationsWithAccessor) -> crate::Result<()> {
let sub_aggregation_accessor =
&mut agg_with_accessor.aggs.values[self.accessor_idx].sub_aggregation;
&mut agg_with_accessor.buckets.values[self.accessor_idx].sub_aggregation;
for bucket in self.buckets.iter_mut() {
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
@@ -261,8 +260,8 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
impl SegmentRangeCollector {
pub(crate) fn from_req_and_validate(
req: &RangeAggregation,
sub_aggregation: &mut AggregationsWithAccessor,
limits: &ResourceLimitGuard,
sub_aggregation: &AggregationsWithAccessor,
limits: &AggregationLimits,
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
@@ -308,7 +307,8 @@ impl SegmentRangeCollector {
limits.add_memory_consumed(
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
)?;
);
limits.validate_memory_consumption()?;
Ok(SegmentRangeCollector {
buckets,
@@ -445,12 +445,14 @@ mod tests {
use serde_json::Value;
use super::*;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::metric::AverageAggregation;
use crate::aggregation::tests::{
exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs,
};
use crate::aggregation::AggregationLimits;
pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>,
@@ -464,8 +466,8 @@ mod tests {
SegmentRangeCollector::from_req_and_validate(
&req,
&mut Default::default(),
&AggregationLimits::default().new_guard(),
&Default::default(),
&Default::default(),
field_type,
0,
)
@@ -476,18 +478,22 @@ mod tests {
fn range_fraction_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
]
},
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -507,25 +513,31 @@ mod tests {
fn range_fraction_test_with_sub_agg() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let sub_agg_req: Aggregations = serde_json::from_value(json!({
"avg": { "avg": { "field": "score_f64", } }
let sub_agg_req: Aggregations = vec![(
"score_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
}))
.unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
]
},
"aggs": sub_agg_req
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
..Default::default()
}),
sub_aggregation: sub_agg_req,
}
.into(),
),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -545,19 +557,22 @@ mod tests {
fn range_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
],
"keyed": true
},
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
keyed: true,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -582,19 +597,33 @@ mod tests {
fn range_custom_key_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"key": "custom-key-0-to-0.1", "from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
],
"keyed": false
},
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![
RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
},
RangeAggregationRange {
key: None,
from: Some(0.1f64),
to: Some(0.2f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -628,19 +657,33 @@ mod tests {
fn range_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"date_ranges": {
"range": {
"field": "date",
"ranges": [
{"to": 1546300800000000000i64},
{"from": 1546300800000000000i64, "to": 1546387200000000000i64},
],
"keyed": false
},
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"date_ranges".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "date".to_string(),
ranges: vec![
RangeAggregationRange {
key: None,
from: None,
to: Some(1546300800000000.0f64),
},
RangeAggregationRange {
key: None,
from: Some(1546300800000000.0f64),
to: Some(1546387200000000.0f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_res = exec_request(agg_req, &index)?;
@@ -679,18 +722,26 @@ mod tests {
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"key": "custom-key-0-to-0.1", "from": 0.0, "to": 0.1},
],
"keyed": true
},
}
}))
.unwrap();
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
}],
keyed: true,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let res = exec_request_with_query(agg_req, &index, None)?;

File diff suppressed because it is too large Load Diff

View File

@@ -1,476 +0,0 @@
use rustc_hash::FxHashMap;
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateKey, IntermediateTermBucketEntry, IntermediateTermBucketResult,
};
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
/// The specialized missing term aggregation.
#[derive(Default, Debug, Clone)]
pub struct TermMissingAgg {
missing_count: u32,
accessor_idx: usize,
sub_agg: Option<Box<dyn SegmentAggregationCollector>>,
}
impl TermMissingAgg {
pub(crate) fn new(
accessor_idx: usize,
sub_aggregations: &mut AggregationsWithAccessor,
) -> crate::Result<Self> {
let has_sub_aggregations = !sub_aggregations.is_empty();
let sub_agg = if has_sub_aggregations {
let sub_aggregation = build_segment_agg_collector(sub_aggregations)?;
Some(sub_aggregation)
} else {
None
};
Ok(Self {
accessor_idx,
sub_agg,
..Default::default()
})
}
}
impl SegmentAggregationCollector for TermMissingAgg {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
let term_agg = agg_with_accessor
.agg
.agg
.as_term()
.expect("TermMissingAgg collector must be term agg req");
let missing = term_agg
.missing
.as_ref()
.expect("TermMissingAgg collector, but no missing found in agg req")
.clone();
let mut entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry> =
Default::default();
let mut missing_entry = IntermediateTermBucketEntry {
doc_count: self.missing_count,
sub_aggregation: Default::default(),
};
if let Some(sub_agg) = self.sub_agg {
let mut res = IntermediateAggregationResults::default();
sub_agg.add_intermediate_aggregation_result(
&agg_with_accessor.sub_aggregation,
&mut res,
)?;
missing_entry.sub_aggregation = res;
}
entries.insert(missing.into(), missing_entry);
let bucket = IntermediateBucketResult::Terms(IntermediateTermBucketResult {
entries,
sum_other_doc_count: 0,
doc_count_error_upper_bound: 0,
});
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(())
}
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let agg = &mut agg_with_accessor.aggs.values[self.accessor_idx];
let has_value = agg.accessors.iter().any(|acc| acc.index.has_value(doc));
if !has_value {
self.missing_count += 1;
if let Some(sub_agg) = self.sub_agg.as_mut() {
sub_agg.collect(doc, &mut agg.sub_aggregation)?;
}
}
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
for doc in docs {
self.collect(*doc, agg_with_accessor)?;
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::exec_request_with_query;
use crate::schema::{Schema, FAST};
use crate::{Index, IndexWriter};
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg_sub_agg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!(score => 5.0))?;
// index_writer.commit().unwrap();
//// => Segment with all values text
index_writer
.add_document(doc!(score => 1.0, json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!(score => 5.0))?;
// index_writer.commit().unwrap();
// => Segment with mixed values
index_writer.add_document(doc!(json => json!({"mixed_type": "red"})))?;
index_writer.add_document(doc!(json => json!({"mixed_type": -20.5})))?;
index_writer.add_document(doc!(json => json!({"mixed_type": true})))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_sub_agg_reg1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer.add_document(doc!(score => 1.0, json => json!({"mixed_type": 10.0})))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
10.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mult_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_single_seg_empty() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let score = schema_builder.add_f64_field("score", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.add_document(doc!(score => 5.0))?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
"aggs": {
"sum_score": {
"sum": {
"field": "score"
}
}
}
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(
res["replace_null"]["buckets"][0]["sum_score"]["value"],
15.0
);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_mult_seg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
//// => Segment with all values text
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
// => Segment with mixed values
index_writer
.add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": true})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
"replace_num": {
"terms": {
"field": "json.mixed_type",
"missing": 1337
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_num"]["buckets"][0]["key"], 1337.0);
assert_eq!(res["replace_num"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_str_on_numeric_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.add_document(doc!())?;
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
#[test]
fn terms_aggregation_missing_mixed_type_one_seg() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values numeric
index_writer
.add_document(doc!(json => json!({"mixed_type": 10.0})))
.unwrap();
index_writer.add_document(doc!())?;
//// => Segment with all values text
index_writer
.add_document(doc!(json => json!({"mixed_type": "blue"})))
.unwrap();
index_writer.add_document(doc!())?;
// => Segment with mixed values
index_writer
.add_document(doc!(json => json!({"mixed_type": "red"})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": -20.5})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"mixed_type": true})))
.unwrap();
index_writer.add_document(doc!())?;
index_writer.commit().unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"replace_null": {
"terms": {
"field": "json.mixed_type",
"missing": "NULL"
},
},
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// text field
assert_eq!(res["replace_null"]["buckets"][0]["key"], "NULL");
assert_eq!(res["replace_null"]["buckets"][0]["doc_count"], 3);
assert_eq!(res["replace_null"]["sum_other_doc_count"], 0);
assert_eq!(res["replace_null"]["doc_count_error_upper_bound"], 0);
Ok(())
}
}

View File

@@ -35,12 +35,11 @@ impl BufAggregationCollector {
impl SegmentAggregationCollector for BufAggregationCollector {
#[inline]
fn add_intermediate_aggregation_result(
fn into_intermediate_aggregations_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
Box::new(self.collector).add_intermediate_aggregation_result(agg_with_accessor, results)
) -> crate::Result<IntermediateAggregationResults> {
Box::new(self.collector).into_intermediate_aggregations_result(agg_with_accessor)
}
#[inline]

View File

@@ -6,7 +6,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::{
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
};
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, SegmentReader, TantivyError};
@@ -104,7 +104,7 @@ impl Collector for AggregationCollector {
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
let res = merge_fruits(segment_fruits)?;
res.into_final_result(self.agg.clone(), &self.limits)
res.into_final_bucket_result(self.agg.clone(), &self.limits)
}
}
@@ -114,7 +114,7 @@ fn merge_fruits(
if let Some(fruit) = segment_fruits.pop() {
let mut fruit = fruit?;
for next_fruit in segment_fruits {
fruit.merge_fruits(next_fruit?)?;
fruit.merge_fruits(next_fruit?);
}
Ok(fruit)
} else {
@@ -137,10 +137,9 @@ impl AggregationSegmentCollector {
reader: &SegmentReader,
limits: &AggregationLimits,
) -> crate::Result<Self> {
let mut aggs_with_accessor =
get_aggs_with_segment_accessor_and_validate(agg, reader, limits)?;
let aggs_with_accessor = get_aggs_with_accessor_and_validate(agg, reader, limits)?;
let result =
BufAggregationCollector::new(build_segment_agg_collector(&mut aggs_with_accessor)?);
BufAggregationCollector::new(build_segment_agg_collector(&aggs_with_accessor)?);
Ok(AggregationSegmentCollector {
aggs_with_accessor,
agg_collector: result,
@@ -185,13 +184,6 @@ impl SegmentCollector for AggregationSegmentCollector {
return Err(err);
}
self.agg_collector.flush(&mut self.aggs_with_accessor)?;
let mut sub_aggregation_res = IntermediateAggregationResults::default();
Box::new(self.agg_collector).add_intermediate_aggregation_result(
&self.aggs_with_accessor,
&mut sub_aggregation_res,
)?;
Ok(sub_aggregation_res)
Box::new(self.agg_collector).into_intermediate_aggregations_result(&self.aggs_with_accessor)
}
}

View File

@@ -4,11 +4,13 @@ use time::OffsetDateTime;
use crate::TantivyError;
pub(crate) fn format_date(val: i64) -> crate::Result<String> {
let datetime = OffsetDateTime::from_unix_timestamp_nanos(val as i128).map_err(|err| {
TantivyError::InvalidArgument(format!(
"Could not convert {val:?} to OffsetDateTime, err {err:?}"
))
})?;
let datetime =
OffsetDateTime::from_unix_timestamp_nanos(1_000 * (val as i128)).map_err(|err| {
TantivyError::InvalidArgument(format!(
"Could not convert {:?} to OffsetDateTime, err {:?}",
val, err
))
})?;
let key_as_string = datetime
.format(&Rfc3339)
.map_err(|_err| TantivyError::InvalidArgument("Could not serialize date".to_string()))?;

Some files were not shown because too many files have changed in this diff Show More