Compare commits

..

27 Commits

Author SHA1 Message Date
Pascal Seitz
b345c11786 add key_as_string for numbers in term agg 2024-07-25 13:20:56 +08:00
PSeitz
7ebcc15b17 add support for str fast field range query (#2453)
* add support for str fast field range query

Add support for range queries on fast fields, by converting term bounds to
term ordinals bounds.

closes https://github.com/quickwit-oss/tantivy/issues/2023

* extend tests, rename

* update comment

* update comment
2024-07-17 09:31:42 +08:00
PSeitz
1b4076691f refactor fast field query (#2452)
As preparation of #2023 and #1709

* Use Term to pass parameters
* merge u64 and ip fast field range query

Side note: I did not rename range_query_u64_fastfield, because then git can't track the changes.
2024-07-15 18:08:05 +08:00
Robert Caulk
eab660873a doc: fix typo in readme (#2450) 2024-07-09 15:12:22 +08:00
PSeitz
232f37126e fix coverage (#2448) 2024-07-05 12:04:18 +08:00
PSeitz
13e9885dfd faster term aggregation fetch terms (#2447)
big impact for term aggregations with large `size` parameter (e.g. 1000)
add top 1000 term agg bench

full
terms_few                                      Memory: 27.3 KB (+79.09%)    Avg: 3.8058ms (+2.40%)      Median: 3.7192ms (+3.47%)       [3.6224ms .. 4.3721ms]
terms_many                                     Memory: 6.9 MB               Avg: 12.6102ms (-4.70%)     Median: 12.1389ms (-6.58%)      [10.2847ms .. 15.4857ms]
terms_many_top_1000                            Memory: 6.9 MB               Avg: 15.8216ms (-83.19%)    Median: 15.4899ms (-83.46%)     [13.4250ms .. 20.6897ms]
terms_many_order_by_term                       Memory: 6.9 MB               Avg: 14.7820ms (-3.95%)     Median: 14.2236ms (-4.28%)      [12.6669ms .. 21.0968ms]
terms_many_with_top_hits                       Memory: 58.2 MB              Avg: 551.6218ms (+7.18%)    Median: 549.8826ms (+11.01%)    [496.7371ms .. 592.1299ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB              Avg: 197.7029ms (+2.66%)    Median: 190.1564ms (+0.64%)     [167.9226ms .. 245.6651ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB (+0.00%)     Avg: 242.0121ms (+0.92%)    Median: 237.7084ms (-2.85%)     [201.9959ms .. 302.2136ms]
terms_few_with_cardinality_agg                 Memory: 10.6 MB              Avg: 122.6036ms (+1.21%)    Median: 119.0033ms (+2.60%)     [109.2859ms .. 161.5858ms]
range_agg_with_term_agg_few                    Memory: 45.4 KB (+39.75%)    Avg: 24.5454ms (+2.14%)     Median: 24.2861ms (+2.44%)      [23.5109ms .. 27.8406ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB               Avg: 56.8049ms (+3.01%)     Median: 50.9706ms (+1.52%)      [41.4517ms .. 90.3934ms]
dense
terms_few                                      Memory: 28.8 KB (+81.74%)    Avg: 8.9092ms (-2.24%)      Median: 8.7143ms (-1.31%)      [8.6148ms .. 10.3868ms]
terms_many                                     Memory: 6.9 MB (-0.00%)      Avg: 17.9604ms (-10.18%)    Median: 17.1552ms (-11.93%)    [14.8979ms .. 26.2779ms]
terms_many_top_1000                            Memory: 6.9 MB               Avg: 21.4963ms (-78.90%)    Median: 21.2924ms (-78.98%)    [18.2033ms .. 28.0087ms]
terms_many_order_by_term                       Memory: 6.9 MB               Avg: 20.4167ms (-9.13%)     Median: 19.5596ms (-11.37%)    [17.5153ms .. 29.5987ms]
terms_many_with_top_hits                       Memory: 58.2 MB              Avg: 518.4474ms (-6.41%)    Median: 514.9180ms (-9.44%)    [471.5550ms .. 579.0220ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB              Avg: 263.6702ms (-2.78%)    Median: 260.8775ms (-2.55%)    [239.5754ms .. 304.6669ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB              Avg: 299.9791ms (-2.01%)    Median: 302.2180ms (-3.08%)    [239.2080ms .. 346.3649ms]
terms_few_with_cardinality_agg                 Memory: 10.6 MB              Avg: 136.3303ms (-3.12%)    Median: 132.3831ms (-2.88%)    [123.7564ms .. 164.7914ms]
range_agg_with_term_agg_few                    Memory: 47.1 KB (+37.81%)    Avg: 35.4538ms (+0.66%)     Median: 34.8754ms (-0.56%)     [34.2287ms .. 40.0884ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB               Avg: 72.2269ms (-4.38%)     Median: 66.1174ms (-4.98%)     [55.5125ms .. 124.1622ms]
sparse
terms_few                                      Memory: 27.3 KB (+69.68%)    Avg: 19.6053ms (-1.15%)     Median: 19.4543ms (-0.38%)     [19.3056ms .. 24.0547ms]
terms_many                                     Memory: 1.8 MB               Avg: 21.2886ms (-6.28%)     Median: 21.1287ms (-6.65%)     [20.6640ms .. 24.6144ms]
terms_many_top_1000                            Memory: 2.6 MB               Avg: 23.4869ms (-85.53%)    Median: 23.3393ms (-85.61%)    [22.7789ms .. 25.0896ms]
terms_many_order_by_term                       Memory: 1.8 MB               Avg: 21.7437ms (-7.78%)     Median: 21.6272ms (-7.66%)     [21.0409ms .. 23.6517ms]
terms_many_with_top_hits                       Memory: 13.1 MB              Avg: 43.7926ms (-2.76%)     Median: 44.3602ms (+0.01%)     [37.8039ms .. 51.0451ms]
terms_many_with_avg_sub_agg                    Memory: 7.5 MB               Avg: 34.6307ms (+3.72%)     Median: 33.4522ms (+1.16%)     [32.4418ms .. 41.4196ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 7.4 MB               Avg: 46.4318ms (+1.16%)     Median: 46.4050ms (+2.03%)     [44.5986ms .. 48.5142ms]
terms_few_with_cardinality_agg                 Memory: 680.0 KB (-0.04%)    Avg: 35.4410ms (+2.05%)     Median: 35.1384ms (+1.19%)     [34.4402ms .. 39.1082ms]
range_agg_with_term_agg_few                    Memory: 45.7 KB (+39.44%)    Avg: 22.7760ms (+0.44%)     Median: 22.5152ms (-0.35%)     [22.3078ms .. 26.1567ms]
range_agg_with_term_agg_many                   Memory: 1.8 MB               Avg: 25.7696ms (-4.45%)     Median: 25.4009ms (-5.61%)     [24.7874ms .. 29.6434ms]
multivalue
terms_few                                      Memory: 244.4 KB            Avg: 15.1253ms (-2.85%)     Median: 15.0988ms (-0.54%)     [14.8790ms .. 15.8193ms]
terms_many                                     Memory: 6.9 MB (-0.00%)     Avg: 26.3019ms (-6.24%)     Median: 26.3662ms (-4.94%)     [21.3553ms .. 31.0564ms]
terms_many_top_1000                            Memory: 6.9 MB              Avg: 29.5212ms (-72.90%)    Median: 29.4257ms (-72.84%)    [24.2645ms .. 35.1607ms]
terms_many_order_by_term                       Memory: 6.9 MB              Avg: 28.6076ms (-4.93%)     Median: 28.1059ms (-6.64%)     [24.0845ms .. 34.1493ms]
terms_many_with_top_hits                       Memory: 58.3 MB             Avg: 570.1548ms (+1.52%)    Median: 572.7759ms (+0.53%)    [525.9567ms .. 617.0862ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB             Avg: 305.5207ms (+0.24%)    Median: 296.0101ms (-0.22%)    [277.8579ms .. 373.5914ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB (-0.00%)    Avg: 324.7342ms (-2.51%)    Median: 319.0025ms (-2.58%)    [298.7122ms .. 368.6144ms]
terms_few_with_cardinality_agg                 Memory: 10.8 MB             Avg: 151.6126ms (-2.54%)    Median: 149.0616ms (-0.32%)    [136.5592ms .. 181.8942ms]
range_agg_with_term_agg_few                    Memory: 248.2 KB            Avg: 49.5225ms (+3.11%)     Median: 48.3994ms (+3.18%)     [46.4134ms .. 60.5989ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB              Avg: 85.9824ms (-3.66%)     Median: 78.4266ms (-3.85%)     [64.1231ms .. 128.5279ms]
2024-07-03 12:42:59 +08:00
PSeitz
56d79cb203 fix cardinality aggregation performance (#2446)
* fix cardinality aggregation performance

fix cardinality performance by fetching multiple terms at once. This
avoids decompressing the same block and keeps the buffer state between
terms.

add cardinality aggregation benchmark

bump rust version to 1.66

Performance comparison to before (AllQuery)
```
full
cardinality_agg                   Memory: 3.5 MB (-0.00%)    Avg: 21.2256ms (-97.78%)    Median: 21.0042ms (-97.82%)    [20.4717ms .. 23.6206ms]
terms_few_with_cardinality_agg    Memory: 10.6 MB            Avg: 81.9293ms (-97.37%)    Median: 81.5526ms (-97.38%)    [79.7564ms .. 88.0374ms]
dense
cardinality_agg                   Memory: 3.6 MB (-0.00%)    Avg: 25.9372ms (-97.24%)    Median: 25.7744ms (-97.25%)    [24.7241ms .. 27.8793ms]
terms_few_with_cardinality_agg    Memory: 10.6 MB            Avg: 93.9897ms (-96.91%)    Median: 92.7821ms (-96.94%)    [90.3312ms .. 117.4076ms]
sparse
cardinality_agg                   Memory: 895.4 KB (-0.00%)    Avg: 22.5113ms (-95.01%)    Median: 22.5629ms (-94.99%)    [22.1628ms .. 22.9436ms]
terms_few_with_cardinality_agg    Memory: 680.2 KB             Avg: 26.4250ms (-94.85%)    Median: 26.4135ms (-94.86%)    [26.3210ms .. 26.6774ms]
```

* clippy

* assert for sorted ordinals
2024-07-02 15:29:00 +08:00
Paul Masurel
0f4c2e27cf Fixes bug that causes out-of-order sstable key. (#2445)
The previous way to address the problem was to replace \u{0000}
with 0 in different places.

This logic had several flaws:
Done on the serializer side (like it was for the columnar), there was
a collision problem.

If a document in the segment contained a json field with a \0 and
antoher doc contained the same json field but `0` then we were sending
the same field path twice to the serializer.

Another option would have been to normalizes all values on the writer
side.

This PR simplifies the logic and simply ignore json path containing a
\0, both in the columnar and the inverted index.

Closes #2442
2024-07-01 15:40:07 +08:00
落叶乌龟
f9ae295507 feat(query): Make BooleanQuery supports minimum_number_should_match (#2405)
* feat(query): Make `BooleanQuery` supports `minimum_number_should_match`. see issue #2398

In this commit, a novel scorer named DisjunctionScorer is introduced, which performs the union of inverted chains with the minimal required elements. BTW, it's implemented via a min-heap. Necessary modifications on `BooleanQuery` and `BooleanWeight` are performed as well.

* fixup! fix test

* fixup!: refactor code.

1. More meaningful names.
2. Add Cache for `Disjunction`'s scorers, and fix bug.
3. Optimize `BooleanWeight::complex_scorer`

Thanks
 Paul Masurel <paul@quickwit.io>

* squash!: come up with better variable naming.

* squash!: fix naming issues.

* squash!: fix typo.

* squash!: Remove CombinationMethod::FullIntersection
2024-07-01 15:39:41 +08:00
Raphael Coeffic
d9db5302d9 feat: cardinality aggregation (#2337)
* WiP: cardinality aggregation

* Collect unique entries first, then insert into HyperLogLog

* Handle `missing`

* Hybrid approach

* Review changes

- insert `missing` value at most once
- `term_id` -> `term_ord`
- iterate directly over entries without collecting first

* Use salted hasher to include column type

* fix: formatting

* More review fixes

* Add cardinality to test_aggregation_flushing

* Formatting
2024-07-01 07:49:42 +08:00
Paul Masurel
e453848134 Recycling buffer in PrefixPhraseScorer (#2443) 2024-06-24 17:11:53 +09:00
PSeitz
59084143ef use optional index in multivalued index (#2439)
* use optional index in multivalued index

For mostly empty multivalued indices there was a large overhead during
creation when iterating all docids. This is alleviated by placing an
optional index in the multivalued index to mark documents that have values.

There's some performance overhead when accessing values in a multivalued
index. The accessing cost is now optional index + multivalue index. The
sparse codec performs relatively bad with the binary_search when accessing
data. This is reflected in the benchmarks below.

This changes the format of columnar to v2, but code is added to handle the v1
formats.

```
     Running benches/bench_access.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_access-ea323c028db88db4)
multi sparse 1/13
access_values_for_doc        Avg: 42.8946ms (+241.80%)    Median: 42.8869ms (+244.10%)    [42.7484ms .. 43.1074ms]
access_first_vals            Avg: 42.8022ms (+421.93%)    Median: 42.7553ms (+439.84%)    [42.6794ms .. 43.7404ms]
multi 2x
access_values_for_doc        Avg: 31.1244ms (+24.17%)    Median: 30.8339ms (+23.46%)    [30.7192ms .. 33.6059ms]
access_first_vals            Avg: 24.3070ms (+70.92%)    Median: 24.0966ms (+70.18%)    [23.9328ms .. 26.4851ms]
sparse 1/13
access_values_for_doc        Avg: 42.2490ms (+0.61%)    Median: 42.2346ms (+2.28%)    [41.8988ms .. 43.7821ms]
access_first_vals            Avg: 43.6272ms (+0.23%)    Median: 43.6197ms (+1.78%)    [43.4920ms .. 43.9009ms]
dense 1/12
access_values_for_doc        Avg: 8.6184ms (+23.18%)    Median: 8.6126ms (+23.78%)    [8.5843ms .. 8.7527ms]
access_first_vals            Avg: 6.8112ms (+4.47%)     Median: 6.8002ms (+4.55%)     [6.7887ms .. 6.8991ms]
full
access_values_for_doc        Avg: 9.4073ms (-5.09%)    Median: 9.4023ms (-2.23%)    [9.3694ms .. 9.4568ms]
access_first_vals            Avg: 4.9531ms (+6.24%)    Median: 4.9502ms (+7.85%)    [4.9423ms .. 4.9718ms]
```

```
     Running benches/bench_merge.rs (/home/pascal/Development/tantivy/optional_multivalues/target/release/deps/bench_merge-475697dfceb3639f)
merge_multi 2x_and_multi 2x                          Avg: 20.2280ms (+34.33%)    Median: 20.1829ms (+35.33%)    [19.9933ms .. 20.8806ms]
merge_multi sparse 1/13_and_multi sparse 1/13        Avg: 0.8961ms (-78.04%)     Median: 0.8943ms (-77.61%)     [0.8899ms .. 0.9272ms]
merge_dense 1/12_and_dense 1/12                      Avg: 0.6619ms (-1.26%)      Median: 0.6616ms (+2.20%)      [0.6473ms .. 0.6837ms]
merge_sparse 1/13_and_sparse 1/13                    Avg: 0.5508ms (-0.85%)      Median: 0.5508ms (+2.80%)      [0.5420ms .. 0.5634ms]
merge_sparse 1/13_and_dense 1/12                     Avg: 0.6046ms (-4.64%)      Median: 0.6038ms (+2.80%)      [0.5939ms .. 0.6296ms]
merge_multi sparse 1/13_and_dense 1/12               Avg: 0.9111ms (-83.48%)     Median: 0.9063ms (-83.50%)     [0.9047ms .. 0.9663ms]
merge_multi sparse 1/13_and_sparse 1/13              Avg: 0.8451ms (-89.49%)     Median: 0.8428ms (-89.43%)     [0.8411ms .. 0.8563ms]
merge_multi 2x_and_dense 1/12                        Avg: 10.6624ms (-4.82%)     Median: 10.6568ms (-4.49%)     [10.5738ms .. 10.8353ms]
merge_multi 2x_and_sparse 1/13                       Avg: 10.6336ms (-22.95%)    Median: 10.5925ms (-22.33%)    [10.5149ms .. 11.5657ms]
```

* Update columnar/src/columnar/format_version.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

* Update columnar/src/column_index/mod.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2024-06-19 14:54:12 +08:00
PSeitz
511b027350 update columnar bench (#2438)
* update columnar bench

* fix compile
2024-06-14 10:42:35 +08:00
Philippe Noël
322f47eb47 Add ParadeDB to Companies List (#1) (#2437)
* Add ParadeDB logo
2024-06-14 09:12:58 +09:00
PSeitz
72f61ff89c remove index sorting (#2434)
closes https://github.com/quickwit-oss/tantivy/issues/2352
2024-06-13 15:51:53 +08:00
PSeitz
a141c3ec59 add columnar format compatibiliy tests (#2433)
* add columnar format compatibiliy tests

* always try to write current format
2024-06-13 15:04:52 +08:00
PSeitz
e90e7a25ae add access benchmark for columnar (#2432) 2024-06-12 14:29:15 +08:00
PSeitz
c3b92a5412 fix compiler warning, cleanup (#2393)
fix compiler warning for missing feature flag
remove unused variables
cleanup unused methods
2024-06-11 16:03:50 +08:00
PSeitz
2f55511064 extend indexwriter proptests (#2342)
* index random values in proptest

* add proptest with multiple docs
2024-06-11 16:02:57 +08:00
trinity-1686a
08b9fc0b31 fix de-escaping too much in query parser (#2427)
* fix de-escaping too much in query parser
2024-06-10 11:19:01 +02:00
PSeitz
714f363d43 add bench & test for columnar merging (#2428)
* add merge columnar proptest

* add columnar merge benchmark
2024-06-10 16:26:16 +08:00
PSeitz
93ff7365b0 reduce top hits aggregation memory consumption (#2426)
move request structure out of top hits aggregation collector and use from the
passed structure instead

full
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 425.9680ms (-21.38%)    Median: 415.1097ms (-23.56%)    [395.5303ms .. 484.6325ms]
dense
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 440.0817ms (-19.68%)    Median: 432.2286ms (-21.10%)    [403.5632ms .. 497.7541ms]
sparse
terms_many_with_top_hits    Memory: 13.1 MB (-49.31%)    Avg: 33.3568ms (-32.19%)    Median: 33.0834ms (-31.86%)    [32.5126ms .. 35.7397ms]
multivalue
terms_many_with_top_hits    Memory: 58.2 MB (-43.64%)    Avg: 414.2340ms (-25.44%)    Median: 413.4144ms (-25.64%)    [403.9919ms .. 430.3170ms]
2024-06-06 22:32:58 +08:00
Adam Reichold
8151925068 Panicking in spawned Rayon tasks will abort the process by default. (#2409) 2024-06-04 17:04:30 +09:00
dependabot[bot]
b960e40bc8 Update sketches-ddsketch requirement from 0.2.1 to 0.3.0 (#2423)
Updates the requirements on [sketches-ddsketch](https://github.com/mheffner/rust-sketches-ddsketch) to permit the latest version.
- [Release notes](https://github.com/mheffner/rust-sketches-ddsketch/releases)
- [Commits](https://github.com/mheffner/rust-sketches-ddsketch/compare/v0.2.1...v0.3.0)

---
updated-dependencies:
- dependency-name: sketches-ddsketch
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-06-04 15:50:23 +08:00
giovannicuccu
1095c9b073 Issue 1787 extended stats (#2247)
* first version of extended stats along with its tests

* using IntermediateExtendStats instead of IntermediateStats with all tests passing

* Created struct for request and response

* first test with extended_stats

* kahan summation and tests with approximate equality

* version ready for merge

* removed approx dependency

* refactor for using ExtendedStats only when needed

* interim version

* refined version with code formatted

* refactored a struct

* cosmetic refactor

* fix after merge

* fix format

* added extended_stat bench

* merge and new benchmark for extended stats

* split stat segment collectors

* wrapped intermediate extended stat with a box to limit memory usage

* Revert "wrapped intermediate extended stat with a box to limit memory usage"

This reverts commit 5b4aa9f393.

* some code reformat, commented kahan summation

* refactor after review

* refactor after code review

* fix after incorrectly restoring kahan summation

* modifications for code review + bug fix in merge_fruit

* refactor assert_nearly_equals macro

* update after code review

---------

Co-authored-by: Giovanni Cuccu <gcuccu@imolainformatica.it>
2024-06-04 14:25:17 +08:00
PSeitz
c0686515a9 update one_shot (#2420) 2024-05-31 11:07:35 +08:00
trinity-1686a
455156f51c improve query parser (#2416)
* support escape sequence in more place

and fix bug with singlequoted strings

* add query parser test for range query on default field
2024-05-30 17:29:27 +02:00
112 changed files with 5849 additions and 4121 deletions

View File

@@ -15,11 +15,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -11,12 +11,11 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.63"
rust-version = "1.66"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
oneshot = "0.1.7"
base64 = "0.22.0"
byteorder = "1.4.3"
crc32fast = "1.3.2"
@@ -39,7 +38,7 @@ levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
downcast-rs = "1.2.1"
bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
@@ -64,7 +63,8 @@ query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tanti
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"

View File

@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
## Benchmark
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
performance for different types of queries/collections.
Your mileage WILL vary depending on the nature of queries and their load.
@@ -101,7 +101,8 @@ cargo test
## Companies Using Tantivy
<p align="left">
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />&nbsp;
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/paradedb.png" alt="ParadeDB" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" /> &nbsp;
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />

View File

@@ -47,13 +47,19 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, average_f64);
register!(group, average_f64_u64);
register!(group, stats_f64);
register!(group, extendedstats_f64);
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
register!(group, terms_many_top_1000);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
register!(group, cardinality_agg);
register!(group, terms_few_with_cardinality_agg);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
@@ -105,7 +111,12 @@ fn stats_f64(index: &Index) {
});
exec_term_with_agg(index, agg_req)
}
fn extendedstats_f64(index: &Index) {
let agg_req = json!({
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
});
exec_term_with_agg(index, agg_req)
}
fn percentiles_f64(index: &Index) {
let agg_req = json!({
"mypercentiles": {
@@ -117,6 +128,33 @@ fn percentiles_f64(index: &Index) {
});
execute_agg(index, agg_req);
}
fn cardinality_agg(index: &Index) {
let agg_req = json!({
"cardinality": {
"cardinality": {
"field": "text_many_terms"
},
}
});
execute_agg(index, agg_req);
}
fn terms_few_with_cardinality_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms" },
"aggs": {
"cardinality": {
"cardinality": {
"field": "text_many_terms"
},
}
}
},
});
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
@@ -129,6 +167,12 @@ fn terms_many(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_many_top_1000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
});
execute_agg(index, agg_req);
}
fn terms_many_order_by_term(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
@@ -165,7 +209,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
@@ -262,6 +306,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
});
execute_agg(index, agg_req);
}
fn histogram(index: &Index) {
let agg_req = json!({
"rangef64": {

View File

@@ -23,6 +23,16 @@ downcast-rs = "1.2.0"
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
binggan = "0.8.1"
[[bench]]
name = "bench_merge"
harness = false
[[bench]]
name = "bench_access"
harness = false
[features]
unstable = []

View File

@@ -0,0 +1,67 @@
use binggan::{black_box, InputGroup};
use common::*;
use tantivy_columnar::Column;
pub mod common;
const NUM_DOCS: u32 = 2_000_000;
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
let reader = generate_columnar_with_name(card, num_docs, "price");
reader.read_columns("price").unwrap()[0]
.open_u64_lenient()
.unwrap()
.unwrap()
}
fn main() {
let mut inputs = Vec::new();
let mut add_card = |card1: Card| {
inputs.push((
format!("{card1}"),
generate_columnar_and_open(card1, NUM_DOCS),
));
};
add_card(Card::MultiSparse);
add_card(Card::Multi);
add_card(Card::Sparse);
add_card(Card::Dense);
add_card(Card::Full);
bench_group(InputGroup::new_with_inputs(inputs));
}
fn bench_group(mut runner: InputGroup<Column>) {
runner.register("access_values_for_doc", |column| {
let mut sum = 0;
for i in 0..NUM_DOCS {
for value in column.values_for_doc(i) {
sum += value;
}
}
black_box(sum);
});
runner.register("access_first_vals", |column| {
let mut sum = 0;
const BLOCK_SIZE: usize = 32;
let mut docs = vec![0; BLOCK_SIZE];
let mut buffer = vec![None; BLOCK_SIZE];
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
// fill docs
for idx in 0..BLOCK_SIZE {
docs[idx] = idx as u32 + i;
}
column.first_vals(&docs, &mut buffer);
for val in buffer.iter() {
let Some(val) = val else { continue };
sum += *val;
}
}
black_box(sum);
});
runner.run();
}

View File

@@ -31,7 +31,7 @@ fn get_test_columns() -> Columns {
}
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(data.len() as u32, None, &mut buffer)
.serialize(data.len() as u32, &mut buffer)
.unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();

View File

@@ -0,0 +1,49 @@
pub mod common;
use binggan::{black_box, BenchRunner};
use common::{generate_columnar_with_name, Card};
use tantivy_columnar::*;
const NUM_DOCS: u32 = 100_000;
fn main() {
let mut inputs = Vec::new();
let mut add_combo = |card1: Card, card2: Card| {
inputs.push((
format!("merge_{card1}_and_{card2}"),
vec![
generate_columnar_with_name(card1, NUM_DOCS, "price"),
generate_columnar_with_name(card2, NUM_DOCS, "price"),
],
));
};
add_combo(Card::Multi, Card::Multi);
add_combo(Card::MultiSparse, Card::MultiSparse);
add_combo(Card::Dense, Card::Dense);
add_combo(Card::Sparse, Card::Sparse);
add_combo(Card::Sparse, Card::Dense);
add_combo(Card::MultiSparse, Card::Dense);
add_combo(Card::MultiSparse, Card::Sparse);
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(
input_name,
columnar_readers,
move |columnar_readers: &Vec<ColumnarReader>| {
let mut out = Vec::new();
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
black_box(out);
},
);
}
group.run();
}

View File

@@ -0,0 +1,59 @@
extern crate tantivy_columnar;
use core::fmt;
use std::fmt::{Display, Formatter};
use tantivy_columnar::{ColumnarReader, ColumnarWriter};
pub enum Card {
MultiSparse,
Multi,
Sparse,
Dense,
Full,
}
impl Display for Card {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Card::MultiSparse => write!(f, "multi sparse 1/13"),
Card::Multi => write!(f, "multi 2x"),
Card::Sparse => write!(f, "sparse 1/13"),
Card::Dense => write!(f, "dense 1/12"),
Card::Full => write!(f, "full"),
}
}
}
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
let mut columnar_writer = ColumnarWriter::default();
if let Card::MultiSparse = card {
columnar_writer.record_numerical(0, column_name, 10u64);
columnar_writer.record_numerical(0, column_name, 10u64);
}
for i in 0..num_docs {
match card {
Card::MultiSparse | Card::Sparse => {
if i % 13 == 0 {
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
Card::Dense => {
if i % 12 == 0 {
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
Card::Full => {
columnar_writer.record_numerical(i, column_name, i as u64);
}
Card::Multi => {
columnar_writer.record_numerical(i, column_name, i as u64);
columnar_writer.record_numerical(i, column_name, i as u64);
}
}
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
ColumnarReader::open(wrt).unwrap()
}

Binary file not shown.

Binary file not shown.

View File

@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
}
/// Get the docids of values which are in the provided value range.
/// Get the docids of values which are in the provided value and docid range.
#[inline]
pub fn get_docids_for_value_range(
&self,

View File

@@ -12,7 +12,7 @@ use crate::column_values::{
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
};
use crate::iterable::Iterable;
use crate::StrColumn;
use crate::{StrColumn, Version};
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
column_index: SerializableColumnIndex<'_>,
@@ -40,25 +40,9 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
Ok(())
}
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
.as_slice()
.try_into()
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_values = load_u64_based_column_values(column_values_data)?;
Ok(Column {
index: column_index,
values: column_values,
})
}
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
@@ -68,7 +52,27 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = load_u64_based_column_values(column_values_data)?;
Ok(Column {
index: column_index,
values: column_values,
})
}
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<T>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
.as_slice()
.try_into()
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
Ok(Column {
index: column_index,
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
/// Open the column as u64.
///
/// See [`open_u128_as_compact_u64`] for more details.
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
pub fn open_column_u128_as_compact_u64(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<Column<u64>> {
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
let column_index_num_bytes = u32::from_le_bytes(
column_index_num_bytes_payload
@@ -88,7 +95,7 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
.unwrap(),
);
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
let column_index = crate::column_index::open_column_index(column_index_data)?;
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
Ok(Column {
index: column_index,
@@ -96,19 +103,19 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
})
}
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
let (body, dictionary_len_bytes) = data.rsplit(4);
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
Ok(BytesColumn {
dictionary,
term_ord_column,
})
}
pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
let bytes_column = open_column_bytes(data)?;
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
let bytes_column = open_column_bytes(data, format_version)?;
Ok(StrColumn::wrap(bytes_column))
}

View File

@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(
#[cfg(test)]
mod tests {
use common::OwnedBytes;
use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::column_index::multivalued_index::{
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
};
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
@@ -171,7 +175,11 @@ mod tests {
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
let multivalue =
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
@@ -200,11 +208,16 @@ mod tests {
],
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
let multivalue =
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}
}

View File

@@ -1,6 +1,8 @@
use std::iter;
use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_index::{
SerializableColumnIndex, SerializableMultivalueIndex, SerializableOptionalIndex, Set,
};
use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
@@ -14,15 +16,24 @@ pub fn merge_column_index_shuffled<'a>(
Cardinality::Optional => {
let non_null_row_ids =
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Optional {
SerializableColumnIndex::Optional(SerializableOptionalIndex {
non_null_row_ids,
num_rows: shuffle_merge_order.num_rows(),
}
})
}
Cardinality::Multivalued => {
let multivalue_start_index =
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(multivalue_start_index)
let non_null_row_ids =
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
SerializableColumnIndex::Multivalued(SerializableMultivalueIndex {
doc_ids_with_values: SerializableOptionalIndex {
non_null_row_ids,
num_rows: shuffle_merge_order.num_rows(),
},
start_offsets: merge_column_index_shuffled_multivalued(
column_indexes,
shuffle_merge_order,
),
})
}
}
}
@@ -102,11 +113,18 @@ fn iter_num_values<'a>(
/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
///
/// This will filter values with 0 values as these are covered by the optional index in the
/// multivalue index.
fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
*state += num_vals;
Some(*state)
}))
iter::once(0u32).chain(
num_vals
.filter(|num_vals| *num_vals != 0)
.scan(0, |state, num_vals| {
*state += num_vals;
Some(*state)
}),
)
}
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
@@ -134,7 +152,7 @@ mod tests {
#[test]
fn test_integrate_num_vals_several() {
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 13, 33].into_iter()));
}
#[test]
@@ -157,10 +175,10 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional {
let SerializableColumnIndex::Optional(SerializableOptionalIndex {
non_null_row_ids,
num_rows,
} = serializable_index
}) = serializable_index
else {
panic!()
};

View File

@@ -1,6 +1,8 @@
use std::iter;
use std::ops::Range;
use crate::column_index::{SerializableColumnIndex, Set};
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
use crate::column_index::serialize::SerializableOptionalIndex;
use crate::column_index::SerializableColumnIndex;
use crate::iterable::Iterable;
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
@@ -15,23 +17,149 @@ pub fn merge_column_index_stacked<'a>(
) -> SerializableColumnIndex<'a> {
match cardinality_after_merge {
Cardinality::Full => SerializableColumnIndex::Full,
Cardinality::Optional => SerializableColumnIndex::Optional {
Cardinality::Optional => SerializableColumnIndex::Optional(SerializableOptionalIndex {
non_null_row_ids: Box::new(StackedOptionalIndex {
columns,
stack_merge_order,
}),
num_rows: stack_merge_order.num_rows(),
},
}),
Cardinality::Multivalued => {
let stacked_multivalued_index = StackedMultivaluedIndex {
columns,
stack_merge_order,
};
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
let serializable_multivalue_index =
make_serializable_multivalued_index(columns, stack_merge_order);
SerializableColumnIndex::Multivalued(serializable_multivalue_index)
}
}
}
struct StackedDocIdsWithValues<'a> {
column_indexes: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
}
impl Iterable<u32> for StackedDocIdsWithValues<'_> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new((0..self.column_indexes.len()).flat_map(|i| {
let column_index = &self.column_indexes[i];
let doc_range = self.stack_merge_order.columnar_range(i);
get_doc_ids_with_values(column_index, doc_range)
}))
}
}
fn get_doc_ids_with_values<'a>(
column_index: &'a ColumnIndex,
doc_range: Range<u32>,
) -> Box<dyn Iterator<Item = u32> + 'a> {
match column_index {
ColumnIndex::Empty { .. } => Box::new(0..0),
ColumnIndex::Full => Box::new(doc_range),
ColumnIndex::Optional(optional_index) => Box::new(
optional_index
.iter_rows()
.map(move |row| row + doc_range.start),
),
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
MultiValueIndex::MultiValueIndexV1(multivalued_index) => {
Box::new((0..multivalued_index.num_docs()).filter_map(move |docid| {
let range = multivalued_index.range(docid);
if range.is_empty() {
None
} else {
Some(docid + doc_range.start)
}
}))
}
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
multivalued_index
.optional_index
.iter_rows()
.map(move |row| row + doc_range.start),
),
},
}
}
fn stack_doc_ids_with_values<'a>(
column_indexes: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
) -> SerializableOptionalIndex<'a> {
let num_rows = stack_merge_order.num_rows();
SerializableOptionalIndex {
non_null_row_ids: Box::new(StackedDocIdsWithValues {
column_indexes,
stack_merge_order,
}),
num_rows,
}
}
struct StackedStartOffsets<'a> {
column_indexes: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
}
fn get_num_values_iterator<'a>(
column_index: &'a ColumnIndex,
num_docs: u32,
) -> Box<dyn Iterator<Item = u32> + 'a> {
match column_index {
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
ColumnIndex::Optional(optional_index) => {
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
}
ColumnIndex::Multivalued(multivalued_index) => Box::new(
multivalued_index
.get_start_index_column()
.iter()
.scan(0u32, |previous_start_offset, current_start_offset| {
let num_vals = current_start_offset - *previous_start_offset;
*previous_start_offset = current_start_offset;
Some(num_vals)
})
.skip(1),
),
}
}
impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
let column_index = &self.column_indexes[columnar_id];
get_num_values_iterator(column_index, num_docs)
});
Box::new(std::iter::once(0u32).chain(num_values_it.into_iter().scan(
0u32,
|cumulated, el| {
*cumulated += el;
Some(*cumulated)
},
)))
}
}
fn stack_start_offsets<'a>(
column_indexes: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
) -> Box<dyn Iterable<u32> + 'a> {
Box::new(StackedStartOffsets {
column_indexes,
stack_merge_order,
})
}
fn make_serializable_multivalued_index<'a>(
columns: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
) -> SerializableMultivalueIndex<'a> {
SerializableMultivalueIndex {
doc_ids_with_values: stack_doc_ids_with_values(columns, stack_merge_order),
start_offsets: stack_start_offsets(columns, stack_merge_order),
}
}
struct StackedOptionalIndex<'a> {
columns: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
@@ -62,87 +190,3 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
)
}
}
#[derive(Clone, Copy)]
struct StackedMultivaluedIndex<'a> {
columns: &'a [ColumnIndex],
stack_merge_order: &'a StackMergeOrder,
}
fn convert_column_opt_to_multivalued_index<'a>(
column_index_opt: &'a ColumnIndex,
num_rows: RowId,
) -> Box<dyn Iterator<Item = RowId> + 'a> {
match column_index_opt {
ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
ColumnIndex::Full => Box::new(0..num_rows + 1),
ColumnIndex::Optional(optional_index) => {
Box::new(
(0..num_rows)
// TODO optimize
.map(|row_id| optional_index.rank(row_id))
.chain(std::iter::once(optional_index.num_non_nulls())),
)
}
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
}
}
impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
let multivalued_indexes =
self.columns
.iter()
.enumerate()
.map(|(columnar_id, column_opt)| {
let num_rows =
self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
convert_column_opt_to_multivalued_index(column_opt, num_rows)
});
stack_multivalued_indexes(multivalued_indexes)
}
}
// Refactor me
fn stack_multivalued_indexes<'a>(
mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
) -> Box<dyn Iterator<Item = RowId> + 'a> {
let mut offset = 0;
let mut last_row_id = 0;
let mut current_it = multivalued_indexes.next();
Box::new(std::iter::from_fn(move || loop {
if let Some(row_id) = current_it.as_mut()?.next() {
last_row_id = offset + row_id;
return Some(last_row_id);
}
offset = last_row_id;
loop {
current_it = multivalued_indexes.next();
if current_it.as_mut()?.next().is_some() {
break;
}
}
}))
}
#[cfg(test)]
mod tests {
use crate::RowId;
fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
Box::new(row_ids.iter().copied())
}
#[test]
fn test_stack() {
let columns = [
it(&[0u32, 0u32]),
it(&[0u32, 1u32, 1u32, 4u32]),
it(&[0u32, 3u32, 5u32]),
it(&[0u32, 4u32]),
]
.into_iter();
let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
}
}

View File

@@ -11,8 +11,11 @@ mod serialize;
use std::ops::Range;
pub use merge::merge_column_index;
pub(crate) use multivalued_index::SerializableMultivalueIndex;
pub use optional_index::{OptionalIndex, Set};
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
pub use serialize::{
open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
};
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::{Cardinality, DocId, RowId};
@@ -131,15 +134,41 @@ impl ColumnIndex {
let row_end = optional_index.rank(doc_id_range.end);
row_start..row_end
}
ColumnIndex::Multivalued(multivalued_index) => {
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1;
let start_docid = doc_id_range.start.min(end_docid);
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
MultiValueIndex::MultiValueIndexV1(index) => {
let row_start = index.start_index_column.get_val(doc_id_range.start);
let row_end = index.start_index_column.get_val(doc_id_range.end);
row_start..row_end
}
MultiValueIndex::MultiValueIndexV2(index) => {
// In this case we will use the optional_index select the next values
// that are valid. There are different cases to consider:
// Not exists below means does not exist in the optional
// index, because it has no values.
// * doc_id_range may cover a range of docids which are non existent
// => rank
// will give us the next document outside the range with a value. They both
// get the same rank and therefore return a zero range
//
// * doc_id_range.start and doc_id_range.end may not exist, but docids in
// between may have values
// => rank will give us the next document outside the range with a value.
//
// * doc_id_range.start may be not existent but doc_id_range.end may exist
// * doc_id_range.start may exist but doc_id_range.end may not exist
// * doc_id_range.start and doc_id_range.end may exist
// => rank on doc_id_range.end will give use the next value, which matches
// how the `start_index_column` works, so we get the value start of the next
// docid which we use to create the exclusive range.
//
let rank_start = index.optional_index.rank(doc_id_range.start);
let row_start = index.start_index_column.get_val(rank_start);
let rank_end = index.optional_index.rank(doc_id_range.end);
let row_end = index.start_index_column.get_val(rank_end);
let row_start = multivalued_index.start_index_column.get_val(start_docid);
let row_end = multivalued_index.start_index_column.get_val(end_docid);
row_start..row_end
}
row_start..row_end
}
},
}
}

View File

@@ -3,64 +3,98 @@ use std::io::Write;
use std::ops::Range;
use std::sync::Arc;
use common::OwnedBytes;
use common::{CountingWriter, OwnedBytes};
use super::optional_index::{open_optional_index, serialize_optional_index};
use super::{OptionalIndex, SerializableOptionalIndex, Set};
use crate::column_values::{
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
};
use crate::iterable::Iterable;
use crate::{DocId, RowId};
use crate::{DocId, RowId, Version};
pub struct SerializableMultivalueIndex<'a> {
pub doc_ids_with_values: SerializableOptionalIndex<'a>,
pub start_offsets: Box<dyn Iterable<u32> + 'a>,
}
pub fn serialize_multivalued_index(
multivalued_index: &dyn Iterable<RowId>,
multivalued_index: &SerializableMultivalueIndex,
output: &mut impl Write,
) -> io::Result<()> {
let SerializableMultivalueIndex {
doc_ids_with_values,
start_offsets,
} = multivalued_index;
let mut count_writer = CountingWriter::wrap(output);
let SerializableOptionalIndex {
non_null_row_ids,
num_rows,
} = doc_ids_with_values;
serialize_optional_index(&**non_null_row_ids, *num_rows, &mut count_writer)?;
let optional_len = count_writer.written_bytes() as u32;
let output = count_writer.finish();
serialize_u64_based_column_values(
multivalued_index,
&**start_offsets,
&[CodecType::Bitpacked, CodecType::Linear],
output,
)?;
output.write_all(&optional_len.to_le_bytes())?;
Ok(())
}
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
Ok(MultiValueIndex { start_index_column })
pub fn open_multivalued_index(
bytes: OwnedBytes,
format_version: Version,
) -> io::Result<MultiValueIndex> {
match format_version {
Version::V1 => {
let start_index_column: Arc<dyn ColumnValues<RowId>> =
load_u64_based_column_values(bytes)?;
Ok(MultiValueIndex::MultiValueIndexV1(MultiValueIndexV1 {
start_index_column,
}))
}
Version::V2 => {
let (body_bytes, optional_index_len) = bytes.rsplit(4);
let optional_index_len =
u32::from_le_bytes(optional_index_len.as_slice().try_into().unwrap());
let (optional_index_bytes, start_index_bytes) =
body_bytes.split(optional_index_len as usize);
let optional_index = open_optional_index(optional_index_bytes)?;
let start_index_column: Arc<dyn ColumnValues<RowId>> =
load_u64_based_column_values(start_index_bytes)?;
Ok(MultiValueIndex::MultiValueIndexV2(MultiValueIndexV2 {
optional_index,
start_index_column,
}))
}
}
}
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndex {
pub enum MultiValueIndex {
MultiValueIndexV1(MultiValueIndexV1),
MultiValueIndexV2(MultiValueIndexV2),
}
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndexV1 {
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
}
impl std::fmt::Debug for MultiValueIndex {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("MultiValuedIndex")
.field("num_rows", &self.start_index_column.num_vals())
.finish_non_exhaustive()
}
}
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
MultiValueIndex { start_index_column }
}
}
impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
let mut buffer = Vec::new();
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes).unwrap()
}
impl MultiValueIndexV1 {
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
if doc_id >= self.num_docs() {
return 0..0;
}
let start = self.start_index_column.get_val(doc_id);
let end = self.start_index_column.get_val(doc_id + 1);
start..end
@@ -83,7 +117,6 @@ impl MultiValueIndex {
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
#[allow(clippy::bool_to_int_with_if)]
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
if ranks.is_empty() {
return;
@@ -111,11 +144,170 @@ impl MultiValueIndex {
}
}
#[derive(Clone)]
/// Index to resolve value range for given doc_id.
/// Starts at 0.
pub struct MultiValueIndexV2 {
pub optional_index: OptionalIndex,
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
}
impl std::fmt::Debug for MultiValueIndex {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
let index = match self {
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
};
f.debug_struct("MultiValuedIndex")
.field("num_rows", &index.num_vals())
.finish_non_exhaustive()
}
}
impl MultiValueIndex {
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
assert!(!start_offsets.is_empty());
assert_eq!(start_offsets[0], 0);
let mut doc_with_values = Vec::new();
let mut compact_start_offsets: Vec<u32> = vec![0];
for doc in 0..start_offsets.len() - 1 {
if start_offsets[doc] < start_offsets[doc + 1] {
doc_with_values.push(doc as RowId);
compact_start_offsets.push(start_offsets[doc + 1]);
}
}
let serializable_multivalued_index = SerializableMultivalueIndex {
doc_ids_with_values: SerializableOptionalIndex {
non_null_row_ids: Box::new(&doc_with_values[..]),
num_rows: start_offsets.len() as u32 - 1,
},
start_offsets: Box::new(&compact_start_offsets[..]),
};
let mut buffer = Vec::new();
serialize_multivalued_index(&serializable_multivalued_index, &mut buffer).unwrap();
let bytes = OwnedBytes::new(buffer);
open_multivalued_index(bytes, Version::V2).unwrap()
}
pub fn get_start_index_column(&self) -> &Arc<dyn crate::ColumnValues<RowId>> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
}
}
/// Returns `[start, end)` values range, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => idx.range(doc_id),
MultiValueIndex::MultiValueIndexV2(idx) => idx.range(doc_id),
}
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_docs(&self) -> u32 {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => idx.start_index_column.num_vals() - 1,
MultiValueIndex::MultiValueIndexV2(idx) => idx.optional_index.num_docs(),
}
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
match self {
MultiValueIndex::MultiValueIndexV1(idx) => {
idx.select_batch_in_place(docid_start, ranks)
}
MultiValueIndex::MultiValueIndexV2(idx) => {
idx.select_batch_in_place(docid_start, ranks)
}
}
}
}
impl MultiValueIndexV2 {
/// Returns `[start, end)`, such that the values associated with
/// the given document are `start..end`.
#[inline]
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
let Some(rank) = self.optional_index.rank_if_exists(doc_id) else {
return 0..0;
};
let start = self.start_index_column.get_val(rank);
let end = self.start_index_column.get_val(rank + 1);
start..end
}
/// Returns the number of documents in the index.
#[inline]
pub fn num_docs(&self) -> u32 {
self.optional_index.num_docs()
}
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
/// docids. Positions are converted inplace to docids.
///
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
/// index.
///
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
/// increasing positions.
///
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
/// match a docid to its value position.
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
if ranks.is_empty() {
return;
}
let mut cur_pos_in_idx = self.optional_index.rank(docid_start);
let mut last_doc = None;
assert!(cur_pos_in_idx <= ranks[0]);
let mut write_doc_pos = 0;
for i in 0..ranks.len() {
let pos = ranks[i];
loop {
let end = self.start_index_column.get_val(cur_pos_in_idx + 1);
if end > pos {
ranks[write_doc_pos] = cur_pos_in_idx;
write_doc_pos += if last_doc == Some(cur_pos_in_idx) {
0
} else {
1
};
last_doc = Some(cur_pos_in_idx);
break;
}
cur_pos_in_idx += 1;
}
}
ranks.truncate(write_doc_pos);
for rank in ranks.iter_mut() {
*rank = self.optional_index.select(*rank);
}
}
}
#[cfg(test)]
mod tests {
use std::ops::Range;
use super::MultiValueIndex;
use crate::{ColumnarReader, DynamicColumn};
fn index_to_pos_helper(
index: &MultiValueIndex,
@@ -134,6 +326,7 @@ mod tests {
let positions = &[10u32, 11, 15, 20, 21, 22];
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
@@ -141,4 +334,67 @@ mod tests {
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
}
#[test]
fn test_range_to_rowids() {
use crate::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
// This column gets coerced to u64
columnar_writer.record_numerical(1, "full", u64::MAX);
columnar_writer.record_numerical(1, "full", u64::MAX);
columnar_writer.record_numerical(5, "full", u64::MAX);
columnar_writer.record_numerical(5, "full", u64::MAX);
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(7, &mut wrt).unwrap();
let reader = ColumnarReader::open(wrt).unwrap();
// Open the column as u64
let column = reader.read_columns("full").unwrap()[0]
.open()
.unwrap()
.coerce_numerical(crate::NumericalType::U64)
.unwrap();
let DynamicColumn::U64(column) = column else {
panic!();
};
let row_id_range = column.index.docid_range_to_rowids(1..2);
assert_eq!(row_id_range, 0..2);
let row_id_range = column.index.docid_range_to_rowids(0..2);
assert_eq!(row_id_range, 0..2);
let row_id_range = column.index.docid_range_to_rowids(0..4);
assert_eq!(row_id_range, 0..2);
let row_id_range = column.index.docid_range_to_rowids(3..4);
assert_eq!(row_id_range, 2..2);
let row_id_range = column.index.docid_range_to_rowids(1..6);
assert_eq!(row_id_range, 0..4);
let row_id_range = column.index.docid_range_to_rowids(3..6);
assert_eq!(row_id_range, 2..4);
let row_id_range = column.index.docid_range_to_rowids(0..6);
assert_eq!(row_id_range, 0..4);
let row_id_range = column.index.docid_range_to_rowids(0..6);
assert_eq!(row_id_range, 0..4);
let check = |range, expected| {
let full_range = 0..=u64::MAX;
let mut docids = Vec::new();
column.get_docids_for_value_range(full_range, range, &mut docids);
assert_eq!(docids, expected);
};
// check(0..1, vec![]);
// check(0..2, vec![1]);
check(1..2, vec![1]);
}
}

View File

@@ -86,8 +86,14 @@ pub struct OptionalIndex {
block_metas: Arc<[BlockMeta]>,
}
impl<'a> Iterable<u32> for &'a OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_rows())
}
}
impl std::fmt::Debug for OptionalIndex {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("OptionalIndex")
.field("num_rows", &self.num_rows)
.field("num_non_null_rows", &self.num_non_null_rows)
@@ -196,6 +202,7 @@ impl Set<RowId> for OptionalIndex {
} = row_addr_from_row_id(doc_id);
let block_meta = self.block_metas[block_id as usize];
let block = self.block(block_meta);
let block_offset_row_id = match block {
Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),

View File

@@ -28,10 +28,11 @@ pub trait Set<T> {
/// Returns true if the elements is contained in the Set
fn contains(&self, el: T) -> bool;
/// Returns the number of rows in the set that are < `el`
/// Returns the element's rank (its position in the set).
/// If the set does not contain the element, it will return the next existing elements rank.
fn rank(&self, el: T) -> T;
/// If the set contains `el` returns the element rank.
/// If the set contains `el`, returns the element's rank (its position in the set).
/// If the set does not contain the element, it returns `None`.
fn rank_if_exists(&self, el: T) -> Option<T>;

View File

@@ -22,8 +22,8 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
vals.iter().cloned().take_while(|v| *v < val).count() as u16
);
}
for rank in 0..vals.len() {
assert_eq!(tested_set.select(rank as u16), vals[rank]);
for (rank, val) in vals.iter().enumerate() {
assert_eq!(tested_set.select(rank as u16), *val);
}
buffer.len()
}
@@ -107,3 +107,41 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
assert_eq!(i, select_cursor.select(i));
}
}
#[test]
fn test_simple_translate_idx_to_value_idx_dense() {
let mut buffer = Vec::new();
DenseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
let tested_set = DenseBlockCodec::open(buffer.as_slice());
assert!(tested_set.contains(1));
assert!(!tested_set.contains(2));
assert_eq!(tested_set.rank(0), 0);
assert_eq!(tested_set.rank(1), 0);
for rank in 2..10 {
// ranks that don't exist select the next highest one
assert_eq!(tested_set.rank_if_exists(rank), None);
assert_eq!(tested_set.rank(rank), 1);
}
assert_eq!(tested_set.rank(10), 1);
}
#[test]
fn test_simple_translate_idx_to_value_idx_sparse() {
let mut buffer = Vec::new();
SparseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
let tested_set = SparseBlockCodec::open(buffer.as_slice());
assert!(tested_set.contains(1));
assert!(!tested_set.contains(2));
assert_eq!(tested_set.rank(0), 0);
assert_eq!(tested_set.select(tested_set.rank(0)), 1);
assert_eq!(tested_set.rank(1), 0);
assert_eq!(tested_set.select(tested_set.rank(1)), 1);
for rank in 2..10 {
// ranks that don't exist select the next highest one
assert_eq!(tested_set.rank_if_exists(rank), None);
assert_eq!(tested_set.rank(rank), 1);
assert_eq!(tested_set.select(tested_set.rank(rank)), 10);
}
assert_eq!(tested_set.rank(10), 1);
assert_eq!(tested_set.select(tested_set.rank(10)), 10);
}

View File

@@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(100, "score", 80i64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_docs, None, &mut buffer)
.unwrap();
dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();

View File

@@ -3,28 +3,39 @@ use std::io::Write;
use common::{CountingWriter, OwnedBytes};
use super::multivalued_index::SerializableMultivalueIndex;
use super::OptionalIndex;
use crate::column_index::multivalued_index::serialize_multivalued_index;
use crate::column_index::optional_index::serialize_optional_index;
use crate::column_index::ColumnIndex;
use crate::iterable::Iterable;
use crate::{Cardinality, RowId};
use crate::{Cardinality, RowId, Version};
pub struct SerializableOptionalIndex<'a> {
pub non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
pub num_rows: RowId,
}
impl<'a> From<&'a OptionalIndex> for SerializableOptionalIndex<'a> {
fn from(optional_index: &'a OptionalIndex) -> Self {
SerializableOptionalIndex {
non_null_row_ids: Box::new(optional_index),
num_rows: optional_index.num_docs(),
}
}
}
pub enum SerializableColumnIndex<'a> {
Full,
Optional {
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
num_rows: RowId,
},
// TODO remove the Arc<dyn> apart from serialization this is not
// dynamic at all.
Multivalued(Box<dyn Iterable<RowId> + 'a>),
Optional(SerializableOptionalIndex<'a>),
Multivalued(SerializableMultivalueIndex<'a>),
}
impl<'a> SerializableColumnIndex<'a> {
pub fn get_cardinality(&self) -> Cardinality {
match self {
SerializableColumnIndex::Full => Cardinality::Full,
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
SerializableColumnIndex::Optional(_) => Cardinality::Optional,
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
}
}
@@ -40,12 +51,12 @@ pub fn serialize_column_index(
output.write_all(&[cardinality])?;
match column_index {
SerializableColumnIndex::Full => {}
SerializableColumnIndex::Optional {
SerializableColumnIndex::Optional(SerializableOptionalIndex {
non_null_row_ids,
num_rows,
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
}) => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
SerializableColumnIndex::Multivalued(multivalued_index) => {
serialize_multivalued_index(&*multivalued_index, &mut output)?
serialize_multivalued_index(&multivalued_index, &mut output)?
}
}
let column_index_num_bytes = output.written_bytes() as u32;
@@ -53,7 +64,10 @@ pub fn serialize_column_index(
}
/// Open a serialized column index.
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
pub fn open_column_index(
mut bytes: OwnedBytes,
format_version: Version,
) -> io::Result<ColumnIndex> {
if bytes.is_empty() {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
@@ -70,7 +84,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
Ok(ColumnIndex::Optional(optional_index))
}
Cardinality::Multivalued => {
let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
let multivalue_index =
super::multivalued_index::open_multivalued_index(bytes, format_version)?;
Ok(ColumnIndex::Multivalued(multivalue_index))
}
}

View File

@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
for val in data {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
codec_serializer
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
.unwrap();
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {

View File

@@ -1,3 +1,6 @@
use core::fmt;
use std::fmt::{Display, Formatter};
use crate::InvalidData;
pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
@@ -8,7 +11,7 @@ const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
footer_bytes[0..4].copy_from_slice(&Version::V1.to_bytes());
footer_bytes[0..4].copy_from_slice(&CURRENT_VERSION.to_bytes());
footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
footer_bytes
}
@@ -20,10 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result<Vers
Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
}
pub const CURRENT_VERSION: Version = Version::V2;
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
#[repr(u32)]
pub enum Version {
V1 = 1u32,
V2 = 2u32,
}
impl Display for Version {
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
match self {
Version::V1 => write!(f, "v1"),
Version::V2 => write!(f, "v2"),
}
}
}
impl Version {
@@ -35,6 +50,7 @@ impl Version {
let code = u32::from_le_bytes(bytes);
match code {
1u32 => Ok(Version::V1),
2u32 => Ok(Version::V2),
_ => Err(InvalidData),
}
}
@@ -47,9 +63,9 @@ mod tests {
use super::*;
#[test]
fn test_footer_dserialization() {
fn test_footer_deserialization() {
let parsed_version: Version = parse_footer(footer()).unwrap();
assert_eq!(Version::V1, parsed_version);
assert_eq!(Version::V2, parsed_version);
}
#[test]
@@ -63,11 +79,10 @@ mod tests {
for &i in &version_to_tests {
let version_res = Version::try_from_bytes(i.to_le_bytes());
if let Ok(version) = version_res {
assert_eq!(version, Version::V1);
assert_eq!(version.to_bytes(), i.to_le_bytes());
valid_versions.insert(i);
}
}
assert_eq!(valid_versions.len(), 1);
assert_eq!(valid_versions.len(), 2);
}
}

View File

@@ -7,7 +7,6 @@ use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use itertools::Itertools;
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
use super::writer::ColumnarSerializer;
@@ -371,20 +370,8 @@ fn is_empty_after_merge(
true
}
ColumnIndex::Multivalued(multivalued_index) => {
for (doc_id, (start_index, end_index)) in multivalued_index
.start_index_column
.iter()
.tuple_windows()
.enumerate()
{
let doc_id = doc_id as u32;
if start_index == end_index {
// There are no values in this document
continue;
}
// The document contains values and is present in the alive bitset.
// The column is therefore not empty.
if alive_bitset.contains(doc_id) {
for alive_docid in alive_bitset.iter() {
if !multivalued_index.range(alive_docid).is_empty() {
return false;
}
}

View File

@@ -1,3 +1,5 @@
use itertools::Itertools;
use super::*;
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
@@ -12,7 +14,7 @@ fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
}
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(vals.len() as RowId, None, &mut buffer)
.serialize(vals.len() as RowId, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
}
@@ -157,9 +159,7 @@ fn make_numerical_columnar_multiple_columns(
.max()
.unwrap_or(0u32);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_rows, None, &mut buffer)
.unwrap();
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
}
@@ -182,9 +182,7 @@ fn make_byte_columnar_multiple_columns(
}
}
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_rows, None, &mut buffer)
.unwrap();
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
}
@@ -203,9 +201,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
.max()
.unwrap_or(0u32);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer
.serialize(num_rows, None, &mut buffer)
.unwrap();
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
}

View File

@@ -5,6 +5,7 @@ mod reader;
mod writer;
pub use column_type::{ColumnType, HasAssociatedColumnType};
pub use format_version::{Version, CURRENT_VERSION};
#[cfg(test)]
pub(crate) use merge::ColumnTypeCategory;
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};

View File

@@ -6,7 +6,7 @@ use sstable::{Dictionary, RangeSSTable};
use crate::columnar::{format_version, ColumnType};
use crate::dynamic_column::DynamicColumnHandle;
use crate::RowId;
use crate::{RowId, Version};
fn io_invalid_data(msg: String) -> io::Error {
io::Error::new(io::ErrorKind::InvalidData, msg)
@@ -19,6 +19,7 @@ pub struct ColumnarReader {
column_dictionary: Dictionary<RangeSSTable>,
column_data: FileSlice,
num_rows: RowId,
format_version: Version,
}
impl fmt::Debug for ColumnarReader {
@@ -53,6 +54,7 @@ impl fmt::Debug for ColumnarReader {
fn read_all_columns_in_stream(
mut stream: sstable::Streamer<'_, RangeSSTable>,
column_data: &FileSlice,
format_version: Version,
) -> io::Result<Vec<DynamicColumnHandle>> {
let mut results = Vec::new();
while stream.advance() {
@@ -67,6 +69,7 @@ fn read_all_columns_in_stream(
let dynamic_column_handle = DynamicColumnHandle {
file_slice,
column_type,
format_version,
};
results.push(dynamic_column_handle);
}
@@ -88,7 +91,7 @@ impl ColumnarReader {
let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
footer_bytes[12..].try_into().unwrap();
let _version = format_version::parse_footer(version_footer_bytes)?;
let format_version = format_version::parse_footer(version_footer_bytes)?;
let (column_data, sstable) =
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
let column_dictionary = Dictionary::open(sstable)?;
@@ -96,6 +99,7 @@ impl ColumnarReader {
column_dictionary,
column_data,
num_rows,
format_version,
})
}
@@ -126,6 +130,7 @@ impl ColumnarReader {
let column_handle = DynamicColumnHandle {
file_slice,
column_type,
format_version: self.format_version,
};
Some((column_name, column_handle))
} else {
@@ -167,7 +172,7 @@ impl ColumnarReader {
.stream_for_column_range(column_name)
.into_stream_async()
.await?;
read_all_columns_in_stream(stream, &self.column_data)
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
/// Get all columns for the given column name.
@@ -176,7 +181,7 @@ impl ColumnarReader {
/// different types.
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let stream = self.stream_for_column_range(column_name).into_stream()?;
read_all_columns_in_stream(stream, &self.column_data)
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
/// Return the number of columns in the columnar.
@@ -195,7 +200,7 @@ mod tests {
columnar_writer.record_column_type("col1", ColumnType::Str, false);
columnar_writer.record_column_type("col2", ColumnType::U64, false);
let mut buffer = Vec::new();
columnar_writer.serialize(1, None, &mut buffer).unwrap();
columnar_writer.serialize(1, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
let columns = columnar.list_columns().unwrap();
assert_eq!(columns.len(), 2);
@@ -211,7 +216,7 @@ mod tests {
columnar_writer.record_column_type("count", ColumnType::U64, false);
columnar_writer.record_numerical(1, "count", 1u64);
let mut buffer = Vec::new();
columnar_writer.serialize(2, None, &mut buffer).unwrap();
columnar_writer.serialize(2, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
let columns = columnar.list_columns().unwrap();
assert_eq!(columns.len(), 1);

View File

@@ -41,31 +41,10 @@ impl ColumnWriter {
pub(super) fn operation_iterator<'a, V: SymbolValue>(
&self,
arena: &MemoryArena,
old_to_new_ids_opt: Option<&[RowId]>,
buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
buffer.clear();
self.values.read_to_end(arena, buffer);
if let Some(old_to_new_ids) = old_to_new_ids_opt {
// TODO avoid the extra deserialization / serialization.
let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
let mut new_doc = 0u32;
let mut cursor = &buffer[..];
for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
if let ColumnOperation::NewDoc(doc) = &op {
new_doc = old_to_new_ids[*doc as usize];
sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
} else {
sorted_ops.push((new_doc, op));
}
}
// stable sort is crucial here.
sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
buffer.clear();
for (_, op) in sorted_ops {
buffer.extend_from_slice(op.serialize().as_ref());
}
}
let mut cursor: &[u8] = &buffer[..];
std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
}
@@ -231,11 +210,9 @@ impl NumericalColumnWriter {
pub(super) fn operation_iterator<'a>(
self,
arena: &MemoryArena,
old_to_new_ids: Option<&[RowId]>,
buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
self.column_writer
.operation_iterator(arena, old_to_new_ids, buffer)
self.column_writer.operation_iterator(arena, buffer)
}
}
@@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter {
pub(super) fn operation_iterator<'a>(
&self,
arena: &MemoryArena,
old_to_new_ids: Option<&[RowId]>,
byte_buffer: &'a mut Vec<u8>,
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
self.column_writer
.operation_iterator(arena, old_to_new_ids, byte_buffer)
self.column_writer.operation_iterator(arena, byte_buffer)
}
}

View File

@@ -8,11 +8,12 @@ use std::net::Ipv6Addr;
use column_operation::ColumnOperation;
pub(crate) use column_writers::CompatibleNumericalTypes;
use common::json_path_writer::JSON_END_OF_PATH;
use common::CountingWriter;
pub(crate) use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena};
use crate::column_index::SerializableColumnIndex;
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use crate::columnar::column_type::ColumnType;
use crate::columnar::writer::column_writers::{
@@ -43,7 +44,7 @@ struct SpareBuffers {
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
/// let mut wrt: Vec<u8> = Vec::new();
/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
/// ```
#[derive(Default)]
pub struct ColumnarWriter {
@@ -75,63 +76,6 @@ impl ColumnarWriter {
.sum::<usize>()
}
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
/// column.
///
/// If the column is multivalued, use the first value for scoring.
/// If no value is associated to a specific row, the document is assigned
/// the lowest possible score.
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
.or_else(|| {
self.datetime_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
})
else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();
let mut start_doc_check_fill = 0;
let mut current_doc_opt: Option<RowId> = None;
// Assumption: NewDoc will never call the same doc twice and is strictly increasing between
// calls
for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
match op {
ColumnOperation::NewDoc(doc) => {
current_doc_opt = Some(doc);
}
ColumnOperation::Value(numerical_value) => {
if let Some(current_doc) = current_doc_opt {
// Fill up with 0.0 since last doc
values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
start_doc_check_fill = current_doc + 1;
// handle multi values
current_doc_opt = None;
let score: f32 = f64::coerce(numerical_value) as f32;
values.push((score, current_doc));
}
}
}
}
for doc in values.len() as u32..num_docs {
values.push((0.0f32, doc));
}
values.sort_by(|(left_score, _), (right_score, _)| {
if reversed {
right_score.total_cmp(left_score)
} else {
left_score.total_cmp(right_score)
}
});
values.into_iter().map(|(_score, doc)| doc).collect()
}
/// Records a column type. This is useful to bypass the coercion process,
/// makes sure the empty is present in the resulting columnar, or set
/// the `sort_values_within_row`.
@@ -302,13 +246,9 @@ impl ColumnarWriter {
},
);
}
pub fn serialize(
&mut self,
num_docs: RowId,
old_to_new_row_ids: Option<&[RowId]>,
wrt: &mut dyn io::Write,
) -> io::Result<()> {
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
let mut serializer = ColumnarSerializer::new(wrt);
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
.numerical_field_hash_map
.iter()
@@ -322,7 +262,7 @@ impl ColumnarWriter {
columns.extend(
self.bytes_field_hash_map
.iter()
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
.map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
);
columns.extend(
self.str_field_hash_map
@@ -349,6 +289,12 @@ impl ColumnarWriter {
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns {
if column_name.contains(&JSON_END_OF_PATH) {
// Tantivy uses b'0' as a separator for nested fields in JSON.
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
// index).
continue;
}
match column_type {
ColumnType::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
@@ -358,11 +304,7 @@ impl ColumnarWriter {
serialize_bool_column(
cardinality,
num_docs,
column_writer.operation_iterator(
arena,
old_to_new_row_ids,
&mut symbol_byte_buffer,
),
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
@@ -376,11 +318,7 @@ impl ColumnarWriter {
serialize_ip_addr_column(
cardinality,
num_docs,
column_writer.operation_iterator(
arena,
old_to_new_row_ids,
&mut symbol_byte_buffer,
),
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
@@ -405,11 +343,8 @@ impl ColumnarWriter {
num_docs,
str_or_bytes_column_writer.sort_values_within_row,
dictionary_builder,
str_or_bytes_column_writer.operation_iterator(
arena,
old_to_new_row_ids,
&mut symbol_byte_buffer,
),
str_or_bytes_column_writer
.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&self.arena,
&mut column_serializer,
@@ -427,11 +362,7 @@ impl ColumnarWriter {
cardinality,
num_docs,
numerical_type,
numerical_column_writer.operation_iterator(
arena,
old_to_new_row_ids,
&mut symbol_byte_buffer,
),
numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
@@ -446,11 +377,7 @@ impl ColumnarWriter {
cardinality,
num_docs,
NumericalType::I64,
column_writer.operation_iterator(
arena,
old_to_new_row_ids,
&mut symbol_byte_buffer,
),
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
buffers,
&mut column_serializer,
)?;
@@ -635,16 +562,16 @@ fn send_to_serialize_column_mappable_to_u128<
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
consume_operation_iterator(op_iterator, optional_index_builder, values);
let optional_index = optional_index_builder.finish(num_rows);
SerializableColumnIndex::Optional {
SerializableColumnIndex::Optional(SerializableOptionalIndex {
num_rows,
non_null_row_ids: Box::new(optional_index),
}
})
}
Cardinality::Multivalued => {
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
}
};
crate::column::serialize_column_mappable_to_u128(
@@ -655,15 +582,6 @@ fn send_to_serialize_column_mappable_to_u128<
Ok(())
}
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
let mut start_index: usize = 0;
for end_index in multivalued_index.iter().copied() {
let end_index = end_index as usize;
values[start_index..end_index].sort_unstable();
start_index = end_index;
}
}
fn send_to_serialize_column_mappable_to_u64(
op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
cardinality: Cardinality,
@@ -687,19 +605,22 @@ fn send_to_serialize_column_mappable_to_u64(
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
consume_operation_iterator(op_iterator, optional_index_builder, values);
let optional_index = optional_index_builder.finish(num_rows);
SerializableColumnIndex::Optional {
SerializableColumnIndex::Optional(SerializableOptionalIndex {
non_null_row_ids: Box::new(optional_index),
num_rows,
}
})
}
Cardinality::Multivalued => {
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
let multivalued_index = multivalued_index_builder.finish(num_rows);
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
if sort_values_within_row {
sort_values_within_row_in_place(multivalued_index, values);
sort_values_within_row_in_place(
serializable_multivalued_index.start_offsets.boxed_iter(),
values,
);
}
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
}
};
crate::column::serialize_column_mappable_to_u64(
@@ -710,6 +631,18 @@ fn send_to_serialize_column_mappable_to_u64(
Ok(())
}
fn sort_values_within_row_in_place(
multivalued_index: impl Iterator<Item = RowId>,
values: &mut [u64],
) {
let mut start_index: usize = 0;
for end_index in multivalued_index {
let end_index = end_index as usize;
values[start_index..end_index].sort_unstable();
start_index = end_index;
}
}
fn coerce_numerical_symbol<T>(
operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
) -> impl Iterator<Item = ColumnOperation<u64>>
@@ -757,7 +690,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&arena, &mut buffer)
.collect();
assert_eq!(symbols.len(), 6);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -786,7 +719,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&arena, &mut buffer)
.collect();
assert_eq!(symbols.len(), 4);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
@@ -809,7 +742,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&arena, &mut buffer)
.collect();
assert_eq!(symbols.len(), 2);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
@@ -828,7 +761,7 @@ mod tests {
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
let mut buffer = Vec::new();
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
.operation_iterator(&arena, None, &mut buffer)
.operation_iterator(&arena, &mut buffer)
.collect();
assert_eq!(symbols.len(), 3);
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));

View File

@@ -1,6 +1,7 @@
use std::io;
use std::io::Write;
use common::json_path_writer::JSON_END_OF_PATH;
use common::{BinarySerializable, CountingWriter};
use sstable::value::RangeValueWriter;
use sstable::RangeSSTable;
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
/// code.
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
buffer.clear();
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
if key.contains(&0u8) {
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
} else {
buffer.extend_from_slice(key);
}
buffer.push(0u8);
buffer.extend_from_slice(key);
buffer.push(JSON_END_OF_PATH);
buffer.push(column_type.to_code());
}
@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
self.columnar_serializer.wrt.write_all(buf)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prepare_key_bytes() {
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
assert_eq!(buffer.len(), 12);
assert_eq!(&buffer[..10], b"root0child");
assert_eq!(buffer[10], 0u8);
assert_eq!(buffer[11], ColumnType::Str.to_code());
}
}

View File

@@ -1,3 +1,4 @@
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
use crate::iterable::Iterable;
use crate::RowId;
@@ -59,31 +60,47 @@ impl IndexBuilder for OptionalIndexBuilder {
#[derive(Default)]
pub struct MultivaluedIndexBuilder {
start_offsets: Vec<RowId>,
doc_with_values: Vec<RowId>,
start_offsets: Vec<u32>,
total_num_vals_seen: u32,
current_row: RowId,
current_row_has_value: bool,
}
impl MultivaluedIndexBuilder {
pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
self.start_offsets
.resize(num_docs as usize + 1, self.total_num_vals_seen);
&self.start_offsets[..]
pub fn finish(&mut self, num_docs: RowId) -> SerializableMultivalueIndex<'_> {
self.start_offsets.push(self.total_num_vals_seen);
let non_null_row_ids: Box<dyn Iterable<RowId>> = Box::new(&self.doc_with_values[..]);
SerializableMultivalueIndex {
doc_ids_with_values: SerializableOptionalIndex {
non_null_row_ids,
num_rows: num_docs,
},
start_offsets: Box::new(&self.start_offsets[..]),
}
}
fn reset(&mut self) {
self.doc_with_values.clear();
self.start_offsets.clear();
self.start_offsets.push(0u32);
self.total_num_vals_seen = 0;
self.current_row = 0;
self.current_row_has_value = false;
}
}
impl IndexBuilder for MultivaluedIndexBuilder {
fn record_row(&mut self, row_id: RowId) {
self.start_offsets
.resize(row_id as usize + 1, self.total_num_vals_seen);
self.current_row = row_id;
self.current_row_has_value = false;
}
fn record_value(&mut self) {
if !self.current_row_has_value {
self.current_row_has_value = true;
self.doc_with_values.push(self.current_row);
self.start_offsets.push(self.total_num_vals_seen);
}
self.total_num_vals_seen += 1;
}
}
@@ -141,6 +158,32 @@ mod tests {
);
}
#[test]
fn test_multivalued_value_index_builder_simple() {
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
{
multivalued_value_index_builder.record_row(0u32);
multivalued_value_index_builder.record_value();
multivalued_value_index_builder.record_value();
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
let start_offsets: Vec<u32> = serialized_multivalue_index
.start_offsets
.boxed_iter()
.collect();
assert_eq!(&start_offsets, &[0, 2]);
}
multivalued_value_index_builder.reset();
multivalued_value_index_builder.record_row(0u32);
multivalued_value_index_builder.record_value();
multivalued_value_index_builder.record_value();
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
let start_offsets: Vec<u32> = serialized_multivalue_index
.start_offsets
.boxed_iter()
.collect();
assert_eq!(&start_offsets, &[0, 2]);
}
#[test]
fn test_multivalued_value_index_builder() {
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
@@ -149,17 +192,15 @@ mod tests {
multivalued_value_index_builder.record_value();
multivalued_value_index_builder.record_row(2u32);
multivalued_value_index_builder.record_value();
assert_eq!(
multivalued_value_index_builder.finish(4u32).to_vec(),
vec![0, 0, 2, 3, 3]
);
multivalued_value_index_builder.reset();
multivalued_value_index_builder.record_row(2u32);
multivalued_value_index_builder.record_value();
multivalued_value_index_builder.record_value();
assert_eq!(
multivalued_value_index_builder.finish(4u32).to_vec(),
vec![0, 0, 0, 2, 2]
);
let SerializableMultivalueIndex {
doc_ids_with_values,
start_offsets,
} = multivalued_value_index_builder.finish(4u32);
assert_eq!(doc_ids_with_values.num_rows, 4u32);
let doc_ids_with_values: Vec<u32> =
doc_ids_with_values.non_null_row_ids.boxed_iter().collect();
assert_eq!(&doc_ids_with_values, &[1u32, 2u32]);
let start_offsets: Vec<u32> = start_offsets.boxed_iter().collect();
assert_eq!(&start_offsets[..], &[0, 2, 3]);
}
}

View File

@@ -0,0 +1,183 @@
use std::path::PathBuf;
use itertools::Itertools;
use crate::{
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
CURRENT_VERSION,
};
const NUM_DOCS: u32 = u16::MAX as u32;
fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
use crate::ColumnarWriter;
let mut columnar_writer = ColumnarWriter::default();
for i in 0..num_docs {
if i % 100 == 0 {
columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
}
if i % 5 == 0 {
columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
}
columnar_writer.record_numerical(i, "full", value_offset + i as u64);
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
}
let mut wrt: Vec<u8> = Vec::new();
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
wrt
}
#[test]
/// Writes a columnar for the CURRENT_VERSION to disk.
fn create_format() {
let version = CURRENT_VERSION.to_string();
let file_path = path_for_version(&version);
if PathBuf::from(file_path.clone()).exists() {
return;
}
let columnar = generate_columnar(NUM_DOCS, 0);
std::fs::write(file_path, columnar).unwrap();
}
fn path_for_version(version: &str) -> String {
format!("./compat_tests_data/{}.columnar", version)
}
#[test]
fn test_format_v1() {
let path = path_for_version("v1");
test_format(&path);
}
#[test]
fn test_format_v2() {
let path = path_for_version("v2");
test_format(&path);
}
fn test_format(path: &str) {
let file_content = std::fs::read(path).unwrap();
let reader = ColumnarReader::open(file_content).unwrap();
check_columns(&reader);
// Test merge
let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
let columnar_readers = vec![&reader, &reader2];
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
let mut out = Vec::new();
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
let reader = ColumnarReader::open(out).unwrap();
check_columns(&reader);
}
fn check_columns(reader: &ColumnarReader) {
let column = open_column(reader, "full");
check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
assert_eq!(column.get_cardinality(), Cardinality::Full);
let column = open_column(reader, "multi");
check_column(&column, |doc_id| {
vec![
(doc_id * 2, doc_id as u64).into(),
(doc_id * 2 + 1, doc_id as u64).into(),
]
});
assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
let column = open_column(reader, "sparse");
check_column(&column, |doc_id| {
if doc_id % 100 == 0 {
vec![(doc_id / 100, doc_id as u64).into()]
} else {
vec![]
}
});
assert_eq!(column.get_cardinality(), Cardinality::Optional);
let column = open_column(reader, "dense");
check_column(&column, |doc_id| {
if doc_id % 5 == 0 {
vec![(doc_id / 5, doc_id as u64).into()]
} else {
vec![]
}
});
assert_eq!(column.get_cardinality(), Cardinality::Optional);
}
struct RowIdAndValue {
row_id: u32,
value: u64,
}
impl From<(u32, u64)> for RowIdAndValue {
fn from((row_id, value): (u32, u64)) -> Self {
Self { row_id, value }
}
}
fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
let num_docs = column.num_docs();
let test_doc = |doc: u32| {
if expected(doc).is_empty() {
assert_eq!(column.first(doc), None);
} else {
assert_eq!(column.first(doc), Some(expected(doc)[0].value));
}
let values = column.values_for_doc(doc).collect_vec();
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
let mut row_ids = Vec::new();
column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
assert_eq!(
row_ids,
expected(doc).iter().map(|x| x.row_id).collect_vec()
);
let values = column.values_for_doc(doc).collect_vec();
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
// Docid rowid conversion
let mut row_ids = Vec::new();
let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
column
.index
.docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
let expected_rowids = expected(doc)
.iter()
.map(|x| x.row_id)
.chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
.collect_vec();
assert_eq!(row_ids, expected_rowids);
let rowid_range = column
.index
.docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
if expected_rowids.is_empty() {
assert!(rowid_range.is_empty());
} else {
assert_eq!(
rowid_range,
expected_rowids[0]..expected_rowids.last().unwrap() + 1
);
}
};
test_doc(0);
test_doc(num_docs - 1);
test_doc(num_docs - 2);
test_doc(65000);
}
fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
let column = reader.read_columns(name).unwrap()[0]
.open()
.unwrap()
.coerce_numerical(crate::NumericalType::U64)
.unwrap();
let DynamicColumn::U64(column) = column else {
panic!();
};
column
}

View File

@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
use crate::column::{BytesColumn, Column, StrColumn};
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
use crate::columnar::ColumnType;
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType};
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
#[derive(Clone)]
pub enum DynamicColumn {
@@ -232,6 +232,7 @@ static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
pub struct DynamicColumnHandle {
pub(crate) file_slice: FileSlice,
pub(crate) column_type: ColumnType,
pub(crate) format_version: Version,
}
impl DynamicColumnHandle {
@@ -260,11 +261,15 @@ impl DynamicColumnHandle {
let column_bytes = self.file_slice.read_bytes()?;
match self.column_type {
ColumnType::Str | ColumnType::Bytes => {
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
let column: BytesColumn =
crate::column::open_column_bytes(column_bytes, self.format_version)?;
Ok(Some(column.term_ord_column))
}
ColumnType::IpAddr => {
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
let column = crate::column::open_column_u128_as_compact_u64(
column_bytes,
self.format_version,
)?;
Ok(Some(column))
}
ColumnType::Bool
@@ -272,7 +277,8 @@ impl DynamicColumnHandle {
| ColumnType::U64
| ColumnType::F64
| ColumnType::DateTime => {
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
let column =
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
Ok(Some(column))
}
}
@@ -280,15 +286,31 @@ impl DynamicColumnHandle {
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
let dynamic_column: DynamicColumn = match self.column_type {
ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
ColumnType::Bytes => {
crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
}
ColumnType::Str => {
crate::column::open_column_str(column_bytes, self.format_version)?.into()
}
ColumnType::I64 => {
crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
}
ColumnType::U64 => {
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
}
ColumnType::F64 => {
crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
}
ColumnType::Bool => {
crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
}
ColumnType::IpAddr => {
crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
.into()
}
ColumnType::DateTime => {
crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
.into()
}
};
Ok(dynamic_column)

View File

@@ -1,4 +1,7 @@
use std::ops::Range;
use std::sync::Arc;
use crate::{ColumnValues, RowId};
pub trait Iterable<T = u64> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
@@ -17,3 +20,9 @@ where Range<T>: Iterator<Item = T>
Box::new(self.clone())
}
}
impl Iterable for Arc<dyn crate::ColumnValues<RowId>> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
Box::new(self.iter().map(|row_id| row_id as u64))
}
}

View File

@@ -48,7 +48,7 @@ pub use column_values::{
};
pub use columnar::{
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
};
use sstable::VoidSSTable;
pub use value::{NumericalType, NumericalValue};
@@ -131,3 +131,6 @@ impl Cardinality {
#[cfg(test)]
mod tests;
#[cfg(test)]
mod compat_tests;

View File

@@ -21,7 +21,7 @@ fn test_dataframe_writer_str() {
dataframe_writer.record_str(1u32, "my_string", "hello");
dataframe_writer.record_str(3u32, "my_string", "helloeee");
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
@@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() {
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
@@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() {
dataframe_writer.record_bool(1u32, "bool.value", false);
dataframe_writer.record_bool(3u32, "bool.value", true);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
@@ -74,12 +74,12 @@ fn test_dataframe_writer_u64_multivalued() {
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(7, None, &mut buffer).unwrap();
dataframe_writer.serialize(7, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
assert_eq!(cols[0].num_bytes(), 50);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
@@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
dataframe_writer.serialize(5, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
@@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() {
dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
let mut buffer: Vec<u8> = Vec::new();
dataframe_writer.serialize(6, None, &mut buffer).unwrap();
dataframe_writer.serialize(6, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
@@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() {
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
}
#[test]
fn test_dataframe_sort_by_full() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
let data = dataframe_writer.sort_order("value", 2, false);
assert_eq!(data, vec![0, 1]);
}
#[test]
fn test_dataframe_sort_by_opt() {
let mut dataframe_writer = ColumnarWriter::default();
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
let data = dataframe_writer.sort_order("value", 5, false);
// 0, 2, 4 is 0.0
assert_eq!(data, vec![0, 2, 4, 3, 1]);
let data = dataframe_writer.sort_order("value", 5, true);
assert_eq!(
data,
vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
);
}
#[test]
fn test_dataframe_sort_by_multi() {
let mut dataframe_writer = ColumnarWriter::default();
// valid for sort
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
// those are ignored for sort
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
// valid for sort
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
// ignored, would change sort order
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
let data = dataframe_writer.sort_order("value", 4, false);
assert_eq!(data, vec![0, 2, 1, 3]);
}
#[test]
fn test_dictionary_encoded_str() {
let mut buffer = Vec::new();
@@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() {
columnar_writer.record_str(3, "my.column", "c");
columnar_writer.record_str(3, "my.column2", "different_column!");
columnar_writer.record_str(4, "my.column", "b");
columnar_writer.serialize(5, None, &mut buffer).unwrap();
columnar_writer.serialize(5, &mut buffer).unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
@@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() {
columnar_writer.record_bytes(3, "my.column", b"c");
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
columnar_writer.record_bytes(4, "my.column", b"b");
columnar_writer.serialize(5, None, &mut buffer).unwrap();
columnar_writer.serialize(5, &mut buffer).unwrap();
let columnar_reader = ColumnarReader::open(buffer).unwrap();
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
@@ -344,7 +304,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
ip_addr_byte
))),
1 => any::<bool>().prop_map(ColumnValue::Bool),
1 => (0_679_723_993i64..1_679_723_995i64)
1 => (679_723_993i64..1_679_723_995i64)
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
]
}
@@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, Colu
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
}
fn columnar_docs_and_mapping_strategy(
) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
columnar_docs_strategy().prop_flat_map(|docs| {
permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
})
}
fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
Just((0u32..n as RowId).collect()).prop_shuffle()
}
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
let vals: Vec<usize> = (0..n).collect();
subsequence(vals, 0..=n).prop_shuffle()
}
fn build_columnar_with_mapping(
docs: &[Vec<(&'static str, ColumnValue)>],
old_to_new_row_ids_opt: Option<&[RowId]>,
) -> ColumnarReader {
fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
let num_docs = docs.len() as u32;
let mut buffer = Vec::new();
let mut columnar_writer = ColumnarWriter::default();
@@ -416,15 +362,13 @@ fn build_columnar_with_mapping(
}
}
}
columnar_writer
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
.unwrap();
columnar_writer.serialize(num_docs, &mut buffer).unwrap();
ColumnarReader::open(buffer).unwrap()
}
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
build_columnar_with_mapping(docs, None)
build_columnar_with_mapping(docs)
}
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
@@ -448,6 +392,7 @@ fn assert_columnar_eq(
}
}
#[track_caller]
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
left: &Column<T>,
right: &Column<T>,
@@ -683,54 +628,6 @@ proptest! {
}
}
// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
proptest! {
#![proptest_config(ProptestConfig::with_cases(500))]
#[test]
fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
assert_eq!(columnar.num_rows() as usize, docs.len());
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
for (doc_id, doc_vals) in docs.iter().enumerate() {
for (col_name, col_val) in doc_vals {
expected_columns
.entry((col_name, col_val.column_type_category()))
.or_default()
.entry(mapping[doc_id])
.or_default()
.push(col_val);
}
}
let column_list = columnar.list_columns().unwrap();
assert_eq!(expected_columns.len(), column_list.len());
for (column_name, column) in column_list {
let dynamic_column = column.open().unwrap();
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
for _doc_id in 0..columnar.num_rows() {
match &dynamic_column {
DynamicColumn::Bool(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::I64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::U64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::F64(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::IpAddr(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::DateTime(col) =>
assert_column_values(col, expected_col_values),
DynamicColumn::Bytes(col) =>
assert_bytes_column_values(col, expected_col_values, false),
DynamicColumn::Str(col) =>
assert_bytes_column_values(col, expected_col_values, true),
}
}
}
}
}
// This tests create 2 or 3 random small columnar and attempts to merge them.
// It compares the resulting merged dataframe with what would have been obtained by building the
// dataframe from the concatenated rows to begin with.
@@ -844,24 +741,68 @@ fn columnar_docs_and_remap(
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
.collect();
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in
columnar_docs_and_remap()) {
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
}
}
fn test_columnar_merge_and_remap(
columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>,
shuffle_merge_order: Vec<RowAddr>,
) {
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order
.iter()
.map(|row_addr| {
columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone()
})
.collect();
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
let columnar_readers: Vec<ColumnarReader> = columnar_docs
.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_ref: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = columnar_docs
.iter()
.map(|docs| docs.len() as RowId)
.collect();
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
crate::merge_columnar(
&columnar_readers_ref[..],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
}
#[test]
fn test_columnar_merge_and_remap_bug_1() {
let columnar_docs = vec![vec![
vec![
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
],
vec![],
]];
let shuffle_merge_order: Vec<RowAddr> = vec![
RowAddr {
segment_ord: 0,
row_id: 1,
},
RowAddr {
segment_ord: 0,
row_id: 0,
},
];
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
}
#[test]
fn test_columnar_merge_empty() {
let columnar_reader_1 = build_columnar(&[]);

View File

@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
@@ -20,5 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
binggan = "0.8.1"
proptest = "1.0.0"
rand = "0.8.4"

View File

@@ -1,39 +1,64 @@
#![feature(test)]
use binggan::{black_box, BenchRunner};
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
extern crate test;
fn bench_vint() {
let mut runner = BenchRunner::new();
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;
let vals: Vec<u32> = (0..20_000).collect();
runner.bench_function("bench_vint", move |_| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
black_box(out);
});
#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
runner.bench_function("bench_vint_rand", move |_| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
black_box(out);
});
}
fn bench_bitset() {
let mut runner = BenchRunner::new();
runner.bench_function("bench_tinyset_pop", move |_| {
let mut tinyset = TinySet::singleton(black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
black_box(tinyset);
});
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
runner.bench_function("bench_tinyset_sum", move |_| {
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
let v = [10u32, 14u32, 21u32];
runner.bench_function("bench_tinyarr_sum", move |_| {
black_box(v.iter().cloned().sum::<u32>());
});
runner.bench_function("bench_bitset_initialize", move |_| {
black_box(BitSet::with_max_value(1_000_000));
});
}
fn main() {
bench_vint();
bench_bitset();
}

View File

@@ -696,43 +696,3 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use test;
use super::{BitSet, TinySet};
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

View File

@@ -7,6 +7,11 @@
- [Other](#other)
- [Usage](#usage)
# Index Sorting has been removed!
More infos here:
https://github.com/quickwit-oss/tantivy/issues/2352
# Index Sorting
Tantivy allows you to sort the index according to a property.

View File

@@ -19,14 +19,13 @@ use tantivy::{doc, Index, IndexWriter, ReloadPolicy};
use tempfile::TempDir;
fn main() -> tantivy::Result<()> {
// Normally you would use `MMapDirectory` instead to persist data on disk.
// https://docs.rs/tantivy/latest/tantivy/directory/struct.MmapDirectory.html
// But for this example, we will use a temporary directory `TempDir`.
// Let's create a temporary directory for the
// sake of this example
let index_path = TempDir::new()?;
// # Defining the schema
//
// The Tantivy index requires a schema.
// The Tantivy index requires a very strict schema.
// The schema declares which fields are in the index,
// and for each field, its type and "the way it should
// be indexed".

View File

@@ -1,3 +1,5 @@
use std::ops::Bound;
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
@@ -5,7 +7,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, IndexWriter, Result, Term};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
let docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);

View File

@@ -1,3 +1,4 @@
use std::borrow::Cow;
use std::iter::once;
use nom::branch::alt;
@@ -19,7 +20,7 @@ use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
// special characters.
const SPECIAL_CHARS: &[char] = &[
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
];
/// consume a field name followed by colon. Return the field name with escape sequence
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
)(inp)
}
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
fn interpret_escape(source: &str) -> String {
let mut res = String::with_capacity(source.len());
let mut in_escape = false;
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
for c in source.chars() {
if in_escape {
if !require_escape(c) {
// we re-add the escape sequence
res.push('\\');
}
res.push(c);
in_escape = false;
} else if c == '\\' {
in_escape = true;
} else {
res.push(c);
}
}
res
}
/// Consume a word outside of any context.
// TODO should support escape sequences
fn word(inp: &str) -> IResult<&str, &str> {
fn word(inp: &str) -> IResult<&str, Cow<str>> {
map_res(
recognize(tuple((
satisfy(|c| {
!c.is_whitespace()
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
}),
many0(satisfy(|c: char| {
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
alt((
preceded(char('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
)),
many0(alt((
preceded(char('\\'), anychar),
satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
))),
))),
|s| match s {
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
_ => Ok(s),
s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
s => Ok(Cow::Borrowed(s)),
},
)(inp)
}
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|inp| {
opt_i_err(
preceded(
multispace0,
recognize(many1(satisfy(|c| {
!c.is_whitespace() && !delimiter.contains(c)
}))),
fn word_infallible(
delimiter: &str,
emit_error: bool,
) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
// emit error is set when receiving an unescaped `:` should emit an error
move |inp| {
map(
opt_i_err(
preceded(
multispace0,
recognize(many1(alt((
preceded(char::<&str, _>('\\'), anychar),
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
)))),
),
"expected word",
),
"expected word",
|(opt_s, mut errors)| match opt_s {
Some(s) => {
if emit_error
&& (s
.as_bytes()
.windows(2)
.any(|window| window[0] != b'\\' && window[1] == b':')
|| s.starts_with(':'))
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if s.contains('\\') {
(Some(Cow::Owned(interpret_escape(s))), errors)
} else {
(Some(Cow::Borrowed(s)), errors)
}
}
None => (None, errors),
},
)(inp)
}
}
@@ -159,7 +216,7 @@ fn simple_term_infallible(
(value((), char('\'')), simple_quotes),
),
// numbers are parsed with words in this case, as we allow string starting with a -
map(word_infallible(delimiter), |(text, errors)| {
map(word_infallible(delimiter, true), |(text, errors)| {
(text.map(|text| (Delimiter::None, text.to_string())), errors)
}),
)(inp)
@@ -322,15 +379,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|((field_name, _, leaf), mut errors)| {
(
leaf.map(|leaf| {
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
&& field_name.is_none()
{
errors.push(LenientErrorInternal {
pos: inp.len(),
message: "parsed possible invalid field as term".to_string(),
});
}
if matches!(&leaf, UserInputLeaf::Literal(literal)
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
&& field_name.is_none()
@@ -449,20 +497,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
tuple_infallible((
opt_i(anychar),
space0_infallible,
word_infallible("]}"),
word_infallible("]}", false),
space1_infallible,
opt_i_err(
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
"missing keyword TO",
),
word_infallible("]}"),
word_infallible("]}", false),
opt_i_err(one_of("]}"), "missing range delimiter"),
)),
|(
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
errs,
)| {
let lower_bound = match (lower_bound_kind, lower) {
let lower_bound = match (lower_bound_kind, lower.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
// if it is some, TO was actually the bound (i.e. [TO TO something])
@@ -471,7 +519,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
_ => unreachable!("precondition failed, range did not start with [ or {{"),
};
let upper_bound = match (upper_bound_kind, upper) {
let upper_bound = match (upper_bound_kind, upper.as_deref()) {
(_, Some("*")) => UserInputBound::Unbounded,
(_, None) => UserInputBound::Unbounded,
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
@@ -488,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
(
(
value((), tag(">=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -502,7 +550,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<=")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -516,7 +564,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag(">")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
bound
@@ -530,7 +578,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
),
(
value((), tag("<")),
map(word_infallible(""), |(bound, err)| {
map(word_infallible("", false), |(bound, err)| {
(
(
UserInputBound::Unbounded,
@@ -1157,6 +1205,12 @@ mod test {
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
}
#[test]
@@ -1590,5 +1644,21 @@ mod test {
r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#,
);
// we don't process escape sequence for chars which don't require it
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
}
#[test]
fn test_queries_with_colons() {
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
}
#[test]
fn test_invalid_field() {
test_is_parse_err(r#"!bc:def"#, "!bc:def");
}
}

View File

@@ -34,8 +34,9 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
TopHitsAggregationReq,
};
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
@@ -146,6 +147,11 @@ pub enum AggregationVariants {
/// extracted values.
#[serde(rename = "stats")]
Stats(StatsAggregation),
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
/// `std_deviation_sampling`) over the extracted values.
#[serde(rename = "extended_stats")]
ExtendedStats(ExtendedStatsAggregation),
/// Computes the sum of the extracted values.
#[serde(rename = "sum")]
Sum(SumAggregation),
@@ -154,7 +160,10 @@ pub enum AggregationVariants {
Percentiles(PercentilesAggregationReq),
/// Finds the top k values matching some order
#[serde(rename = "top_hits")]
TopHits(TopHitsAggregation),
TopHits(TopHitsAggregationReq),
/// Computes an estimate of the number of unique values
#[serde(rename = "cardinality")]
Cardinality(CardinalityAggregationReq),
}
impl AggregationVariants {
@@ -170,9 +179,11 @@ impl AggregationVariants {
AggregationVariants::Max(max) => vec![max.field_name()],
AggregationVariants::Min(min) => vec![min.field_name()],
AggregationVariants::Stats(stats) => vec![stats.field_name()],
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
AggregationVariants::Cardinality(per) => vec![per.field_name()],
}
}
@@ -197,6 +208,12 @@ impl AggregationVariants {
_ => None,
}
}
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self {

View File

@@ -11,8 +11,8 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
SumAggregation,
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
};
use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
field: ref field_name,
ref missing,
..
})
| Cardinality(CardinalityAggregationReq {
field: ref field_name,
ref missing,
..
}) => {
let str_dict_column = reader.fast_fields().str(field_name)?;
let allowed_column_types = [
@@ -276,6 +281,10 @@ impl AggregationWithAccessor {
field: ref field_name,
..
})
| ExtendedStats(ExtendedStatsAggregation {
field: ref field_name,
..
})
| Sum(SumAggregation {
field: ref field_name,
..

View File

@@ -8,7 +8,9 @@ use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::bucket::GetDocCount;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
use super::metric::{
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
};
use super::{AggregationError, Key};
use crate::TantivyError;
@@ -88,12 +90,16 @@ pub enum MetricResult {
Min(SingleMetricResult),
/// Stats metric result.
Stats(Stats),
/// ExtendedStats metric result.
ExtendedStats(Box<ExtendedStats>),
/// Sum metric result.
Sum(SingleMetricResult),
/// Percentiles metric result.
Percentiles(PercentilesMetricResult),
/// Top hits metric result
TopHits(TopHitsMetricResult),
/// Cardinality metric result
Cardinality(SingleMetricResult),
}
impl MetricResult {
@@ -104,6 +110,7 @@ impl MetricResult {
MetricResult::Max(max) => Ok(max.value),
MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value),
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
@@ -111,6 +118,7 @@ impl MetricResult {
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
)),
MetricResult::Cardinality(card) => Ok(card.value),
}
}
}

View File

@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
}
}
}
},
"cardinality_string_id":{
"cardinality": {
"field": "string_id"
}
},
"cardinality_score":{
"cardinality": {
"field": "score"
}
}
});
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
)
);
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
assert_eq!(res["cardinality_score"]["value"], 80.0);
Ok(())
}
@@ -926,10 +939,10 @@ fn test_aggregation_on_json_object_mixed_types() {
},
"termagg": {
"buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 1, "key": 10.0, "key_as_string": "10", "min_price": { "value": 10.0 } },
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
{ "doc_count": 1, "key": -20.5, "key_as_string": "-20.5", "min_price": { "value": -20.5 } },
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
],
"sum_other_doc_count": 0

View File

@@ -1,10 +1,9 @@
use std::fmt::Debug;
use std::io;
use std::net::Ipv6Addr;
use columnar::column_values::CompactSpaceU64Accessor;
use columnar::{
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
};
use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -466,49 +465,66 @@ impl SegmentTermCollector {
};
if self.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty();
let term_dict = agg_with_accessor
.str_dict_column
.as_ref()
.cloned()
.unwrap_or_else(|| {
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
});
let mut buffer = String::new();
for (term_id, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
// Special case for missing key
if term_id == u64::MAX {
let missing_key = self
.req
.missing
.as_ref()
.expect("Found placeholder term_id but `missing` is None");
match missing_key {
Key::Str(missing) => {
buffer.clear();
buffer.push_str(missing);
dict.insert(
IntermediateKey::Str(buffer.to_string()),
intermediate_entry,
);
}
Key::F64(val) => {
buffer.push_str(&val.to_string());
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
}
.map(|el| el.dictionary())
.unwrap_or_else(|| &fallback_dict);
let mut buffer = Vec::new();
// special case for missing key
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
let entry = entries[index];
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
let missing_key = self
.req
.missing
.as_ref()
.expect("Found placeholder term_id but `missing` is None");
match missing_key {
Key::Str(missing) => {
buffer.clear();
buffer.extend_from_slice(missing.as_bytes());
dict.insert(
IntermediateKey::Str(
String::from_utf8(buffer.to_vec())
.expect("could not convert to String"),
),
intermediate_entry,
);
}
} else {
if !term_dict.ord_to_str(term_id, &mut buffer)? {
return Err(TantivyError::InternalError(format!(
"Couldn't find term_id {term_id} in dict"
)));
Key::F64(val) => {
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
}
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
}
entries.swap_remove(index);
}
// Sort by term ord
entries.sort_unstable_by_key(|bucket| bucket.0);
let mut idx = 0;
term_dict.sorted_ords_to_term_cb(
entries.iter().map(|(term_id, _)| *term_id),
|term| {
let entry = entries[idx];
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
dict.insert(
IntermediateKey::Str(
String::from_utf8(term.to_vec()).expect("could not convert to String"),
),
intermediate_entry,
);
idx += 1;
Ok(())
},
)?;
if self.req.min_doc_count == 0 {
// TODO: Handle rev streaming for descending sorting by keys
let mut stream = term_dict.dictionary().stream()?;
let mut stream = term_dict.stream()?;
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
agg_with_accessor.agg.sub_aggregation(),
);

View File

@@ -19,13 +19,14 @@ use super::bucket::{
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
};
use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
};
use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey};
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
use crate::aggregation::metric::CardinalityCollector;
use crate::TantivyError;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
@@ -215,6 +216,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(),
)),
ExtendedStats(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(),
)),
@@ -222,7 +226,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
),
TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
),
Cardinality(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
),
}
}
@@ -282,10 +289,14 @@ pub enum IntermediateMetricResult {
Min(IntermediateMin),
/// Intermediate stats result.
Stats(IntermediateStats),
/// Intermediate stats result.
ExtendedStats(IntermediateExtendedStats),
/// Intermediate sum result.
Sum(IntermediateSum),
/// Intermediate top_hits result
TopHits(TopHitsTopNComputer),
/// Intermediate cardinality result
Cardinality(CardinalityCollector),
}
impl IntermediateMetricResult {
@@ -306,6 +317,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
MetricResult::ExtendedStats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
@@ -316,6 +330,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::TopHits(top_hits) => {
MetricResult::TopHits(top_hits.into_final_result())
}
IntermediateMetricResult::Cardinality(cardinality) => {
MetricResult::Cardinality(cardinality.finalize().into())
}
}
}
@@ -346,6 +363,12 @@ impl IntermediateMetricResult {
) => {
stats_left.merge_fruits(stats_right);
}
(
IntermediateMetricResult::ExtendedStats(extended_stats_left),
IntermediateMetricResult::ExtendedStats(extended_stats_right),
) => {
extended_stats_left.merge_fruits(extended_stats_right);
}
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right);
}
@@ -358,6 +381,12 @@ impl IntermediateMetricResult {
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
left.merge_fruits(right)?;
}
(
IntermediateMetricResult::Cardinality(left),
IntermediateMetricResult::Cardinality(right),
) => {
left.merge_fruits(right)?;
}
_ => {
panic!("incompatible fruit types in tree or missing merge_fruits handler");
}
@@ -570,6 +599,7 @@ impl IntermediateTermBucketResult {
let val = if key { "true" } else { "false" };
Some(val.to_string())
}
IntermediateKey::F64(val) => Some(val.to_string()),
_ => None,
};
Ok(BucketEntry {

View File

@@ -0,0 +1,466 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hasher};
use columnar::column_values::CompactSpaceU64Accessor;
use columnar::Dictionary;
use common::f64_to_u64;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use rustc_hash::FxHashSet;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*;
use crate::TantivyError;
#[derive(Clone, Debug, Serialize, Deserialize)]
struct BuildSaltedHasher {
salt: u8,
}
impl BuildHasher for BuildSaltedHasher {
type Hasher = DefaultHasher;
fn build_hasher(&self) -> Self::Hasher {
let mut hasher = DefaultHasher::new();
hasher.write_u8(self.salt);
hasher
}
}
/// # Cardinality
///
/// The cardinality aggregation allows for computing an estimate
/// of the number of different values in a data set based on the
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
/// uniqueness of values in a large dataset where counting each unique value
/// individually would be computationally expensive.
///
/// For example, you might use a cardinality aggregation to estimate the number
/// of unique visitors to a website by aggregating on a field that contains
/// user IDs or session IDs.
///
/// To use the cardinality aggregation, you'll need to provide a field to
/// aggregate on. The following example demonstrates a request for the cardinality
/// of the "user_id" field:
///
/// ```JSON
/// {
/// "cardinality": {
/// "field": "user_id"
/// }
/// }
/// ```
///
/// This request will return an estimate of the number of unique values in the
/// "user_id" field.
///
/// ## Missing Values
///
/// The `missing` parameter defines how documents that are missing a value should be treated.
/// By default, documents without a value for the specified field are ignored. However, you can
/// specify a default value for these documents using the `missing` parameter. This can be useful
/// when you want to include documents with missing values in the aggregation.
///
/// For example, the following request treats documents with missing values in the "user_id"
/// field as if they had a value of "unknown":
///
/// ```JSON
/// {
/// "cardinality": {
/// "field": "user_id",
/// "missing": "unknown"
/// }
/// }
/// ```
///
/// # Estimation Accuracy
///
/// The cardinality aggregation provides an approximate count, which is usually
/// accurate within a small error range. This trade-off allows for efficient
/// computation even on very large datasets.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CardinalityAggregationReq {
/// The field name to compute the percentiles on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(skip_serializing_if = "Option::is_none", default)]
pub missing: Option<Key>,
}
impl CardinalityAggregationReq {
/// Creates a new [`CardinalityAggregationReq`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
&self.field
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentCardinalityCollector {
cardinality: CardinalityCollector,
entries: FxHashSet<u64>,
column_type: ColumnType,
accessor_idx: usize,
missing: Option<Key>,
}
impl SegmentCardinalityCollector {
pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
Self {
cardinality: CardinalityCollector::new(column_type as u8),
entries: Default::default(),
column_type,
accessor_idx,
missing: missing.clone(),
}
}
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_accessor: &mut AggregationWithAccessor,
) {
if let Some(missing) = agg_accessor.missing_value_for_accessor {
agg_accessor.column_block_accessor.fetch_block_with_missing(
docs,
&agg_accessor.accessor,
missing,
);
} else {
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
}
}
fn into_intermediate_metric_result(
mut self,
agg_with_accessor: &AggregationWithAccessor,
) -> crate::Result<IntermediateMetricResult> {
if self.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty();
let dict = agg_with_accessor
.str_dict_column
.as_ref()
.map(|el| el.dictionary())
.unwrap_or_else(|| &fallback_dict);
let mut has_missing = false;
// TODO: replace FxHashSet with something that allows iterating in order
// (e.g. sparse bitvec)
let mut term_ids = Vec::new();
for term_ord in self.entries.into_iter() {
if term_ord == u64::MAX {
has_missing = true;
} else {
// we can reasonably exclude values above u32::MAX
term_ids.push(term_ord as u32);
}
}
term_ids.sort_unstable();
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
self.cardinality.sketch.insert_any(&term);
Ok(())
})?;
if has_missing {
let missing_key = self
.missing
.as_ref()
.expect("Found placeholder term_ord but `missing` is None");
match missing_key {
Key::Str(missing) => {
self.cardinality.sketch.insert_any(&missing);
}
Key::F64(val) => {
let val = f64_to_u64(*val);
self.cardinality.sketch.insert_any(&val);
}
}
}
}
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
}
}
impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)?;
Ok(())
}
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_with_accessor)
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
self.fetch_block_with_field(docs, bucket_agg_accessor);
let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
if self.column_type == ColumnType::Str {
for term_ord in col_block_accessor.iter_vals() {
self.entries.insert(term_ord);
}
} else if self.column_type == ColumnType::IpAddr {
let compact_space_accessor = bucket_agg_accessor
.accessor
.values
.clone()
.downcast_arc::<CompactSpaceU64Accessor>()
.map_err(|_| {
TantivyError::AggregationError(
crate::aggregation::AggregationError::InternalError(
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
.to_string(),
),
)
})?;
for val in col_block_accessor.iter_vals() {
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
self.cardinality.sketch.insert_any(&val);
}
} else {
for val in col_block_accessor.iter_vals() {
self.cardinality.sketch.insert_any(&val);
}
}
Ok(())
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
/// The percentiles collector used during segment collection and for merging results.
pub struct CardinalityCollector {
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
}
impl Default for CardinalityCollector {
fn default() -> Self {
Self::new(0)
}
}
impl PartialEq for CardinalityCollector {
fn eq(&self, _other: &Self) -> bool {
false
}
}
impl CardinalityCollector {
/// Compute the final cardinality estimate.
pub fn finalize(self) -> Option<f64> {
Some(self.sketch.clone().count().trunc())
}
fn new(salt: u8) -> Self {
Self {
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
}
}
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
self.sketch.merge(&right.sketch).map_err(|err| {
TantivyError::AggregationError(AggregationError::InternalError(format!(
"Error while merging cardinality {err:?}"
)))
})?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::net::IpAddr;
use std::str::FromStr;
use columnar::MonotonicallyMappableToU64;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
use crate::schema::{IntoIpv6Addr, Schema, FAST};
use crate::Index;
#[test]
fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
let values = vec![];
let index = get_test_index_from_terms(false, &values)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "string_id",
}
},
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 0.0);
Ok(())
}
#[test]
fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
cardinality_aggregation_test_merge_segment(true)
}
#[test]
fn cardinality_aggregation_test() -> crate::Result<()> {
cardinality_aggregation_test_merge_segment(false)
}
fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
let segment_and_terms = vec![
vec!["terma"],
vec!["termb"],
vec!["termc"],
vec!["terma"],
vec!["terma"],
vec!["terma"],
vec!["termb"],
vec!["terma"],
];
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "string_id",
}
},
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 3.0);
Ok(())
}
#[test]
fn cardinality_aggregation_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(id_field => 1u64))?;
writer.add_document(doc!(id_field => 2u64))?;
writer.add_document(doc!(id_field => 3u64))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "id",
"missing": 0u64
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 4.0);
Ok(())
}
#[test]
fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
// IpV6 loopback
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
// IpV4
writer.add_document(
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
)?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "ip_field"
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 2.0);
Ok(())
}
#[test]
fn cardinality_aggregation_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("json", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(field => json!({"value": false})))?;
writer.add_document(doc!(field => json!({"value": true})))?;
writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "json.value"
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 4.0);
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -17,7 +17,9 @@
//! - [Percentiles](PercentilesAggregationReq)
mod average;
mod cardinality;
mod count;
mod extended_stats;
mod max;
mod min;
mod percentiles;
@@ -28,7 +30,9 @@ mod top_hits;
use std::collections::HashMap;
pub use average::*;
pub use cardinality::*;
pub use count::*;
pub use extended_stats::*;
pub use max::*;
pub use min::*;
pub use percentiles::*;

View File

@@ -1,3 +1,5 @@
use std::fmt::Debug;
use serde::{Deserialize, Serialize};
use super::*;
@@ -85,13 +87,15 @@ impl Stats {
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateStats {
/// The number of extracted values.
count: u64,
pub(crate) count: u64,
/// The sum of the extracted values.
sum: f64,
pub(crate) sum: f64,
/// delta for sum needed for [Kahan algorithm for summation](https://en.wikipedia.org/wiki/Kahan_summation_algorithm)
pub(crate) delta: f64,
/// The min value.
min: f64,
pub(crate) min: f64,
/// The max value.
max: f64,
pub(crate) max: f64,
}
impl Default for IntermediateStats {
@@ -99,6 +103,7 @@ impl Default for IntermediateStats {
Self {
count: 0,
sum: 0.0,
delta: 0.0,
min: f64::MAX,
max: f64::MIN,
}
@@ -109,7 +114,13 @@ impl IntermediateStats {
/// Merges the other stats intermediate result into self.
pub fn merge_fruits(&mut self, other: IntermediateStats) {
self.count += other.count;
self.sum += other.sum;
// kahan algorithm for sum
let y = other.sum - (self.delta + other.delta);
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(other.min);
self.max = self.max.max(other.max);
}
@@ -141,9 +152,15 @@ impl IntermediateStats {
}
#[inline]
fn collect(&mut self, value: f64) {
pub(in crate::aggregation::metric) fn collect(&mut self, value: f64) {
self.count += 1;
self.sum += value;
// kahan algorithm for sum
let y = value - self.delta;
let t = self.sum + y;
self.delta = (t - self.sum) - y;
self.sum = t;
self.min = self.min.min(value);
self.max = self.max.max(value);
}
@@ -288,7 +305,6 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)]
mod tests {
use serde_json::Value;
use crate::aggregation::agg_req::{Aggregation, Aggregations};

View File

@@ -1,7 +1,7 @@
use std::collections::HashMap;
use std::net::Ipv6Addr;
use columnar::{ColumnarReader, DynamicColumn};
use columnar::{Column, ColumnType, ColumnarReader, DynamicColumn};
use common::json_path_writer::JSON_PATH_SEGMENT_SEP_STR;
use common::DateTime;
use regex::Regex;
@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
/// }
/// ```
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct TopHitsAggregation {
pub struct TopHitsAggregationReq {
sort: Vec<KeyOrder>,
size: usize,
from: Option<usize>,
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
))
}
impl TopHitsAggregation {
impl TopHitsAggregationReq {
/// Validate and resolve field retrieval parameters
pub fn validate_and_resolve_field_names(
&mut self,
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
/// The TopHitsCollector used for collecting over segments and merging results.
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct TopHitsTopNComputer {
req: TopHitsAggregation,
req: TopHitsAggregationReq,
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
}
@@ -443,10 +443,10 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer {
/// Create a new TopHitsCollector
pub fn new(req: TopHitsAggregation) -> Self {
pub fn new(req: &TopHitsAggregationReq) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req,
req: req.clone(),
}
}
@@ -491,18 +491,16 @@ impl TopHitsTopNComputer {
pub(crate) struct TopHitsSegmentCollector {
segment_ordinal: SegmentOrdinal,
accessor_idx: usize,
req: TopHitsAggregation,
top_n: TopNComputer<Vec<DocValueAndOrder>, DocAddress, false>,
}
impl TopHitsSegmentCollector {
pub fn from_req(
req: &TopHitsAggregation,
req: &TopHitsAggregationReq,
accessor_idx: usize,
segment_ordinal: SegmentOrdinal,
) -> Self {
Self {
req: req.clone(),
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
segment_ordinal,
accessor_idx,
@@ -511,14 +509,13 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector(
self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregationReq,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(self.req.clone());
let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec();
for res in top_results {
let doc_value_fields = self
.req
.get_document_field_data(value_accessors, res.doc.doc_id);
let doc_value_fields = req.get_document_field_data(value_accessors, res.doc.doc_id);
top_hits_computer.collect(
DocSortValuesAndFields {
sorts: res.feature,
@@ -530,34 +527,15 @@ impl TopHitsSegmentCollector {
top_hits_computer
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let intermediate_result =
IntermediateMetricResult::TopHits(self.into_top_hits_collector(value_accessors));
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
fn collect(
/// TODO add a specialized variant for a single sort field
fn collect_with(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
req: &TopHitsAggregationReq,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
let sorts: Vec<DocValueAndOrder> = self
.req
let sorts: Vec<DocValueAndOrder> = req
.sort
.iter()
.enumerate()
@@ -582,15 +560,62 @@ impl SegmentAggregationCollector for TopHitsSegmentCollector {
);
Ok(())
}
}
impl SegmentAggregationCollector for TopHitsSegmentCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
results: &mut crate::aggregation::intermediate_agg_result::IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let value_accessors = &agg_with_accessor.aggs.values[self.accessor_idx].value_accessors;
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let intermediate_result = IntermediateMetricResult::TopHits(
self.into_top_hits_collector(value_accessors, tophits_req),
);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)
}
/// TODO: Consider a caching layer to reduce the call overhead
fn collect(
&mut self,
doc_id: crate::DocId,
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
self.collect_with(doc_id, tophits_req, accessors)?;
Ok(())
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor,
) -> crate::Result<()> {
let tophits_req = &agg_with_accessor.aggs.values[self.accessor_idx]
.agg
.agg
.as_top_hits()
.expect("aggregation request must be of type top hits");
let accessors = &agg_with_accessor.aggs.values[self.accessor_idx].accessors;
// TODO: Consider getting fields with the column block accessor.
for doc in docs {
self.collect(*doc, agg_with_accessor)?;
self.collect_with(*doc, tophits_req, accessors)?;
}
Ok(())
}

View File

@@ -44,11 +44,14 @@
//! - [Metric](metric)
//! - [Average](metric::AverageAggregation)
//! - [Stats](metric::StatsAggregation)
//! - [ExtendedStats](metric::ExtendedStatsAggregation)
//! - [Min](metric::MinAggregation)
//! - [Max](metric::MaxAggregation)
//! - [Sum](metric::SumAggregation)
//! - [Count](metric::CountAggregation)
//! - [Percentiles](metric::PercentilesAggregationReq)
//! - [Cardinality](metric::CardinalityAggregationReq)
//! - [TopHits](metric::TopHitsAggregationReq)
//!
//! # Example
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an

View File

@@ -11,12 +11,15 @@ use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAcce
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation,
};
use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::TopHitsSegmentCollector;
use crate::aggregation::metric::{
CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
TopHitsSegmentCollector,
};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result(
@@ -148,6 +151,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx,
*missing,
))),
ExtendedStats(ExtendedStatsAggregation { missing, sigma, .. }) => Ok(Box::new(
SegmentExtendedStatsCollector::from_req(req.field_type, *sigma, accessor_idx, *missing),
)),
Sum(SumAggregation { missing, .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Sum,
@@ -166,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx,
req.segment_ordinal,
))),
Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
)),
}
}

View File

@@ -871,7 +871,10 @@ mod tests {
use crate::schema::{Field, Schema, FAST, STORED, TEXT};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::{DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score, SegmentReader};
use crate::{
assert_nearly_equals, DateTime, DocAddress, DocId, Index, IndexWriter, Order, Score,
SegmentReader,
};
fn make_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();

View File

@@ -195,7 +195,7 @@ mod tests {
let (tx, rx) = crossbeam_channel::bounded::<()>(0);
let rx = Arc::new(rx);
let executor = Executor::multi_thread(3, "search-test").unwrap();
for i in 0..1000 {
for _ in 0..1000 {
let counter_clone: Arc<AtomicU64> = counter.clone();
let other_counter_clone: Arc<AtomicU64> = other_counter.clone();
@@ -203,18 +203,18 @@ mod tests {
let rx_clone2 = rx.clone();
let fut = executor.spawn_blocking(move || {
counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone.recv().unwrap();
let _ = rx_clone.recv();
});
futures.push(fut);
let other_fut = executor.spawn_blocking(move || {
other_counter_clone.fetch_add(1, Ordering::SeqCst);
let () = rx_clone2.recv().unwrap();
let _ = rx_clone2.recv();
});
other_futures.push(other_fut);
}
// We execute 100 futures.
for i in 0..100 {
for _ in 0..100 {
tx.send(()).unwrap();
}
@@ -226,7 +226,7 @@ mod tests {
drop(other_futures);
// We execute 100 futures.
for i in 0..100 {
for _ in 0..100 {
tx.send(()).unwrap();
}

View File

@@ -1,4 +1,4 @@
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value_visitor) in json_visitor {
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
continue;
}
json_path_writer.push(json_path_segment);
index_json_value(
doc,

View File

@@ -127,7 +127,7 @@ mod tests {
fast_field_writers
.add_document(&doc!(*FIELD=>2u64))
.unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -178,7 +178,7 @@ mod tests {
fast_field_writers
.add_document(&doc!(*FIELD=>215u64))
.unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -211,7 +211,7 @@ mod tests {
.add_document(&doc!(*FIELD=>100_000u64))
.unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -243,7 +243,7 @@ mod tests {
.add_document(&doc!(*FIELD=>5_000_000_000_000_000_000u64 + doc_id))
.unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -276,7 +276,7 @@ mod tests {
doc.add_i64(i64_field, i);
fast_field_writers.add_document(&doc).unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -315,7 +315,7 @@ mod tests {
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
@@ -348,7 +348,7 @@ mod tests {
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
@@ -385,7 +385,7 @@ mod tests {
for &x in &permutation {
fast_field_writers.add_document(&doc!(*FIELD=>x)).unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -770,7 +770,7 @@ mod tests {
fast_field_writers
.add_document(&doc!(field=>false))
.unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -802,7 +802,7 @@ mod tests {
.add_document(&doc!(field=>false))
.unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -827,7 +827,7 @@ mod tests {
let mut fast_field_writers = FastFieldsWriter::from_schema(&schema).unwrap();
let doc = TantivyDocument::default();
fast_field_writers.add_document(&doc).unwrap();
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
@@ -855,7 +855,7 @@ mod tests {
for doc in docs {
fast_field_writers.add_document(doc).unwrap();
}
fast_field_writers.serialize(&mut write, None).unwrap();
fast_field_writers.serialize(&mut write).unwrap();
write.terminate().unwrap();
}
Ok(directory)

View File

@@ -4,7 +4,6 @@ use columnar::{ColumnarWriter, NumericalValue};
use common::{DateTimePrecision, JsonPathWriter};
use tokenizer_api::Token;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::document::{Document, ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema, Type};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
@@ -106,16 +105,6 @@ impl FastFieldsWriter {
self.columnar_writer.mem_usage()
}
pub(crate) fn sort_order(
&self,
sort_field: &str,
num_docs: DocId,
reversed: bool,
) -> Vec<DocId> {
self.columnar_writer
.sort_order(sort_field, num_docs, reversed)
}
/// Indexes all of the fastfields of a new document.
pub fn add_document<D: Document>(&mut self, doc: &D) -> crate::Result<()> {
let doc_id = self.num_docs;
@@ -233,16 +222,9 @@ impl FastFieldsWriter {
/// Serializes all of the `FastFieldWriter`s by pushing them in
/// order to the fast field serializer.
pub fn serialize(
mut self,
wrt: &mut dyn io::Write,
doc_id_map_opt: Option<&DocIdMapping>,
) -> io::Result<()> {
pub fn serialize(mut self, wrt: &mut dyn io::Write) -> io::Result<()> {
let num_docs = self.num_docs;
let old_to_new_row_ids =
doc_id_map_opt.map(|doc_id_mapping| doc_id_mapping.old_to_new_ids());
self.columnar_writer
.serialize(num_docs, old_to_new_row_ids, wrt)?;
self.columnar_writer.serialize(num_docs, wrt)?;
Ok(())
}
}
@@ -392,7 +374,7 @@ mod tests {
}
let mut buffer = Vec::new();
columnar_writer
.serialize(json_docs.len() as DocId, None, &mut buffer)
.serialize(json_docs.len() as DocId, &mut buffer)
.unwrap();
ColumnarReader::open(buffer).unwrap()
}

View File

@@ -77,7 +77,7 @@ mod tests {
let mut fieldnorm_writers = FieldNormsWriter::for_schema(&SCHEMA);
fieldnorm_writers.record(2u32, *TXT_FIELD, 5);
fieldnorm_writers.record(3u32, *TXT_FIELD, 3);
fieldnorm_writers.serialize(serializer, None)?;
fieldnorm_writers.serialize(serializer)?;
}
let file = directory.open_read(path)?;
{

View File

@@ -2,7 +2,6 @@ use std::cmp::Ordering;
use std::{io, iter};
use super::{fieldnorm_to_id, FieldNormsSerializer};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::{Field, Schema};
use crate::DocId;
@@ -92,11 +91,7 @@ impl FieldNormsWriter {
}
/// Serialize the seen fieldnorm values to the serializer for all fields.
pub fn serialize(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> io::Result<()> {
pub fn serialize(&self, mut fieldnorms_serializer: FieldNormsSerializer) -> io::Result<()> {
for (field, fieldnorms_buffer) in self.fieldnorms_buffers.iter().enumerate().filter_map(
|(field_id, fieldnorms_buffer_opt)| {
fieldnorms_buffer_opt.as_ref().map(|fieldnorms_buffer| {
@@ -104,12 +99,7 @@ impl FieldNormsWriter {
})
},
) {
if let Some(doc_id_map) = doc_id_map {
let remapped_fieldnorm_buffer = doc_id_map.remap(fieldnorms_buffer);
fieldnorms_serializer.serialize_field(field, &remapped_fieldnorm_buffer)?;
} else {
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
}
fieldnorms_serializer.serialize_field(field, fieldnorms_buffer)?;
}
fieldnorms_serializer.close()?;
Ok(())

View File

@@ -7,7 +7,7 @@ use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
#[allow(deprecated)]
use crate::{doc, schema, Index, IndexSettings, IndexSortByField, IndexWriter, Order, Searcher};
use crate::{doc, schema, Index, IndexWriter, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
assert!(searcher.segment_readers().len() < 20);
@@ -65,71 +65,6 @@ fn get_num_iterations() -> usize {
.map(|str| str.parse().unwrap())
.unwrap_or(2000)
}
#[test]
#[ignore]
fn test_functional_indexing_sorted() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", INDEXED | FAST);
let multiples_field = schema_builder.add_u64_field("multiples", INDEXED);
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
index_builder = index_builder.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Desc,
}),
..Default::default()
});
let index = index_builder.create_from_tempdir().unwrap();
let reader = index.reader()?;
let mut rng = thread_rng();
let mut index_writer: IndexWriter =
index.writer_with_num_threads(3, 3 * MEMORY_BUDGET_NUM_BYTES_MIN)?;
let mut committed_docs: HashSet<u64> = HashSet::new();
let mut uncommitted_docs: HashSet<u64> = HashSet::new();
for _ in 0..get_num_iterations() {
let random_val = rng.gen_range(0..20);
if random_val == 0 {
index_writer.commit()?;
committed_docs.extend(&uncommitted_docs);
uncommitted_docs.clear();
reader.reload()?;
let searcher = reader.searcher();
// check that everything is correct.
check_index_content(
&searcher,
&committed_docs.iter().cloned().collect::<Vec<u64>>(),
)?;
} else if committed_docs.remove(&random_val) || uncommitted_docs.remove(&random_val) {
let doc_id_term = Term::from_field_u64(id_field, random_val);
index_writer.delete_term(doc_id_term);
} else {
uncommitted_docs.insert(random_val);
let mut doc = TantivyDocument::new();
doc.add_u64(id_field, random_val);
for i in 1u64..10u64 {
doc.add_u64(multiples_field, random_val * i);
}
doc.add_text(text_field, get_text());
index_writer.add_document(doc)?;
}
}
Ok(())
}
const LOREM: &str = "Doc Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod \
tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, \

View File

@@ -20,7 +20,7 @@ use crate::indexer::segment_updater::save_metas;
use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
use crate::reader::{IndexReader, IndexReaderBuilder};
use crate::schema::document::Document;
use crate::schema::{Field, FieldType, Schema, Type};
use crate::schema::{Field, FieldType, Schema};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::SegmentReader;
@@ -232,31 +232,7 @@ impl IndexBuilder {
}
fn validate(&self) -> crate::Result<()> {
if let Some(schema) = self.schema.as_ref() {
if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref() {
let schema_field = schema.get_field(&sort_by_field.field).map_err(|_| {
TantivyError::InvalidArgument(format!(
"Field to sort index {} not found in schema",
sort_by_field.field
))
})?;
let entry = schema.get_field_entry(schema_field);
if !entry.is_fast() {
return Err(TantivyError::InvalidArgument(format!(
"Field {} is no fast field. Field needs to be a single value fast field \
to be used to sort an index",
sort_by_field.field
)));
}
let supported_field_types = [Type::I64, Type::U64, Type::F64, Type::Date];
let field_type = entry.field_type().value_type();
if !supported_field_types.contains(&field_type) {
return Err(TantivyError::InvalidArgument(format!(
"Unsupported field type in sort_by_field: {field_type:?}. Supported field \
types: {supported_field_types:?} ",
)));
}
}
if let Some(_schema) = self.schema.as_ref() {
Ok(())
} else {
Err(TantivyError::InvalidArgument(

View File

@@ -249,10 +249,6 @@ fn is_true(val: &bool) -> bool {
/// index, like presort documents.
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSettings {
/// Sorts the documents by information
/// provided in `IndexSortByField`
#[serde(skip_serializing_if = "Option::is_none")]
pub sort_by_field: Option<IndexSortByField>,
/// The `Compressor` used to compress the doc store.
#[serde(default)]
pub docstore_compression: Compressor,
@@ -275,7 +271,6 @@ fn default_docstore_blocksize() -> usize {
impl Default for IndexSettings {
fn default() -> Self {
Self {
sort_by_field: None,
docstore_compression: Compressor::default(),
docstore_blocksize: default_docstore_blocksize(),
docstore_compress_dedicated_thread: true,
@@ -283,22 +278,6 @@ impl Default for IndexSettings {
}
}
/// Settings to presort the documents in an index
///
/// Presorting documents can greatly improve performance
/// in some scenarios, by applying top n
/// optimizations.
#[deprecated(
since = "0.22.0",
note = "We plan to remove index sorting in `0.23`. If you need index sorting, please comment on the related issue https://github.com/quickwit-oss/tantivy/issues/2352 and explain your use case."
)]
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub struct IndexSortByField {
/// The field to sort the documents by
pub field: String,
/// The order to sort the documents by
pub order: Order,
}
/// The order to sort by
#[derive(Clone, Debug, Serialize, Deserialize, Eq, PartialEq)]
pub enum Order {
@@ -417,7 +396,7 @@ mod tests {
use crate::store::Compressor;
#[cfg(feature = "zstd-compression")]
use crate::store::ZstdCompressor;
use crate::{IndexSettings, IndexSortByField, Order};
use crate::IndexSettings;
#[test]
fn test_serialize_metas() {
@@ -427,13 +406,7 @@ mod tests {
schema_builder.build()
};
let index_metas = IndexMeta {
index_settings: IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "text".to_string(),
order: Order::Asc,
}),
..Default::default()
},
index_settings: IndexSettings::default(),
segments: Vec::new(),
schema,
opstamp: 0u64,
@@ -442,7 +415,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
r#"{"index_settings":{"docstore_compression":"lz4","docstore_blocksize":16384},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -461,10 +434,6 @@ mod tests {
};
let index_metas = IndexMeta {
index_settings: IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "text".to_string(),
order: Order::Asc,
}),
docstore_compression: crate::store::Compressor::Zstd(ZstdCompressor {
compression_level: Some(4),
}),
@@ -479,7 +448,7 @@ mod tests {
let json = serde_json::ser::to_string(&index_metas).expect("serialization failed");
assert_eq!(
json,
r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
r#"{"index_settings":{"docstore_compression":"zstd(compression_level=4)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#
);
let deser_meta: UntrackedIndexMeta = serde_json::from_str(&json).unwrap();
@@ -491,35 +460,35 @@ mod tests {
#[test]
#[cfg(all(feature = "lz4-compression", feature = "zstd-compression"))]
fn test_serialize_metas_invalid_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let json = r#"{"index_settings":{"docstore_compression":"zsstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unknown variant `zsstd`, expected one of `none`, `lz4`, `zstd`, \
`zstd(compression_level=5)` at line 1 column 96"
`zstd(compression_level=5)` at line 1 column 49"
.to_string()
);
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let json = r#"{"index_settings":{"docstore_compression":"zstd(bla=10)","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unknown zstd option \"bla\" at line 1 column 103".to_string()
"unknown zstd option \"bla\" at line 1 column 56".to_string()
);
}
#[test]
#[cfg(not(feature = "zstd-compression"))]
fn test_serialize_metas_unsupported_comp() {
let json = r#"{"index_settings":{"sort_by_field":{"field":"text","order":"Asc"},"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let json = r#"{"index_settings":{"docstore_compression":"zstd","docstore_blocksize":1000000},"segments":[],"schema":[{"name":"text","type":"text","options":{"indexing":{"record":"position","fieldnorms":true,"tokenizer":"default"},"stored":false,"fast":false}}],"opstamp":0}"#;
let err = serde_json::from_str::<UntrackedIndexMeta>(json).unwrap_err();
assert_eq!(
err.to_string(),
"unsupported variant `zstd`, please enable Tantivy's `zstd-compression` feature at \
line 1 column 95"
line 1 column 48"
.to_string()
);
}
@@ -531,7 +500,6 @@ mod tests {
assert_eq!(
index_settings,
IndexSettings {
sort_by_field: None,
docstore_compression: Compressor::default(),
docstore_compress_dedicated_thread: true,
docstore_blocksize: 16_384

View File

@@ -12,7 +12,7 @@ mod segment_reader;
pub use self::index::{Index, IndexBuilder};
pub(crate) use self::index_meta::SegmentMetaInventory;
pub use self::index_meta::{IndexMeta, IndexSettings, IndexSortByField, Order, SegmentMeta};
pub use self::index_meta::{IndexMeta, IndexSettings, Order, SegmentMeta};
pub use self::inverted_index_reader::InvertedIndexReader;
pub use self::segment::Segment;
pub use self::segment_component::SegmentComponent;

View File

@@ -3,15 +3,12 @@
use common::ReadOnlyBitSet;
use super::SegmentWriter;
use crate::schema::{Field, Schema};
use crate::{DocAddress, DocId, IndexSortByField, TantivyError};
use crate::DocAddress;
#[derive(Copy, Clone, Eq, PartialEq)]
pub enum MappingType {
Stacked,
StackedWithDeletes,
Shuffled,
}
/// Struct to provide mapping from new doc_id to old doc_id and segment.
@@ -46,537 +43,4 @@ impl SegmentDocIdMapping {
pub(crate) fn iter_old_doc_addrs(&self) -> impl Iterator<Item = DocAddress> + '_ {
self.new_doc_id_to_old_doc_addr.iter().copied()
}
/// This flags means the segments are simply stacked in the order of their ordinal.
/// e.g. [(0, 1), .. (n, 1), (0, 2)..., (m, 2)]
///
/// The different segment may present some deletes, in which case it is expressed by skipping a
/// `DocId`. [(0, 1), (0, 3)] <--- here doc_id=0 and doc_id=1 have been deleted
///
/// Being trivial is equivalent to having the `new_doc_id_to_old_doc_addr` array sorted.
///
/// This allows for some optimization.
pub(crate) fn is_trivial(&self) -> bool {
match self.mapping_type {
MappingType::Stacked | MappingType::StackedWithDeletes => true,
MappingType::Shuffled => false,
}
}
}
/// Struct to provide mapping from old doc_id to new doc_id and vice versa within a segment.
pub struct DocIdMapping {
new_doc_id_to_old: Vec<DocId>,
old_doc_id_to_new: Vec<DocId>,
}
impl DocIdMapping {
pub fn from_new_id_to_old_id(new_doc_id_to_old: Vec<DocId>) -> Self {
let max_doc = new_doc_id_to_old.len();
let old_max_doc = new_doc_id_to_old
.iter()
.cloned()
.max()
.map(|n| n + 1)
.unwrap_or(0);
let mut old_doc_id_to_new = vec![0; old_max_doc as usize];
for i in 0..max_doc {
old_doc_id_to_new[new_doc_id_to_old[i] as usize] = i as DocId;
}
DocIdMapping {
new_doc_id_to_old,
old_doc_id_to_new,
}
}
/// returns the new doc_id for the old doc_id
pub fn get_new_doc_id(&self, doc_id: DocId) -> DocId {
self.old_doc_id_to_new[doc_id as usize]
}
/// returns the old doc_id for the new doc_id
pub fn get_old_doc_id(&self, doc_id: DocId) -> DocId {
self.new_doc_id_to_old[doc_id as usize]
}
/// iterate over old doc_ids in order of the new doc_ids
pub fn iter_old_doc_ids(&self) -> impl Iterator<Item = DocId> + Clone + '_ {
self.new_doc_id_to_old.iter().cloned()
}
pub fn old_to_new_ids(&self) -> &[DocId] {
&self.old_doc_id_to_new[..]
}
/// Remaps a given array to the new doc ids.
pub fn remap<T: Copy>(&self, els: &[T]) -> Vec<T> {
self.new_doc_id_to_old
.iter()
.map(|old_doc| els[*old_doc as usize])
.collect()
}
pub fn num_new_doc_ids(&self) -> usize {
self.new_doc_id_to_old.len()
}
pub fn num_old_doc_ids(&self) -> usize {
self.old_doc_id_to_new.len()
}
}
pub(crate) fn expect_field_id_for_sort_field(
schema: &Schema,
sort_by_field: &IndexSortByField,
) -> crate::Result<Field> {
schema.get_field(&sort_by_field.field).map_err(|_| {
TantivyError::InvalidArgument(format!(
"field to sort index by not found: {:?}",
sort_by_field.field
))
})
}
// Generates a document mapping in the form of [index new doc_id] -> old doc_id
// TODO detect if field is already sorted and discard mapping
pub(crate) fn get_doc_id_mapping_from_field(
sort_by_field: IndexSortByField,
segment_writer: &SegmentWriter,
) -> crate::Result<DocIdMapping> {
let schema = segment_writer.segment_serializer.segment().schema();
expect_field_id_for_sort_field(&schema, &sort_by_field)?; // for now expect
let new_doc_id_to_old = segment_writer.fast_field_writers.sort_order(
sort_by_field.field.as_str(),
segment_writer.max_doc(),
sort_by_field.order.is_desc(),
);
// create new doc_id to old doc_id index (used in fast_field_writers)
Ok(DocIdMapping::from_new_id_to_old_id(new_doc_id_to_old))
}
#[cfg(test)]
mod tests_indexsorting {
use common::DateTime;
use crate::collector::TopDocs;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::NoMergePolicy;
use crate::query::QueryParser;
use crate::schema::*;
use crate::{DocAddress, Index, IndexBuilder, IndexSettings, IndexSortByField, Order};
fn create_test_index(
index_settings: Option<IndexSettings>,
text_field_options: TextOptions,
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let my_text_field = schema_builder.add_text_field("text_field", text_field_options);
let my_string_field = schema_builder.add_text_field("string_field", STRING | STORED);
let my_number =
schema_builder.add_u64_field("my_number", NumericOptions::default().set_fast());
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.add_document(doc!(my_number=>40_u64))?;
index_writer.add_document(
doc!(my_number=>20_u64, multi_numbers => 5_u64, multi_numbers => 6_u64),
)?;
index_writer.add_document(doc!(my_number=>100_u64))?;
index_writer.add_document(
doc!(my_number=>10_u64, my_string_field=> "blublub", my_text_field => "some text"),
)?;
index_writer.add_document(doc!(my_number=>30_u64, multi_numbers => 3_u64 ))?;
index_writer.commit()?;
Ok(index)
}
fn get_text_options() -> TextOptions {
TextOptions::default().set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::Basic),
)
}
#[test]
fn test_sort_index_test_text_field() -> crate::Result<()> {
// there are different serializers for different settings in postings/recorder.rs
// test remapping for all of them
let options = vec![
get_text_options(),
get_text_options().set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
),
get_text_options().set_indexing_options(
TextFieldIndexing::default()
.set_index_option(IndexRecordOption::WithFreqsAndPositions),
),
];
for option in options {
// let options = get_text_options();
// no index_sort
let index = create_test_index(None, option.clone())?;
let my_text_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher();
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![3]
);
// sort by field asc
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
option.clone(),
)?;
let my_text_field = index.schema().get_field("text_field").unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let query = QueryParser::for_index(&index, vec![my_text_field]).parse_query("text")?;
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![0]
);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = searcher
.segment_reader(0)
.get_fieldnorms_reader(my_text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
}
// sort by field desc
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
option.clone(),
)?;
let my_string_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader()?.searcher();
let query =
QueryParser::for_index(&index, vec![my_string_field]).parse_query("text")?;
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![4]
);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = searcher
.segment_reader(0)
.get_fieldnorms_reader(my_text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
}
}
Ok(())
}
#[test]
fn test_sort_index_get_documents() -> crate::Result<()> {
// default baseline
let index = create_test_index(None, get_text_options())?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
assert!(searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field)
.is_none());
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 3))?
.get_first(my_string_field)
.unwrap()
.as_str(),
Some("blublub")
);
}
// sort by field asc
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
assert_eq!(
searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))?
.get_first(my_string_field)
.unwrap()
.as_str(),
Some("blublub")
);
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert!(doc.get_first(my_string_field).is_none());
}
// sort by field desc
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
get_text_options(),
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
{
let doc = searcher.doc::<TantivyDocument>(DocAddress::new(0, 4))?;
assert_eq!(
doc.get_first(my_string_field).unwrap().as_str(),
Some("blublub")
);
}
Ok(())
}
#[test]
fn test_sort_index_test_string_field() -> crate::Result<()> {
let index = create_test_index(None, get_text_options())?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![3]
);
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let reader = index.reader()?;
let searcher = reader.searcher();
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![0]
);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = searcher
.segment_reader(0)
.get_fieldnorms_reader(my_text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
}
// sort by field desc
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
get_text_options(),
)?;
let my_string_field = index.schema().get_field("string_field").unwrap();
let searcher = index.reader()?.searcher();
let query = QueryParser::for_index(&index, vec![my_string_field]).parse_query("blublub")?;
let top_docs: Vec<(f32, DocAddress)> = searcher.search(&query, &TopDocs::with_limit(3))?;
assert_eq!(
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>(),
vec![4]
);
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = searcher
.segment_reader(0)
.get_fieldnorms_reader(my_text_field)?;
assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
assert_eq!(fieldnorm_reader.fieldnorm(3), 0);
assert_eq!(fieldnorm_reader.fieldnorm(4), 2); // some text
}
Ok(())
}
#[test]
fn test_sort_index_fast_field() -> crate::Result<()> {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "my_number".to_string(),
order: Order::Asc,
}),
..Default::default()
}),
get_text_options(),
)?;
assert_eq!(
index.settings().sort_by_field.as_ref().unwrap().field,
"my_number".to_string()
);
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields
.u64("my_number")
.unwrap()
.first_or_default_col(999);
assert_eq!(fast_field.get_val(0), 10u64);
assert_eq!(fast_field.get_val(1), 20u64);
assert_eq!(fast_field.get_val(2), 30u64);
let multifield = fast_fields.u64("multi_numbers").unwrap();
let vals: Vec<u64> = multifield.values_for_doc(0u32).collect();
assert_eq!(vals, &[] as &[u64]);
let vals: Vec<_> = multifield.values_for_doc(1u32).collect();
assert_eq!(vals, &[5, 6]);
let vals: Vec<_> = multifield.values_for_doc(2u32).collect();
assert_eq!(vals, &[3]);
Ok(())
}
#[test]
fn test_with_sort_by_date_field() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field("date", INDEXED | STORED | FAST);
let schema = schema_builder.build();
let settings = IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "date".to_string(),
order: Order::Desc,
}),
..Default::default()
};
let index = Index::builder()
.schema(schema)
.settings(settings)
.create_in_ram()?;
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(1000),
))?;
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(999),
))?;
index_writer.add_document(doc!(
date_field => DateTime::from_timestamp_secs(1001),
))?;
index_writer.commit()?;
let searcher = index.reader()?.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_reader(0);
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields
.date("date")
.unwrap()
.first_or_default_col(DateTime::from_timestamp_secs(0));
assert_eq!(fast_field.get_val(0), DateTime::from_timestamp_secs(1001));
assert_eq!(fast_field.get_val(1), DateTime::from_timestamp_secs(1000));
assert_eq!(fast_field.get_val(2), DateTime::from_timestamp_secs(999));
Ok(())
}
#[test]
fn test_doc_mapping() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![3, 2, 5]);
assert_eq!(doc_mapping.get_old_doc_id(0), 3);
assert_eq!(doc_mapping.get_old_doc_id(1), 2);
assert_eq!(doc_mapping.get_old_doc_id(2), 5);
assert_eq!(doc_mapping.get_new_doc_id(0), 0);
assert_eq!(doc_mapping.get_new_doc_id(1), 0);
assert_eq!(doc_mapping.get_new_doc_id(2), 1);
assert_eq!(doc_mapping.get_new_doc_id(3), 0);
assert_eq!(doc_mapping.get_new_doc_id(4), 0);
assert_eq!(doc_mapping.get_new_doc_id(5), 2);
}
#[test]
fn test_doc_mapping_remap() {
let doc_mapping = DocIdMapping::from_new_id_to_old_id(vec![2, 8, 3]);
assert_eq!(
&doc_mapping.remap(&[0, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000]),
&[2000, 8000, 3000]
);
}
#[test]
fn test_text_sort() -> crate::Result<()> {
let mut schema_builder = SchemaBuilder::new();
schema_builder.add_text_field("id", STRING | FAST | STORED);
schema_builder.add_text_field("name", TEXT | STORED);
let resp = IndexBuilder::new()
.schema(schema_builder.build())
.settings(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "id".to_string(),
order: Order::Asc,
}),
..Default::default()
})
.create_in_ram();
assert!(resp
.unwrap_err()
.to_string()
.contains("Unsupported field type"));
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,147 @@
#[cfg(test)]
mod tests {
use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet;
use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions,
};
use crate::{DocAddress, DocSet, IndexSettings, IndexWriter, Term};
fn create_test_index(index_settings: Option<IndexSettings>) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast()
.set_stored()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let bytes_options = BytesOptions::default().set_fast().set_indexed();
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram()?;
{
let mut index_writer = index.writer_for_tests()?;
// segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64))?;
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
)?;
index_writer.add_document(
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
)?;
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
)?;
index_writer.commit()?;
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
let in_val = 1u64;
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
index_writer.commit()?;
let int_vals = [10u64, 5];
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
)?;
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
)?;
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
index_writer.commit()?;
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
Ok(index)
}
#[test]
fn test_merge_index() {
let index = create_test_index(Some(IndexSettings {
..Default::default()
}))
.unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
}

View File

@@ -1,8 +1,7 @@
use std::sync::Arc;
use columnar::{
ColumnType, ColumnValues, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder,
StackMergeOrder,
ColumnType, ColumnarReader, MergeRowOrder, RowAddr, ShuffleMergeOrder, StackMergeOrder,
};
use common::ReadOnlyBitSet;
use itertools::Itertools;
@@ -11,7 +10,7 @@ use measure_time::debug_time;
use crate::directory::WritePtr;
use crate::docset::{DocSet, TERMINATED};
use crate::error::DataCorruption;
use crate::fastfield::{AliveBitSet, FastFieldNotAvailableError};
use crate::fastfield::AliveBitSet;
use crate::fieldnorm::{FieldNormReader, FieldNormReaders, FieldNormsSerializer, FieldNormsWriter};
use crate::index::{Segment, SegmentComponent, SegmentReader};
use crate::indexer::doc_id_mapping::{MappingType, SegmentDocIdMapping};
@@ -20,9 +19,7 @@ use crate::postings::{InvertedIndexSerializer, Postings, SegmentPostings};
use crate::schema::{value_type_to_column_type, Field, FieldType, Schema};
use crate::store::StoreWriter;
use crate::termdict::{TermMerger, TermOrdinal};
use crate::{
DocAddress, DocId, IndexSettings, IndexSortByField, InvertedIndexReader, Order, SegmentOrdinal,
};
use crate::{DocAddress, DocId, InvertedIndexReader};
/// Segment's max doc must be `< MAX_DOC_LIMIT`.
///
@@ -80,7 +77,6 @@ fn estimate_total_num_tokens(readers: &[SegmentReader], field: Field) -> crate::
}
pub struct IndexMerger {
index_settings: IndexSettings,
schema: Schema,
pub(crate) readers: Vec<SegmentReader>,
max_doc: u32,
@@ -116,7 +112,7 @@ fn convert_to_merge_order(
) -> MergeRowOrder {
match doc_id_mapping.mapping_type() {
MappingType::Stacked => MergeRowOrder::Stack(StackMergeOrder::stack(columnars)),
MappingType::StackedWithDeletes | MappingType::Shuffled => {
MappingType::StackedWithDeletes => {
// RUST/LLVM is amazing. The following conversion is actually a no-op:
// no allocation, no copy.
let new_row_id_to_old_row_id: Vec<RowAddr> = doc_id_mapping
@@ -149,13 +145,9 @@ fn extract_fast_field_required_columns(schema: &Schema) -> Vec<(String, ColumnTy
}
impl IndexMerger {
pub fn open(
schema: Schema,
index_settings: IndexSettings,
segments: &[Segment],
) -> crate::Result<IndexMerger> {
pub fn open(schema: Schema, segments: &[Segment]) -> crate::Result<IndexMerger> {
let alive_bitset = segments.iter().map(|_| None).collect_vec();
Self::open_with_custom_alive_set(schema, index_settings, segments, alive_bitset)
Self::open_with_custom_alive_set(schema, segments, alive_bitset)
}
// Create merge with a custom delete set.
@@ -172,7 +164,6 @@ impl IndexMerger {
// segments and partitions them e.g. by a value in a field.
pub fn open_with_custom_alive_set(
schema: Schema,
index_settings: IndexSettings,
segments: &[Segment],
alive_bitset_opt: Vec<Option<AliveBitSet>>,
) -> crate::Result<IndexMerger> {
@@ -186,9 +177,6 @@ impl IndexMerger {
}
let max_doc = readers.iter().map(|reader| reader.num_docs()).sum();
if let Some(sort_by_field) = index_settings.sort_by_field.as_ref() {
readers = Self::sort_readers_by_min_sort_field(readers, sort_by_field)?;
}
// sort segments by their natural sort setting
if max_doc >= MAX_DOC_LIMIT {
let err_msg = format!(
@@ -198,37 +186,12 @@ impl IndexMerger {
return Err(crate::TantivyError::InvalidArgument(err_msg));
}
Ok(IndexMerger {
index_settings,
schema,
readers,
max_doc,
})
}
fn sort_readers_by_min_sort_field(
readers: Vec<SegmentReader>,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<SegmentReader>> {
// presort the readers by their min_values, so that when they are disjunct, we can use
// the regular merge logic (implicitly sorted)
let mut readers_with_min_sort_values = readers
.into_iter()
.map(|reader| {
let accessor = Self::get_sort_field_accessor(&reader, sort_by_field)?;
Ok((reader, accessor.min_value()))
})
.collect::<crate::Result<Vec<_>>>()?;
if sort_by_field.order.is_asc() {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| *min_val);
} else {
readers_with_min_sort_values.sort_by_key(|(_, min_val)| std::cmp::Reverse(*min_val));
}
Ok(readers_with_min_sort_values
.into_iter()
.map(|(reader, _)| reader)
.collect())
}
fn write_fieldnorms(
&self,
mut fieldnorms_serializer: FieldNormsSerializer,
@@ -276,128 +239,6 @@ impl IndexMerger {
Ok(())
}
/// Checks if the readers are disjunct for their sort property and in the correct order to be
/// able to just stack them.
pub(crate) fn is_disjunct_and_sorted_on_sort_property(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<bool> {
let reader_ordinal_and_field_accessors =
self.get_reader_with_sort_field_accessor(sort_by_field)?;
let everything_is_in_order = reader_ordinal_and_field_accessors
.into_iter()
.map(|(_, col)| Arc::new(col))
.tuple_windows()
.all(|(field_accessor1, field_accessor2)| {
if sort_by_field.order.is_asc() {
field_accessor1.max_value() <= field_accessor2.min_value()
} else {
field_accessor1.min_value() >= field_accessor2.max_value()
}
});
Ok(everything_is_in_order)
}
pub(crate) fn get_sort_field_accessor(
reader: &SegmentReader,
sort_by_field: &IndexSortByField,
) -> crate::Result<Arc<dyn ColumnValues>> {
reader.schema().get_field(&sort_by_field.field)?;
let (value_accessor, _column_type) = reader
.fast_fields()
.u64_lenient(&sort_by_field.field)?
.ok_or_else(|| FastFieldNotAvailableError {
field_name: sort_by_field.field.to_string(),
})?;
Ok(value_accessor.first_or_default_col(0u64))
}
/// Collecting value_accessors into a vec to bind the lifetime.
pub(crate) fn get_reader_with_sort_field_accessor(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<Vec<(SegmentOrdinal, Arc<dyn ColumnValues>)>> {
let reader_ordinal_and_field_accessors = self
.readers
.iter()
.enumerate()
.map(|(reader_ordinal, _)| reader_ordinal as SegmentOrdinal)
.map(|reader_ordinal: SegmentOrdinal| {
let value_accessor = Self::get_sort_field_accessor(
&self.readers[reader_ordinal as usize],
sort_by_field,
)?;
Ok((reader_ordinal, value_accessor))
})
.collect::<crate::Result<Vec<_>>>()?;
Ok(reader_ordinal_and_field_accessors)
}
/// Generates the doc_id mapping where position in the vec=new
/// doc_id.
/// ReaderWithOrdinal will include the ordinal position of the
/// reader in self.readers.
pub(crate) fn generate_doc_id_mapping_with_sort_by_field(
&self,
sort_by_field: &IndexSortByField,
) -> crate::Result<SegmentDocIdMapping> {
let reader_ordinal_and_field_accessors =
self.get_reader_with_sort_field_accessor(sort_by_field)?;
// Loading the field accessor on demand causes a 15x regression
// create iterators over segment/sort_accessor/doc_id tuple
let doc_id_reader_pair =
reader_ordinal_and_field_accessors
.iter()
.map(|(reader_ord, ff_reader)| {
let reader = &self.readers[*reader_ord as usize];
reader
.doc_ids_alive()
.map(move |doc_id| (doc_id, reader_ord, ff_reader))
});
let total_num_new_docs = self
.readers
.iter()
.map(|reader| reader.num_docs() as usize)
.sum();
let mut sorted_doc_ids: Vec<DocAddress> = Vec::with_capacity(total_num_new_docs);
// create iterator tuple of (old doc_id, reader) in order of the new doc_ids
sorted_doc_ids.extend(
doc_id_reader_pair
.into_iter()
.kmerge_by(|a, b| {
let val1 = a.2.get_val(a.0);
let val2 = b.2.get_val(b.0);
if sort_by_field.order == Order::Asc {
val1 < val2
} else {
val1 > val2
}
})
.map(|(doc_id, &segment_ord, _)| DocAddress {
doc_id,
segment_ord,
}),
);
let alive_bitsets: Vec<Option<ReadOnlyBitSet>> = self
.readers
.iter()
.map(|segment_reader| {
let alive_bitset = segment_reader.alive_bitset()?;
Some(alive_bitset.bitset().clone())
})
.collect();
Ok(SegmentDocIdMapping::new(
sorted_doc_ids,
MappingType::Shuffled,
alive_bitsets,
))
}
/// Creates a mapping if the segments are stacked. this is helpful to merge codelines between
/// index sorting and the others
pub(crate) fn get_doc_id_from_concatenated_data(&self) -> crate::Result<SegmentDocIdMapping> {
@@ -515,7 +356,6 @@ impl IndexMerger {
);
let mut segment_postings_containing_the_term: Vec<(usize, SegmentPostings)> = vec![];
let mut doc_id_and_positions = vec![];
while merged_terms.advance() {
segment_postings_containing_the_term.clear();
@@ -611,37 +451,13 @@ impl IndexMerger {
0u32
};
// if doc_id_mapping exists, the doc_ids are reordered, they are
// not just stacked. The field serializer expects monotonically increasing
// doc_ids, so we collect and sort them first, before writing.
//
// I think this is not strictly necessary, it would be possible to
// avoid the loading into a vec via some form of kmerge, but then the merge
// logic would deviate much more from the stacking case (unsorted index)
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.push((
remapped_doc_id,
term_freq,
positions_buffer.to_vec(),
));
} else {
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
}
let delta_positions = delta_computer.compute_delta(&positions_buffer);
field_serializer.write_doc(remapped_doc_id, term_freq, delta_positions);
}
doc = segment_postings.advance();
}
}
if !doc_id_mapping.is_trivial() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _, _)| doc_id);
for (doc_id, term_freq, positions) in &doc_id_and_positions {
let delta_positions = delta_computer.compute_delta(positions);
field_serializer.write_doc(*doc_id, *term_freq, delta_positions);
}
doc_id_and_positions.clear();
}
// closing the term.
field_serializer.close_term()?;
}
@@ -670,47 +486,13 @@ impl IndexMerger {
Ok(())
}
fn write_storable_fields(
&self,
store_writer: &mut StoreWriter,
doc_id_mapping: &SegmentDocIdMapping,
) -> crate::Result<()> {
fn write_storable_fields(&self, store_writer: &mut StoreWriter) -> crate::Result<()> {
debug_time!("write-storable-fields");
debug!("write-storable-field");
if !doc_id_mapping.is_trivial() {
debug!("non-trivial-doc-id-mapping");
let store_readers: Vec<_> = self
.readers
.iter()
.map(|reader| reader.get_store_reader(50))
.collect::<Result<_, _>>()?;
let mut document_iterators: Vec<_> = store_readers
.iter()
.enumerate()
.map(|(i, store)| store.iter_raw(self.readers[i].alive_bitset()))
.collect();
for old_doc_addr in doc_id_mapping.iter_old_doc_addrs() {
let doc_bytes_it = &mut document_iterators[old_doc_addr.segment_ord as usize];
if let Some(doc_bytes_res) = doc_bytes_it.next() {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
} else {
return Err(DataCorruption::comment_only(format!(
"unexpected missing document in docstore on merge, doc address \
{old_doc_addr:?}",
))
.into());
}
}
} else {
debug!("trivial-doc-id-mapping");
for reader in &self.readers {
let store_reader = reader.get_store_reader(1)?;
if reader.has_deletes()
for reader in &self.readers {
let store_reader = reader.get_store_reader(1)?;
if reader.has_deletes()
// If there is not enough data in the store, we avoid stacking in order to
// avoid creating many small blocks in the doc store. Once we have 5 full blocks,
// we start stacking. In the worst case 2/7 of the blocks would be very small.
@@ -726,14 +508,13 @@ impl IndexMerger {
// take 7 in order to not walk over all checkpoints.
|| store_reader.block_checkpoints().take(7).count() < 6
|| store_reader.decompressor() != store_writer.compressor().into()
{
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
store_writer.stack(store_reader)?;
{
for doc_bytes_res in store_reader.iter_raw(reader.alive_bitset()) {
let doc_bytes = doc_bytes_res?;
store_writer.store_bytes(&doc_bytes)?;
}
} else {
store_writer.stack(store_reader)?;
}
}
Ok(())
@@ -745,18 +526,7 @@ impl IndexMerger {
/// # Returns
/// The number of documents in the resulting segment.
pub fn write(&self, mut serializer: SegmentSerializer) -> crate::Result<u32> {
let doc_id_mapping = if let Some(sort_by_field) = self.index_settings.sort_by_field.as_ref()
{
// If the documents are already sorted and stackable, we ignore the mapping and execute
// it as if there was no sorting
if self.is_disjunct_and_sorted_on_sort_property(sort_by_field)? {
self.get_doc_id_from_concatenated_data()?
} else {
self.generate_doc_id_mapping_with_sort_by_field(sort_by_field)?
}
} else {
self.get_doc_id_from_concatenated_data()?
};
let doc_id_mapping = self.get_doc_id_from_concatenated_data()?;
debug!("write-fieldnorms");
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
self.write_fieldnorms(fieldnorms_serializer, &doc_id_mapping)?;
@@ -773,7 +543,7 @@ impl IndexMerger {
)?;
debug!("write-storagefields");
self.write_storable_fields(serializer.get_store_writer(), &doc_id_mapping)?;
self.write_storable_fields(serializer.get_store_writer())?;
debug!("write-fastfields");
self.write_fast_fields(serializer.get_fast_field_write(), doc_id_mapping)?;
@@ -787,6 +557,8 @@ impl IndexMerger {
mod tests {
use columnar::Column;
use proptest::prop_oneof;
use proptest::strategy::Strategy;
use schema::FAST;
use crate::collector::tests::{
@@ -794,6 +566,7 @@ mod tests {
};
use crate::collector::{Count, FacetCollector};
use crate::index::{Index, SegmentId};
use crate::indexer::NoMergePolicy;
use crate::query::{AllQuery, BooleanQuery, EnableScoring, Scorer, TermQuery};
use crate::schema::{
Facet, FacetOptions, IndexRecordOption, NumericOptions, TantivyDocument, Term,
@@ -802,7 +575,7 @@ mod tests {
use crate::time::OffsetDateTime;
use crate::{
assert_nearly_equals, schema, DateTime, DocAddress, DocId, DocSet, IndexSettings,
IndexSortByField, IndexWriter, Order, Searcher,
IndexWriter, Searcher,
};
#[test]
@@ -1275,60 +1048,6 @@ mod tests {
test_merge_facets(None, true)
}
#[test]
fn test_merge_facets_sort_asc() {
// In the merge case this will go through the doc_id mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
true,
);
// In the merge case this will not go through the doc_id mapping code, because the data
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
#[test]
fn test_merge_facets_sort_desc() {
// In the merge case this will go through the doc_id mapping code
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
true,
);
// In the merge case this will not go through the doc_id mapping code, because the data
// sorted and disjunct
test_merge_facets(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
false,
);
}
// force_segment_value_overlap forces the int value for sorting to have overlapping min and max
// ranges between segments so that merge algorithm can't apply certain optimizations
fn test_merge_facets(index_settings: Option<IndexSettings>, force_segment_value_overlap: bool) {
@@ -1531,6 +1250,112 @@ mod tests {
Ok(())
}
#[derive(Debug, Clone, Copy, Eq, PartialEq)]
enum IndexingOp {
ZeroVal,
OneVal { val: u64 },
TwoVal { val: u64 },
Commit,
}
fn balanced_operation_strategy() -> impl Strategy<Value = IndexingOp> {
prop_oneof![
(0u64..1u64).prop_map(|_| IndexingOp::ZeroVal),
(0u64..1u64).prop_map(|val| IndexingOp::OneVal { val }),
(0u64..1u64).prop_map(|val| IndexingOp::TwoVal { val }),
(0u64..1u64).prop_map(|_| IndexingOp::Commit),
]
}
use proptest::prelude::*;
proptest! {
#[test]
fn test_merge_columnar_int_proptest(ops in proptest::collection::vec(balanced_operation_strategy(), 1..20)) {
assert!(test_merge_int_fields(&ops[..]).is_ok());
}
}
fn test_merge_int_fields(ops: &[IndexingOp]) -> crate::Result<()> {
if ops.iter().all(|op| *op == IndexingOp::Commit) {
return Ok(());
}
let expected_doc_and_vals: Vec<(u32, Vec<u64>)> = ops
.iter()
.filter(|op| *op != &IndexingOp::Commit)
.map(|op| match op {
IndexingOp::ZeroVal => vec![],
IndexingOp::OneVal { val } => vec![*val],
IndexingOp::TwoVal { val } => vec![*val, *val],
IndexingOp::Commit => unreachable!(),
})
.enumerate()
.map(|(id, val)| (id as u32, val))
.collect();
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intvals", int_options);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_for_tests()?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
let index_doc = |index_writer: &mut IndexWriter, int_vals: &[u64]| {
let mut doc = TantivyDocument::default();
for &val in int_vals {
doc.add_u64(int_field, val);
}
index_writer.add_document(doc).unwrap();
};
for op in ops {
match op {
IndexingOp::ZeroVal => index_doc(&mut index_writer, &[]),
IndexingOp::OneVal { val } => index_doc(&mut index_writer, &[*val]),
IndexingOp::TwoVal { val } => index_doc(&mut index_writer, &[*val, *val]),
IndexingOp::Commit => {
index_writer.commit().expect("commit failed");
}
}
}
index_writer.commit().expect("commit failed");
}
{
let mut segment_ids = index.searchable_segment_ids()?;
segment_ids.sort();
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
let reader = index.reader()?;
reader.reload()?;
let mut vals: Vec<u64> = Vec::new();
let mut test_vals = move |col: &Column<u64>, doc: DocId, expected: &[u64]| {
vals.clear();
vals.extend(col.values_for_doc(doc));
assert_eq!(&vals[..], expected);
};
let mut test_col = move |col: &Column<u64>, column_expected: &[(u32, Vec<u64>)]| {
for (doc_id, vals) in column_expected.iter() {
test_vals(col, *doc_id, vals);
}
};
{
let searcher = reader.searcher();
let segment = searcher.segment_reader(0u32);
let col = segment
.fast_fields()
.column_opt::<u64>("intvals")
.unwrap()
.unwrap();
test_col(&col, &expected_doc_and_vals);
}
Ok(())
}
#[test]
fn test_merge_multivalued_int_fields_simple() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();

View File

@@ -1,579 +0,0 @@
#[cfg(test)]
mod tests {
use crate::collector::TopDocs;
use crate::fastfield::AliveBitSet;
use crate::index::Index;
use crate::postings::Postings;
use crate::query::QueryParser;
use crate::schema::{
self, BytesOptions, Facet, FacetOptions, IndexRecordOption, NumericOptions,
TextFieldIndexing, TextOptions, Value,
};
use crate::{
DocAddress, DocSet, IndexSettings, IndexSortByField, IndexWriter, Order, TantivyDocument,
Term,
};
fn create_test_index_posting_list_issue(index_settings: Option<IndexSettings>) -> Index {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(int_field=>3_u64, facet_field=> Facet::from("/crime")))
.unwrap();
index_writer
.add_document(doc!(int_field=>6_u64, facet_field=> Facet::from("/crime")))
.unwrap();
index_writer.commit().unwrap();
index_writer
.add_document(doc!(int_field=>5_u64, facet_field=> Facet::from("/fanta")))
.unwrap();
index_writer.commit().unwrap();
}
// Merging the segments
{
let segment_ids = index
.searchable_segment_ids()
.expect("Searchable segments failed.");
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
assert!(index_writer.merge(&segment_ids).wait().is_ok());
assert!(index_writer.wait_merging_threads().is_ok());
}
index
}
// force_disjunct_segment_sort_values forces the field, by which the index is sorted have
// disjunct ranges between segments, e.g. values in segment [1-3] [10 - 20] [50 - 500]
fn create_test_index(
index_settings: Option<IndexSettings>,
force_disjunct_segment_sort_values: bool,
) -> crate::Result<Index> {
let mut schema_builder = schema::Schema::builder();
let int_options = NumericOptions::default()
.set_fast()
.set_stored()
.set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let bytes_options = BytesOptions::default().set_fast().set_indexed();
let bytes_field = schema_builder.add_bytes_field("bytes", bytes_options);
let facet_field = schema_builder.add_facet_field("facet", FacetOptions::default());
let multi_numbers =
schema_builder.add_u64_field("multi_numbers", NumericOptions::default().set_fast());
let text_field_options = TextOptions::default()
.set_indexing_options(
TextFieldIndexing::default()
.set_index_option(schema::IndexRecordOption::WithFreqsAndPositions),
)
.set_stored();
let text_field = schema_builder.add_text_field("text_field", text_field_options);
let schema = schema_builder.build();
let mut index_builder = Index::builder().schema(schema);
if let Some(settings) = index_settings {
index_builder = index_builder.settings(settings);
}
let index = index_builder.create_in_ram()?;
{
let mut index_writer = index.writer_for_tests()?;
// segment 1 - range 1-3
index_writer.add_document(doc!(int_field=>1_u64))?;
index_writer.add_document(
doc!(int_field=>3_u64, multi_numbers => 3_u64, multi_numbers => 4_u64, bytes_field => vec![1, 2, 3], text_field => "some text", facet_field=> Facet::from("/book/crime")),
)?;
index_writer.add_document(
doc!(int_field=>1_u64, text_field=> "deleteme", text_field => "ok text more text"),
)?;
index_writer.add_document(
doc!(int_field=>2_u64, multi_numbers => 2_u64, multi_numbers => 3_u64, text_field => "ok text more text"),
)?;
index_writer.commit()?;
// segment 2 - range 1-20 , with force_disjunct_segment_sort_values 10-20
index_writer.add_document(doc!(int_field=>20_u64, multi_numbers => 20_u64))?;
let in_val = if force_disjunct_segment_sort_values {
10_u64
} else {
1
};
index_writer.add_document(doc!(int_field=>in_val, text_field=> "deleteme" , text_field => "ok text more text", facet_field=> Facet::from("/book/crime")))?;
index_writer.commit()?;
// segment 3 - range 5-1000, with force_disjunct_segment_sort_values 50-1000
let int_vals = if force_disjunct_segment_sort_values {
[100_u64, 50]
} else {
[10, 5]
};
index_writer.add_document( // position of this doc after delete in desc sorting = [2], in disjunct case [1]
doc!(int_field=>int_vals[0], multi_numbers => 10_u64, multi_numbers => 11_u64, text_field=> "blubber", facet_field=> Facet::from("/book/fantasy")),
)?;
index_writer.add_document(doc!(int_field=>int_vals[1], text_field=> "deleteme"))?;
index_writer.add_document(
doc!(int_field=>1_000u64, multi_numbers => 1001_u64, multi_numbers => 1002_u64, bytes_field => vec![5, 5],text_field => "the biggest num")
)?;
index_writer.delete_term(Term::from_field_text(text_field, "deleteme"));
index_writer.commit()?;
}
// Merging the segments
{
let segment_ids = index.searchable_segment_ids()?;
let mut index_writer: IndexWriter = index.writer_for_tests()?;
index_writer.merge(&segment_ids).wait()?;
index_writer.wait_merging_threads()?;
}
Ok(index)
}
#[test]
fn test_merge_sorted_postinglist_sort_issue() {
create_test_index_posting_list_issue(Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}));
}
#[test]
fn test_merge_sorted_index_desc_not_disjunct() {
test_merge_sorted_index_desc_(false);
}
#[test]
fn test_merge_sorted_index_desc_disjunct() {
test_merge_sorted_index_desc_(true);
}
fn test_merge_sorted_index_desc_(force_disjunct_segment_sort_values: bool) {
let index = create_test_index(
Some(IndexSettings {
sort_by_field: Some(IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
}),
..Default::default()
}),
force_disjunct_segment_sort_values,
)
.unwrap();
let int_field = index.schema().get_field("intval").unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let fast_fields = segment_reader.fast_fields();
let fast_field = fast_fields.u64("intval").unwrap();
assert_eq!(fast_field.first(5), Some(1u64));
assert_eq!(fast_field.first(4), Some(2u64));
assert_eq!(fast_field.first(3), Some(3u64));
if force_disjunct_segment_sort_values {
assert_eq!(fast_field.first(2), Some(20u64));
assert_eq!(fast_field.first(1), Some(100u64));
} else {
assert_eq!(fast_field.first(2), Some(10u64));
assert_eq!(fast_field.first(1), Some(20u64));
}
assert_eq!(fast_field.first(0), Some(1_000u64));
// test new field norm mapping
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
assert_eq!(fieldnorm_reader.fieldnorm(0), 3); // the biggest num
if force_disjunct_segment_sort_values {
assert_eq!(fieldnorm_reader.fieldnorm(1), 1); // blubber
assert_eq!(fieldnorm_reader.fieldnorm(2), 0);
} else {
assert_eq!(fieldnorm_reader.fieldnorm(1), 0);
assert_eq!(fieldnorm_reader.fieldnorm(2), 1); // blubber
}
assert_eq!(fieldnorm_reader.fieldnorm(3), 2); // some text
assert_eq!(fieldnorm_reader.fieldnorm(5), 0);
}
let my_text_field = index.schema().get_field("text_field").unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![3]);
if force_disjunct_segment_sort_values {
assert_eq!(do_search("blubber"), vec![1]);
} else {
assert_eq!(do_search("blubber"), vec![2]);
}
assert_eq!(do_search("biggest"), vec![0]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
// access doc store
{
let blubber_pos = if force_disjunct_segment_sort_values {
1
} else {
2
};
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, blubber_pos))
.unwrap();
assert_eq!(
doc.get_first(my_text_field).unwrap().as_value().as_str(),
Some("blubber")
);
let doc = searcher
.doc::<TantivyDocument>(DocAddress::new(0, 0))
.unwrap();
assert_eq!(
doc.get_first(int_field).unwrap().as_value().as_u64(),
Some(1000)
);
}
}
#[test]
fn test_merge_unsorted_index() {
let index = create_test_index(
Some(IndexSettings {
..Default::default()
}),
false,
)
.unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
assert_eq!(searcher.segment_readers().len(), 1);
let segment_reader = searcher.segment_readers().last().unwrap();
let searcher = index.reader().unwrap().searcher();
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let do_search = |term: &str| {
let query = QueryParser::for_index(&index, vec![my_text_field])
.parse_query(term)
.unwrap();
let top_docs: Vec<(f32, DocAddress)> =
searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
};
assert_eq!(do_search("some"), vec![1]);
assert_eq!(do_search("blubber"), vec![3]);
assert_eq!(do_search("biggest"), vec![4]);
}
// postings file
{
let my_text_field = index.schema().get_field("text_field").unwrap();
let term_a = Term::from_field_text(my_text_field, "text");
let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
let mut postings = inverted_index
.read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
.unwrap()
.unwrap();
assert_eq!(postings.doc_freq(), 2);
let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
assert_eq!(
postings.doc_freq_given_deletes(
segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
),
2
);
assert_eq!(postings.term_freq(), 1);
let mut output = vec![];
postings.positions(&mut output);
assert_eq!(output, vec![1]);
postings.advance();
assert_eq!(postings.term_freq(), 2);
postings.positions(&mut output);
assert_eq!(output, vec![1, 3]);
}
}
// #[test]
// fn test_merge_sorted_index_asc() {
// let index = create_test_index(
// Some(IndexSettings {
// sort_by_field: Some(IndexSortByField {
// field: "intval".to_string(),
// order: Order::Asc,
// }),
// ..Default::default()
// }),
// false,
// )
// .unwrap();
// let int_field = index.schema().get_field("intval").unwrap();
// let multi_numbers = index.schema().get_field("multi_numbers").unwrap();
// let bytes_field = index.schema().get_field("bytes").unwrap();
// let reader = index.reader().unwrap();
// let searcher = reader.searcher();
// assert_eq!(searcher.segment_readers().len(), 1);
// let segment_reader = searcher.segment_readers().last().unwrap();
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64(int_field).unwrap();
// assert_eq!(fast_field.get_val(0), 1u64);
// assert_eq!(fast_field.get_val(1), 2u64);
// assert_eq!(fast_field.get_val(2), 3u64);
// assert_eq!(fast_field.get_val(3), 10u64);
// assert_eq!(fast_field.get_val(4), 20u64);
// assert_eq!(fast_field.get_val(5), 1_000u64);
// let get_vals = |fast_field: &MultiValuedFastFieldReader<u64>, doc_id: u32| -> Vec<u64> {
// let mut vals = vec![];
// fast_field.get_vals(doc_id, &mut vals);
// vals
// };
// let fast_fields = segment_reader.fast_fields();
// let fast_field = fast_fields.u64s(multi_numbers).unwrap();
// assert_eq!(&get_vals(&fast_field, 0), &[] as &[u64]);
// assert_eq!(&get_vals(&fast_field, 1), &[2, 3]);
// assert_eq!(&get_vals(&fast_field, 2), &[3, 4]);
// assert_eq!(&get_vals(&fast_field, 3), &[10, 11]);
// assert_eq!(&get_vals(&fast_field, 4), &[20]);
// assert_eq!(&get_vals(&fast_field, 5), &[1001, 1002]);
// let fast_field = fast_fields.bytes(bytes_field).unwrap();
// assert_eq!(fast_field.get_bytes(0), &[] as &[u8]);
// assert_eq!(fast_field.get_bytes(2), &[1, 2, 3]);
// assert_eq!(fast_field.get_bytes(5), &[5, 5]);
// // test new field norm mapping
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let fieldnorm_reader = segment_reader.get_fieldnorms_reader(my_text_field).unwrap();
// assert_eq!(fieldnorm_reader.fieldnorm(0), 0);
// assert_eq!(fieldnorm_reader.fieldnorm(1), 4);
// assert_eq!(fieldnorm_reader.fieldnorm(2), 2); // some text
// assert_eq!(fieldnorm_reader.fieldnorm(3), 1);
// assert_eq!(fieldnorm_reader.fieldnorm(5), 3); // the biggest num
// }
// let searcher = index.reader().unwrap().searcher();
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let do_search = |term: &str| {
// let query = QueryParser::for_index(&index, vec![my_text_field])
// .parse_query(term)
// .unwrap();
// let top_docs: Vec<(f32, DocAddress)> =
// searcher.search(&query, &TopDocs::with_limit(3)).unwrap();
// top_docs.iter().map(|el| el.1.doc_id).collect::<Vec<_>>()
// };
// assert_eq!(do_search("some"), vec![2]);
// assert_eq!(do_search("blubber"), vec![3]);
// assert_eq!(do_search("biggest"), vec![5]);
// }
// // postings file
// {
// let my_text_field = index.schema().get_field("text_field").unwrap();
// let term_a = Term::from_field_text(my_text_field, "text");
// let inverted_index = segment_reader.inverted_index(my_text_field).unwrap();
// let mut postings = inverted_index
// .read_postings(&term_a, IndexRecordOption::WithFreqsAndPositions)
// .unwrap()
// .unwrap();
// assert_eq!(postings.doc_freq(), 2);
// let fallback_bitset = AliveBitSet::for_test_from_deleted_docs(&[0], 100);
// assert_eq!(
// postings.doc_freq_given_deletes(
// segment_reader.alive_bitset().unwrap_or(&fallback_bitset)
// ),
// 2
// );
// let mut output = vec![];
// postings.positions(&mut output);
// assert_eq!(output, vec![1, 3]);
// postings.advance();
// postings.positions(&mut output);
// assert_eq!(output, vec![1]);
// }
// // access doc store
// {
// let doc = searcher.doc(DocAddress::new(0, 0)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1));
// let doc = searcher.doc(DocAddress::new(0, 1)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(2));
// let doc = searcher.doc(DocAddress::new(0, 2)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(3));
// let doc = searcher.doc(DocAddress::new(0, 3)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(10));
// let doc = searcher.doc(DocAddress::new(0, 4)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(20));
// let doc = searcher.doc(DocAddress::new(0, 5)).unwrap();
// assert_eq!(doc.get_first(int_field).unwrap().as_u64(), Some(1_000));
// }
// }
}
#[cfg(all(test, feature = "unstable"))]
mod bench_sorted_index_merge {
use test::{self, Bencher};
use crate::index::Index;
use crate::indexer::merger::IndexMerger;
use crate::schema::{NumericOptions, Schema};
use crate::{IndexSettings, IndexSortByField, IndexWriter, Order};
fn create_index(sort_by_field: Option<IndexSortByField>) -> Index {
let mut schema_builder = Schema::builder();
let int_options = NumericOptions::default().set_fast().set_indexed();
let int_field = schema_builder.add_u64_field("intval", int_options);
let schema = schema_builder.build();
let index_builder = Index::builder().schema(schema).settings(IndexSettings {
sort_by_field,
..Default::default()
});
let index = index_builder.create_in_ram().unwrap();
{
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
let index_doc = |index_writer: &mut IndexWriter, val: u64| {
index_writer.add_document(doc!(int_field=>val)).unwrap();
};
// 3 segments with 10_000 values in the fast fields
for _ in 0..3 {
index_doc(&mut index_writer, 5000); // fix to make it unordered
for i in 0..10_000 {
index_doc(&mut index_writer, i);
}
index_writer.commit().unwrap();
}
}
index
}
//#[bench]
// fn create_sorted_index_walk_overkmerge_on_merge_fastfield(
// b: &mut Bencher,
//) -> crate::Result<()> {
// let sort_by_field = IndexSortByField {
// field: "intval".to_string(),
// order: Order::Desc,
//};
// let index = create_index(Some(sort_by_field.clone()));
// let segments = index.searchable_segments().unwrap();
// let merger: IndexMerger =
// IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
// let doc_id_mapping = merger.generate_doc_id_mapping(&sort_by_field).unwrap();
// b.iter(|| {
// let sorted_doc_ids = doc_id_mapping.iter_old_doc_addrs().map(|doc_addr| {
// let reader = &merger.readers[doc_addr.segment_ord as usize];
// let u64_reader: Arc<dyn Column<u64>> = reader
//.fast_fields()
//.typed_fast_field_reader("intval")
//.expect(
//"Failed to find a reader for single fast field. This is a tantivy bug and \
// it should never happen.",
//);
//(doc_addr.doc_id, reader, u64_reader)
//});
/// add values in order of the new doc_ids
// let mut val = 0;
// for (doc_id, _reader, field_reader) in sorted_doc_ids {
// val = field_reader.get_val(doc_id);
//}
// val
//});
// Ok(())
//}
#[bench]
fn create_sorted_index_create_doc_id_mapping(b: &mut Bencher) -> crate::Result<()> {
let sort_by_field = IndexSortByField {
field: "intval".to_string(),
order: Order::Desc,
};
let index = create_index(Some(sort_by_field.clone()));
// let field = index.schema().get_field("intval").unwrap();
let segments = index.searchable_segments().unwrap();
let merger: IndexMerger =
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
b.iter(|| {
merger
.generate_doc_id_mapping_with_sort_by_field(&sort_by_field)
.unwrap();
});
Ok(())
}
}

View File

@@ -13,10 +13,10 @@ mod flat_map_with_buffer;
pub(crate) mod index_writer;
pub(crate) mod index_writer_status;
mod log_merge_policy;
mod merge_index_test;
mod merge_operation;
pub(crate) mod merge_policy;
pub(crate) mod merger;
mod merger_sorted_index_test;
pub(crate) mod operation;
pub(crate) mod prepared_commit;
mod segment_entry;
@@ -145,15 +145,27 @@ mod tests_mmap {
}
}
#[test]
fn test_json_field_null_byte() {
// Test when field name contains a zero byte, which has special meaning in tantivy.
// As a workaround, we convert the zero byte to the ASCII character '0'.
// https://github.com/quickwit-oss/tantivy/issues/2340
// https://github.com/quickwit-oss/tantivy/issues/2193
let field_name_in = "\u{0000}";
let field_name_out = "0";
test_json_field_name(field_name_in, field_name_out);
fn test_json_field_null_byte_is_ignored() {
let mut schema_builder = Schema::builder();
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
let field = schema_builder.add_json_field("json", options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inv_indexer = segment_reader.inverted_index(field).unwrap();
let term_dict = inv_indexer.terms();
assert_eq!(term_dict.num_terms(), 1);
let mut term_bytes = Vec::new();
term_dict.ord_to_term(0, &mut term_bytes).unwrap();
assert_eq!(term_bytes, b"key\0stest1");
}
#[test]
fn test_json_field_1byte() {
// Test when field name contains a '1' byte, which has special meaning in tantivy.
@@ -291,7 +303,7 @@ mod tests_mmap {
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),
(field_name_out_internal.to_string(), Type::Str),
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();

View File

@@ -38,7 +38,8 @@ impl PathToUnorderedId {
#[cold]
fn insert_new_path(&mut self, path: &str) -> u32 {
let next_id = self.map.len() as u32;
self.map.insert(path.to_string(), next_id);
let new_path = path.to_string();
self.map.insert(new_path, next_id);
next_id
}

View File

@@ -18,27 +18,9 @@ pub struct SegmentSerializer {
impl SegmentSerializer {
/// Creates a new `SegmentSerializer`.
pub fn for_segment(
mut segment: Segment,
is_in_merge: bool,
) -> crate::Result<SegmentSerializer> {
// If the segment is going to be sorted, we stream the docs first to a temporary file.
// In the merge case this is not necessary because we can kmerge the already sorted
// segments
let remapping_required = segment.index().settings().sort_by_field.is_some() && !is_in_merge;
pub fn for_segment(mut segment: Segment) -> crate::Result<SegmentSerializer> {
let settings = segment.index().settings().clone();
let store_writer = if remapping_required {
let store_write = segment.open_write(SegmentComponent::TempStore)?;
StoreWriter::new(
store_write,
crate::store::Compressor::None,
// We want fast random access on the docs, so we choose a small block size.
// If this is zero, the skip index will contain too many checkpoints and
// therefore will be relatively slow.
16000,
settings.docstore_compress_dedicated_thread,
)?
} else {
let store_writer = {
let store_write = segment.open_write(SegmentComponent::Store)?;
StoreWriter::new(
store_write,
@@ -72,10 +54,6 @@ impl SegmentSerializer {
&self.segment
}
pub fn segment_mut(&mut self) -> &mut Segment {
&mut self.segment
}
/// Accessor to the `PostingsSerializer`.
pub fn get_postings_serializer(&mut self) -> &mut InvertedIndexSerializer {
&mut self.postings_serializer

View File

@@ -115,11 +115,10 @@ fn merge(
.collect();
// An IndexMerger is like a "view" of our merged segments.
let merger: IndexMerger =
IndexMerger::open(index.schema(), index.settings().clone(), &segments[..])?;
let merger: IndexMerger = IndexMerger::open(index.schema(), &segments[..])?;
// ... we just serialize this index merger in our new segment to merge the segments.
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone(), true)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment.clone())?;
let num_docs = merger.write(segment_serializer)?;
@@ -220,13 +219,9 @@ pub fn merge_filtered_segments<T: Into<Box<dyn Directory>>>(
)?;
let merged_segment = merged_index.new_segment();
let merged_segment_id = merged_segment.id();
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
segments,
filter_doc_ids,
)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment, true)?;
let merger: IndexMerger =
IndexMerger::open_with_custom_alive_set(merged_index.schema(), segments, filter_doc_ids)?;
let segment_serializer = SegmentSerializer::for_segment(merged_segment)?;
let num_docs = merger.write(segment_serializer)?;
let segment_meta = merged_index.new_segment_meta(merged_segment_id, num_docs);
@@ -1067,7 +1062,6 @@ mod tests {
)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;
@@ -1083,7 +1077,6 @@ mod tests {
Index::create(RamDirectory::default(), target_schema, target_settings)?;
let merger: IndexMerger = IndexMerger::open_with_custom_alive_set(
merged_index.schema(),
merged_index.settings().clone(),
&segments[..],
filter_segments,
)?;

View File

@@ -3,7 +3,6 @@ use common::JsonPathWriter;
use itertools::Itertools;
use tokenizer_api::BoxTokenStream;
use super::doc_id_mapping::{get_doc_id_mapping_from_field, DocIdMapping};
use super::operation::AddOperation;
use crate::fastfield::FastFieldsWriter;
use crate::fieldnorm::{FieldNormReaders, FieldNormsWriter};
@@ -16,7 +15,6 @@ use crate::postings::{
};
use crate::schema::document::{Document, Value};
use crate::schema::{FieldEntry, FieldType, Schema, Term, DATE_TIME_PRECISION_INDEXED};
use crate::store::{StoreReader, StoreWriter};
use crate::tokenizer::{FacetTokenizer, PreTokenizedStream, TextAnalyzer, Tokenizer};
use crate::{DocId, Opstamp, TantivyError};
@@ -41,20 +39,6 @@ fn compute_initial_table_size(per_thread_memory_budget: usize) -> crate::Result<
})
}
fn remap_doc_opstamps(
opstamps: Vec<Opstamp>,
doc_id_mapping_opt: Option<&DocIdMapping>,
) -> Vec<Opstamp> {
if let Some(doc_id_mapping_opt) = doc_id_mapping_opt {
doc_id_mapping_opt
.iter_old_doc_ids()
.map(|doc| opstamps[doc as usize])
.collect()
} else {
opstamps
}
}
/// A `SegmentWriter` is in charge of creating segment index from a
/// set of documents.
///
@@ -90,7 +74,7 @@ impl SegmentWriter {
let tokenizer_manager = segment.index().tokenizers().clone();
let tokenizer_manager_fast_field = segment.index().fast_field_tokenizer().clone();
let table_size = compute_initial_table_size(memory_budget_in_bytes)?;
let segment_serializer = SegmentSerializer::for_segment(segment, false)?;
let segment_serializer = SegmentSerializer::for_segment(segment)?;
let per_field_postings_writers = PerFieldPostingsWriter::for_schema(&schema);
let per_field_text_analyzers = schema
.fields()
@@ -139,15 +123,6 @@ impl SegmentWriter {
/// be used afterwards.
pub fn finalize(mut self) -> crate::Result<Vec<u64>> {
self.fieldnorms_writer.fill_up_to_max_doc(self.max_doc);
let mapping: Option<DocIdMapping> = self
.segment_serializer
.segment()
.index()
.settings()
.sort_by_field
.clone()
.map(|sort_by_field| get_doc_id_mapping_from_field(sort_by_field, &self))
.transpose()?;
remap_and_write(
self.schema,
&self.per_field_postings_writers,
@@ -155,10 +130,8 @@ impl SegmentWriter {
self.fast_field_writers,
&self.fieldnorms_writer,
self.segment_serializer,
mapping.as_ref(),
)?;
let doc_opstamps = remap_doc_opstamps(self.doc_opstamps, mapping.as_ref());
Ok(doc_opstamps)
Ok(self.doc_opstamps)
}
/// Returns an estimation of the current memory usage of the segment writer.
@@ -202,9 +175,8 @@ impl SegmentWriter {
match field_entry.field_type() {
FieldType::Facet(_) => {
let mut facet_tokenizer = FacetTokenizer::default(); // this can be global
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let facet_str = value.as_facet().ok_or_else(make_schema_error)?;
let mut facet_tokenizer = facet_tokenizer.token_stream(facet_str);
@@ -220,15 +192,14 @@ impl SegmentWriter {
}
FieldType::Str(_) => {
let mut indexing_position = IndexingPosition::default();
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
let mut token_stream = if let Some(text) = value.as_str() {
let text_analyzer =
&mut self.per_field_text_analyzers[field.field_id() as usize];
text_analyzer.token_stream(text)
} else if let Some(tok_str) = value.as_pre_tokenized_text() {
} else if let Some(tok_str) = value.into_pre_tokenized_text() {
BoxTokenStream::new(PreTokenizedStream::from(*tok_str.clone()))
} else {
continue;
@@ -250,9 +221,8 @@ impl SegmentWriter {
}
FieldType::U64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let u64_val = value.as_u64().ok_or_else(make_schema_error)?;
@@ -265,10 +235,8 @@ impl SegmentWriter {
}
FieldType::Date(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value_access = value_access as D::Value<'_>;
let value = value_access.as_value();
for value in values {
let value = value.as_value();
num_vals += 1;
let date_val = value.as_datetime().ok_or_else(make_schema_error)?;
@@ -282,9 +250,8 @@ impl SegmentWriter {
}
FieldType::I64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let i64_val = value.as_i64().ok_or_else(make_schema_error)?;
@@ -297,10 +264,8 @@ impl SegmentWriter {
}
FieldType::F64(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let f64_val = value.as_f64().ok_or_else(make_schema_error)?;
term_buffer.set_f64(f64_val);
@@ -312,10 +277,8 @@ impl SegmentWriter {
}
FieldType::Bool(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bool_val = value.as_bool().ok_or_else(make_schema_error)?;
term_buffer.set_bool(bool_val);
@@ -327,10 +290,8 @@ impl SegmentWriter {
}
FieldType::Bytes(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let bytes = value.as_bytes().ok_or_else(make_schema_error)?;
term_buffer.set_bytes(bytes);
@@ -364,9 +325,8 @@ impl SegmentWriter {
}
FieldType::IpAddr(_) => {
let mut num_vals = 0;
for value_access in values {
// Used to help with linting and type checking.
let value = value_access as D::Value<'_>;
for value in values {
let value = value.as_value();
num_vals += 1;
let ip_addr = value.as_ip_addr().ok_or_else(make_schema_error)?;
@@ -432,11 +392,10 @@ fn remap_and_write(
fast_field_writers: FastFieldsWriter,
fieldnorms_writer: &FieldNormsWriter,
mut serializer: SegmentSerializer,
doc_id_map: Option<&DocIdMapping>,
) -> crate::Result<()> {
debug!("remap-and-write");
if let Some(fieldnorms_serializer) = serializer.extract_fieldnorms_serializer() {
fieldnorms_writer.serialize(fieldnorms_serializer, doc_id_map)?;
fieldnorms_writer.serialize(fieldnorms_serializer)?;
}
let fieldnorm_data = serializer
.segment()
@@ -447,39 +406,10 @@ fn remap_and_write(
schema,
per_field_postings_writers,
fieldnorm_readers,
doc_id_map,
serializer.get_postings_serializer(),
)?;
debug!("fastfield-serialize");
fast_field_writers.serialize(serializer.get_fast_field_write(), doc_id_map)?;
// finalize temp docstore and create version, which reflects the doc_id_map
if let Some(doc_id_map) = doc_id_map {
debug!("resort-docstore");
let store_write = serializer
.segment_mut()
.open_write(SegmentComponent::Store)?;
let settings = serializer.segment().index().settings();
let store_writer = StoreWriter::new(
store_write,
settings.docstore_compression,
settings.docstore_blocksize,
settings.docstore_compress_dedicated_thread,
)?;
let old_store_writer = std::mem::replace(&mut serializer.store_writer, store_writer);
old_store_writer.close()?;
let store_read = StoreReader::open(
serializer
.segment()
.open_read(SegmentComponent::TempStore)?,
1, /* The docstore is configured to have one doc per block, and each doc is accessed
* only once: we don't need caching. */
)?;
for old_doc_id in doc_id_map.iter_old_doc_ids() {
let doc_bytes = store_read.get_document_bytes(old_doc_id)?;
serializer.get_store_writer().store_bytes(&doc_bytes)?;
}
}
fast_field_writers.serialize(serializer.get_fast_field_write())?;
debug!("serializer-close");
serializer.close()?;

View File

@@ -222,8 +222,8 @@ pub use crate::core::{Executor, Searcher, SearcherGeneration};
pub use crate::directory::Directory;
#[allow(deprecated)] // Remove with index sorting
pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, IndexSortByField, InvertedIndexReader, Order,
Segment, SegmentMeta, SegmentReader,
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
SegmentMeta, SegmentReader,
};
pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term};
@@ -397,16 +397,20 @@ pub mod tests {
#[macro_export]
macro_rules! assert_nearly_equals {
($left:expr, $right:expr) => {{
match (&$left, &$right) {
(left_val, right_val) => {
assert_nearly_equals!($left, $right, 0.0005);
}};
($left:expr, $right:expr, $epsilon:expr) => {{
match (&$left, &$right, &$epsilon) {
(left_val, right_val, epsilon_val) => {
let diff = (left_val - right_val).abs();
let add = left_val.abs() + right_val.abs();
if diff > 0.0005 * add {
if diff > *epsilon_val {
panic!(
r#"assertion failed: `(left ~= right)`
left: `{:?}`,
right: `{:?}`"#,
&*left_val, &*right_val
r#"assertion failed: `abs(left-right)>epsilon`
left: `{:?}`,
right: `{:?}`,
epsilon: `{:?}`"#,
&*left_val, &*right_val, &*epsilon_val
)
}
}

View File

@@ -3,7 +3,7 @@
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
//! This information is useful to run phrase queries.
//!
//! The [position](crate::SegmentComponent::Positions) file contains all of the
//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
//! bitpacked positions delta, for all terms of a given field, one term after the other.
//!
//! Each term is encoded independently.

View File

@@ -3,7 +3,6 @@ use std::io;
use common::json_path_writer::JSON_END_OF_PATH;
use stacker::Addr;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::postings_writer::SpecializedPostingsWriter;
use crate::postings::recorder::{BufferLender, DocIdRecorder, Recorder};
@@ -60,9 +59,8 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
@@ -71,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
let mut prev_term_id = u32::MAX;
let mut term_path_len = 0; // this will be set in the first iteration
for (_field, path_id, term, addr) in term_addrs {
for (_field, path_id, term, addr) in ordered_term_addrs {
if prev_term_id != path_id.path_id() {
term_buffer.truncate_value_bytes(0);
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
@@ -87,7 +85,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
SpecializedPostingsWriter::<Rec>::serialize_one_term(
term_buffer.serialized_value_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,
@@ -96,7 +93,6 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
SpecializedPostingsWriter::<DocIdRecorder>::serialize_one_term(
term_buffer.serialized_value_bytes(),
*addr,
doc_id_map,
&mut buffer_lender,
ctx,
serializer,

View File

@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
fn term_freq(&self) -> u32;
/// Returns the positions offsetted with a given value.
/// It is not necessary to clear the `output` before calling this method.
/// The output vector will be resized to the `term_freq`.
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

View File

@@ -5,7 +5,6 @@ use std::ops::Range;
use stacker::Addr;
use crate::fieldnorm::FieldNormReaders;
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::indexer::path_to_unordered_id::OrderedPathId;
use crate::postings::recorder::{BufferLender, Recorder};
use crate::postings::{
@@ -50,7 +49,6 @@ pub(crate) fn serialize_postings(
schema: Schema,
per_field_postings_writers: &PerFieldPostingsWriter,
fieldnorm_readers: FieldNormReaders,
doc_id_map: Option<&DocIdMapping>,
serializer: &mut InvertedIndexSerializer,
) -> crate::Result<()> {
// Replace unordered ids by ordered ids to be able to sort
@@ -86,7 +84,6 @@ pub(crate) fn serialize_postings(
postings_writer.serialize(
&term_offsets[byte_offsets],
&ordered_id_to_path,
doc_id_map,
&ctx,
&mut field_serializer,
)?;
@@ -122,7 +119,6 @@ pub(crate) trait PostingsWriter: Send + Sync {
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()>;
@@ -187,7 +183,6 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
pub(crate) fn serialize_one_term(
term: &[u8],
addr: Addr,
doc_id_map: Option<&DocIdMapping>,
buffer_lender: &mut BufferLender,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
@@ -195,7 +190,7 @@ impl<Rec: Recorder> SpecializedPostingsWriter<Rec> {
let recorder: Rec = ctx.term_index.read(addr);
let term_doc_freq = recorder.term_doc_freq().unwrap_or(0u32);
serializer.new_term(term, term_doc_freq, recorder.has_term_freq())?;
recorder.serialize(&ctx.arena, doc_id_map, serializer, buffer_lender);
recorder.serialize(&ctx.arena, serializer, buffer_lender);
serializer.close_term()?;
Ok(())
}
@@ -229,13 +224,12 @@ impl<Rec: Recorder> PostingsWriter for SpecializedPostingsWriter<Rec> {
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
_ordered_id_to_path: &[&str],
doc_id_map: Option<&DocIdMapping>,
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
) -> io::Result<()> {
let mut buffer_lender = BufferLender::default();
for (_field, _path_id, term, addr) in term_addrs {
Self::serialize_one_term(term, *addr, doc_id_map, &mut buffer_lender, ctx, serializer)?;
Self::serialize_one_term(term, *addr, &mut buffer_lender, ctx, serializer)?;
}
Ok(())
}

View File

@@ -1,7 +1,6 @@
use common::read_u32_vint;
use stacker::{ExpUnrolledLinkedList, MemoryArena};
use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::postings::FieldSerializer;
use crate::DocId;
@@ -71,7 +70,6 @@ pub(crate) trait Recorder: Copy + Default + Send + Sync + 'static {
fn serialize(
&self,
arena: &MemoryArena,
doc_id_map: Option<&DocIdMapping>,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
);
@@ -115,26 +113,15 @@ impl Recorder for DocIdRecorder {
fn serialize(
&self,
arena: &MemoryArena,
doc_id_map: Option<&DocIdMapping>,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let (buffer, doc_ids) = buffer_lender.lend_all();
let buffer = buffer_lender.lend_u8();
// TODO avoid reading twice.
self.stack.read_to_end(arena, buffer);
if let Some(doc_id_map) = doc_id_map {
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
doc_ids.extend(iter.map(|old_doc_id| doc_id_map.get_new_doc_id(old_doc_id)));
doc_ids.sort_unstable();
for doc in doc_ids {
serializer.write_doc(*doc, 0u32, &[][..]);
}
} else {
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
for doc_id in iter {
serializer.write_doc(doc_id, 0u32, &[][..]);
}
let iter = get_sum_reader(VInt32Reader::new(&buffer[..]));
for doc_id in iter {
serializer.write_doc(doc_id, 0u32, &[][..]);
}
}
@@ -194,35 +181,18 @@ impl Recorder for TermFrequencyRecorder {
fn serialize(
&self,
arena: &MemoryArena,
doc_id_map: Option<&DocIdMapping>,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let buffer = buffer_lender.lend_u8();
self.stack.read_to_end(arena, buffer);
let mut u32_it = VInt32Reader::new(&buffer[..]);
if let Some(doc_id_map) = doc_id_map {
let mut doc_id_and_tf = vec![];
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let term_freq = u32_it.next().unwrap_or(self.current_tf);
doc_id_and_tf.push((doc_id_map.get_new_doc_id(doc_id), term_freq));
}
doc_id_and_tf.sort_unstable_by_key(|&(doc_id, _)| doc_id);
for (doc_id, tf) in doc_id_and_tf {
serializer.write_doc(doc_id, tf, &[][..]);
}
} else {
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc_id, term_freq, &[][..]);
}
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
prev_doc = doc_id;
let term_freq = u32_it.next().unwrap_or(self.current_tf);
serializer.write_doc(doc_id, term_freq, &[][..]);
}
}
@@ -268,14 +238,12 @@ impl Recorder for TfAndPositionRecorder {
fn serialize(
&self,
arena: &MemoryArena,
doc_id_map: Option<&DocIdMapping>,
serializer: &mut FieldSerializer<'_>,
buffer_lender: &mut BufferLender,
) {
let (buffer_u8, buffer_positions) = buffer_lender.lend_all();
self.stack.read_to_end(arena, buffer_u8);
let mut u32_it = VInt32Reader::new(&buffer_u8[..]);
let mut doc_id_and_positions = vec![];
let mut prev_doc = 0;
while let Some(delta_doc_id) = u32_it.next() {
let doc_id = prev_doc + delta_doc_id;
@@ -294,19 +262,7 @@ impl Recorder for TfAndPositionRecorder {
}
}
}
if let Some(doc_id_map) = doc_id_map {
// this simple variant to remap may consume to much memory
doc_id_and_positions
.push((doc_id_map.get_new_doc_id(doc_id), buffer_positions.to_vec()));
} else {
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
}
}
if doc_id_map.is_some() {
doc_id_and_positions.sort_unstable_by_key(|&(doc_id, _)| doc_id);
for (doc_id, positions) in doc_id_and_positions {
serializer.write_doc(doc_id, positions.len() as u32, &positions);
}
serializer.write_doc(doc_id, buffer_positions.len() as u32, buffer_positions);
}
}

View File

@@ -22,10 +22,7 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer {
doc: 0u32,
max_doc: reader.max_doc(),
};
let all_scorer = AllScorer::new(reader.max_doc());
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
}
@@ -43,6 +40,13 @@ pub struct AllScorer {
max_doc: DocId,
}
impl AllScorer {
/// Creates a new AllScorer with `max_doc` docs.
pub fn new(max_doc: DocId) -> AllScorer {
AllScorer { doc: 0u32, max_doc }
}
}
impl DocSet for AllScorer {
#[inline(always)]
fn advance(&mut self) -> DocId {

View File

@@ -66,6 +66,10 @@ use crate::schema::{IndexRecordOption, Term};
/// Term::from_field_text(title, "diary"),
/// IndexRecordOption::Basic,
/// ));
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "cow"),
/// IndexRecordOption::Basic
/// ));
/// // A TermQuery with "found" in the body
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(body, "found"),
@@ -74,7 +78,7 @@ use crate::schema::{IndexRecordOption, Term};
/// // TermQuery "diary" must and "girl" must not be present
/// let queries_with_occurs1 = vec![
/// (Occur::Must, diary_term_query.box_clone()),
/// (Occur::MustNot, girl_term_query),
/// (Occur::MustNot, girl_term_query.box_clone()),
/// ];
/// // Make a BooleanQuery equivalent to
/// // title:+diary title:-girl
@@ -82,15 +86,10 @@ use crate::schema::{IndexRecordOption, Term};
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
/// assert_eq!(count1, 1);
///
/// // TermQuery for "cow" in the title
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "cow"),
/// IndexRecordOption::Basic,
/// ));
/// // "title:diary OR title:cow"
/// let title_diary_or_cow = BooleanQuery::new(vec![
/// (Occur::Should, diary_term_query.box_clone()),
/// (Occur::Should, cow_term_query),
/// (Occur::Should, cow_term_query.box_clone()),
/// ]);
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
/// assert_eq!(count2, 4);
@@ -118,21 +117,38 @@ use crate::schema::{IndexRecordOption, Term};
/// ]);
/// let count4 = searcher.search(&nested_query, &Count)?;
/// assert_eq!(count4, 1);
///
/// // You may call `with_minimum_required_clauses` to
/// // specify the number of should clauses the returned documents must match.
/// let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
/// (Occur::Should, cow_term_query.box_clone()),
/// (Occur::Should, girl_term_query.box_clone()),
/// (Occur::Should, diary_term_query.box_clone()),
/// ], 2);
/// // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
/// // Notice: "Diary" isn't "Dairy". ;-)
/// let count5 = searcher.search(&minimum_required_query, &Count)?;
/// assert_eq!(count5, 1);
/// Ok(())
/// }
/// ```
#[derive(Debug)]
pub struct BooleanQuery {
subqueries: Vec<(Occur, Box<dyn Query>)>,
minimum_number_should_match: usize,
}
impl Clone for BooleanQuery {
fn clone(&self) -> Self {
self.subqueries
let subqueries = self
.subqueries
.iter()
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
.collect::<Vec<_>>()
.into()
.collect::<Vec<_>>();
Self {
subqueries,
minimum_number_should_match: self.minimum_number_should_match,
}
}
}
@@ -149,8 +165,9 @@ impl Query for BooleanQuery {
.iter()
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new(
Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
sub_weights,
self.minimum_number_should_match,
enable_scoring.is_scoring_enabled(),
Box::new(SumWithCoordsCombiner::default),
)))
@@ -166,7 +183,41 @@ impl Query for BooleanQuery {
impl BooleanQuery {
/// Creates a new boolean query.
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
BooleanQuery { subqueries }
// If the bool query includes at least one should clause
// and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
// 0. Keep pace with Elasticsearch.
let mut minimum_required = 0;
for (occur, _) in &subqueries {
match occur {
Occur::Should => minimum_required = 1,
Occur::Must | Occur::MustNot => {
minimum_required = 0;
break;
}
}
}
Self::with_minimum_required_clauses(subqueries, minimum_required)
}
/// Create a new boolean query with minimum number of required should clauses specified.
pub fn with_minimum_required_clauses(
subqueries: Vec<(Occur, Box<dyn Query>)>,
minimum_number_should_match: usize,
) -> BooleanQuery {
BooleanQuery {
subqueries,
minimum_number_should_match,
}
}
/// Getter for `minimum_number_should_match`
pub fn get_minimum_number_should_match(&self) -> usize {
self.minimum_number_should_match
}
/// Setter for `minimum_number_should_match`
pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
self.minimum_number_should_match = minimum_number_should_match;
}
/// Returns the intersection of the queries.
@@ -181,6 +232,18 @@ impl BooleanQuery {
BooleanQuery::new(subqueries)
}
/// Returns the union of the queries with minimum required clause.
pub fn union_with_minimum_required_clauses(
queries: Vec<Box<dyn Query>>,
minimum_required_clauses: usize,
) -> BooleanQuery {
let subqueries = queries
.into_iter()
.map(|sub_query| (Occur::Should, sub_query))
.collect();
BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
}
/// Helper method to create a boolean query matching a given list of terms.
/// The resulting query is a disjunction of the terms.
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
@@ -203,11 +266,13 @@ impl BooleanQuery {
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use super::BooleanQuery;
use crate::collector::{Count, DocSetCollector};
use crate::query::{QueryClone, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{DocAddress, Index, Term};
use crate::query::{Query, QueryClone, QueryParser, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
use crate::{DocAddress, DocId, Index, Term};
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -223,6 +288,73 @@ mod tests {
Ok(index)
}
#[test]
fn test_minimum_required() -> crate::Result<()> {
fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
docs: T,
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
for doc in docs {
writer.add_document(doc!(text => doc))?;
}
writer.commit()?;
Ok(index)
}
fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
queries: T,
field: Field,
mr: usize,
) -> BooleanQuery {
let terms = queries
.into_iter()
.map(|t| Term::from_field_text(field, t))
.map(|t| TermQuery::new(t, IndexRecordOption::Basic))
.map(|q| -> Box<dyn Query> { Box::new(q) })
.collect();
BooleanQuery::union_with_minimum_required_clauses(terms, mr)
}
fn check_doc_id<T: IntoIterator<Item = DocId>>(
expected: T,
actually: HashSet<DocAddress>,
seg: u32,
) {
assert_eq!(
actually,
expected
.into_iter()
.map(|id| DocAddress::new(seg, id))
.collect()
);
}
let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
let searcher = index.reader()?.searcher();
let text = index.schema().get_field("text").unwrap();
// Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
let docs = searcher.search(&q1, &DocSetCollector)?;
check_doc_id([0, 1, 4], docs, 0);
// Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
let docs = searcher.search(&q2, &DocSetCollector)?;
check_doc_id([0, 1], docs, 0);
// Nothing queried since minimum_required is too large.
let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
let docs = searcher.search(&q3, &DocSetCollector)?;
assert!(docs.is_empty());
// When mr is set to zero or one, there are no difference with `Boolean::Union`.
let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
let docs = searcher.search(&q4, &DocSetCollector)?;
check_doc_id([0, 1, 3], docs, 0);
let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
let docs = searcher.search(&q5, &DocSetCollector)?;
check_doc_id([0, 1, 4], docs, 0);
Ok(())
}
#[test]
fn test_union() -> crate::Result<()> {
let index = create_test_index()?;

View File

@@ -3,6 +3,7 @@ use std::collections::HashMap;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::index::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::disjunction::Disjunction;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer;
@@ -18,6 +19,26 @@ enum SpecializedScorer {
Other(Box<dyn Scorer>),
}
fn scorer_disjunction<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner: TScoreCombiner,
minimum_match_required: usize,
) -> Box<dyn Scorer>
where
TScoreCombiner: ScoreCombiner,
{
debug_assert!(!scorers.is_empty());
debug_assert!(minimum_match_required > 1);
if scorers.len() == 1 {
return scorers.into_iter().next().unwrap(); // Safe unwrap.
}
Box::new(Disjunction::new(
scorers,
score_combiner,
minimum_match_required,
))
}
fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
/// Weight associated to the `BoolQuery`.
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
weights: Vec<(Occur, Box<dyn Weight>)>,
minimum_number_should_match: usize,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
}
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
weights,
scoring_enabled,
score_combiner_fn,
minimum_number_should_match: 1,
}
}
/// Create a new boolean weight with minimum number of required should clauses specified.
pub fn with_minimum_number_should_match(
weights: Vec<(Occur, Box<dyn Weight>)>,
minimum_number_should_match: usize,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
) -> BooleanWeight<TScoreCombiner> {
BooleanWeight {
weights,
minimum_number_should_match,
scoring_enabled,
score_combiner_fn,
}
}
@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
) -> crate::Result<SpecializedScorer> {
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
.remove(&Occur::Should)
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
// Indicate how should clauses are combined with other clauses.
enum CombinationMethod {
Ignored,
// Only contributes to final score.
Optional(SpecializedScorer),
// Must be fitted.
Required(Box<dyn Scorer>),
}
let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
{
let num_of_should_scorers = should_scorers.len();
if self.minimum_number_should_match > num_of_should_scorers {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
}
match self.minimum_number_should_match {
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
1 => CombinationMethod::Required(into_box_scorer(
scorer_union(should_scorers, &score_combiner_fn),
&score_combiner_fn,
)),
n if num_of_should_scorers == n => {
// When num_of_should_scorers equals the number of should clauses,
// they are no different from must clauses.
must_scorers = match must_scorers.take() {
Some(mut must_scorers) => {
must_scorers.append(&mut should_scorers);
Some(must_scorers)
}
None => Some(should_scorers),
};
CombinationMethod::Ignored
}
_ => CombinationMethod::Required(scorer_disjunction(
should_scorers,
score_combiner_fn(),
self.minimum_number_should_match,
)),
}
} else {
// None of should clauses are provided.
if self.minimum_number_should_match > 0 {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
} else {
CombinationMethod::Ignored
}
};
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot)
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
.map(|specialized_scorer| {
.map(|specialized_scorer: SpecializedScorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
});
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::Must)
.map(intersect_scorers);
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
(Some(should_scorer), Some(must_scorer)) => {
let positive_scorer = match (should_opt, must_scorers) {
(CombinationMethod::Ignored, Some(must_scorers)) => {
SpecializedScorer::Other(intersect_scorers(must_scorers))
}
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
let must_scorer = intersect_scorers(must_scorers);
if self.scoring_enabled {
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
Box<dyn Scorer>,
Box<dyn Scorer>,
TComplexScoreCombiner,
>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
)))
SpecializedScorer::Other(Box::new(
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
),
))
} else {
SpecializedScorer::Other(must_scorer)
}
}
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
(Some(should_scorer), None) => should_scorer,
(None, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
must_scorers.push(should_scorer);
SpecializedScorer::Other(intersect_scorers(must_scorers))
}
(CombinationMethod::Ignored, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
}
(CombinationMethod::Required(should_scorer), None) => {
SpecializedScorer::Other(should_scorer)
}
// Optional options are promoted to required if no must scorers exists.
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
};
if let Some(exclude_scorer) = exclude_scorer_opt {
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(

327
src/query/disjunction.rs Normal file
View File

@@ -0,0 +1,327 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ScoreCombiner, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
/// `Disjunction` is responsible for merging `DocSet` from multiple
/// source. Specifically, It takes the union of two or more `DocSet`s
/// then filtering out elements that appear fewer times than a
/// specified threshold.
pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
chains: BinaryHeap<ScorerWrapper<TScorer>>,
minimum_matches_required: usize,
score_combiner: TScoreCombiner,
current_doc: DocId,
current_score: Score,
}
/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
struct ScorerWrapper<T> {
scorer: T,
current_doc: DocId,
}
impl<T: Scorer> ScorerWrapper<T> {
fn new(scorer: T) -> Self {
let current_doc = scorer.doc();
Self {
scorer,
current_doc,
}
}
}
impl<T: Scorer> PartialEq for ScorerWrapper<T> {
fn eq(&self, other: &Self) -> bool {
self.doc() == other.doc()
}
}
impl<T: Scorer> Eq for ScorerWrapper<T> {}
impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: Scorer> Ord for ScorerWrapper<T> {
fn cmp(&self, other: &Self) -> Ordering {
self.doc().cmp(&other.doc()).reverse()
}
}
impl<T: Scorer> DocSet for ScorerWrapper<T> {
fn advance(&mut self) -> DocId {
let doc_id = self.scorer.advance();
self.current_doc = doc_id;
doc_id
}
fn doc(&self) -> DocId {
self.current_doc
}
fn size_hint(&self) -> u32 {
self.scorer.size_hint()
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
pub fn new<T: IntoIterator<Item = TScorer>>(
docsets: T,
score_combiner: TScoreCombiner,
minimum_matches_required: usize,
) -> Self {
debug_assert!(
minimum_matches_required > 1,
"union scorer works better if just one matches required"
);
let chains = docsets
.into_iter()
.map(|doc| ScorerWrapper::new(doc))
.collect();
let mut disjunction = Self {
chains,
score_combiner,
current_doc: TERMINATED,
minimum_matches_required,
current_score: 0.0,
};
if minimum_matches_required > disjunction.chains.len() {
return disjunction;
}
disjunction.advance();
disjunction
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
for Disjunction<TScorer, TScoreCombiner>
{
fn advance(&mut self) -> DocId {
let mut current_num_matches = 0;
while let Some(mut candidate) = self.chains.pop() {
let next = candidate.doc();
if next != TERMINATED {
// Peek next doc.
if self.current_doc != next {
if current_num_matches >= self.minimum_matches_required {
self.chains.push(candidate);
self.current_score = self.score_combiner.score();
return self.current_doc;
}
// Reset current_num_matches and scores.
current_num_matches = 0;
self.current_doc = next;
self.score_combiner.clear();
}
current_num_matches += 1;
self.score_combiner.update(&mut candidate.scorer);
candidate.advance();
self.chains.push(candidate);
}
}
if current_num_matches < self.minimum_matches_required {
self.current_doc = TERMINATED;
}
self.current_score = self.score_combiner.score();
self.current_doc
}
#[inline]
fn doc(&self) -> DocId {
self.current_doc
}
fn size_hint(&self) -> u32 {
self.chains
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
for Disjunction<TScorer, TScoreCombiner>
{
fn score(&mut self) -> Score {
self.current_score
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use super::Disjunction;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
use crate::{DocId, DocSet, Score, TERMINATED};
fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
let mut counts = BTreeMap::new();
for array in arrays {
for &element in array {
*counts.entry(element).or_insert(0) += 1;
}
}
counts
.iter()
.filter_map(|(&element, &count)| {
if count >= pass_line {
Some(element)
} else {
None
}
})
.collect()
}
fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
let make_scorer = || {
Disjunction::new(
vals.iter()
.cloned()
.map(VecDocSet::from)
.map(|d| ConstScorer::new(d, 1.0)),
DoNothingCombiner,
min_match,
)
};
let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
let mut count = 0;
while scorer.doc() != TERMINATED {
assert_eq!(union_expected.doc(), scorer.doc());
assert_eq!(union_expected.advance(), scorer.advance());
count += 1;
}
assert_eq!(union_expected.advance(), TERMINATED);
assert_eq!(count, make_scorer().count_including_deleted());
}
#[should_panic]
#[test]
fn test_arg_check1() {
aux_test_conjunction(vec![], 0);
}
#[should_panic]
#[test]
fn test_arg_check2() {
aux_test_conjunction(vec![], 1);
}
#[test]
fn test_corner_case() {
aux_test_conjunction(vec![], 2);
aux_test_conjunction(vec![vec![]; 1000], 2);
aux_test_conjunction(vec![vec![]; 100], usize::MAX);
aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
}
#[test]
fn test_conjunction() {
aux_test_conjunction(
vec![
vec![1, 3333, 100000000u32],
vec![1, 2, 100000000u32],
vec![1, 2, 100000000u32],
],
2,
);
aux_test_conjunction(
vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
2,
);
aux_test_conjunction(
vec![
vec![1, 3333, 100000000u32],
vec![1, 2, 100000000u32],
vec![1, 2, 100000000u32],
],
3,
)
}
// This dummy scorer does nothing but yield doc id increasingly.
// with constant score 1.0
#[derive(Clone)]
struct DummyScorer {
cursor: usize,
foo: Vec<(DocId, f32)>,
}
impl DummyScorer {
fn new(doc_score: Vec<(DocId, f32)>) -> Self {
Self {
cursor: 0,
foo: doc_score,
}
}
}
impl DocSet for DummyScorer {
fn advance(&mut self) -> DocId {
self.cursor += 1;
self.doc()
}
fn doc(&self) -> DocId {
self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
}
fn size_hint(&self) -> u32 {
self.foo.len() as u32
}
}
impl Scorer for DummyScorer {
fn score(&mut self) -> Score {
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
}
}
#[test]
fn test_score_calculate() {
let mut scorer = Disjunction::new(
vec![
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
],
SumCombiner::default(),
3,
);
assert_eq!(scorer.score(), 5.0);
assert_eq!(scorer.advance(), 2);
assert_eq!(scorer.score(), 3.0);
}
#[test]
fn test_score_calculate_corner_case() {
let mut scorer = Disjunction::new(
vec![
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
],
SumCombiner::default(),
2,
);
assert_eq!(scorer.doc(), 1);
assert_eq!(scorer.score(), 3.0);
assert_eq!(scorer.advance(), 3);
assert_eq!(scorer.score(), 2.0);
}
}

View File

@@ -149,7 +149,7 @@ mod tests {
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{Index, Searcher};
use crate::{Index, Searcher, Term};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
@@ -188,9 +188,8 @@ mod tests {
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 50)),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
@@ -198,10 +197,9 @@ mod tests {
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 0)),
Bound::Included(Term::from_field_u64(all_field, 50)),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);

View File

@@ -5,6 +5,7 @@ mod bm25;
mod boolean_query;
mod boost_query;
mod const_score_query;
mod disjunction;
mod disjunction_max_query;
mod empty_query;
mod exclude;
@@ -53,7 +54,7 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
pub use self::range_query::{FastFieldRangeWeight, RangeQuery};
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{

View File

@@ -145,15 +145,7 @@ impl Query for PhrasePrefixQuery {
Bound::Unbounded
};
let mut range_query = RangeQuery::new_term_bounds(
enable_scoring
.schema()
.get_field_name(self.field)
.to_owned(),
self.prefix.1.typ(),
&Bound::Included(self.prefix.1.clone()),
&end_term,
);
let mut range_query = RangeQuery::new(Bound::Included(self.prefix.1.clone()), end_term);
range_query.limit(self.max_expansions as u64);
range_query.weight(enable_scoring)
}

View File

@@ -97,6 +97,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
suffixes: Vec<TPostings>,
suffix_offset: u32,
phrase_count: u32,
suffix_position_buffer: Vec<u32>,
}
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
@@ -140,6 +141,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
suffixes,
suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0,
suffix_position_buffer: Vec::with_capacity(100),
};
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
phrase_prefix_scorer.advance();
@@ -153,7 +155,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
fn matches_prefix(&mut self) -> bool {
let mut count = 0;
let mut positions = Vec::new();
let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes {
@@ -162,8 +163,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
}
let doc = suffix.seek(current_doc);
if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut positions);
count += intersection_count(pos_matching, &positions);
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
count += intersection_count(pos_matching, &self.suffix_position_buffer);
}
}
self.phrase_count = count as u32;

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound;
use crate::query::Occur;
use crate::schema::{Field, Term, Type};
use crate::schema::Term;
use crate::Score;
#[derive(Clone)]
@@ -14,14 +14,10 @@ pub enum LogicalLiteral {
prefix: bool,
},
Range {
field: String,
value_type: Type,
lower: Bound<Term>,
upper: Bound<Term>,
},
Set {
field: Field,
value_type: Type,
elements: Vec<Term>,
},
All,

View File

@@ -790,8 +790,6 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let mut errors = Vec::new();
let lower = match self.resolve_bound(field, json_path, &lower) {
Ok(bound) => bound,
@@ -812,12 +810,8 @@ impl QueryParser {
// we failed to parse something. Either way, there is no point emiting it
return (None, errors);
}
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field: self.schema.get_field_name(field).to_string(),
value_type,
lower,
upper,
}));
let logical_ast =
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
(Some(logical_ast), errors)
}
UserInputLeaf::Set {
@@ -832,17 +826,11 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let (elements, errors) = elements
.into_iter()
.map(|element| self.compute_boundary_term(field, json_path, &element))
.partition_result();
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set {
elements,
field,
value_type,
}));
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Set { elements }));
(Some(logical_ast), errors)
}
UserInputLeaf::Exists { .. } => (
@@ -890,14 +878,7 @@ fn convert_literal_to_query(
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
}
}
LogicalLiteral::Range {
field,
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}
@@ -1142,8 +1123,8 @@ mod test {
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
assert_eq!(
format!("{query:?}"),
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
upper_bound: Included([98]), limit: None }"
"RangeQuery { lower_bound: Included(Term(field=0, type=Str, \"a\")), upper_bound: \
Included(Term(field=0, type=Str, \"b\")), limit: None }"
);
}
@@ -1821,7 +1802,8 @@ mod test {
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
minimum_number_should_match: 1 }"
);
}
@@ -1886,7 +1868,8 @@ mod test {
format!("{query:?}"),
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
minimum_number_should_match: 1 }"
);
}
@@ -1903,7 +1886,8 @@ mod test {
format!("{query:?}"),
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
distance: 2, transposition_cost_one: false, prefix: true })] }"
distance: 2, transposition_cost_one: false, prefix: true })], \
minimum_number_should_match: 1 }"
);
}
}

View File

@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
#[cfg(test)]
mod tests {
use std::ops::Bound;
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, IndexBuilder, TantivyDocument};
use crate::{schema, IndexBuilder, TantivyDocument, Term};
#[test]
fn range_query_fast_optional_field_minimum() {
@@ -218,10 +220,9 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
let query = RangeQuery::new(
Bound::Included(Term::from_field_u64(score_field, 70)),
Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();

View File

@@ -2,21 +2,19 @@ use std::ops::Bound;
use crate::schema::Type;
mod fast_field_range_query;
mod fast_field_range_doc_set;
mod range_query;
mod range_query_ip_fastfield;
mod range_query_u64_fastfield;
pub use self::range_query::RangeQuery;
pub use self::range_query_ip_fastfield::IPFastFieldRangeWeight;
pub use self::range_query_u64_fastfield::FastFieldRangeWeight;
// TODO is this correct?
pub(crate) fn is_type_valid_for_fastfield_range_query(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::Str | Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => true,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
Type::Facet | Type::Bytes | Type::Json => false,
}
}

View File

@@ -1,21 +1,17 @@
use std::io;
use std::net::Ipv6Addr;
use std::ops::{Bound, Range};
use std::ops::Bound;
use columnar::MonotonicallyMappableToU128;
use common::{BinarySerializable, BitSet};
use common::BitSet;
use super::map_bound;
use super::range_query_u64_fastfield::FastFieldRangeWeight;
use crate::error::TantivyError;
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::range_query::range_query_ip_fastfield::IPFastFieldRangeWeight;
use crate::query::range_query::{is_type_valid_for_fastfield_range_query, map_bound_res};
use crate::query::range_query::is_type_valid_for_fastfield_range_query;
use crate::query::{BitSetDocSet, ConstScorer, EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type};
use crate::termdict::{TermDictionary, TermStreamer};
use crate::{DateTime, DocId, Score};
use crate::{DocId, Score};
/// `RangeQuery` matches all documents that have at least one term within a defined range.
///
@@ -40,8 +36,10 @@ use crate::{DateTime, DocId, Score};
/// ```rust
/// use tantivy::collector::Count;
/// use tantivy::query::RangeQuery;
/// use tantivy::Term;
/// use tantivy::schema::{Schema, INDEXED};
/// use tantivy::{doc, Index, IndexWriter};
/// use std::ops::Bound;
/// # fn test() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let year_field = schema_builder.add_u64_field("year", INDEXED);
@@ -59,7 +57,10 @@ use crate::{DateTime, DocId, Score};
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
/// let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
/// let docs_in_the_sixties = RangeQuery::new(
/// Bound::Included(Term::from_field_u64(year_field, 1960)),
/// Bound::Excluded(Term::from_field_u64(year_field, 1970)),
/// );
/// let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
/// assert_eq!(num_60s_books, 2285);
/// Ok(())
@@ -68,246 +69,46 @@ use crate::{DateTime, DocId, Score};
/// ```
#[derive(Clone, Debug)]
pub struct RangeQuery {
field: String,
value_type: Type,
lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>,
lower_bound: Bound<Term>,
upper_bound: Bound<Term>,
limit: Option<u64>,
}
/// Returns the inner value of a `Bound`
pub(crate) fn inner_bound(val: &Bound<Term>) -> Option<&Term> {
match val {
Bound::Included(term) | Bound::Excluded(term) => Some(term),
Bound::Unbounded => None,
}
}
impl RangeQuery {
/// Creates a new `RangeQuery` from bounded start and end terms.
///
/// If the value type is not correct, something may go terribly wrong when
/// the `Weight` object is created.
pub fn new_term_bounds(
field: String,
value_type: Type,
lower_bound: &Bound<Term>,
upper_bound: &Bound<Term>,
) -> RangeQuery {
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
pub fn new(lower_bound: Bound<Term>, upper_bound: Bound<Term>) -> RangeQuery {
RangeQuery {
field,
value_type,
lower_bound: map_bound(lower_bound, verify_and_unwrap_term),
upper_bound: map_bound(upper_bound, verify_and_unwrap_term),
lower_bound,
upper_bound,
limit: None,
}
}
/// Creates a new `RangeQuery` over a `i64` field.
///
/// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_i64(field: String, range: Range<i64>) -> RangeQuery {
RangeQuery::new_i64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `i64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `i64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_i64_bounds(
field: String,
lower_bound: Bound<i64>,
upper_bound: Bound<i64>,
) -> RangeQuery {
let make_term_val = |val: &i64| {
Term::from_field_i64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::I64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Creates a new `RangeQuery` over a `f64` field.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64(field: String, range: Range<f64>) -> RangeQuery {
RangeQuery::new_f64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `f64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `f64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_f64_bounds(
field: String,
lower_bound: Bound<f64>,
upper_bound: Bound<f64>,
) -> RangeQuery {
let make_term_val = |val: &f64| {
Term::from_field_f64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::F64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `u64` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_u64_bounds(
field: String,
lower_bound: Bound<u64>,
upper_bound: Bound<u64>,
) -> RangeQuery {
let make_term_val = |val: &u64| {
Term::from_field_u64(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::U64,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `ip` field.
///
/// If the field is not of the type `ip`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_ip_bounds(
field: String,
lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>,
) -> RangeQuery {
let make_term_val = |val: &Ipv6Addr| {
Term::from_field_ip_addr(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::IpAddr,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `u64` field.
///
/// If the field is not of the type `u64`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_u64(field: String, range: Range<u64>) -> RangeQuery {
RangeQuery::new_u64_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `date` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date_bounds(
field: String,
lower_bound: Bound<DateTime>,
upper_bound: Bound<DateTime>,
) -> RangeQuery {
let make_term_val = |val: &DateTime| {
Term::from_field_date(Field::from_field_id(0), *val)
.serialized_value_bytes()
.to_owned()
};
RangeQuery {
field,
value_type: Type::Date,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `date` field.
///
/// If the field is not of the type `date`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_date(field: String, range: Range<DateTime>) -> RangeQuery {
RangeQuery::new_date_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Create a new `RangeQuery` over a `Str` field.
///
/// The two `Bound` arguments make it possible to create more complex
/// ranges than semi-inclusive range.
///
/// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_str_bounds(
field: String,
lower_bound: Bound<&str>,
upper_bound: Bound<&str>,
) -> RangeQuery {
let make_term_val = |val: &&str| val.as_bytes().to_vec();
RangeQuery {
field,
value_type: Type::Str,
lower_bound: map_bound(&lower_bound, make_term_val),
upper_bound: map_bound(&upper_bound, make_term_val),
limit: None,
}
}
/// Create a new `RangeQuery` over a `Str` field.
///
/// If the field is not of the type `Str`, tantivy
/// will panic when the `Weight` object is created.
pub fn new_str(field: String, range: Range<&str>) -> RangeQuery {
RangeQuery::new_str_bounds(
field,
Bound::Included(range.start),
Bound::Excluded(range.end),
)
}
/// Field to search over
pub fn field(&self) -> &str {
&self.field
pub fn field(&self) -> Field {
self.get_term().field()
}
/// The value type of the field
pub fn value_type(&self) -> Type {
self.get_term().typ()
}
pub(crate) fn get_term(&self) -> &Term {
inner_bound(&self.lower_bound)
.or(inner_bound(&self.upper_bound))
.expect("At least one bound must be set")
}
/// Limit the number of term the `RangeQuery` will go through.
@@ -319,70 +120,23 @@ impl RangeQuery {
}
}
/// Returns true if the type maps to a u64 fast field
pub(crate) fn maps_to_u64_fastfield(typ: Type) -> bool {
match typ {
Type::U64 | Type::I64 | Type::F64 | Type::Bool | Type::Date => true,
Type::IpAddr => false,
Type::Str | Type::Facet | Type::Bytes | Type::Json => false,
}
}
impl Query for RangeQuery {
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let schema = enable_scoring.schema();
let field_type = schema
.get_field_entry(schema.get_field(&self.field)?)
.field_type();
let value_type = field_type.value_type();
if value_type != self.value_type {
let err_msg = format!(
"Create a range query of the type {:?}, when the field given was of type \
{value_type:?}",
self.value_type
);
return Err(TantivyError::SchemaError(err_msg));
}
let field_type = schema.get_field_entry(self.field()).field_type();
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type) {
if field_type.is_ip_addr() {
let parse_ip_from_bytes = |data: &Vec<u8>| {
let ip_u128_bytes: [u8; 16] = data.as_slice().try_into().map_err(|_| {
crate::TantivyError::InvalidArgument(
"Expected 8 bytes for ip address".to_string(),
)
})?;
let ip_u128 = u128::from_be_bytes(ip_u128_bytes);
crate::Result::<Ipv6Addr>::Ok(Ipv6Addr::from_u128(ip_u128))
};
let lower_bound = map_bound_res(&self.lower_bound, parse_ip_from_bytes)?;
let upper_bound = map_bound_res(&self.upper_bound, parse_ip_from_bytes)?;
Ok(Box::new(IPFastFieldRangeWeight::new(
self.field.to_string(),
lower_bound,
upper_bound,
)))
} else {
// We run the range query on u64 value space for performance reasons and simpicity
// assert the type maps to u64
assert!(maps_to_u64_fastfield(self.value_type));
let parse_from_bytes = |data: &Vec<u8>| {
u64::from_be(BinarySerializable::deserialize(&mut &data[..]).unwrap())
};
let lower_bound = map_bound(&self.lower_bound, parse_from_bytes);
let upper_bound = map_bound(&self.upper_bound, parse_from_bytes);
Ok(Box::new(FastFieldRangeWeight::new_u64_lenient(
self.field.to_string(),
lower_bound,
upper_bound,
)))
}
if field_type.is_fast() && is_type_valid_for_fastfield_range_query(self.value_type()) {
Ok(Box::new(FastFieldRangeWeight::new(
self.field(),
self.lower_bound.clone(),
self.upper_bound.clone(),
)))
} else {
let verify_and_unwrap_term = |val: &Term| val.serialized_value_bytes().to_owned();
Ok(Box::new(RangeWeight {
field: self.field.to_string(),
lower_bound: self.lower_bound.clone(),
upper_bound: self.upper_bound.clone(),
field: self.field(),
lower_bound: map_bound(&self.lower_bound, verify_and_unwrap_term),
upper_bound: map_bound(&self.upper_bound, verify_and_unwrap_term),
limit: self.limit,
}))
}
@@ -390,7 +144,7 @@ impl Query for RangeQuery {
}
pub struct RangeWeight {
field: String,
field: Field,
lower_bound: Bound<Vec<u8>>,
upper_bound: Bound<Vec<u8>>,
limit: Option<u64>,
@@ -423,7 +177,7 @@ impl Weight for RangeWeight {
let max_doc = reader.max_doc();
let mut doc_bitset = BitSet::with_max_value(max_doc);
let inverted_index = reader.inverted_index(reader.schema().get_field(&self.field)?)?;
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_range = self.term_range(term_dict)?;
let mut processed_count = 0;
@@ -477,7 +231,7 @@ mod tests {
use crate::schema::{
Field, IntoIpv6Addr, Schema, TantivyDocument, FAST, INDEXED, STORED, TEXT,
};
use crate::{Index, IndexWriter};
use crate::{Index, IndexWriter, Term};
#[test]
fn test_range_query_simple() -> crate::Result<()> {
@@ -499,7 +253,10 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
let docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
// ... or `1960..=1969` if inclusive range is enabled.
let count = searcher.search(&docs_in_the_sixties, &Count)?;
@@ -530,7 +287,10 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
let mut docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960u64..1970u64);
let mut docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
docs_in_the_sixties.limit(5);
// due to the limit and no docs in 1963, it's really only 1960..=1965
@@ -575,29 +335,29 @@ mod tests {
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_i64("intfield".to_string(), 10..11)),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 10)),
Bound::Excluded(Term::from_field_i64(int_field, 11)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Included(10),
Bound::Included(11)
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 10)),
Bound::Included(Term::from_field_i64(int_field, 11)),
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Excluded(9),
Bound::Included(10)
count_multiples(RangeQuery::new(
Bound::Excluded(Term::from_field_i64(int_field, 9)),
Bound::Included(Term::from_field_i64(int_field, 10)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_i64_bounds(
"intfield".to_string(),
Bound::Included(9),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_i64(int_field, 9)),
Bound::Unbounded
)),
91
@@ -646,29 +406,29 @@ mod tests {
|range_query: RangeQuery| searcher.search(&range_query, &Count).unwrap();
assert_eq!(
count_multiples(RangeQuery::new_f64("floatfield".to_string(), 10.0..11.0)),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 10.0)),
Bound::Excluded(Term::from_field_f64(float_field, 11.0)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Included(10.0),
Bound::Included(11.0)
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 10.0)),
Bound::Included(Term::from_field_f64(float_field, 11.0)),
)),
18
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Excluded(9.0),
Bound::Included(10.0)
count_multiples(RangeQuery::new(
Bound::Excluded(Term::from_field_f64(float_field, 9.0)),
Bound::Included(Term::from_field_f64(float_field, 10.0)),
)),
9
);
assert_eq!(
count_multiples(RangeQuery::new_f64_bounds(
"floatfield".to_string(),
Bound::Included(9.0),
count_multiples(RangeQuery::new(
Bound::Included(Term::from_field_f64(float_field, 9.0)),
Bound::Unbounded
)),
91

View File

@@ -1,512 +0,0 @@
//! IP Fastfields support efficient scanning for range queries.
//! We use this variant only if the fastfield exists, otherwise the default in `range_query` is
//! used, which uses the term dictionary + postings.
use std::net::Ipv6Addr;
use std::ops::{Bound, RangeInclusive};
use columnar::{Column, MonotonicallyMappableToU128};
use crate::query::range_query::fast_field_range_query::RangeDocSet;
use crate::query::{ConstScorer, EmptyScorer, Explanation, Scorer, Weight};
use crate::{DocId, DocSet, Score, SegmentReader, TantivyError};
/// `IPFastFieldRangeWeight` uses the ip address fast field to execute range queries.
pub struct IPFastFieldRangeWeight {
field: String,
lower_bound: Bound<Ipv6Addr>,
upper_bound: Bound<Ipv6Addr>,
}
impl IPFastFieldRangeWeight {
/// Creates a new IPFastFieldRangeWeight.
pub fn new(field: String, lower_bound: Bound<Ipv6Addr>, upper_bound: Bound<Ipv6Addr>) -> Self {
Self {
field,
lower_bound,
upper_bound,
}
}
}
impl Weight for IPFastFieldRangeWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let Some(ip_addr_column): Option<Column<Ipv6Addr>> =
reader.fast_fields().column_opt(&self.field)?
else {
return Ok(Box::new(EmptyScorer));
};
let value_range = bound_to_value_range(
&self.lower_bound,
&self.upper_bound,
ip_addr_column.min_value(),
ip_addr_column.max_value(),
);
let docset = RangeDocSet::new(value_range, ip_addr_column);
Ok(Box::new(ConstScorer::new(docset, boost)))
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let mut scorer = self.scorer(reader, 1.0)?;
if scorer.seek(doc) != doc {
return Err(TantivyError::InvalidArgument(format!(
"Document #({doc}) does not match"
)));
}
let explanation = Explanation::new("Const", scorer.score());
Ok(explanation)
}
}
fn bound_to_value_range(
lower_bound: &Bound<Ipv6Addr>,
upper_bound: &Bound<Ipv6Addr>,
min_value: Ipv6Addr,
max_value: Ipv6Addr,
) -> RangeInclusive<Ipv6Addr> {
let start_value = match lower_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() + 1),
Bound::Unbounded => min_value,
};
let end_value = match upper_bound {
Bound::Included(ip_addr) => *ip_addr,
Bound::Excluded(ip_addr) => Ipv6Addr::from(ip_addr.to_u128() - 1),
Bound::Unbounded => max_value,
};
start_value..=end_value
}
#[cfg(test)]
pub mod tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::schema::{Schema, FAST, INDEXED, STORED, STRING};
use crate::{Index, IndexWriter};
#[derive(Clone, Debug)]
pub struct Doc {
pub id: String,
pub ip: Ipv6Addr,
}
fn operation_strategy() -> impl Strategy<Value = Doc> {
prop_oneof![
(0u64..10_000u64).prop_map(doc_from_id_1),
(1u64..10_000u64).prop_map(doc_from_id_2),
]
}
pub fn doc_from_id_1(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: id.to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
fn doc_from_id_2(id: u64) -> Doc {
let id = id * 1000;
Doc {
// ip != id
id: (id - 1).to_string(),
ip: Ipv6Addr::from_u128(id as u128),
}
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(10))]
#[test]
fn test_ip_range_for_docs_prop(ops in proptest::collection::vec(operation_strategy(), 1..1000)) {
assert!(test_ip_range_for_docs(&ops).is_ok());
}
}
#[test]
fn test_ip_range_regression1() {
let ops = &[doc_from_id_1(0)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression2() {
let ops = &[
doc_from_id_1(52),
doc_from_id_1(63),
doc_from_id_1(12),
doc_from_id_2(91),
doc_from_id_2(33),
];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3() {
let ops = &[doc_from_id_1(1), doc_from_id_1(2), doc_from_id_1(3)];
assert!(test_ip_range_for_docs(ops).is_ok());
}
#[test]
fn test_ip_range_regression3_simple() {
let mut schema_builder = Schema::builder();
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer: IndexWriter = index.writer_for_tests().unwrap();
let ip_addrs: Vec<Ipv6Addr> = [1000, 2000, 3000]
.into_iter()
.map(Ipv6Addr::from_u128)
.collect();
for &ip_addr in &ip_addrs {
writer
.add_document(doc!(ips_field=>ip_addr, ips_field=>ip_addr))
.unwrap();
}
writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let range_weight = IPFastFieldRangeWeight {
field: "ips".to_string(),
lower_bound: Bound::Included(ip_addrs[1]),
upper_bound: Bound::Included(ip_addrs[2]),
};
let count = range_weight.count(searcher.segment_reader(0)).unwrap();
assert_eq!(count, 2);
}
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
let mut schema_builder = Schema::builder();
let ip_field = schema_builder.add_ip_addr_field("ip", STORED | FAST);
let ips_field = schema_builder.add_ip_addr_field("ips", FAST | INDEXED);
let text_field = schema_builder.add_text_field("id", STRING | STORED);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer = index.writer_with_num_threads(2, 60_000_000).unwrap();
for doc in docs.iter() {
index_writer
.add_document(doc!(
ips_field => doc.ip,
ips_field => doc.ip,
ip_field => doc.ip,
text_field => doc.id.to_string(),
))
.unwrap();
}
index_writer.commit().unwrap();
}
index
}
fn test_ip_range_for_docs(docs: &[Doc]) -> crate::Result<()> {
let index = create_index_from_docs(docs);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let get_num_hits = |query| searcher.search(&query, &Count).unwrap();
let query_from_text = |text: &str| {
QueryParser::for_index(&index, vec![])
.parse_query(text)
.unwrap()
};
let gen_query_inclusive = |field: &str, ip_range: &RangeInclusive<Ipv6Addr>| {
format!("{field}:[{} TO {}]", ip_range.start(), ip_range.end())
};
let test_sample = |sample_docs: &[Doc]| {
let mut ips: Vec<Ipv6Addr> = sample_docs.iter().map(|doc| doc.ip).collect();
ips.sort();
let ip_range = ips[0]..=ips[1];
let expected_num_hits = docs
.iter()
.filter(|doc| (ips[0]..=ips[1]).contains(&doc.ip))
.count();
let query = gen_query_inclusive("ip", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
let query = gen_query_inclusive("ips", &ip_range);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search
let id_filter = sample_docs[0].id.to_string();
let expected_num_hits = docs
.iter()
.filter(|doc| ip_range.contains(&doc.ip) && doc.id == id_filter)
.count();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ip", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
// Intersection search on multivalue ip field
let id_filter = sample_docs[0].id.to_string();
let query = format!(
"{} AND id:{}",
gen_query_inclusive("ips", &ip_range),
&id_filter
);
assert_eq!(get_num_hits(query_from_text(&query)), expected_num_hits);
};
test_sample(&[docs[0].clone(), docs[0].clone()]);
if docs.len() > 1 {
test_sample(&[docs[0].clone(), docs[1].clone()]);
test_sample(&[docs[1].clone(), docs[1].clone()]);
}
if docs.len() > 2 {
test_sample(&[docs[1].clone(), docs[2].clone()]);
}
Ok(())
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use test::Bencher;
use super::tests::*;
use super::*;
use crate::collector::Count;
use crate::query::QueryParser;
use crate::Index;
fn get_index_0_to_100() -> Index {
let mut rng = StdRng::from_seed([1u8; 32]);
let num_vals = 100_000;
let docs: Vec<_> = (0..num_vals)
.map(|_i| {
let id = if rng.gen_bool(0.01) {
"veryfew".to_string() // 1%
} else if rng.gen_bool(0.1) {
"few".to_string() // 9%
} else {
"many".to_string() // 90%
};
Doc {
id,
// Multiply by 1000, so that we create many buckets in the compact space
// The benches depend on this range to select n-percent of elements with the
// methods below.
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
}
})
.collect();
create_index_from_docs(&docs)
}
fn get_90_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(90 * 1000);
start..=end
}
fn get_10_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(0);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn get_1_percent() -> RangeInclusive<Ipv6Addr> {
let start = Ipv6Addr::from_u128(10 * 1000);
let end = Ipv6Addr::from_u128(10 * 1000);
start..=end
}
fn excute_query(
field: &str,
ip_range: RangeInclusive<Ipv6Addr>,
suffix: &str,
index: &Index,
) -> usize {
let gen_query_inclusive = |from: &Ipv6Addr, to: &Ipv6Addr| {
format!(
"{}:[{} TO {}] {}",
field,
&from.to_string(),
&to.to_string(),
suffix
)
};
let query = gen_query_inclusive(ip_range.start(), ip_range.end());
let query_from_text = |text: &str| {
QueryParser::for_index(index, vec![])
.parse_query(text)
.unwrap()
};
let query = query_from_text(&query);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
searcher.search(&query, &(Count)).unwrap()
}
#[bench]
fn bench_ip_range_hit_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ip", get_90_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_1_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_1_percent(), "AND id:veryfew", &index));
}
#[bench]
fn bench_ip_range_hit_10_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_10_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_90_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:many", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_10_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:few", &index));
}
#[bench]
fn bench_ip_range_hit_90_percent_intersect_with_1_percent_multi(bench: &mut Bencher) {
let index = get_index_0_to_100();
bench.iter(|| excute_query("ips", get_90_percent(), "AND id:veryfew", &index));
}
}

Some files were not shown because too many files have changed in this diff Show More