Compare commits

..

31 Commits

Author SHA1 Message Date
Pascal Seitz
80538175e8 fix build 2024-10-16 10:33:24 +08:00
dependabot[bot]
8dc942e8e7 Update binggan requirement from 0.10.0 to 0.12.0
---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
2024-10-15 20:05:11 +00:00
Bruce Mitchener
c17e513377 Reduce typo count. (#2510) 2024-10-10 09:55:37 +08:00
PSeitz
2f5a269c70 update packages (#2500)
fixes some warnings
2024-09-25 17:46:18 +08:00
PSeitz
50532260e3 update changelog (#2496) 2024-09-25 10:28:53 +08:00
Tri
8bd6eb06e6 feat: make SegmentMeta.with_max_doc public (#2499)
* chore: add container

* feat: make max doc editable externally

* chore: expose another method

* chore: remove comments

* remove unused devcontainer

* chore: manually match nightly format

* chore: change weird formating

* revert format change

* fix: format with nightly
2024-09-23 12:39:36 +08:00
PSeitz
55b0b52457 Fix AggregationLimits (#2495)
* change AggregationLimits behavior

This fixes an issue encountered with the current behaviour of
AggregationLimits.
Previously we had AggregationLimits and RessourceLimitGuard, which both
track the memory, but only RessourceLimitGuard released memory when
dropped, while AggregationLimits did not.

This PR changes AggregationLimits to be a guard itself and removes the
RessourceLimitGuard.

* rename AggregationLimits to AggregationLimitsGuard
2024-09-17 14:25:47 +08:00
dependabot[bot]
56fc56c5b9 Update binggan requirement from 0.8.0 to 0.10.0 (#2493)
* Update binggan requirement from 0.8.0 to 0.10.0

---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* update PR

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2024-09-10 14:26:06 +08:00
trinity-1686a
85395d942a fix clippy lints from 1.80-1.81 (#2488)
* fix some clippy lints

* fix clippy::doc_lazy_continuation

* fix some lints for 1.82
2024-09-05 14:33:05 +02:00
PSeitz
a206c3ccd3 add compat tests (#2485) 2024-09-04 18:26:57 +08:00
Chaya
dc5d31c116 grammar and misspellings (#2483)
* grammar

* grammar

* misspelling
2024-09-04 12:45:31 +08:00
gezihuzi
95a4ddea3e Fix: Improve collapse_overlapped_ranges function (#2474)
* Fix: Improve collapse_overlapped_ranges function

- Refactor into separate sort_and_deduplicate_ranges and merge_overlapping_ranges functions
- Enhance sorting to consider both start and end of ranges
- Optimize merging logic to handle adjacent ranges
- Add comprehensive examples in function documentation
- Ensure proper handling of duplicate and unsorted input ranges
- Improve overall efficiency and readability of range collapsing algorithm

* move debug_assert

---------

Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>
2024-09-04 12:39:13 +08:00
trinity-1686a
ab5125d3dc remove unused trait bounds and outdated doc comment (#2478) 2024-09-03 16:31:51 +02:00
trinity-1686a
9f81d59ecd make find_field_with_default return json fields without path (#2476)
* make find_field_with_default return json fields without path

* add tests for find_field_with_default
2024-08-19 15:25:29 +02:00
PSeitz
c71ec8086d add FastFieldRangeQuery, rename (#2477)
* add FastFieldRangeQuery, rename

* remove Query impl
2024-08-19 09:02:00 +02:00
PSeitz
27be6aed91 lift clauses in LogicalAst (#2449)
(a OR b) OR (c OR d) can be simplified to (a OR b OR c OR d)
(a AND b) AND (c AND d) can be simplified to (a AND b AND c AND d)

This directly affects how queries are executed

remove unused SumWithCoordsCombiner
the number of fields is unused and private
2024-08-14 19:21:26 +02:00
PSeitz
3d1c4b313a support ff range queries on json fields (#2456)
* support ff range queries on json fields

* fix term date truncation

* use inverted index range query for phrase prefix queries

* rename to InvertedIndexRangeQuery

* fix column filter, add mixed column test
2024-08-02 00:06:50 +08:00
PSeitz
0d4e319965 add Key::I64 and Key::U64 variants in aggregation (#2468)
* add Key::I64 and Key::U64 variants in aggregation

Currently all `Key` numerical values are returned as f64. This causes problems in some
cases with the precision and the way f64 is serialized.

This PR adds `Key::I64` and `Key::U64` variants and uses them in the term
aggregation.

* add clarification comment
2024-07-31 20:29:32 +08:00
PSeitz
75dc3eb298 extend custom order deserialization (#2451)
allow arrays
improve validation
closes https://github.com/quickwit-oss/tantivy/issues/2435
2024-07-30 18:36:08 +08:00
PSeitz
3f6d225086 fix potential endless loop in merge (#2457)
avoid single segments lists without deletes as merge candidates, as they will be moved
to a merge operation and filtered for merging in the next
consider_merge_options call. In rare cases this may end up in a endless
merge loop where only single segments where nothing is to be done are
merged.
2024-07-30 16:37:20 +08:00
PSeitz
d8843c608c make FastFieldRangeWeight::new pub (#2460) 2024-07-29 10:39:27 +08:00
PSeitz
7ebcc15b17 add support for str fast field range query (#2453)
* add support for str fast field range query

Add support for range queries on fast fields, by converting term bounds to
term ordinals bounds.

closes https://github.com/quickwit-oss/tantivy/issues/2023

* extend tests, rename

* update comment

* update comment
2024-07-17 09:31:42 +08:00
PSeitz
1b4076691f refactor fast field query (#2452)
As preparation of #2023 and #1709

* Use Term to pass parameters
* merge u64 and ip fast field range query

Side note: I did not rename range_query_u64_fastfield, because then git can't track the changes.
2024-07-15 18:08:05 +08:00
Robert Caulk
eab660873a doc: fix typo in readme (#2450) 2024-07-09 15:12:22 +08:00
PSeitz
232f37126e fix coverage (#2448) 2024-07-05 12:04:18 +08:00
PSeitz
13e9885dfd faster term aggregation fetch terms (#2447)
big impact for term aggregations with large `size` parameter (e.g. 1000)
add top 1000 term agg bench

full
terms_few                                      Memory: 27.3 KB (+79.09%)    Avg: 3.8058ms (+2.40%)      Median: 3.7192ms (+3.47%)       [3.6224ms .. 4.3721ms]
terms_many                                     Memory: 6.9 MB               Avg: 12.6102ms (-4.70%)     Median: 12.1389ms (-6.58%)      [10.2847ms .. 15.4857ms]
terms_many_top_1000                            Memory: 6.9 MB               Avg: 15.8216ms (-83.19%)    Median: 15.4899ms (-83.46%)     [13.4250ms .. 20.6897ms]
terms_many_order_by_term                       Memory: 6.9 MB               Avg: 14.7820ms (-3.95%)     Median: 14.2236ms (-4.28%)      [12.6669ms .. 21.0968ms]
terms_many_with_top_hits                       Memory: 58.2 MB              Avg: 551.6218ms (+7.18%)    Median: 549.8826ms (+11.01%)    [496.7371ms .. 592.1299ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB              Avg: 197.7029ms (+2.66%)    Median: 190.1564ms (+0.64%)     [167.9226ms .. 245.6651ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB (+0.00%)     Avg: 242.0121ms (+0.92%)    Median: 237.7084ms (-2.85%)     [201.9959ms .. 302.2136ms]
terms_few_with_cardinality_agg                 Memory: 10.6 MB              Avg: 122.6036ms (+1.21%)    Median: 119.0033ms (+2.60%)     [109.2859ms .. 161.5858ms]
range_agg_with_term_agg_few                    Memory: 45.4 KB (+39.75%)    Avg: 24.5454ms (+2.14%)     Median: 24.2861ms (+2.44%)      [23.5109ms .. 27.8406ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB               Avg: 56.8049ms (+3.01%)     Median: 50.9706ms (+1.52%)      [41.4517ms .. 90.3934ms]
dense
terms_few                                      Memory: 28.8 KB (+81.74%)    Avg: 8.9092ms (-2.24%)      Median: 8.7143ms (-1.31%)      [8.6148ms .. 10.3868ms]
terms_many                                     Memory: 6.9 MB (-0.00%)      Avg: 17.9604ms (-10.18%)    Median: 17.1552ms (-11.93%)    [14.8979ms .. 26.2779ms]
terms_many_top_1000                            Memory: 6.9 MB               Avg: 21.4963ms (-78.90%)    Median: 21.2924ms (-78.98%)    [18.2033ms .. 28.0087ms]
terms_many_order_by_term                       Memory: 6.9 MB               Avg: 20.4167ms (-9.13%)     Median: 19.5596ms (-11.37%)    [17.5153ms .. 29.5987ms]
terms_many_with_top_hits                       Memory: 58.2 MB              Avg: 518.4474ms (-6.41%)    Median: 514.9180ms (-9.44%)    [471.5550ms .. 579.0220ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB              Avg: 263.6702ms (-2.78%)    Median: 260.8775ms (-2.55%)    [239.5754ms .. 304.6669ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB              Avg: 299.9791ms (-2.01%)    Median: 302.2180ms (-3.08%)    [239.2080ms .. 346.3649ms]
terms_few_with_cardinality_agg                 Memory: 10.6 MB              Avg: 136.3303ms (-3.12%)    Median: 132.3831ms (-2.88%)    [123.7564ms .. 164.7914ms]
range_agg_with_term_agg_few                    Memory: 47.1 KB (+37.81%)    Avg: 35.4538ms (+0.66%)     Median: 34.8754ms (-0.56%)     [34.2287ms .. 40.0884ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB               Avg: 72.2269ms (-4.38%)     Median: 66.1174ms (-4.98%)     [55.5125ms .. 124.1622ms]
sparse
terms_few                                      Memory: 27.3 KB (+69.68%)    Avg: 19.6053ms (-1.15%)     Median: 19.4543ms (-0.38%)     [19.3056ms .. 24.0547ms]
terms_many                                     Memory: 1.8 MB               Avg: 21.2886ms (-6.28%)     Median: 21.1287ms (-6.65%)     [20.6640ms .. 24.6144ms]
terms_many_top_1000                            Memory: 2.6 MB               Avg: 23.4869ms (-85.53%)    Median: 23.3393ms (-85.61%)    [22.7789ms .. 25.0896ms]
terms_many_order_by_term                       Memory: 1.8 MB               Avg: 21.7437ms (-7.78%)     Median: 21.6272ms (-7.66%)     [21.0409ms .. 23.6517ms]
terms_many_with_top_hits                       Memory: 13.1 MB              Avg: 43.7926ms (-2.76%)     Median: 44.3602ms (+0.01%)     [37.8039ms .. 51.0451ms]
terms_many_with_avg_sub_agg                    Memory: 7.5 MB               Avg: 34.6307ms (+3.72%)     Median: 33.4522ms (+1.16%)     [32.4418ms .. 41.4196ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 7.4 MB               Avg: 46.4318ms (+1.16%)     Median: 46.4050ms (+2.03%)     [44.5986ms .. 48.5142ms]
terms_few_with_cardinality_agg                 Memory: 680.0 KB (-0.04%)    Avg: 35.4410ms (+2.05%)     Median: 35.1384ms (+1.19%)     [34.4402ms .. 39.1082ms]
range_agg_with_term_agg_few                    Memory: 45.7 KB (+39.44%)    Avg: 22.7760ms (+0.44%)     Median: 22.5152ms (-0.35%)     [22.3078ms .. 26.1567ms]
range_agg_with_term_agg_many                   Memory: 1.8 MB               Avg: 25.7696ms (-4.45%)     Median: 25.4009ms (-5.61%)     [24.7874ms .. 29.6434ms]
multivalue
terms_few                                      Memory: 244.4 KB            Avg: 15.1253ms (-2.85%)     Median: 15.0988ms (-0.54%)     [14.8790ms .. 15.8193ms]
terms_many                                     Memory: 6.9 MB (-0.00%)     Avg: 26.3019ms (-6.24%)     Median: 26.3662ms (-4.94%)     [21.3553ms .. 31.0564ms]
terms_many_top_1000                            Memory: 6.9 MB              Avg: 29.5212ms (-72.90%)    Median: 29.4257ms (-72.84%)    [24.2645ms .. 35.1607ms]
terms_many_order_by_term                       Memory: 6.9 MB              Avg: 28.6076ms (-4.93%)     Median: 28.1059ms (-6.64%)     [24.0845ms .. 34.1493ms]
terms_many_with_top_hits                       Memory: 58.3 MB             Avg: 570.1548ms (+1.52%)    Median: 572.7759ms (+0.53%)    [525.9567ms .. 617.0862ms]
terms_many_with_avg_sub_agg                    Memory: 27.8 MB             Avg: 305.5207ms (+0.24%)    Median: 296.0101ms (-0.22%)    [277.8579ms .. 373.5914ms]
terms_many_json_mixed_type_with_avg_sub_agg    Memory: 42.0 MB (-0.00%)    Avg: 324.7342ms (-2.51%)    Median: 319.0025ms (-2.58%)    [298.7122ms .. 368.6144ms]
terms_few_with_cardinality_agg                 Memory: 10.8 MB             Avg: 151.6126ms (-2.54%)    Median: 149.0616ms (-0.32%)    [136.5592ms .. 181.8942ms]
range_agg_with_term_agg_few                    Memory: 248.2 KB            Avg: 49.5225ms (+3.11%)     Median: 48.3994ms (+3.18%)     [46.4134ms .. 60.5989ms]
range_agg_with_term_agg_many                   Memory: 6.9 MB              Avg: 85.9824ms (-3.66%)     Median: 78.4266ms (-3.85%)     [64.1231ms .. 128.5279ms]
2024-07-03 12:42:59 +08:00
PSeitz
56d79cb203 fix cardinality aggregation performance (#2446)
* fix cardinality aggregation performance

fix cardinality performance by fetching multiple terms at once. This
avoids decompressing the same block and keeps the buffer state between
terms.

add cardinality aggregation benchmark

bump rust version to 1.66

Performance comparison to before (AllQuery)
```
full
cardinality_agg                   Memory: 3.5 MB (-0.00%)    Avg: 21.2256ms (-97.78%)    Median: 21.0042ms (-97.82%)    [20.4717ms .. 23.6206ms]
terms_few_with_cardinality_agg    Memory: 10.6 MB            Avg: 81.9293ms (-97.37%)    Median: 81.5526ms (-97.38%)    [79.7564ms .. 88.0374ms]
dense
cardinality_agg                   Memory: 3.6 MB (-0.00%)    Avg: 25.9372ms (-97.24%)    Median: 25.7744ms (-97.25%)    [24.7241ms .. 27.8793ms]
terms_few_with_cardinality_agg    Memory: 10.6 MB            Avg: 93.9897ms (-96.91%)    Median: 92.7821ms (-96.94%)    [90.3312ms .. 117.4076ms]
sparse
cardinality_agg                   Memory: 895.4 KB (-0.00%)    Avg: 22.5113ms (-95.01%)    Median: 22.5629ms (-94.99%)    [22.1628ms .. 22.9436ms]
terms_few_with_cardinality_agg    Memory: 680.2 KB             Avg: 26.4250ms (-94.85%)    Median: 26.4135ms (-94.86%)    [26.3210ms .. 26.6774ms]
```

* clippy

* assert for sorted ordinals
2024-07-02 15:29:00 +08:00
Paul Masurel
0f4c2e27cf Fixes bug that causes out-of-order sstable key. (#2445)
The previous way to address the problem was to replace \u{0000}
with 0 in different places.

This logic had several flaws:
Done on the serializer side (like it was for the columnar), there was
a collision problem.

If a document in the segment contained a json field with a \0 and
antoher doc contained the same json field but `0` then we were sending
the same field path twice to the serializer.

Another option would have been to normalizes all values on the writer
side.

This PR simplifies the logic and simply ignore json path containing a
\0, both in the columnar and the inverted index.

Closes #2442
2024-07-01 15:40:07 +08:00
落叶乌龟
f9ae295507 feat(query): Make BooleanQuery supports minimum_number_should_match (#2405)
* feat(query): Make `BooleanQuery` supports `minimum_number_should_match`. see issue #2398

In this commit, a novel scorer named DisjunctionScorer is introduced, which performs the union of inverted chains with the minimal required elements. BTW, it's implemented via a min-heap. Necessary modifications on `BooleanQuery` and `BooleanWeight` are performed as well.

* fixup! fix test

* fixup!: refactor code.

1. More meaningful names.
2. Add Cache for `Disjunction`'s scorers, and fix bug.
3. Optimize `BooleanWeight::complex_scorer`

Thanks
 Paul Masurel <paul@quickwit.io>

* squash!: come up with better variable naming.

* squash!: fix naming issues.

* squash!: fix typo.

* squash!: Remove CombinationMethod::FullIntersection
2024-07-01 15:39:41 +08:00
Raphael Coeffic
d9db5302d9 feat: cardinality aggregation (#2337)
* WiP: cardinality aggregation

* Collect unique entries first, then insert into HyperLogLog

* Handle `missing`

* Hybrid approach

* Review changes

- insert `missing` value at most once
- `term_id` -> `term_ord`
- iterate directly over entries without collecting first

* Use salted hasher to include column type

* fix: formatting

* More review fixes

* Add cardinality to test_aggregation_flushing

* Formatting
2024-07-01 07:49:42 +08:00
Paul Masurel
e453848134 Recycling buffer in PrefixPhraseScorer (#2443) 2024-06-24 17:11:53 +09:00
150 changed files with 5210 additions and 2251 deletions

View File

@@ -15,11 +15,11 @@ jobs:
steps:
- uses: actions/checkout@v4
- name: Install Rust
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
- uses: Swatinem/rust-cache@v2
- uses: taiki-e/install-action@cargo-llvm-cov
- name: Generate code coverage
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
continue-on-error: true

View File

@@ -1,3 +1,69 @@
Tantivy 0.23 - Unreleased
================================
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21.
#### Bugfixes
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
#### Breaking API Changes
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
#### Features/Improvements
- **Aggregation**
- Support for cardinality aggregation [#2337](https://github.com/quickwit-oss/tantivy/pull/2337) [#2446](https://github.com/quickwit-oss/tantivy/pull/2446) (@raphaelcoeffic @PSeitz)
- Support for extended stats aggregation [#2247](https://github.com/quickwit-oss/tantivy/pull/2247)(@giovannicuccu)
- Add Key::I64 and Key::U64 variants in aggregation to avoid f64 precision issues [#2468](https://github.com/quickwit-oss/tantivy/pull/2468)(@PSeitz)
- Faster term aggregation fetch terms [#2447](https://github.com/quickwit-oss/tantivy/pull/2447)(@PSeitz)
- Improve custom order deserialization [#2451](https://github.com/quickwit-oss/tantivy/pull/2451)(@PSeitz)
- Change AggregationLimits behavior [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
- lower contention on AggregationLimits [#2394](https://github.com/quickwit-oss/tantivy/pull/2394)(@PSeitz)
- fix postcard compatibility for top_hits, add postcard test [#2346](https://github.com/quickwit-oss/tantivy/pull/2346)(@PSeitz)
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
- **Range Queries**
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
- modify fastfield range query heuristic [#2375](https://github.com/quickwit-oss/tantivy/pull/2375)(@trinity-1686a)
- add FastFieldRangeQuery for explicit range queries on fast field (for `RangeQuery` it is autodetected) [#2477](https://github.com/quickwit-oss/tantivy/pull/2477)(@PSeitz)
- add format backwards-compatibility tests [#2485](https://github.com/quickwit-oss/tantivy/pull/2485)(@PSeitz)
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
- feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
- **Optional Index in Multivalue Columnar Index** For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case). This is alleviated by placing an optional index in the multivalued index to mark documents that have values. This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
- **Performace/Memory**
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
- Recycling buffer in PrefixPhraseScorer [#2443](https://github.com/quickwit-oss/tantivy/pull/2443)(@fulmicoton)
- **Json Type**
- JSON supports now all values on the root level. Previously an object was required. This enables support for flat mixed types. allow more JSON values, fix i64 special case [#2383](https://github.com/quickwit-oss/tantivy/pull/2383)(@PSeitz)
- add json path constructor to term [#2367](https://github.com/quickwit-oss/tantivy/pull/2367)(@PSeitz)
- **QueryParser**
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
- use bingang for agg benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)(@PSeitz)
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
Tantivy 0.22
================================
@@ -8,7 +74,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
- Fix bug occurring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
@@ -26,7 +92,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
- Support ip addresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
@@ -116,7 +182,7 @@ Tantivy 0.20
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
- Move tokenizer API to separate crate. Having a separate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
@@ -163,13 +229,13 @@ Tantivy 0.20
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
- sstable
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
- Isolating sstable and stacker in independent crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
- Use DeltaReader directly to implement Dictionary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
- Add separate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
- Added support for madvise when opening an mmapped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
- Query Parser
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
@@ -208,7 +274,7 @@ Tantivy 0.19
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
- Aggregation
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
- Add support for keyed parameter in range and histogram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
- Faster indexing
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)

View File

@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.63"
rust-version = "1.66"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
@@ -29,7 +29,7 @@ tantivy-fst = "0.5"
memmap2 = { version = "0.9.0", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
zstd = { version = "0.13", optional = true, default-features = false }
tempfile = { version = "3.3.0", optional = true }
tempfile = { version = "3.12.0", optional = true }
log = "0.4.16"
serde = { version = "1.0.136", features = ["derive"] }
serde_json = "1.0.79"
@@ -38,7 +38,7 @@ levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.0"
downcast-rs = "1.2.1"
bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
@@ -47,7 +47,7 @@ rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
time = { version = "0.3.10", features = ["serde-well-known"] }
time = { version = "0.3.35", features = ["serde-well-known"] }
smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
@@ -64,6 +64,7 @@ tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true }
fnv = "1.0.7"
@@ -71,7 +72,7 @@ fnv = "1.0.7"
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.8.0"
binggan = "0.12.0"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"

View File

@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
## Benchmark
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
performance for different types of queries/collections.
Your mileage WILL vary depending on the nature of queries and their load.

View File

@@ -1,7 +1,7 @@
Make schema_builder API fluent.
fix doc serialization and prevent compression problems
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
u64 , etc. should return Result<Option> now that we support optional missing a column is really not an error
remove fastfield codecs
ditch the first_or_default trick. if it is still useful, improve its implementation.
rename FastFieldReaders::open to load

View File

@@ -1,3 +1,4 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -17,7 +18,10 @@ pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
/// runner.register("average_u64", move |index| average_u64(index));
macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| $func(index))
$runner.register(stringify!($func), move |index| {
$func(index);
None
})
};
}
@@ -42,7 +46,8 @@ fn main() {
}
fn bench_agg(mut group: InputGroup<Index>) {
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
register!(group, average_u64);
register!(group, average_f64);
register!(group, average_f64_u64);
@@ -51,10 +56,15 @@ fn bench_agg(mut group: InputGroup<Index>) {
register!(group, percentiles_f64);
register!(group, terms_few);
register!(group, terms_many);
register!(group, terms_many_top_1000);
register!(group, terms_many_order_by_term);
register!(group, terms_many_with_top_hits);
register!(group, terms_many_with_avg_sub_agg);
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
register!(group, cardinality_agg);
register!(group, terms_few_with_cardinality_agg);
register!(group, range_agg);
register!(group, range_agg_with_avg_sub_agg);
register!(group, range_agg_with_term_agg_few);
@@ -123,6 +133,33 @@ fn percentiles_f64(index: &Index) {
});
execute_agg(index, agg_req);
}
fn cardinality_agg(index: &Index) {
let agg_req = json!({
"cardinality": {
"cardinality": {
"field": "text_many_terms"
},
}
});
execute_agg(index, agg_req);
}
fn terms_few_with_cardinality_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "text_few_terms" },
"aggs": {
"cardinality": {
"cardinality": {
"field": "text_many_terms"
},
}
}
},
});
execute_agg(index, agg_req);
}
fn terms_few(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
@@ -135,6 +172,12 @@ fn terms_many(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_many_top_1000(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
});
execute_agg(index, agg_req);
}
fn terms_many_order_by_term(index: &Index) {
let agg_req = json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
@@ -171,7 +214,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
});
execute_agg(index, agg_req);
}
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
let agg_req = json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
@@ -268,6 +311,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
});
execute_agg(index, agg_req);
}
fn histogram(index: &Index) {
let agg_req = json!({
"rangef64": {

View File

@@ -368,9 +368,9 @@ mod test {
for start_idx in 0u32..32u32 {
output.resize(len, 0);
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
for i in 0..len {
for (i, output_byte) in output.iter().enumerate() {
let expected = (start_idx + i as u32) & mask;
assert_eq!(output[i], expected);
assert_eq!(*output_byte, expected);
}
}
}

View File

@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
binggan = "0.8.1"
binggan = "0.12.0"
[[bench]]
name = "bench_merge"

View File

@@ -31,7 +31,7 @@ restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot
# Columnar format
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
The `(column_name, columne_type)` couple however uniquely identifies a column.
The `(column_name, column_type)` couple however uniquely identifies a column.
That couple is serialized as a column `column_key`. The format of that key is:
`[column_name][ZERO_BYTE][column_type_header: u8]`

View File

@@ -42,6 +42,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
}
}
black_box(sum);
None
});
runner.register("access_first_vals", |column| {
let mut sum = 0;
@@ -62,6 +63,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
}
black_box(sum);
None
});
runner.run();
}

View File

@@ -41,7 +41,7 @@ fn main() {
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
black_box(out);
Some(out.len() as u64)
},
);
}

View File

@@ -10,7 +10,7 @@
# Perf and Size
* remove alloc in `ord_to_term`
+ multivaued range queries restrat frm the beginning all of the time.
+ multivaued range queries restart from the beginning all of the time.
* re-add ZSTD compression for dictionaries
no systematic monotonic mapping
consider removing multilinear
@@ -30,7 +30,7 @@ investigate if should have better errors? io::Error is overused at the moment.
rename rank/select in unit tests
Review the public API via cargo doc
go through TODOs
remove all doc_id occurences -> row_id
remove all doc_id occurrences -> row_id
use the rank & select naming in unit tests branch.
multi-linear -> blockwise
linear codec -> simply a multiplication for the index column
@@ -43,5 +43,5 @@ isolate u128_based and uniform naming
# Other
fix enhance column-cli
# Santa claus
# Santa Claus
autodetect datetime ipaddr, plug customizable tokenizer.

View File

@@ -173,7 +173,7 @@ mod tests {
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
panic!("Expected a multivalued index")
};
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
@@ -211,7 +211,7 @@ mod tests {
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
panic!("Expected a multivalued index")
};
let mut output = Vec::new();
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();

View File

@@ -28,7 +28,7 @@ pub enum ColumnIndex {
Full,
Optional(OptionalIndex),
/// In addition, at index num_rows, an extra value is added
/// containing the overal number of values.
/// containing the overall number of values.
Multivalued(MultiValueIndex),
}

View File

@@ -174,7 +174,9 @@ impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
}
impl Set<RowId> for OptionalIndex {
type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
type SelectCursor<'b>
= OptionalIndexSelectCursor<'b>
where Self: 'b;
// Check if value at position is not null.
#[inline]
fn contains(&self, row_id: RowId) -> bool {

View File

@@ -123,7 +123,9 @@ impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
}
impl<'a> Set<u16> for DenseBlock<'a> {
type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
type SelectCursor<'b>
= DenseBlockSelectCursor<'a>
where Self: 'b;
#[inline(always)]
fn contains(&self, el: u16) -> bool {

View File

@@ -32,7 +32,9 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
}
impl<'a> Set<u16> for SparseBlock<'a> {
type SelectCursor<'b> = Self where Self: 'b;
type SelectCursor<'b>
= Self
where Self: 'b;
#[inline(always)]
fn contains(&self, el: u16) -> bool {

View File

@@ -110,8 +110,8 @@ fn test_null_index(data: &[bool]) {
.map(|(pos, _val)| pos as u32)
.collect();
let mut select_iter = null_index.select_cursor();
for i in 0..orig_idx_with_value.len() {
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
for (i, expected) in orig_idx_with_value.iter().enumerate() {
assert_eq!(select_iter.select(i as u32), *expected);
}
let step_size = (orig_idx_with_value.len() / 100).max(1);

View File

@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
fn value_iter() -> impl Iterator<Item = u64> {
0..20_000
}
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
let mut bytes = Vec::new();
let stats = compute_stats(data.iter().cloned());
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
for val in data {
codec_serializer.collect(*val);
}
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
codec_serializer
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
.unwrap();
Codec::load(OwnedBytes::new(bytes)).unwrap()
}
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
let col = get_reader_for_bench::<Codec>(data);
b.iter(|| {

View File

@@ -184,7 +184,7 @@ impl CompactSpaceBuilder {
let mut covered_space = Vec::with_capacity(self.blanks.len());
// begining of the blanks
// beginning of the blanks
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
if *first_blank_start != 0 {
covered_space.push(0..=first_blank_start - 1);

View File

@@ -122,12 +122,11 @@ impl Line {
line
}
/// Returns a line that attemps to approximate a function
/// Returns a line that attempts to approximate a function
/// f: i in 0..[ys.num_vals()) -> ys[i].
///
/// - The approximation is always lower than the actual value.
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
/// for any i in [0..ys.len()).
/// - The approximation is always lower than the actual value. Or more rigorously, formally
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
/// - It computes without panicking for any value of it.
///
/// This function is only invariable by translation if all of the

View File

@@ -25,7 +25,7 @@ use crate::{
/// After merge, all columns belonging to the same category are coerced to
/// the same column type.
///
/// In practise, today, only Numerical colummns are coerced into one type today.
/// In practise, today, only Numerical columns are coerced into one type today.
///
/// See also [README.md].
///
@@ -63,11 +63,10 @@ impl From<ColumnType> for ColumnTypeCategory {
/// `require_columns` makes it possible to ensure that some columns will be present in the
/// resulting columnar. When a required column is a numerical column type, one of two things can
/// happen:
/// - If the required column type is compatible with all of the input columnar, the resulsting
/// merged
/// columnar will simply coerce the input column and use the required column type.
/// - If the required column type is incompatible with one of the input columnar, the merged
/// will fail with an InvalidData error.
/// - If the required column type is compatible with all of the input columnar, the resulting merged
/// columnar will simply coerce the input column and use the required column type.
/// - If the required column type is incompatible with one of the input columnar, the merged will
/// fail with an InvalidData error.
///
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
/// `Columnar` table.

View File

@@ -35,8 +35,7 @@ impl<'a> Ord for HeapItem<'a> {
///
/// The item yield is actually a pair with
/// - the term
/// - a slice with the ordinal of the segments containing
/// the terms.
/// - a slice with the ordinal of the segments containing the terms.
pub struct TermMerger<'a> {
heap: BinaryHeap<HeapItem<'a>>,
current_streamers: Vec<HeapItem<'a>>,

View File

@@ -87,7 +87,7 @@ impl<V: SymbolValue> ColumnOperation<V> {
minibuf
}
/// Deserialize a colummn operation.
/// Deserialize a column operation.
/// Returns None if the buffer is empty.
///
/// Panics if the payload is invalid:

View File

@@ -8,6 +8,7 @@ use std::net::Ipv6Addr;
use column_operation::ColumnOperation;
pub(crate) use column_writers::CompatibleNumericalTypes;
use common::json_path_writer::JSON_END_OF_PATH;
use common::CountingWriter;
pub(crate) use serializer::ColumnarSerializer;
use stacker::{Addr, ArenaHashMap, MemoryArena};
@@ -283,12 +284,17 @@ impl ColumnarWriter {
.iter()
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
);
// TODO: replace JSON_END_OF_PATH with b'0' in columns
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns {
if column_name.contains(&JSON_END_OF_PATH) {
// Tantivy uses b'0' as a separator for nested fields in JSON.
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
// index).
continue;
}
match column_type {
ColumnType::Bool => {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);

View File

@@ -93,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
self.columnar_serializer.wrt.write_all(buf)
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_prepare_key_bytes() {
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
assert_eq!(buffer.len(), 12);
assert_eq!(&buffer[..10], b"root0child");
assert_eq!(buffer[10], 0u8);
assert_eq!(buffer[11], ColumnType::Str.to_code());
}
}

View File

@@ -17,6 +17,31 @@ impl NumericalValue {
NumericalValue::F64(_) => NumericalType::F64,
}
}
/// Tries to normalize the numerical value in the following priorities:
/// i64, i64, f64
pub fn normalize(self) -> Self {
match self {
NumericalValue::U64(val) => {
if val <= i64::MAX as u64 {
NumericalValue::I64(val as i64)
} else {
NumericalValue::F64(val as f64)
}
}
NumericalValue::I64(val) => NumericalValue::I64(val),
NumericalValue::F64(val) => {
let fract = val.fract();
if fract == 0.0 && val >= i64::MIN as f64 && val <= i64::MAX as f64 {
NumericalValue::I64(val as i64)
} else if fract == 0.0 && val >= u64::MIN as f64 && val <= u64::MAX as f64 {
NumericalValue::U64(val as u64)
} else {
NumericalValue::F64(val)
}
}
}
}
}
impl From<u64> for NumericalValue {

View File

@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
[dependencies]
@@ -20,8 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
binggan = "0.12.0"
proptest = "1.0.0"
rand = "0.8.4"
[features]
unstable = [] # useful for benches.

View File

@@ -1,39 +1,70 @@
#![feature(test)]
use binggan::{black_box, BenchRunner};
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
extern crate test;
fn bench_vint() {
let mut runner = BenchRunner::new();
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;
let vals: Vec<u32> = (0..20_000).collect();
runner.bench_function("bench_vint", move |_| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
black_box(out);
None
});
#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
runner.bench_function("bench_vint_rand", move |_| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
black_box(out);
None
});
}
fn bench_bitset() {
let mut runner = BenchRunner::new();
runner.bench_function("bench_tinyset_pop", move |_| {
let mut tinyset = TinySet::singleton(black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
black_box(tinyset);
None
});
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
runner.bench_function("bench_tinyset_sum", move |_| {
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
None
});
let v = [10u32, 14u32, 21u32];
runner.bench_function("bench_tinyarr_sum", move |_| {
black_box(v.iter().cloned().sum::<u32>());
None
});
runner.bench_function("bench_bitset_initialize", move |_| {
black_box(BitSet::with_max_value(1_000_000));
None
});
}
fn main() {
bench_vint();
bench_bitset();
}

View File

@@ -696,43 +696,3 @@ mod tests {
}
}
}
#[cfg(all(test, feature = "unstable"))]
mod bench {
use test;
use super::{BitSet, TinySet};
#[bench]
fn bench_tinyset_pop(b: &mut test::Bencher) {
b.iter(|| {
let mut tinyset = TinySet::singleton(test::black_box(31u32));
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
tinyset.pop_lowest();
});
}
#[bench]
fn bench_tinyset_sum(b: &mut test::Bencher) {
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
b.iter(|| {
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
});
}
#[bench]
fn bench_tinyarr_sum(b: &mut test::Bencher) {
let v = [10u32, 14u32, 21u32];
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
}
#[bench]
fn bench_bitset_initialize(b: &mut test::Bencher) {
b.iter(|| BitSet::with_max_value(1_000_000));
}
}

130
common/src/bounds.rs Normal file
View File

@@ -0,0 +1,130 @@
use std::io;
use std::ops::Bound;
#[derive(Clone, Debug)]
pub struct BoundsRange<T> {
pub lower_bound: Bound<T>,
pub upper_bound: Bound<T>,
}
impl<T> BoundsRange<T> {
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
BoundsRange {
lower_bound,
upper_bound,
}
}
pub fn is_unbounded(&self) -> bool {
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
}
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
BoundsRange {
lower_bound: map_bound(&self.lower_bound, &transform),
upper_bound: map_bound(&self.upper_bound, &transform),
}
}
pub fn map_bound_res<TTo, Err>(
&self,
transform: impl Fn(&T) -> Result<TTo, Err>,
) -> Result<BoundsRange<TTo>, Err> {
Ok(BoundsRange {
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
})
}
pub fn transform_inner<TTo>(
&self,
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
) -> BoundsRange<TTo> {
BoundsRange {
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
}
}
/// Returns the first set inner value
pub fn get_inner(&self) -> Option<&T> {
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
}
}
pub enum TransformBound<T> {
/// Overwrite the bounds
NewBound(Bound<T>),
/// Use Existing bounds with new value
Existing(T),
}
/// Takes a bound and transforms the inner value into a new bound via a closure.
/// The bound variant may change by the value returned value from the closure.
pub fn transform_bound_inner_res<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
) -> io::Result<Bound<TTo>> {
use self::Bound::*;
Ok(match bound {
Excluded(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val),
},
Included(ref from_val) => match transform(from_val)? {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val),
},
Unbounded => Unbounded,
})
}
/// Takes a bound and transforms the inner value into a new bound via a closure.
/// The bound variant may change by the value returned value from the closure.
pub fn transform_bound_inner<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Excluded(new_val),
},
Included(ref from_val) => match transform(from_val) {
TransformBound::NewBound(new_val) => new_val,
TransformBound::Existing(new_val) => Included(new_val),
},
Unbounded => Unbounded,
}
}
/// Returns the inner value of a `Bound`
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
match val {
Bound::Included(term) | Bound::Excluded(term) => Some(term),
Bound::Unbounded => None,
}
}
pub fn map_bound<TFrom, TTo>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> TTo,
) -> Bound<TTo> {
use self::Bound::*;
match bound {
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
Included(ref from_val) => Bound::Included(transform(from_val)),
Unbounded => Unbounded,
}
}
pub fn map_bound_res<TFrom, TTo, Err>(
bound: &Bound<TFrom>,
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
) -> Result<Bound<TTo>, Err> {
use self::Bound::*;
Ok(match bound {
Excluded(ref from_val) => Excluded(transform(from_val)?),
Included(ref from_val) => Included(transform(from_val)?),
Unbounded => Unbounded,
})
}

View File

@@ -5,6 +5,7 @@ use std::ops::Deref;
pub use byteorder::LittleEndian as Endianness;
mod bitset;
pub mod bounds;
mod byte_count;
mod datetime;
pub mod file_slice;

View File

@@ -1,3 +1,5 @@
use std::ops::Bound;
// # Searching a range on an indexed int field.
//
// Below is an example of creating an indexed integer field in your schema
@@ -5,7 +7,7 @@
use tantivy::collector::Count;
use tantivy::query::RangeQuery;
use tantivy::schema::{Schema, INDEXED};
use tantivy::{doc, Index, IndexWriter, Result};
use tantivy::{doc, Index, IndexWriter, Result, Term};
fn main() -> Result<()> {
// For the sake of simplicity, this schema will only have 1 field
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
reader.reload()?;
let searcher = reader.searcher();
// The end is excluded i.e. here we are searching up to 1969
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
let docs_in_the_sixties = RangeQuery::new(
Bound::Included(Term::from_field_u64(year_field, 1960)),
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
);
// Uses a Count collector to sum the total number of docs in the range
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
assert_eq!(num_60s_books, 10);

View File

@@ -28,7 +28,7 @@ fn main() -> tantivy::Result<()> {
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
index_writer.add_document(doc!(title => "The modern Prometheus"))?;
index_writer.commit()?;
let reader = index.reader()?;

View File

@@ -109,6 +109,9 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
move |input: I| match f.parse(input) {
Ok((input, (output, _err))) => Ok((input, output)),
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
// old versions don't understand this is uninhabited and need the empty match to help,
// newer versions warn because this arm is unreachable (which it is indeed).
#[allow(unreachable_patterns)]
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
}
}

View File

@@ -6,12 +6,12 @@ use std::fmt::Write;
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
pub enum Occur {
/// For a given document to be considered for scoring,
/// at least one of the terms with the Should or the Must
/// at least one of the queries with the Should or the Must
/// Occur constraint must be within the document.
Should,
/// Document without the term are excluded from the search.
/// Document without the queries are excluded from the search.
Must,
/// Document that contain the term are excluded from the
/// Document that contain the query are excluded from the
/// search.
MustNot,
}

View File

@@ -833,7 +833,7 @@ fn aggregate_infallible_expressions(
if early_operand {
err.push(LenientErrorInternal {
pos: 0,
message: "Found unexpeted boolean operator before term".to_string(),
message: "Found unexpected boolean operator before term".to_string(),
});
}
@@ -856,7 +856,7 @@ fn aggregate_infallible_expressions(
_ => Some(Occur::Should),
};
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
clauses.push(vec![(
Some(Occur::Should),
ast.clone().unary(Occur::MustNot),
@@ -872,7 +872,7 @@ fn aggregate_infallible_expressions(
None => None,
};
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
clauses.push(vec![(
Some(Occur::Should),
ast.clone().unary(Occur::MustNot),
@@ -897,7 +897,7 @@ fn aggregate_infallible_expressions(
}
Some(BinaryOperand::Or) => {
if last_occur == Some(Occur::MustNot) {
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
} else {
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
@@ -1057,7 +1057,7 @@ mod test {
valid_parse("1", 1.0, "");
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
error_parse(".3332");
// TODO trinity-1686a: I disagree that it should fail, I think it should succeeed,
// TODO trinity-1686a: I disagree that it should fail, I think it should succeed,
// consuming only "1", and leave "." for the next thing (which will likely fail then)
// error_parse("1.");
error_parse("-1.");
@@ -1467,7 +1467,7 @@ mod test {
}
#[test]
fn test_parse_query_to_triming_spaces() {
fn test_parse_query_to_trimming_spaces() {
test_parse_query_to_ast_helper(" abc", "abc");
test_parse_query_to_ast_helper("abc ", "abc");
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");

View File

@@ -267,7 +267,7 @@ impl fmt::Debug for UserInputAst {
match *self {
UserInputAst::Clause(ref subqueries) => {
if subqueries.is_empty() {
// TODO this will break ast reserialization, is writing "( )" enought?
// TODO this will break ast reserialization, is writing "( )" enough?
write!(formatter, "<emptyclause>")?;
} else {
write!(formatter, "(")?;

View File

@@ -21,7 +21,10 @@ impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
/// (500MB). The limit is shared by all SegmentCollectors
pub struct AggregationLimits {
///
/// The memory limit is also a guard, which tracks how much it allocated and releases it's memory
/// on the shared counter. Cloning will create a new guard.
pub struct AggregationLimitsGuard {
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc<AtomicU64>,
/// The memory_limit in bytes
@@ -29,28 +32,41 @@ pub struct AggregationLimits {
/// The maximum number of buckets _returned_
/// This is not counting intermediate buckets.
bucket_limit: u32,
/// Allocated memory with this guard.
allocated_with_the_guard: u64,
}
impl Clone for AggregationLimits {
impl Clone for AggregationLimitsGuard {
fn clone(&self) -> Self {
Self {
memory_consumption: Arc::clone(&self.memory_consumption),
memory_limit: self.memory_limit,
bucket_limit: self.bucket_limit,
allocated_with_the_guard: 0,
}
}
}
impl Default for AggregationLimits {
impl Drop for AggregationLimitsGuard {
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
/// This is used to clear the segment specific memory consumption all at once.
fn drop(&mut self) {
self.memory_consumption
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
impl Default for AggregationLimitsGuard {
fn default() -> Self {
Self {
memory_consumption: Default::default(),
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
bucket_limit: DEFAULT_BUCKET_LIMIT,
allocated_with_the_guard: 0,
}
}
}
impl AggregationLimits {
impl AggregationLimitsGuard {
/// *memory_limit*
/// memory_limit is defined in bytes.
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
@@ -67,24 +83,15 @@ impl AggregationLimits {
memory_consumption: Default::default(),
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
}
}
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard {
// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption),
// The memory_limit in bytes
memory_limit: self.memory_limit,
allocated_with_the_guard: 0,
}
}
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
pub(crate) fn add_memory_consumed(&mut self, add_num_bytes: u64) -> crate::Result<()> {
let prev_value = self
.memory_consumption
.fetch_add(add_num_bytes, Ordering::Relaxed);
self.allocated_with_the_guard += add_num_bytes;
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(())
}
@@ -109,34 +116,6 @@ fn validate_memory_consumption(
Ok(())
}
pub struct ResourceLimitGuard {
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc<AtomicU64>,
/// The memory_limit in bytes
memory_limit: ByteCount,
/// Allocated memory with this guard.
allocated_with_the_guard: u64,
}
impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
let prev_value = self
.memory_consumption
.fetch_add(add_num_bytes, Ordering::Relaxed);
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
Ok(())
}
}
impl Drop for ResourceLimitGuard {
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
/// This is used to clear the segment specific memory consumption all at once.
fn drop(&mut self) {
self.memory_consumption
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}
#[cfg(test)]
mod tests {
use crate::aggregation::tests::exec_request_with_query;

View File

@@ -34,8 +34,9 @@ use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
TopHitsAggregationReq,
};
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
@@ -159,7 +160,10 @@ pub enum AggregationVariants {
Percentiles(PercentilesAggregationReq),
/// Finds the top k values matching some order
#[serde(rename = "top_hits")]
TopHits(TopHitsAggregation),
TopHits(TopHitsAggregationReq),
/// Computes an estimate of the number of unique values
#[serde(rename = "cardinality")]
Cardinality(CardinalityAggregationReq),
}
impl AggregationVariants {
@@ -179,6 +183,7 @@ impl AggregationVariants {
AggregationVariants::Sum(sum) => vec![sum.field_name()],
AggregationVariants::Percentiles(per) => vec![per.field_name()],
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
AggregationVariants::Cardinality(per) => vec![per.field_name()],
}
}
@@ -203,7 +208,7 @@ impl AggregationVariants {
_ => None,
}
}
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
match &self {
AggregationVariants::TopHits(top_hits) => Some(top_hits),
_ => None,

View File

@@ -5,16 +5,15 @@ use std::io;
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
use super::agg_limits::ResourceLimitGuard;
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
StatsAggregation, SumAggregation,
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
};
use super::segment_agg_result::AggregationLimits;
use super::segment_agg_result::AggregationLimitsGuard;
use super::VecWithNames;
use crate::aggregation::{f64_to_fastfield_u64, Key};
use crate::index::SegmentReader;
@@ -46,7 +45,7 @@ pub struct AggregationWithAccessor {
pub(crate) str_dict_column: Option<StrColumn>,
pub(crate) field_type: ColumnType,
pub(crate) sub_aggregation: AggregationsWithAccessor,
pub(crate) limits: ResourceLimitGuard,
pub(crate) limits: AggregationLimitsGuard,
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
/// Used for missing term aggregation, which checks all columns for existence.
/// And also for `top_hits` aggregation, which may sort on multiple fields.
@@ -69,7 +68,7 @@ impl AggregationWithAccessor {
sub_aggregation: &Aggregations,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: AggregationLimits,
limits: AggregationLimitsGuard,
) -> crate::Result<Vec<AggregationWithAccessor>> {
let mut agg = agg.clone();
@@ -91,7 +90,7 @@ impl AggregationWithAccessor {
&limits,
)?,
agg: agg.clone(),
limits: limits.new_guard(),
limits: limits.clone(),
missing_value_for_accessor: None,
str_dict_column: None,
column_block_accessor: Default::default(),
@@ -106,6 +105,7 @@ impl AggregationWithAccessor {
value_accessors: HashMap<String, Vec<DynamicColumn>>|
-> crate::Result<()> {
let (accessor, field_type) = accessors.first().expect("at least one accessor");
let limits = limits.clone();
let res = AggregationWithAccessor {
segment_ordinal,
// TODO: We should do away with the `accessor` field altogether
@@ -120,7 +120,7 @@ impl AggregationWithAccessor {
&limits,
)?,
agg: agg.clone(),
limits: limits.new_guard(),
limits,
missing_value_for_accessor: None,
str_dict_column: None,
column_block_accessor: Default::default(),
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
field: ref field_name,
ref missing,
..
})
| Cardinality(CardinalityAggregationReq {
field: ref field_name,
ref missing,
..
}) => {
let str_dict_column = reader.fast_fields().str(field_name)?;
let allowed_column_types = [
@@ -181,6 +186,8 @@ impl AggregationWithAccessor {
.map(|missing| match missing {
Key::Str(_) => ColumnType::Str,
Key::F64(_) => ColumnType::F64,
Key::I64(_) => ColumnType::I64,
Key::U64(_) => ColumnType::U64,
})
.unwrap_or(ColumnType::U64);
let column_and_types = get_all_ff_reader_or_empty(
@@ -227,14 +234,18 @@ impl AggregationWithAccessor {
missing.clone()
};
let missing_value_for_accessor = if let Some(missing) =
missing_value_term_agg.as_ref()
{
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
} else {
None
};
let missing_value_for_accessor =
if let Some(missing) = missing_value_term_agg.as_ref() {
get_missing_val_as_u64_lenient(
column_type,
missing,
agg.agg.get_fast_field_names()[0],
)?
} else {
None
};
let limits = limits.clone();
let agg = AggregationWithAccessor {
segment_ordinal,
missing_value_for_accessor,
@@ -250,7 +261,7 @@ impl AggregationWithAccessor {
)?,
agg: agg.clone(),
str_dict_column: str_dict_column.clone(),
limits: limits.new_guard(),
limits,
column_block_accessor: Default::default(),
};
res.push(agg);
@@ -325,7 +336,14 @@ impl AggregationWithAccessor {
}
}
fn get_missing_val(
/// Get the missing value as internal u64 representation
///
/// For terms we use u64::MAX as sentinel value
/// For numerical data we convert the value into the representation
/// we would get from the fast field, when we open it as u64_lenient_for_type.
///
/// That way we can use it the same way as if it would come from the fastfield.
fn get_missing_val_as_u64_lenient(
column_type: ColumnType,
missing: &Key,
field_name: &str,
@@ -334,9 +352,18 @@ fn get_missing_val(
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
// Allow fallback to number on text fields
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
Key::F64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val, &column_type)
}
// NOTE: We may loose precision of the passed missing value by casting i64 and u64 to f64.
Key::I64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val as f64, &column_type)
}
Key::U64(val) if column_type.numerical_type().is_some() => {
f64_to_fastfield_u64(*val as f64, &column_type)
}
_ => {
return Err(crate::TantivyError::InvalidArgument(format!(
"Missing value {missing:?} for field {field_name} is not supported for column \
@@ -360,7 +387,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
aggs: &Aggregations,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: &AggregationLimits,
limits: &AggregationLimitsGuard,
) -> crate::Result<AggregationsWithAccessor> {
let mut aggss = Vec::new();
for (key, agg) in aggs.iter() {

View File

@@ -98,6 +98,8 @@ pub enum MetricResult {
Percentiles(PercentilesMetricResult),
/// Top hits metric result
TopHits(TopHitsMetricResult),
/// Cardinality metric result
Cardinality(SingleMetricResult),
}
impl MetricResult {
@@ -116,6 +118,7 @@ impl MetricResult {
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
)),
MetricResult::Cardinality(card) => Ok(card.value),
}
}
}

View File

@@ -5,7 +5,7 @@ use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::segment_agg_result::AggregationLimits;
use crate::aggregation::segment_agg_result::AggregationLimitsGuard;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector;
use crate::query::{AllQuery, TermQuery};
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
}
}
}
},
"cardinality_string_id":{
"cardinality": {
"field": "string_id"
}
},
"cardinality_score":{
"cardinality": {
"field": "score"
}
}
});
@@ -120,7 +130,7 @@ fn test_aggregation_flushing(
let agg_res: AggregationResults = if use_distributed_collector {
let collector = DistributedAggregationCollector::from_aggs(
agg_req.clone(),
AggregationLimits::default(),
AggregationLimitsGuard::default(),
);
let searcher = reader.searcher();
@@ -136,7 +146,7 @@ fn test_aggregation_flushing(
.expect("Post deserialization failed");
intermediate_agg_result
.into_final_result(agg_req, &Default::default())
.into_final_result(agg_req, Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req);
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
)
);
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
assert_eq!(res["cardinality_score"]["value"], 80.0);
Ok(())
}
@@ -447,7 +460,7 @@ fn test_aggregation_level2(
let searcher = reader.searcher();
let res = searcher.search(&term_query, &collector).unwrap();
res.into_final_result(agg_req.clone(), &Default::default())
res.into_final_result(agg_req.clone(), Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req.clone());
@@ -857,7 +870,7 @@ fn test_aggregation_on_json_object_mixed_types() {
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
.unwrap();
index_writer.commit().unwrap();
// => Segment with all boolen
// => Segment with all boolean
index_writer
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
.unwrap();
@@ -926,11 +939,11 @@ fn test_aggregation_on_json_object_mixed_types() {
},
"termagg": {
"buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } },
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
{ "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } },
],
"sum_other_doc_count": 0
}
@@ -938,3 +951,60 @@ fn test_aggregation_on_json_object_mixed_types() {
)
);
}
#[test]
fn test_aggregation_on_json_object_mixed_numerical_segments() {
let mut schema_builder = Schema::builder();
let json = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
// => Segment with all values f64 numeric
index_writer
.add_document(doc!(json => json!({"mixed_price": 10.5})))
.unwrap();
// Gets converted to f64!
index_writer
.add_document(doc!(json => json!({"mixed_price": 10})))
.unwrap();
index_writer.commit().unwrap();
// => Segment with all values i64 numeric
index_writer
.add_document(doc!(json => json!({"mixed_price": 10})))
.unwrap();
index_writer.commit().unwrap();
index_writer.commit().unwrap();
// All bucket types
let agg_req_str = r#"
{
"termagg": {
"terms": {
"field": "json.mixed_price"
}
}
} "#;
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let aggregation_collector = get_collector(agg);
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
use pretty_assertions::assert_eq;
assert_eq!(
&aggregation_res_json,
&serde_json::json!({
"termagg": {
"buckets": [
{ "doc_count": 2, "key": 10},
{ "doc_count": 1, "key": 10.5},
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
)
);
}

View File

@@ -438,7 +438,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
buckets: Vec<IntermediateHistogramBucketEntry>,
histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<Vec<BucketEntry>> {
// Generate the full list of buckets without gaps.
//
@@ -496,7 +496,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
is_date_agg: bool,
histogram_req: &HistogramAggregation,
sub_aggregation: &Aggregations,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<Vec<BucketEntry>> {
// Normalization is column type dependent.
// The request used in the the call to final is not yet be normalized.
@@ -750,7 +750,7 @@ mod tests {
agg_req,
&index,
None,
AggregationLimits::new(Some(5_000), None),
AggregationLimitsGuard::new(Some(5_000), None),
)
.unwrap_err();
assert!(res.to_string().starts_with(

View File

@@ -112,18 +112,64 @@ impl Serialize for CustomOrder {
impl<'de> Deserialize<'de> for CustomOrder {
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
where D: Deserializer<'de> {
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
if let Some((key, value)) = map.into_iter().next() {
let value = serde_json::Value::deserialize(deserializer)?;
let return_err = |message, val: serde_json::Value| {
de::Error::custom(format!(
"{}, but got {}",
message,
serde_json::to_string(&val).unwrap()
))
};
match value {
serde_json::Value::Object(map) => {
if map.len() != 1 {
return Err(return_err(
"expected exactly one key-value pair in the order map",
map.into(),
));
}
let (key, value) = map.into_iter().next().unwrap();
let order = serde_json::from_value(value).map_err(de::Error::custom)?;
Ok(CustomOrder {
target: key.as_str().into(),
order: value,
order,
})
} else {
Err(de::Error::custom(
"unexpected empty map in order".to_string(),
))
}
})
serde_json::Value::Array(arr) => {
if arr.is_empty() {
return Err(return_err("unexpected empty array in order", arr.into()));
}
if arr.len() != 1 {
return Err(return_err(
"only one sort order supported currently",
arr.into(),
));
}
let entry = arr.into_iter().next().unwrap();
let map = entry
.as_object()
.ok_or_else(|| return_err("expected object as sort order", entry.clone()))?;
let (key, value) = map.into_iter().next().ok_or_else(|| {
return_err(
"expected exactly one key-value pair in the order map",
entry.clone(),
)
})?;
let order = serde_json::from_value(value.clone()).map_err(de::Error::custom)?;
Ok(CustomOrder {
target: key.as_str().into(),
order,
})
}
_ => Err(return_err(
"unexpected type, expected an object or array",
value,
)),
}
}
}
@@ -138,11 +184,23 @@ fn custom_order_serde_test() {
assert_eq!(order_str, "{\"_key\":\"desc\"}");
let order_deser = serde_json::from_str(&order_str).unwrap();
assert_eq!(order, order_deser);
let order_deser: CustomOrder = serde_json::from_str("[{\"_key\":\"desc\"}]").unwrap();
assert_eq!(order, order_deser);
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
assert!(order_deser.is_err());
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
assert!(order_deser.is_err());
assert!(order_deser
.unwrap_err()
.to_string()
.contains("unexpected empty array in order"));
let order_deser: serde_json::Result<CustomOrder> =
serde_json::from_str(r#"[{"_key":"desc"},{"_key":"desc"}]"#);
assert_eq!(
order_deser.unwrap_err().to_string(),
r#"only one sort order supported currently, but got [{"_key":"desc"},{"_key":"desc"}]"#
);
}

View File

@@ -4,7 +4,6 @@ use std::ops::Range;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_limits::ResourceLimitGuard;
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
@@ -270,7 +269,7 @@ impl SegmentRangeCollector {
pub(crate) fn from_req_and_validate(
req: &RangeAggregation,
sub_aggregation: &mut AggregationsWithAccessor,
limits: &ResourceLimitGuard,
limits: &mut AggregationLimitsGuard,
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
@@ -471,7 +470,7 @@ mod tests {
SegmentRangeCollector::from_req_and_validate(
&req,
&mut Default::default(),
&AggregationLimits::default().new_guard(),
&mut AggregationLimitsGuard::default(),
field_type,
0,
)

View File

@@ -1,9 +1,10 @@
use std::fmt::Debug;
use std::io;
use std::net::Ipv6Addr;
use columnar::column_values::CompactSpaceU64Accessor;
use columnar::{
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalValue,
};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
@@ -20,11 +21,11 @@ use crate::aggregation::intermediate_agg_result::{
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
use crate::aggregation::{format_date, Key};
use crate::error::DataCorruption;
use crate::TantivyError;
/// Creates a bucket for every unique term and counts the number of occurences.
/// Creates a bucket for every unique term and counts the number of occurrences.
/// Note that doc_count in the response buckets equals term count here.
///
/// If the text is untokenized and single value, that means one term per document and therefore it
@@ -157,7 +158,7 @@ pub struct TermsAggregation {
/// when loading the text.
/// Special Case 1:
/// If we have multiple columns on one field, we need to have a union on the indices on both
/// columns, to find docids without a value. That requires a special missing aggreggation.
/// columns, to find docids without a value. That requires a special missing aggregation.
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
/// the special missing aggregation, since there is no mechanism in the numerical column to
/// add text.
@@ -363,7 +364,7 @@ impl SegmentTermCollector {
let term_buckets = TermBuckets::default();
if let Some(custom_order) = req.order.as_ref() {
// Validate sub aggregtion exists
// Validate sub aggregation exists
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
@@ -466,49 +467,72 @@ impl SegmentTermCollector {
};
if self.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty();
let term_dict = agg_with_accessor
.str_dict_column
.as_ref()
.cloned()
.unwrap_or_else(|| {
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
});
let mut buffer = String::new();
for (term_id, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
// Special case for missing key
if term_id == u64::MAX {
let missing_key = self
.req
.missing
.as_ref()
.expect("Found placeholder term_id but `missing` is None");
match missing_key {
Key::Str(missing) => {
buffer.clear();
buffer.push_str(missing);
dict.insert(
IntermediateKey::Str(buffer.to_string()),
intermediate_entry,
);
}
Key::F64(val) => {
buffer.push_str(&val.to_string());
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
}
.map(|el| el.dictionary())
.unwrap_or_else(|| &fallback_dict);
let mut buffer = Vec::new();
// special case for missing key
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
let entry = entries[index];
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
let missing_key = self
.req
.missing
.as_ref()
.expect("Found placeholder term_id but `missing` is None");
match missing_key {
Key::Str(missing) => {
buffer.clear();
buffer.extend_from_slice(missing.as_bytes());
dict.insert(
IntermediateKey::Str(
String::from_utf8(buffer.to_vec())
.expect("could not convert to String"),
),
intermediate_entry,
);
}
} else {
if !term_dict.ord_to_str(term_id, &mut buffer)? {
return Err(TantivyError::InternalError(format!(
"Couldn't find term_id {term_id} in dict"
)));
Key::F64(val) => {
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
}
Key::U64(val) => {
dict.insert(IntermediateKey::U64(*val), intermediate_entry);
}
Key::I64(val) => {
dict.insert(IntermediateKey::I64(*val), intermediate_entry);
}
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
}
entries.swap_remove(index);
}
// Sort by term ord
entries.sort_unstable_by_key(|bucket| bucket.0);
let mut idx = 0;
term_dict.sorted_ords_to_term_cb(
entries.iter().map(|(term_id, _)| *term_id),
|term| {
let entry = entries[idx];
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
dict.insert(
IntermediateKey::Str(
String::from_utf8(term.to_vec()).expect("could not convert to String"),
),
intermediate_entry,
);
idx += 1;
Ok(())
},
)?;
if self.req.min_doc_count == 0 {
// TODO: Handle rev streaming for descending sorting by keys
let mut stream = term_dict.dictionary().stream()?;
let mut stream = term_dict.stream()?;
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
agg_with_accessor.agg.sub_aggregation(),
);
@@ -567,8 +591,26 @@ impl SegmentTermCollector {
} else {
for (val, doc_count) in entries {
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
let val = f64_from_fastfield_u64(val, &self.column_type);
dict.insert(IntermediateKey::F64(val), intermediate_entry);
if self.column_type == ColumnType::U64 {
dict.insert(IntermediateKey::U64(val), intermediate_entry);
} else if self.column_type == ColumnType::I64 {
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
} else {
let val = f64::from_u64(val);
let val: NumericalValue = val.into();
match val.normalize() {
NumericalValue::U64(val) => {
dict.insert(IntermediateKey::U64(val), intermediate_entry);
}
NumericalValue::I64(val) => {
dict.insert(IntermediateKey::I64(val), intermediate_entry);
}
NumericalValue::F64(val) => {
dict.insert(IntermediateKey::F64(val), intermediate_entry);
}
}
};
}
};
@@ -627,7 +669,7 @@ mod tests {
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
get_test_index_from_terms, get_test_index_from_values_and_terms,
};
use crate::aggregation::AggregationLimits;
use crate::aggregation::AggregationLimitsGuard;
use crate::indexer::NoMergePolicy;
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
use crate::{Index, IndexWriter};
@@ -1382,7 +1424,7 @@ mod tests {
agg_req,
&index,
None,
AggregationLimits::new(Some(50_000), None),
AggregationLimitsGuard::new(Some(50_000), None),
)
.unwrap_err();
assert!(res
@@ -1643,7 +1685,7 @@ mod tests {
res["my_texts"]["buckets"][2]["key"],
serde_json::Value::Null
);
// text field with numner as missing fallback
// text field with number as missing fallback
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
@@ -1703,6 +1745,54 @@ mod tests {
Ok(())
}
#[test]
fn terms_aggregation_u64_value() -> crate::Result<()> {
// Make sure that large u64 are not truncated
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
index_writer.add_document(doc!(
id_field => 9_223_372_036_854_775_807u64,
))?;
index_writer.add_document(doc!(
id_field => 1_769_070_189_829_214_202u64,
))?;
index_writer.add_document(doc!(
id_field => 1_769_070_189_829_214_202u64,
))?;
index_writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"my_ids": {
"terms": {
"field": "id"
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
// id field
assert_eq!(
res["my_ids"]["buckets"][0]["key"],
1_769_070_189_829_214_202u64
);
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
assert_eq!(
res["my_ids"]["buckets"][1]["key"],
9_223_372_036_854_775_807u64
);
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
Ok(())
}
#[test]
fn terms_aggregation_missing1() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
@@ -1769,7 +1859,7 @@ mod tests {
res["my_texts"]["buckets"][2]["key"],
serde_json::Value::Null
);
// text field with numner as missing fallback
// text field with number as missing fallback
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);

View File

@@ -70,7 +70,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
)?;
missing_entry.sub_aggregation = res;
}
entries.insert(missing.into(), missing_entry);
let bucket = IntermediateBucketResult::Terms {

View File

@@ -4,7 +4,7 @@ use super::agg_result::AggregationResults;
use super::buf_collector::BufAggregationCollector;
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::{
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
build_segment_agg_collector, AggregationLimitsGuard, SegmentAggregationCollector,
};
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
@@ -22,7 +22,7 @@ pub const DEFAULT_MEMORY_LIMIT: u64 = 500_000_000;
/// The collector collects all aggregations by the underlying aggregation request.
pub struct AggregationCollector {
agg: Aggregations,
limits: AggregationLimits,
limits: AggregationLimitsGuard,
}
impl AggregationCollector {
@@ -30,7 +30,7 @@ impl AggregationCollector {
///
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
/// bucket limit)
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
Self { agg, limits }
}
}
@@ -45,7 +45,7 @@ impl AggregationCollector {
/// into the final `AggregationResults` via the `into_final_result()` method.
pub struct DistributedAggregationCollector {
agg: Aggregations,
limits: AggregationLimits,
limits: AggregationLimitsGuard,
}
impl DistributedAggregationCollector {
@@ -53,7 +53,7 @@ impl DistributedAggregationCollector {
///
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
/// bucket limit)
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
Self { agg, limits }
}
}
@@ -115,7 +115,7 @@ impl Collector for AggregationCollector {
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
let res = merge_fruits(segment_fruits)?;
res.into_final_result(self.agg.clone(), &self.limits)
res.into_final_result(self.agg.clone(), self.limits.clone())
}
}
@@ -147,7 +147,7 @@ impl AggregationSegmentCollector {
agg: &Aggregations,
reader: &SegmentReader,
segment_ordinal: SegmentOrdinal,
limits: &AggregationLimits,
limits: &AggregationLimitsGuard,
) -> crate::Result<Self> {
let mut aggs_with_accessor =
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;

View File

@@ -22,10 +22,11 @@ use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
};
use super::segment_agg_result::AggregationLimits;
use super::segment_agg_result::AggregationLimitsGuard;
use super::{format_date, AggregationError, Key, SerializedKey};
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
use crate::aggregation::metric::CardinalityCollector;
use crate::TantivyError;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
@@ -50,12 +51,18 @@ pub enum IntermediateKey {
Str(String),
/// `f64` key
F64(f64),
/// `i64` key
I64(i64),
/// `u64` key
U64(u64),
}
impl From<Key> for IntermediateKey {
fn from(value: Key) -> Self {
match value {
Key::Str(s) => Self::Str(s),
Key::F64(f) => Self::F64(f),
Key::U64(f) => Self::U64(f),
Key::I64(f) => Self::I64(f),
}
}
}
@@ -72,7 +79,9 @@ impl From<IntermediateKey> for Key {
}
}
IntermediateKey::F64(f) => Self::F64(f),
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
IntermediateKey::Bool(f) => Self::U64(f as u64),
IntermediateKey::U64(f) => Self::U64(f),
IntermediateKey::I64(f) => Self::I64(f),
}
}
}
@@ -85,6 +94,8 @@ impl std::hash::Hash for IntermediateKey {
match self {
IntermediateKey::Str(text) => text.hash(state),
IntermediateKey::F64(val) => val.to_bits().hash(state),
IntermediateKey::U64(val) => val.hash(state),
IntermediateKey::I64(val) => val.hash(state),
IntermediateKey::Bool(val) => val.hash(state),
IntermediateKey::IpAddr(val) => val.hash(state),
}
@@ -111,9 +122,9 @@ impl IntermediateAggregationResults {
pub fn into_final_result(
self,
req: Aggregations,
limits: &AggregationLimits,
mut limits: AggregationLimitsGuard,
) -> crate::Result<AggregationResults> {
let res = self.into_final_result_internal(&req, limits)?;
let res = self.into_final_result_internal(&req, &mut limits)?;
let bucket_count = res.get_bucket_count() as u32;
if bucket_count > limits.get_bucket_limit() {
return Err(TantivyError::AggregationError(
@@ -130,7 +141,7 @@ impl IntermediateAggregationResults {
pub(crate) fn into_final_result_internal(
self,
req: &Aggregations,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<AggregationResults> {
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
for (key, agg_res) in self.aggs_res.into_iter() {
@@ -227,6 +238,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
TopHits(ref req) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
),
Cardinality(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
),
}
}
@@ -243,7 +257,7 @@ impl IntermediateAggregationResult {
pub(crate) fn into_final_result(
self,
req: &Aggregation,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<AggregationResult> {
let res = match self {
IntermediateAggregationResult::Bucket(bucket) => {
@@ -291,6 +305,8 @@ pub enum IntermediateMetricResult {
Sum(IntermediateSum),
/// Intermediate top_hits result
TopHits(TopHitsTopNComputer),
/// Intermediate cardinality result
Cardinality(CardinalityCollector),
}
impl IntermediateMetricResult {
@@ -324,6 +340,9 @@ impl IntermediateMetricResult {
IntermediateMetricResult::TopHits(top_hits) => {
MetricResult::TopHits(top_hits.into_final_result())
}
IntermediateMetricResult::Cardinality(cardinality) => {
MetricResult::Cardinality(cardinality.finalize().into())
}
}
}
@@ -372,6 +391,12 @@ impl IntermediateMetricResult {
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
left.merge_fruits(right)?;
}
(
IntermediateMetricResult::Cardinality(left),
IntermediateMetricResult::Cardinality(right),
) => {
left.merge_fruits(right)?;
}
_ => {
panic!("incompatible fruit types in tree or missing merge_fruits handler");
}
@@ -407,7 +432,7 @@ impl IntermediateBucketResult {
pub(crate) fn into_final_bucket_result(
self,
req: &Aggregation,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<BucketResult> {
match self {
IntermediateBucketResult::Range(range_res) => {
@@ -571,7 +596,7 @@ impl IntermediateTermBucketResult {
self,
req: &TermsAggregation,
sub_aggregation_req: &Aggregations,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<BucketResult> {
let req = TermsAggregationInternal::from_req(req);
let mut buckets: Vec<BucketEntry> = self
@@ -698,7 +723,7 @@ impl IntermediateHistogramBucketEntry {
pub(crate) fn into_final_bucket_entry(
self,
req: &Aggregations,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<BucketEntry> {
Ok(BucketEntry {
key_as_string: None,
@@ -733,7 +758,7 @@ impl IntermediateRangeBucketEntry {
req: &Aggregations,
_range_req: &RangeAggregation,
column_type: Option<ColumnType>,
limits: &AggregationLimits,
limits: &mut AggregationLimitsGuard,
) -> crate::Result<RangeBucketEntry> {
let mut range_bucket_entry = RangeBucketEntry {
key: self.key.into(),
@@ -835,7 +860,7 @@ mod tests {
}
}
fn get_intermediat_tree_with_ranges(
fn get_intermediate_tree_with_ranges(
data: &[(String, u64, String, u64)],
) -> IntermediateAggregationResults {
let mut map = HashMap::new();
@@ -871,18 +896,18 @@ mod tests {
#[test]
fn test_merge_fruits_tree_1() {
let mut tree_left = get_intermediat_tree_with_ranges(&[
let mut tree_left = get_intermediate_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);
let tree_right = get_intermediat_tree_with_ranges(&[
let tree_right = get_intermediate_tree_with_ranges(&[
("red".to_string(), 60, "1900".to_string(), 30),
("blue".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(tree_right).unwrap();
let tree_expected = get_intermediat_tree_with_ranges(&[
let tree_expected = get_intermediate_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
("blue".to_string(), 55, "1900".to_string(), 80),
]);
@@ -892,18 +917,18 @@ mod tests {
#[test]
fn test_merge_fruits_tree_2() {
let mut tree_left = get_intermediat_tree_with_ranges(&[
let mut tree_left = get_intermediate_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);
let tree_right = get_intermediat_tree_with_ranges(&[
let tree_right = get_intermediate_tree_with_ranges(&[
("red".to_string(), 60, "1900".to_string(), 30),
("green".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(tree_right).unwrap();
let tree_expected = get_intermediat_tree_with_ranges(&[
let tree_expected = get_intermediate_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
("blue".to_string(), 30, "1900".to_string(), 30),
("green".to_string(), 25, "1900".to_string(), 50),
@@ -914,7 +939,7 @@ mod tests {
#[test]
fn test_merge_fruits_tree_empty() {
let mut tree_left = get_intermediat_tree_with_ranges(&[
let mut tree_left = get_intermediate_tree_with_ranges(&[
("red".to_string(), 50, "1900".to_string(), 25),
("blue".to_string(), 30, "1900".to_string(), 30),
]);

View File

@@ -0,0 +1,473 @@
use std::collections::hash_map::DefaultHasher;
use std::hash::{BuildHasher, Hasher};
use columnar::column_values::CompactSpaceU64Accessor;
use columnar::Dictionary;
use common::f64_to_u64;
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
use rustc_hash::FxHashSet;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_req_with_accessor::{
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::*;
use crate::TantivyError;
#[derive(Clone, Debug, Serialize, Deserialize)]
struct BuildSaltedHasher {
salt: u8,
}
impl BuildHasher for BuildSaltedHasher {
type Hasher = DefaultHasher;
fn build_hasher(&self) -> Self::Hasher {
let mut hasher = DefaultHasher::new();
hasher.write_u8(self.salt);
hasher
}
}
/// # Cardinality
///
/// The cardinality aggregation allows for computing an estimate
/// of the number of different values in a data set based on the
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
/// uniqueness of values in a large dataset where counting each unique value
/// individually would be computationally expensive.
///
/// For example, you might use a cardinality aggregation to estimate the number
/// of unique visitors to a website by aggregating on a field that contains
/// user IDs or session IDs.
///
/// To use the cardinality aggregation, you'll need to provide a field to
/// aggregate on. The following example demonstrates a request for the cardinality
/// of the "user_id" field:
///
/// ```JSON
/// {
/// "cardinality": {
/// "field": "user_id"
/// }
/// }
/// ```
///
/// This request will return an estimate of the number of unique values in the
/// "user_id" field.
///
/// ## Missing Values
///
/// The `missing` parameter defines how documents that are missing a value should be treated.
/// By default, documents without a value for the specified field are ignored. However, you can
/// specify a default value for these documents using the `missing` parameter. This can be useful
/// when you want to include documents with missing values in the aggregation.
///
/// For example, the following request treats documents with missing values in the "user_id"
/// field as if they had a value of "unknown":
///
/// ```JSON
/// {
/// "cardinality": {
/// "field": "user_id",
/// "missing": "unknown"
/// }
/// }
/// ```
///
/// # Estimation Accuracy
///
/// The cardinality aggregation provides an approximate count, which is usually
/// accurate within a small error range. This trade-off allows for efficient
/// computation even on very large datasets.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct CardinalityAggregationReq {
/// The field name to compute the percentiles on.
pub field: String,
/// The missing parameter defines how documents that are missing a value should be treated.
/// By default they will be ignored but it is also possible to treat them as if they had a
/// value. Examples in JSON format:
/// { "field": "my_numbers", "missing": "10.0" }
#[serde(skip_serializing_if = "Option::is_none", default)]
pub missing: Option<Key>,
}
impl CardinalityAggregationReq {
/// Creates a new [`CardinalityAggregationReq`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
Self {
field: field_name,
missing: None,
}
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
&self.field
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentCardinalityCollector {
cardinality: CardinalityCollector,
entries: FxHashSet<u64>,
column_type: ColumnType,
accessor_idx: usize,
missing: Option<Key>,
}
impl SegmentCardinalityCollector {
pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
Self {
cardinality: CardinalityCollector::new(column_type as u8),
entries: Default::default(),
column_type,
accessor_idx,
missing: missing.clone(),
}
}
fn fetch_block_with_field(
&mut self,
docs: &[crate::DocId],
agg_accessor: &mut AggregationWithAccessor,
) {
if let Some(missing) = agg_accessor.missing_value_for_accessor {
agg_accessor.column_block_accessor.fetch_block_with_missing(
docs,
&agg_accessor.accessor,
missing,
);
} else {
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
}
}
fn into_intermediate_metric_result(
mut self,
agg_with_accessor: &AggregationWithAccessor,
) -> crate::Result<IntermediateMetricResult> {
if self.column_type == ColumnType::Str {
let fallback_dict = Dictionary::empty();
let dict = agg_with_accessor
.str_dict_column
.as_ref()
.map(|el| el.dictionary())
.unwrap_or_else(|| &fallback_dict);
let mut has_missing = false;
// TODO: replace FxHashSet with something that allows iterating in order
// (e.g. sparse bitvec)
let mut term_ids = Vec::new();
for term_ord in self.entries.into_iter() {
if term_ord == u64::MAX {
has_missing = true;
} else {
// we can reasonably exclude values above u32::MAX
term_ids.push(term_ord as u32);
}
}
term_ids.sort_unstable();
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
self.cardinality.sketch.insert_any(&term);
Ok(())
})?;
if has_missing {
// Replace missing with the actual value provided
let missing_key = self
.missing
.as_ref()
.expect("Found sentinel value u64::MAX for term_ord but `missing` is not set");
match missing_key {
Key::Str(missing) => {
self.cardinality.sketch.insert_any(&missing);
}
Key::F64(val) => {
let val = f64_to_u64(*val);
self.cardinality.sketch.insert_any(&val);
}
Key::U64(val) => {
self.cardinality.sketch.insert_any(&val);
}
Key::I64(val) => {
self.cardinality.sketch.insert_any(&val);
}
}
}
}
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
}
}
impl SegmentAggregationCollector for SegmentCardinalityCollector {
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_result),
)?;
Ok(())
}
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
self.collect_block(&[doc], agg_with_accessor)
}
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
self.fetch_block_with_field(docs, bucket_agg_accessor);
let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
if self.column_type == ColumnType::Str {
for term_ord in col_block_accessor.iter_vals() {
self.entries.insert(term_ord);
}
} else if self.column_type == ColumnType::IpAddr {
let compact_space_accessor = bucket_agg_accessor
.accessor
.values
.clone()
.downcast_arc::<CompactSpaceU64Accessor>()
.map_err(|_| {
TantivyError::AggregationError(
crate::aggregation::AggregationError::InternalError(
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
.to_string(),
),
)
})?;
for val in col_block_accessor.iter_vals() {
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
self.cardinality.sketch.insert_any(&val);
}
} else {
for val in col_block_accessor.iter_vals() {
self.cardinality.sketch.insert_any(&val);
}
}
Ok(())
}
}
#[derive(Clone, Debug, Serialize, Deserialize)]
/// The percentiles collector used during segment collection and for merging results.
pub struct CardinalityCollector {
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
}
impl Default for CardinalityCollector {
fn default() -> Self {
Self::new(0)
}
}
impl PartialEq for CardinalityCollector {
fn eq(&self, _other: &Self) -> bool {
false
}
}
impl CardinalityCollector {
/// Compute the final cardinality estimate.
pub fn finalize(self) -> Option<f64> {
Some(self.sketch.clone().count().trunc())
}
fn new(salt: u8) -> Self {
Self {
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
}
}
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
self.sketch.merge(&right.sketch).map_err(|err| {
TantivyError::AggregationError(AggregationError::InternalError(format!(
"Error while merging cardinality {err:?}"
)))
})?;
Ok(())
}
}
#[cfg(test)]
mod tests {
use std::net::IpAddr;
use std::str::FromStr;
use columnar::MonotonicallyMappableToU64;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
use crate::schema::{IntoIpv6Addr, Schema, FAST};
use crate::Index;
#[test]
fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
let values = vec![];
let index = get_test_index_from_terms(false, &values)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "string_id",
}
},
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 0.0);
Ok(())
}
#[test]
fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
cardinality_aggregation_test_merge_segment(true)
}
#[test]
fn cardinality_aggregation_test() -> crate::Result<()> {
cardinality_aggregation_test_merge_segment(false)
}
fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
let segment_and_terms = vec![
vec!["terma"],
vec!["termb"],
vec!["termc"],
vec!["terma"],
vec!["terma"],
vec!["terma"],
vec!["termb"],
vec!["terma"],
];
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "string_id",
}
},
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 3.0);
Ok(())
}
#[test]
fn cardinality_aggregation_u64() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let id_field = schema_builder.add_u64_field("id", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(id_field => 1u64))?;
writer.add_document(doc!(id_field => 2u64))?;
writer.add_document(doc!(id_field => 3u64))?;
writer.add_document(doc!())?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "id",
"missing": 0u64
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 4.0);
Ok(())
}
#[test]
fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
// IpV6 loopback
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
// IpV4
writer.add_document(
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
)?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "ip_field"
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 2.0);
Ok(())
}
#[test]
fn cardinality_aggregation_json() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_json_field("json", FAST);
let index = Index::create_in_ram(schema_builder.build());
{
let mut writer = index.writer_for_tests()?;
writer.add_document(doc!(field => json!({"value": false})))?;
writer.add_document(doc!(field => json!({"value": true})))?;
writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
writer.commit()?;
}
let agg_req: Aggregations = serde_json::from_value(json!({
"cardinality": {
"cardinality": {
"field": "json.value"
},
}
}))
.unwrap();
let res = exec_request(agg_req, &index)?;
assert_eq!(res["cardinality"]["value"], 4.0);
Ok(())
}
}

View File

@@ -17,6 +17,7 @@
//! - [Percentiles](PercentilesAggregationReq)
mod average;
mod cardinality;
mod count;
mod extended_stats;
mod max;
@@ -29,6 +30,7 @@ mod top_hits;
use std::collections::HashMap;
pub use average::*;
pub use cardinality::*;
pub use count::*;
pub use extended_stats::*;
pub use max::*;

View File

@@ -163,8 +163,8 @@ impl PartialEq for PercentilesCollector {
}
}
fn format_percentil(percentil: f64) -> String {
let mut out = percentil.to_string();
fn format_percentile(percentile: f64) -> String {
let mut out = percentile.to_string();
// Slightly silly way to format trailing decimals
if !out.contains('.') {
out.push_str(".0");
@@ -197,7 +197,7 @@ impl PercentilesCollector {
let values = if req.keyed {
PercentileValues::HashMap(
iter_quantile_and_values
.map(|(val, quantil)| (format_percentil(val), quantil))
.map(|(val, quantil)| (format_percentile(val), quantil))
.collect(),
)
} else {

View File

@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
/// }
/// ```
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
pub struct TopHitsAggregation {
pub struct TopHitsAggregationReq {
sort: Vec<KeyOrder>,
size: usize,
from: Option<usize>,
@@ -139,7 +139,7 @@ impl<'de> Deserialize<'de> for KeyOrder {
}
}
// Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
// Transform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
// Replace `*` glob with `.*` regex
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
))
}
impl TopHitsAggregation {
impl TopHitsAggregationReq {
/// Validate and resolve field retrieval parameters
pub fn validate_and_resolve_field_names(
&mut self,
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
/// The TopHitsCollector used for collecting over segments and merging results.
#[derive(Clone, Serialize, Deserialize, Debug)]
pub struct TopHitsTopNComputer {
req: TopHitsAggregation,
req: TopHitsAggregationReq,
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
}
@@ -443,7 +443,7 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
impl TopHitsTopNComputer {
/// Create a new TopHitsCollector
pub fn new(req: &TopHitsAggregation) -> Self {
pub fn new(req: &TopHitsAggregationReq) -> Self {
Self {
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
req: req.clone(),
@@ -496,7 +496,7 @@ pub(crate) struct TopHitsSegmentCollector {
impl TopHitsSegmentCollector {
pub fn from_req(
req: &TopHitsAggregation,
req: &TopHitsAggregationReq,
accessor_idx: usize,
segment_ordinal: SegmentOrdinal,
) -> Self {
@@ -509,7 +509,7 @@ impl TopHitsSegmentCollector {
fn into_top_hits_collector(
self,
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
req: &TopHitsAggregation,
req: &TopHitsAggregationReq,
) -> TopHitsTopNComputer {
let mut top_hits_computer = TopHitsTopNComputer::new(req);
let top_results = self.top_n.into_vec();
@@ -532,7 +532,7 @@ impl TopHitsSegmentCollector {
fn collect_with(
&mut self,
doc_id: crate::DocId,
req: &TopHitsAggregation,
req: &TopHitsAggregationReq,
accessors: &[(Column<u64>, ColumnType)],
) -> crate::Result<()> {
let sorts: Vec<DocValueAndOrder> = req

View File

@@ -44,11 +44,14 @@
//! - [Metric](metric)
//! - [Average](metric::AverageAggregation)
//! - [Stats](metric::StatsAggregation)
//! - [ExtendedStats](metric::ExtendedStatsAggregation)
//! - [Min](metric::MinAggregation)
//! - [Max](metric::MaxAggregation)
//! - [Sum](metric::SumAggregation)
//! - [Count](metric::CountAggregation)
//! - [Percentiles](metric::PercentilesAggregationReq)
//! - [Cardinality](metric::CardinalityAggregationReq)
//! - [TopHits](metric::TopHitsAggregationReq)
//!
//! # Example
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
@@ -145,7 +148,7 @@ mod agg_tests;
use core::fmt;
pub use agg_limits::AggregationLimits;
pub use agg_limits::AggregationLimitsGuard;
pub use collector::{
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
DEFAULT_BUCKET_LIMIT,
@@ -333,10 +336,16 @@ pub type SerializedKey = String;
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
/// The key to identify a bucket.
///
/// The order is important, with serde untagged, that we try to deserialize into i64 first.
#[serde(untagged)]
pub enum Key {
/// String key
Str(String),
/// `i64` key
I64(i64),
/// `u64` key
U64(u64),
/// `f64` key
F64(f64),
}
@@ -347,6 +356,8 @@ impl std::hash::Hash for Key {
match self {
Key::Str(text) => text.hash(state),
Key::F64(val) => val.to_bits().hash(state),
Key::U64(val) => val.hash(state),
Key::I64(val) => val.hash(state),
}
}
}
@@ -366,6 +377,8 @@ impl Display for Key {
match self {
Key::Str(val) => f.write_str(val),
Key::F64(val) => f.write_str(&val.to_string()),
Key::U64(val) => f.write_str(&val.to_string()),
Key::I64(val) => f.write_str(&val.to_string()),
}
}
}
@@ -445,7 +458,7 @@ mod tests {
agg_req: Aggregations,
index: &Index,
query: Option<(&str, &str)>,
limits: AggregationLimits,
limits: AggregationLimitsGuard,
) -> crate::Result<Value> {
let collector = AggregationCollector::from_aggs(agg_req, limits);

View File

@@ -5,7 +5,7 @@
use std::fmt::Debug;
pub(crate) use super::agg_limits::AggregationLimits;
pub(crate) use super::agg_limits::AggregationLimitsGuard;
use super::agg_req::AggregationVariants;
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
@@ -16,7 +16,10 @@ use super::metric::{
SumAggregation,
};
use crate::aggregation::bucket::TermMissingAgg;
use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
use crate::aggregation::metric::{
CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
TopHitsSegmentCollector,
};
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn add_intermediate_aggregation_result(
@@ -100,7 +103,7 @@ pub(crate) fn build_single_agg_segment_collector(
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
range_req,
&mut req.sub_aggregation,
&req.limits,
&mut req.limits,
req.field_type,
accessor_idx,
)?)),
@@ -169,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
accessor_idx,
req.segment_ordinal,
))),
Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
)),
}
}

View File

@@ -15,11 +15,6 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
/// should be reversed, which is useful for achieving for example largest-first
/// semantics without having to wrap the feature in a `Reverse`.
///
/// WARNING: equality is not what you would expect here.
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
/// is equal. This should be perfectly fine for this usage, but let's make sure this
/// struct is never public.
#[derive(Clone, Default, Serialize, Deserialize)]
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
/// The feature of the document. In practice, this is

View File

@@ -3,7 +3,6 @@ use std::marker::PhantomData;
use std::sync::Arc;
use columnar::ColumnValues;
use serde::de::DeserializeOwned;
use serde::{Deserialize, Serialize};
use super::Collector;
@@ -790,7 +789,7 @@ impl<Score, D, const R: bool> From<TopNComputerDeser<Score, D, R>> for TopNCompu
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
where
Score: PartialOrd + Clone,
D: Serialize + DeserializeOwned + Ord + Clone,
D: Ord,
{
/// Create a new `TopNComputer`.
/// Internally it will allocate a buffer of size `2 * top_n`.

80
src/compat_tests.rs Normal file
View File

@@ -0,0 +1,80 @@
use std::path::PathBuf;
use schema::*;
use crate::*;
fn create_index(path: &str) {
let mut schema_builder = Schema::builder();
let label = schema_builder.add_text_field("label", TEXT | STORED);
let date = schema_builder.add_date_field("date", INDEXED | STORED);
let schema = schema_builder.build();
std::fs::create_dir_all(path).unwrap();
let index = Index::create_in_dir(path, schema).unwrap();
let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap();
index_writer
.add_document(doc!(label => "dateformat", date => DateTime::from_timestamp_nanos(123456)))
.unwrap();
index_writer.commit().unwrap();
}
#[test]
/// Writes an Index for the current INDEX_FORMAT_VERSION to disk.
fn create_format() {
let version = INDEX_FORMAT_VERSION.to_string();
let file_path = path_for_version(&version);
if PathBuf::from(file_path.clone()).exists() {
return;
}
create_index(&file_path);
}
fn path_for_version(version: &str) -> String {
format!("./tests/compat_tests_data/index_v{}/", version)
}
/// feature flag quickwit uses a different dictionary type
#[test]
#[cfg(not(feature = "quickwit"))]
fn test_format_6() {
let path = path_for_version("6");
let index = Index::open_in_dir(path).expect("Failed to open index");
// dates are truncated to Microseconds in v6
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
}
#[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
use collector::TopDocs;
let reader = index.reader().expect("Failed to create reader");
let searcher = reader.searcher();
let schema = index.schema();
let label_field = schema.get_field("label").expect("Field 'label' not found");
let query_parser = query::QueryParser::for_index(index, vec![label_field]);
let query = query_parser
.parse_query("dateformat")
.expect("Failed to parse query");
let top_docs = searcher
.search(&query, &TopDocs::with_limit(1))
.expect("Search failed");
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
let doc_address = top_docs[0].1;
let retrieved_doc: TantivyDocument = searcher
.doc(doc_address)
.expect("Failed to retrieve document");
let date_field = schema.get_field("date").expect("Field 'date' not found");
let date_value = retrieved_doc
.get_first(date_field)
.expect("Date field not found in document")
.as_datetime()
.unwrap();
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
assert_eq!(date_value, expected,);
}

View File

@@ -100,7 +100,7 @@ impl Executor {
/// Spawn a task on the pool, returning a future completing on task success.
///
/// If the task panic, returns `Err(())`.
/// If the task panics, returns `Err(())`.
#[cfg(feature = "quickwit")]
pub fn spawn_blocking<T: Send + 'static>(
&self,

View File

@@ -1,10 +1,10 @@
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
use common::{replace_in_place, JsonPathWriter};
use rustc_hash::FxHashMap;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
use crate::schema::Type;
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
positions_per_path: &mut IndexingPositionsPerPath,
) {
for (json_path_segment, json_value_visitor) in json_visitor {
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
continue;
}
json_path_writer.push(json_path_segment);
index_json_value(
doc,
@@ -186,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
ctx.path_to_unordered_id
.get_or_allocate_unordered_id(json_path_writer.as_str()),
);
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
term_buffer.append_type_and_fast_value(val);
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
}
@@ -236,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
/// Tries to infer a JSON type from a string and append it to the term.
///
/// The term must be json + JSON path.
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
pub fn convert_to_fast_value_and_append_to_json_term(
mut term: Term,
phrase: &str,
truncate_date_for_search: bool,
) -> Option<Term> {
assert_eq!(
term.value()
.as_json_value_bytes()
@@ -247,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
"JSON value bytes should be empty"
);
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
let dt_utc = dt.to_offset(UtcOffset::UTC);
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
if truncate_date_for_search {
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
}
term.append_type_and_fast_value(dt);
return Some(term);
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {

View File

@@ -102,10 +102,8 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
///
/// There are currently two implementations of `Directory`
///
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
/// should be your default choice.
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
/// should be used mostly for tests.
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this should be your default choice.
/// - The [`RamDirectory`][crate::directory::RamDirectory], which should be used mostly for tests.
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
/// Opens a file and returns a boxed `FileHandle`.
///

View File

@@ -25,10 +25,9 @@ impl FacetReader {
/// Creates a new `FacetReader`.
///
/// A facet reader just wraps :
/// - a `MultiValuedFastFieldReader` that makes it possible to
/// access the list of facet ords for a given document.
/// - a `TermDictionary` that helps associating a facet to
/// an ordinal and vice versa.
/// - a `MultiValuedFastFieldReader` that makes it possible to access the list of facet ords for
/// a given document.
/// - a `TermDictionary` that helps associating a facet to an ordinal and vice versa.
pub fn new(facet_column: StrColumn) -> FacetReader {
FacetReader { facet_column }
}

View File

@@ -942,10 +942,10 @@ mod tests {
let numbers = [100, 200, 300];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
assert_eq!(vec.len(), expected_count);
};
test_range(50..=50);
test_range(150..=150);
@@ -1020,10 +1020,10 @@ mod tests {
let numbers = [1000, 1001, 1003];
let test_range = |range: RangeInclusive<u64>| {
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
let mut vec = vec![];
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
assert_eq!(vec.len(), expexted_count);
assert_eq!(vec.len(), expected_count);
};
let test_range_variant = |start, stop| {
let start_range = start..=stop;

View File

@@ -70,13 +70,13 @@ impl FastFieldReaders {
///
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
///
/// The logic works as follows, first we identify which field is targetted by calling
/// The logic works as follows, first we identify which field is targeted by calling
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
/// name by non-escaped dots, and find the longest matching schema field name.
/// In our case, it would return the (attribute_field, "color").
///
/// If no field is found, but a dynamic field is supplied, then we
/// will simply assuem the user is targetting the dynamic field. (This feature is used in
/// will simply assume the user is targeting the dynamic field. (This feature is used in
/// Quickwit.)
///
/// We then encode the `(field, path)` into the right `columnar_key`.

View File

@@ -11,8 +11,8 @@ use crate::TantivyError;
/// progress. Dropping the `FutureResult` does not cancel the task being executed
/// either.
///
/// - In a sync context, you can call `FutureResult::wait()`. The function
/// does not rely on `block_on`.
/// - In a sync context, you can call `FutureResult::wait()`. The function does not rely on
/// `block_on`.
/// - In an async context, you can call simply use `FutureResult` as a future.
pub struct FutureResult<T> {
inner: Inner<T>,

View File

@@ -49,10 +49,8 @@ fn load_metas(
/// Save the index meta file.
/// This operation is atomic :
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written
/// and flushed.
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
/// - it succeeds, and `meta.json` is written and flushed.
///
/// This method is not part of tantivy's public API
fn save_new_metas(
@@ -529,12 +527,12 @@ impl Index {
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// - `num_threads` defines the number of indexing workers that
/// should work at the same time.
/// - `num_threads` defines the number of indexing workers that should work at the same time.
///
/// - `overall_memory_budget_in_bytes` sets the amount of memory
/// allocated for all indexing thread.
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
/// - `overall_memory_budget_in_bytes` sets the amount of memory allocated for all indexing
/// thread.
///
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
///
/// # Errors
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.

View File

@@ -176,10 +176,7 @@ impl SegmentMeta {
}
/// Updates the max_doc value from the `SegmentMeta`.
///
/// This method is only used when updating `max_doc` from 0
/// as we finalize a fresh new segment.
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
pub fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
assert_eq!(self.tracked.max_doc, 0);
assert!(self.tracked.deletes.is_none());
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {

View File

@@ -71,7 +71,7 @@ impl InvertedIndexReader {
&self.termdict
}
/// Return the fields and types encoded in the dictionary in lexicographic oder.
/// Return the fields and types encoded in the dictionary in lexicographic order.
/// Only valid on JSON fields.
///
/// Notice: This requires a full scan and therefore **very expensive**.

View File

@@ -358,7 +358,7 @@ impl SegmentReader {
.map(|(mut field_name, handle)| {
json_path_sep_to_dot(&mut field_name);
// map to canonical path, to avoid similar but different entries.
// Eventually we should just accept '.' seperated for all cases.
// Eventually we should just accept '.' separated for all cases.
let field_name = map_to_canonical
.get(&field_name)
.unwrap_or(&field_name)

View File

@@ -179,8 +179,7 @@ impl DeleteCursor {
/// Skips operations and position it so that
/// - either all of the delete operation currently in the queue are consume and the next get
/// will return `None`.
/// - the next get will return the first operation with an
/// `opstamp >= target_opstamp`.
/// - the next get will return the first operation with an `opstamp >= target_opstamp`.
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
// TODO Can be optimize as we work with block.
while self.is_behind_opstamp(target_opstamp) {

View File

@@ -482,7 +482,7 @@ impl<D: Document> IndexWriter<D> {
/// let index = Index::create_in_ram(schema.clone());
///
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
/// index_writer.add_document(doc!(title => "The modern Prometheus"))?;
/// index_writer.commit()?;
///
/// let clear_res = index_writer.delete_all_documents().unwrap();
@@ -491,7 +491,7 @@ impl<D: Document> IndexWriter<D> {
///
/// let searcher = index.reader()?.searcher();
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query_promo = query_parser.parse_query("Promotheus")?;
/// let query_promo = query_parser.parse_query("Prometheus")?;
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
///
/// assert!(top_docs_promo.is_empty());
@@ -815,8 +815,9 @@ mod tests {
use crate::indexer::NoMergePolicy;
use crate::query::{QueryParser, TermQuery};
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions,
TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, JsonObjectOptions,
NumericOptions, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
STRING, TEXT,
};
use crate::store::DOCSTORE_CACHE_CAPACITY;
use crate::{
@@ -1573,11 +1574,11 @@ mod tests {
deleted_ids.remove(id);
}
IndexingOp::DeleteDoc { id } => {
existing_ids.remove(&id);
existing_ids.remove(id);
deleted_ids.insert(*id);
}
IndexingOp::DeleteDocQuery { id } => {
existing_ids.remove(&id);
existing_ids.remove(id);
deleted_ids.insert(*id);
}
_ => {}
@@ -2092,7 +2093,7 @@ mod tests {
//
// Take half as sample
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
sample.sort_by_key(|(k, _num_occurences)| *k);
sample.sort_by_key(|(k, _num_occurrences)| *k);
// sample.truncate(sample.len() / 2);
if !sample.is_empty() {
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
@@ -2101,7 +2102,7 @@ mod tests {
sample
.iter()
.filter(|(id, _)| id_is_full_doc(**id))
.map(|(_id, num_occurences)| **num_occurences)
.map(|(_id, num_occurrences)| **num_occurrences)
.sum::<u64>()
};
fn gen_query_inclusive<T1: ToString, T2: ToString>(
@@ -2378,11 +2379,11 @@ mod tests {
#[test]
fn test_bug_1617_2() {
assert!(test_operation_strategy(
test_operation_strategy(
&[
IndexingOp::AddDoc {
id: 13,
value: Default::default()
value: Default::default(),
},
IndexingOp::DeleteDoc { id: 13 },
IndexingOp::Commit,
@@ -2390,9 +2391,9 @@ mod tests {
IndexingOp::Commit,
IndexingOp::Merge,
],
true
true,
)
.is_ok());
.unwrap();
}
#[test]
@@ -2492,9 +2493,9 @@ mod tests {
}
#[test]
fn test_bug_2442() -> crate::Result<()> {
fn test_bug_2442_reserved_character_fast_field() -> crate::Result<()> {
let mut schema_builder = schema::Schema::builder();
let json_field = schema_builder.add_json_field("json", TEXT | FAST);
let json_field = schema_builder.add_json_field("json", FAST | TEXT);
let schema = schema_builder.build();
let index = Index::builder().schema(schema).create_in_ram()?;
@@ -2515,4 +2516,21 @@ mod tests {
Ok(())
}
#[test]
fn test_bug_2442_reserved_character_columnar() -> crate::Result<()> {
let mut schema_builder = Schema::builder();
let options = JsonObjectOptions::from(FAST).set_expand_dots_enabled();
let field = schema_builder.add_json_field("json", options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({"\u{0000}": "A"})))
.unwrap();
index_writer
.add_document(doc!(field=>json!({format!("\u{0000}\u{0000}"): "A"})))
.unwrap();
index_writer.commit().unwrap();
Ok(())
}
}

View File

@@ -29,8 +29,8 @@ impl MergeOperationInventory {
/// A `MergeOperation` has two roles.
/// It carries all of the information required to describe a merge:
/// - `target_opstamp` is the opstamp up to which we want to consume the
/// delete queue and reflect their deletes.
/// - `target_opstamp` is the opstamp up to which we want to consume the delete queue and reflect
/// their deletes.
/// - `segment_ids` is the list of segment to be merged.
///
/// The second role is to ensure keep track of the fact that these

View File

@@ -673,7 +673,7 @@ mod tests {
]
);
assert_eq!(
get_doc_ids(vec![Term::from_field_date(
get_doc_ids(vec![Term::from_field_date_for_search(
date_field,
DateTime::from_utc(curr_time)
)])?,

View File

@@ -145,15 +145,27 @@ mod tests_mmap {
}
}
#[test]
fn test_json_field_null_byte() {
// Test when field name contains a zero byte, which has special meaning in tantivy.
// As a workaround, we convert the zero byte to the ASCII character '0'.
// https://github.com/quickwit-oss/tantivy/issues/2340
// https://github.com/quickwit-oss/tantivy/issues/2193
let field_name_in = "\u{0000}";
let field_name_out = "0";
test_json_field_name(field_name_in, field_name_out);
fn test_json_field_null_byte_is_ignored() {
let mut schema_builder = Schema::builder();
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
let field = schema_builder.add_json_field("json", options);
let index = Index::create_in_ram(schema_builder.build());
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
.unwrap();
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0);
let inv_indexer = segment_reader.inverted_index(field).unwrap();
let term_dict = inv_indexer.terms();
assert_eq!(term_dict.num_terms(), 1);
let mut term_bytes = Vec::new();
term_dict.ord_to_term(0, &mut term_bytes).unwrap();
assert_eq!(term_bytes, b"key\0stest1");
}
#[test]
fn test_json_field_1byte() {
// Test when field name contains a '1' byte, which has special meaning in tantivy.
@@ -291,7 +303,7 @@ mod tests_mmap {
Type::Str,
),
(format!("{field_name_out_internal}a"), Type::Str),
(format!("{field_name_out_internal}"), Type::Str),
(field_name_out_internal.to_string(), Type::Str),
(format!("num{field_name_out_internal}"), Type::I64),
];
expected_fields.sort();

View File

@@ -1,5 +1,3 @@
use common::json_path_writer::JSON_END_OF_PATH;
use common::replace_in_place;
use fnv::FnvHashMap;
/// `Field` is represented by an unsigned 32-bit integer type.
@@ -40,18 +38,12 @@ impl PathToUnorderedId {
#[cold]
fn insert_new_path(&mut self, path: &str) -> u32 {
let next_id = self.map.len() as u32;
let mut new_path = path.to_string();
// The unsafe below is safe as long as b'.' and JSON_PATH_SEGMENT_SEP are
// valid single byte ut8 strings.
// By utf-8 design, they cannot be part of another codepoint.
unsafe { replace_in_place(JSON_END_OF_PATH, b'0', new_path.as_bytes_mut()) };
let new_path = path.to_string();
self.map.insert(new_path, next_id);
next_id
}
/// Retuns ids which reflect the lexical order of the paths.
/// Returns ids which reflect the lexical order of the paths.
///
/// The returned vec can be indexed with the unordered id to get the ordered id.
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
@@ -65,7 +57,7 @@ impl PathToUnorderedId {
result
}
/// Retuns the paths so they can be queried by the ordered id (which is the index).
/// Returns the paths so they can be queried by the ordered id (which is the index).
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
paths.sort_unstable();

View File

@@ -10,12 +10,9 @@ use crate::indexer::delete_queue::DeleteCursor;
///
/// In addition to segment `meta`,
/// it contains a few transient states
/// - `alive_bitset` is a bitset describing
/// documents that were alive during the commit
/// itself.
/// - `delete_cursor` is the position in the delete queue.
/// Deletes happening before the cursor are reflected either
/// in the .del file or in the `alive_bitset`.
/// - `alive_bitset` is a bitset describing documents that were alive during the commit itself.
/// - `delete_cursor` is the position in the delete queue. Deletes happening before the cursor are
/// reflected either in the .del file or in the `alive_bitset`.
#[derive(Clone)]
pub struct SegmentEntry {
meta: SegmentMeta,

View File

@@ -30,10 +30,8 @@ const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic:
/// Either
/// - it fails, in which case an error is returned,
/// and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written
/// and flushed.
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
/// - it success, and `meta.json` is written and flushed.
///
/// This method is not part of tantivy's public API
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
@@ -379,7 +377,7 @@ impl SegmentUpdater {
if self.is_alive() {
let index = &self.index;
let directory = index.directory();
let mut commited_segment_metas = self.segment_manager.committed_segment_metas();
let mut committed_segment_metas = self.segment_manager.committed_segment_metas();
// We sort segment_readers by number of documents.
// This is an heuristic to make multithreading more efficient.
@@ -394,10 +392,10 @@ impl SegmentUpdater {
// from the different drives.
//
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
let index_meta = IndexMeta {
index_settings: index.settings().clone(),
segments: commited_segment_metas,
segments: committed_segment_metas,
schema: index.schema(),
opstamp,
payload: commit_message,
@@ -542,7 +540,13 @@ impl SegmentUpdater {
}
fn consider_merge_options(&self) {
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
let (mut committed_segments, mut uncommitted_segments) = self.get_mergeable_segments();
if committed_segments.len() == 1 && committed_segments[0].num_deleted_docs() == 0 {
committed_segments.clear();
}
if uncommitted_segments.len() == 1 && uncommitted_segments[0].num_deleted_docs() == 0 {
uncommitted_segments.clear();
}
// Committed segments cannot be merged with uncommitted_segments.
// We therefore consider merges using these two sets of segments independently.

View File

@@ -64,9 +64,9 @@ impl SegmentWriter {
///
/// The arguments are defined as follows
///
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
/// is stored in a memory arena. This makes it possible for the user to define
/// the flushing behavior as a memory limit.
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
/// stored in a memory arena. This makes it possible for the user to define the flushing
/// behavior as a memory limit.
/// - segment: The segment being written
/// - schema
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
@@ -431,7 +431,7 @@ mod tests {
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
STORED, STRING, TEXT,
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
@@ -651,7 +651,8 @@ mod tests {
set_fast_val(
DateTime::from_utc(
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
),
)
.truncate(DATE_TIME_PRECISION_INDEXED),
term
)
.serialized_value_bytes()

View File

@@ -125,8 +125,8 @@
//!
//! - **Searching**: [Searcher] searches the segments with anything that implements
//! [Query](query::Query) and merges the results. The list of [supported
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
//! [Query](query::Query) trait.
//!
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
//!
@@ -202,12 +202,15 @@ pub mod space_usage;
pub mod store;
pub mod termdict;
mod docset;
mod reader;
#[cfg(test)]
mod compat_tests;
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
pub mod snippet;
mod docset;
use std::fmt;
pub use census::{Inventory, TrackedObject};
@@ -229,9 +232,9 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term};
/// Index format version.
const INDEX_FORMAT_VERSION: u32 = 6;
pub const INDEX_FORMAT_VERSION: u32 = 6;
/// Oldest index format version this tantivy version can read.
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
/// Structure version for the index.
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]

View File

@@ -3,7 +3,7 @@
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
//! This information is useful to run phrase queries.
//!
//! The [position](crate::SegmentComponent::Positions) file contains all of the
//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
//! bitpacked positions delta, for all terms of a given field, one term after the other.
//!
//! Each term is encoded independently.

View File

@@ -18,7 +18,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
/// # Assumption
///
/// - The block is sorted. Some elements may appear several times. This is the case at the
/// end of the last block for instance.
/// end of the last block for instance.
/// - The target is assumed smaller or equal to the last element of the block.
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
let mut start = 0;

View File

@@ -59,7 +59,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
/// The actual serialization format is handled by the `PostingsSerializer`.
fn serialize(
&self,
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
ordered_id_to_path: &[&str],
ctx: &IndexingContext,
serializer: &mut FieldSerializer,
@@ -69,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
let mut prev_term_id = u32::MAX;
let mut term_path_len = 0; // this will be set in the first iteration
for (_field, path_id, term, addr) in term_addrs {
for (_field, path_id, term, addr) in ordered_term_addrs {
if prev_term_id != path_id.path_id() {
term_buffer.truncate_value_bytes(0);
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());

View File

@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
fn term_freq(&self) -> u32;
/// Returns the positions offsetted with a given value.
/// It is not necessary to clear the `output` before calling this method.
/// The output vector will be resized to the `term_freq`.
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);

View File

@@ -7,7 +7,7 @@ use crate::{DocId, Score};
/// Query that matches all of the documents.
///
/// All of the document get the score 1.0.
/// All of the documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct AllQuery;
@@ -22,10 +22,7 @@ pub struct AllWeight;
impl Weight for AllWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let all_scorer = AllScorer {
doc: 0u32,
max_doc: reader.max_doc(),
};
let all_scorer = AllScorer::new(reader.max_doc());
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
}
@@ -43,6 +40,13 @@ pub struct AllScorer {
max_doc: DocId,
}
impl AllScorer {
/// Creates a new AllScorer with `max_doc` docs.
pub fn new(max_doc: DocId) -> AllScorer {
AllScorer { doc: 0u32, max_doc }
}
}
impl DocSet for AllScorer {
#[inline(always)]
fn advance(&mut self) -> DocId {

View File

@@ -1,19 +1,14 @@
use super::boolean_weight::BooleanWeight;
use crate::query::{EnableScoring, Occur, Query, SumWithCoordsCombiner, TermQuery, Weight};
use crate::query::{EnableScoring, Occur, Query, SumCombiner, TermQuery, Weight};
use crate::schema::{IndexRecordOption, Term};
/// The boolean query returns a set of documents
/// that matches the Boolean combination of constituent subqueries.
///
/// The documents matched by the boolean query are
/// those which
/// * match all of the sub queries associated with the
/// `Must` occurrence
/// * match none of the sub queries associated with the
/// `MustNot` occurrence.
/// * match at least one of the sub queries associated
/// with the `Must` or `Should` occurrence.
///
/// The documents matched by the boolean query are those which
/// - match all of the sub queries associated with the `Must` occurrence
/// - match none of the sub queries associated with the `MustNot` occurrence.
/// - match at least one of the sub queries associated with the `Must` or `Should` occurrence.
///
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
///
@@ -66,6 +61,10 @@ use crate::schema::{IndexRecordOption, Term};
/// Term::from_field_text(title, "diary"),
/// IndexRecordOption::Basic,
/// ));
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "cow"),
/// IndexRecordOption::Basic
/// ));
/// // A TermQuery with "found" in the body
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(body, "found"),
@@ -74,7 +73,7 @@ use crate::schema::{IndexRecordOption, Term};
/// // TermQuery "diary" must and "girl" must not be present
/// let queries_with_occurs1 = vec![
/// (Occur::Must, diary_term_query.box_clone()),
/// (Occur::MustNot, girl_term_query),
/// (Occur::MustNot, girl_term_query.box_clone()),
/// ];
/// // Make a BooleanQuery equivalent to
/// // title:+diary title:-girl
@@ -82,15 +81,10 @@ use crate::schema::{IndexRecordOption, Term};
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
/// assert_eq!(count1, 1);
///
/// // TermQuery for "cow" in the title
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
/// Term::from_field_text(title, "cow"),
/// IndexRecordOption::Basic,
/// ));
/// // "title:diary OR title:cow"
/// let title_diary_or_cow = BooleanQuery::new(vec![
/// (Occur::Should, diary_term_query.box_clone()),
/// (Occur::Should, cow_term_query),
/// (Occur::Should, cow_term_query.box_clone()),
/// ]);
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
/// assert_eq!(count2, 4);
@@ -118,21 +112,38 @@ use crate::schema::{IndexRecordOption, Term};
/// ]);
/// let count4 = searcher.search(&nested_query, &Count)?;
/// assert_eq!(count4, 1);
///
/// // You may call `with_minimum_required_clauses` to
/// // specify the number of should clauses the returned documents must match.
/// let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
/// (Occur::Should, cow_term_query.box_clone()),
/// (Occur::Should, girl_term_query.box_clone()),
/// (Occur::Should, diary_term_query.box_clone()),
/// ], 2);
/// // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
/// // Notice: "Diary" isn't "Dairy". ;-)
/// let count5 = searcher.search(&minimum_required_query, &Count)?;
/// assert_eq!(count5, 1);
/// Ok(())
/// }
/// ```
#[derive(Debug)]
pub struct BooleanQuery {
subqueries: Vec<(Occur, Box<dyn Query>)>,
minimum_number_should_match: usize,
}
impl Clone for BooleanQuery {
fn clone(&self) -> Self {
self.subqueries
let subqueries = self
.subqueries
.iter()
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
.collect::<Vec<_>>()
.into()
.collect::<Vec<_>>();
Self {
subqueries,
minimum_number_should_match: self.minimum_number_should_match,
}
}
}
@@ -149,10 +160,11 @@ impl Query for BooleanQuery {
.iter()
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
.collect::<crate::Result<_>>()?;
Ok(Box::new(BooleanWeight::new(
Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
sub_weights,
self.minimum_number_should_match,
enable_scoring.is_scoring_enabled(),
Box::new(SumWithCoordsCombiner::default),
Box::new(SumCombiner::default),
)))
}
@@ -166,7 +178,41 @@ impl Query for BooleanQuery {
impl BooleanQuery {
/// Creates a new boolean query.
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
BooleanQuery { subqueries }
// If the bool query includes at least one should clause
// and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
// 0. Keep pace with Elasticsearch.
let mut minimum_required = 0;
for (occur, _) in &subqueries {
match occur {
Occur::Should => minimum_required = 1,
Occur::Must | Occur::MustNot => {
minimum_required = 0;
break;
}
}
}
Self::with_minimum_required_clauses(subqueries, minimum_required)
}
/// Create a new boolean query with minimum number of required should clauses specified.
pub fn with_minimum_required_clauses(
subqueries: Vec<(Occur, Box<dyn Query>)>,
minimum_number_should_match: usize,
) -> BooleanQuery {
BooleanQuery {
subqueries,
minimum_number_should_match,
}
}
/// Getter for `minimum_number_should_match`
pub fn get_minimum_number_should_match(&self) -> usize {
self.minimum_number_should_match
}
/// Setter for `minimum_number_should_match`
pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
self.minimum_number_should_match = minimum_number_should_match;
}
/// Returns the intersection of the queries.
@@ -181,6 +227,18 @@ impl BooleanQuery {
BooleanQuery::new(subqueries)
}
/// Returns the union of the queries with minimum required clause.
pub fn union_with_minimum_required_clauses(
queries: Vec<Box<dyn Query>>,
minimum_required_clauses: usize,
) -> BooleanQuery {
let subqueries = queries
.into_iter()
.map(|sub_query| (Occur::Should, sub_query))
.collect();
BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
}
/// Helper method to create a boolean query matching a given list of terms.
/// The resulting query is a disjunction of the terms.
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
@@ -203,11 +261,13 @@ impl BooleanQuery {
#[cfg(test)]
mod tests {
use std::collections::HashSet;
use super::BooleanQuery;
use crate::collector::{Count, DocSetCollector};
use crate::query::{QueryClone, QueryParser, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TEXT};
use crate::{DocAddress, Index, Term};
use crate::query::{Query, QueryClone, QueryParser, TermQuery};
use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
use crate::{DocAddress, DocId, Index, Term};
fn create_test_index() -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -223,6 +283,73 @@ mod tests {
Ok(index)
}
#[test]
fn test_minimum_required() -> crate::Result<()> {
fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
docs: T,
) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut writer = index.writer_for_tests()?;
for doc in docs {
writer.add_document(doc!(text => doc))?;
}
writer.commit()?;
Ok(index)
}
fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
queries: T,
field: Field,
mr: usize,
) -> BooleanQuery {
let terms = queries
.into_iter()
.map(|t| Term::from_field_text(field, t))
.map(|t| TermQuery::new(t, IndexRecordOption::Basic))
.map(|q| -> Box<dyn Query> { Box::new(q) })
.collect();
BooleanQuery::union_with_minimum_required_clauses(terms, mr)
}
fn check_doc_id<T: IntoIterator<Item = DocId>>(
expected: T,
actually: HashSet<DocAddress>,
seg: u32,
) {
assert_eq!(
actually,
expected
.into_iter()
.map(|id| DocAddress::new(seg, id))
.collect()
);
}
let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
let searcher = index.reader()?.searcher();
let text = index.schema().get_field("text").unwrap();
// Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
let docs = searcher.search(&q1, &DocSetCollector)?;
check_doc_id([0, 1, 4], docs, 0);
// Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
let docs = searcher.search(&q2, &DocSetCollector)?;
check_doc_id([0, 1], docs, 0);
// Nothing queried since minimum_required is too large.
let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
let docs = searcher.search(&q3, &DocSetCollector)?;
assert!(docs.is_empty());
// When mr is set to zero or one, there are no difference with `Boolean::Union`.
let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
let docs = searcher.search(&q4, &DocSetCollector)?;
check_doc_id([0, 1, 3], docs, 0);
let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
let docs = searcher.search(&q5, &DocSetCollector)?;
check_doc_id([0, 1, 4], docs, 0);
Ok(())
}
#[test]
fn test_union() -> crate::Result<()> {
let index = create_test_index()?;

View File

@@ -3,6 +3,7 @@ use std::collections::HashMap;
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
use crate::index::SegmentReader;
use crate::postings::FreqReadingOption;
use crate::query::disjunction::Disjunction;
use crate::query::explanation::does_not_match;
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer;
@@ -18,6 +19,26 @@ enum SpecializedScorer {
Other(Box<dyn Scorer>),
}
fn scorer_disjunction<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner: TScoreCombiner,
minimum_match_required: usize,
) -> Box<dyn Scorer>
where
TScoreCombiner: ScoreCombiner,
{
debug_assert!(!scorers.is_empty());
debug_assert!(minimum_match_required > 1);
if scorers.len() == 1 {
return scorers.into_iter().next().unwrap(); // Safe unwrap.
}
Box::new(Disjunction::new(
scorers,
score_combiner,
minimum_match_required,
))
}
fn scorer_union<TScoreCombiner>(
scorers: Vec<Box<dyn Scorer>>,
score_combiner_fn: impl Fn() -> TScoreCombiner,
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
/// Weight associated to the `BoolQuery`.
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
weights: Vec<(Occur, Box<dyn Weight>)>,
minimum_number_should_match: usize,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
}
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
weights,
scoring_enabled,
score_combiner_fn,
minimum_number_should_match: 1,
}
}
/// Create a new boolean weight with minimum number of required should clauses specified.
pub fn with_minimum_number_should_match(
weights: Vec<(Occur, Box<dyn Weight>)>,
minimum_number_should_match: usize,
scoring_enabled: bool,
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
) -> BooleanWeight<TScoreCombiner> {
BooleanWeight {
weights,
minimum_number_should_match,
scoring_enabled,
score_combiner_fn,
}
}
@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
) -> crate::Result<SpecializedScorer> {
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
.remove(&Occur::Should)
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
// Indicate how should clauses are combined with other clauses.
enum CombinationMethod {
Ignored,
// Only contributes to final score.
Optional(SpecializedScorer),
// Must be fitted.
Required(Box<dyn Scorer>),
}
let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
{
let num_of_should_scorers = should_scorers.len();
if self.minimum_number_should_match > num_of_should_scorers {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
}
match self.minimum_number_should_match {
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
1 => CombinationMethod::Required(into_box_scorer(
scorer_union(should_scorers, &score_combiner_fn),
&score_combiner_fn,
)),
n if num_of_should_scorers == n => {
// When num_of_should_scorers equals the number of should clauses,
// they are no different from must clauses.
must_scorers = match must_scorers.take() {
Some(mut must_scorers) => {
must_scorers.append(&mut should_scorers);
Some(must_scorers)
}
None => Some(should_scorers),
};
CombinationMethod::Ignored
}
_ => CombinationMethod::Required(scorer_disjunction(
should_scorers,
score_combiner_fn(),
self.minimum_number_should_match,
)),
}
} else {
// None of should clauses are provided.
if self.minimum_number_should_match > 0 {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
} else {
CombinationMethod::Ignored
}
};
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::MustNot)
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
.map(|specialized_scorer| {
.map(|specialized_scorer: SpecializedScorer| {
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
});
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
.remove(&Occur::Must)
.map(intersect_scorers);
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
(Some(should_scorer), Some(must_scorer)) => {
let positive_scorer = match (should_opt, must_scorers) {
(CombinationMethod::Ignored, Some(must_scorers)) => {
SpecializedScorer::Other(intersect_scorers(must_scorers))
}
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
let must_scorer = intersect_scorers(must_scorers);
if self.scoring_enabled {
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
Box<dyn Scorer>,
Box<dyn Scorer>,
TComplexScoreCombiner,
>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
)))
SpecializedScorer::Other(Box::new(
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
must_scorer,
into_box_scorer(should_scorer, &score_combiner_fn),
),
))
} else {
SpecializedScorer::Other(must_scorer)
}
}
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
(Some(should_scorer), None) => should_scorer,
(None, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
must_scorers.push(should_scorer);
SpecializedScorer::Other(intersect_scorers(must_scorers))
}
(CombinationMethod::Ignored, None) => {
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
}
(CombinationMethod::Required(should_scorer), None) => {
SpecializedScorer::Other(should_scorer)
}
// Optional options are promoted to required if no must scorers exists.
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
};
if let Some(exclude_scorer) = exclude_scorer_opt {
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
Ok(SpecializedScorer::Other(Box::new(Exclude::new(

View File

@@ -12,11 +12,10 @@ mod tests {
use super::*;
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
use crate::collector::TopDocs;
use crate::query::score_combiner::SumWithCoordsCombiner;
use crate::query::term_query::TermScorer;
use crate::query::{
EnableScoring, Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer,
TermQuery,
SumCombiner, TermQuery,
};
use crate::schema::*;
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
@@ -90,11 +89,8 @@ mod tests {
let query = query_parser.parse_query("+a b")?;
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
assert!(scorer.is::<RequiredOptionalScorer<
Box<dyn Scorer>,
Box<dyn Scorer>,
SumWithCoordsCombiner,
>>());
assert!(scorer
.is::<RequiredOptionalScorer<Box<dyn Scorer>, Box<dyn Scorer>, SumCombiner>>());
}
{
let query = query_parser.parse_query("+a b")?;

327
src/query/disjunction.rs Normal file
View File

@@ -0,0 +1,327 @@
use std::cmp::Ordering;
use std::collections::BinaryHeap;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ScoreCombiner, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
/// `Disjunction` is responsible for merging `DocSet` from multiple
/// source. Specifically, It takes the union of two or more `DocSet`s
/// then filtering out elements that appear fewer times than a
/// specified threshold.
pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
chains: BinaryHeap<ScorerWrapper<TScorer>>,
minimum_matches_required: usize,
score_combiner: TScoreCombiner,
current_doc: DocId,
current_score: Score,
}
/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
struct ScorerWrapper<T> {
scorer: T,
current_doc: DocId,
}
impl<T: Scorer> ScorerWrapper<T> {
fn new(scorer: T) -> Self {
let current_doc = scorer.doc();
Self {
scorer,
current_doc,
}
}
}
impl<T: Scorer> PartialEq for ScorerWrapper<T> {
fn eq(&self, other: &Self) -> bool {
self.doc() == other.doc()
}
}
impl<T: Scorer> Eq for ScorerWrapper<T> {}
impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
Some(self.cmp(other))
}
}
impl<T: Scorer> Ord for ScorerWrapper<T> {
fn cmp(&self, other: &Self) -> Ordering {
self.doc().cmp(&other.doc()).reverse()
}
}
impl<T: Scorer> DocSet for ScorerWrapper<T> {
fn advance(&mut self) -> DocId {
let doc_id = self.scorer.advance();
self.current_doc = doc_id;
doc_id
}
fn doc(&self) -> DocId {
self.current_doc
}
fn size_hint(&self) -> u32 {
self.scorer.size_hint()
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
pub fn new<T: IntoIterator<Item = TScorer>>(
docsets: T,
score_combiner: TScoreCombiner,
minimum_matches_required: usize,
) -> Self {
debug_assert!(
minimum_matches_required > 1,
"union scorer works better if just one matches required"
);
let chains = docsets
.into_iter()
.map(|doc| ScorerWrapper::new(doc))
.collect();
let mut disjunction = Self {
chains,
score_combiner,
current_doc: TERMINATED,
minimum_matches_required,
current_score: 0.0,
};
if minimum_matches_required > disjunction.chains.len() {
return disjunction;
}
disjunction.advance();
disjunction
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
for Disjunction<TScorer, TScoreCombiner>
{
fn advance(&mut self) -> DocId {
let mut current_num_matches = 0;
while let Some(mut candidate) = self.chains.pop() {
let next = candidate.doc();
if next != TERMINATED {
// Peek next doc.
if self.current_doc != next {
if current_num_matches >= self.minimum_matches_required {
self.chains.push(candidate);
self.current_score = self.score_combiner.score();
return self.current_doc;
}
// Reset current_num_matches and scores.
current_num_matches = 0;
self.current_doc = next;
self.score_combiner.clear();
}
current_num_matches += 1;
self.score_combiner.update(&mut candidate.scorer);
candidate.advance();
self.chains.push(candidate);
}
}
if current_num_matches < self.minimum_matches_required {
self.current_doc = TERMINATED;
}
self.current_score = self.score_combiner.score();
self.current_doc
}
#[inline]
fn doc(&self) -> DocId {
self.current_doc
}
fn size_hint(&self) -> u32 {
self.chains
.iter()
.map(|docset| docset.size_hint())
.max()
.unwrap_or(0u32)
}
}
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
for Disjunction<TScorer, TScoreCombiner>
{
fn score(&mut self) -> Score {
self.current_score
}
}
#[cfg(test)]
mod tests {
use std::collections::BTreeMap;
use super::Disjunction;
use crate::query::score_combiner::DoNothingCombiner;
use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
use crate::{DocId, DocSet, Score, TERMINATED};
fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
let mut counts = BTreeMap::new();
for array in arrays {
for &element in array {
*counts.entry(element).or_insert(0) += 1;
}
}
counts
.iter()
.filter_map(|(&element, &count)| {
if count >= pass_line {
Some(element)
} else {
None
}
})
.collect()
}
fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
let make_scorer = || {
Disjunction::new(
vals.iter()
.cloned()
.map(VecDocSet::from)
.map(|d| ConstScorer::new(d, 1.0)),
DoNothingCombiner,
min_match,
)
};
let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
let mut count = 0;
while scorer.doc() != TERMINATED {
assert_eq!(union_expected.doc(), scorer.doc());
assert_eq!(union_expected.advance(), scorer.advance());
count += 1;
}
assert_eq!(union_expected.advance(), TERMINATED);
assert_eq!(count, make_scorer().count_including_deleted());
}
#[should_panic]
#[test]
fn test_arg_check1() {
aux_test_conjunction(vec![], 0);
}
#[should_panic]
#[test]
fn test_arg_check2() {
aux_test_conjunction(vec![], 1);
}
#[test]
fn test_corner_case() {
aux_test_conjunction(vec![], 2);
aux_test_conjunction(vec![vec![]; 1000], 2);
aux_test_conjunction(vec![vec![]; 100], usize::MAX);
aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
}
#[test]
fn test_conjunction() {
aux_test_conjunction(
vec![
vec![1, 3333, 100000000u32],
vec![1, 2, 100000000u32],
vec![1, 2, 100000000u32],
],
2,
);
aux_test_conjunction(
vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
2,
);
aux_test_conjunction(
vec![
vec![1, 3333, 100000000u32],
vec![1, 2, 100000000u32],
vec![1, 2, 100000000u32],
],
3,
)
}
// This dummy scorer does nothing but yield doc id increasingly.
// with constant score 1.0
#[derive(Clone)]
struct DummyScorer {
cursor: usize,
foo: Vec<(DocId, f32)>,
}
impl DummyScorer {
fn new(doc_score: Vec<(DocId, f32)>) -> Self {
Self {
cursor: 0,
foo: doc_score,
}
}
}
impl DocSet for DummyScorer {
fn advance(&mut self) -> DocId {
self.cursor += 1;
self.doc()
}
fn doc(&self) -> DocId {
self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
}
fn size_hint(&self) -> u32 {
self.foo.len() as u32
}
}
impl Scorer for DummyScorer {
fn score(&mut self) -> Score {
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
}
}
#[test]
fn test_score_calculate() {
let mut scorer = Disjunction::new(
vec![
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
],
SumCombiner::default(),
3,
);
assert_eq!(scorer.score(), 5.0);
assert_eq!(scorer.advance(), 2);
assert_eq!(scorer.score(), 3.0);
}
#[test]
fn test_score_calculate_corner_case() {
let mut scorer = Disjunction::new(
vec![
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
],
SumCombiner::default(),
2,
);
assert_eq!(scorer.doc(), 1);
assert_eq!(scorer.score(), 3.0);
assert_eq!(scorer.advance(), 3);
assert_eq!(scorer.score(), 2.0);
}
}

View File

@@ -149,7 +149,7 @@ mod tests {
use crate::query::exist_query::ExistsQuery;
use crate::query::{BooleanQuery, RangeQuery};
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
use crate::{Index, Searcher};
use crate::{Index, Searcher, Term};
#[test]
fn test_exists_query_simple() -> crate::Result<()> {
@@ -188,9 +188,8 @@ mod tests {
// exercise seek
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 50)),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
@@ -198,10 +197,9 @@ mod tests {
assert_eq!(searcher.search(&query, &Count)?, 25);
let query = BooleanQuery::intersection(vec![
Box::new(RangeQuery::new_u64_bounds(
"all".to_string(),
Bound::Included(0),
Bound::Excluded(50),
Box::new(RangeQuery::new(
Bound::Included(Term::from_field_u64(all_field, 0)),
Bound::Included(Term::from_field_u64(all_field, 50)),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
]);

View File

@@ -5,6 +5,7 @@ mod bm25;
mod boolean_query;
mod boost_query;
mod const_score_query;
mod disjunction;
mod disjunction_max_query;
mod empty_query;
mod exclude;
@@ -53,12 +54,10 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
pub use self::range_query::*;
pub use self::regex_query::RegexQuery;
pub use self::reqopt_scorer::RequiredOptionalScorer;
pub use self::score_combiner::{
DisjunctionMaxCombiner, ScoreCombiner, SumCombiner, SumWithCoordsCombiner,
};
pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner};
pub use self::scorer::Scorer;
pub use self::set_query::TermSetQuery;
pub use self::term_query::TermQuery;

View File

@@ -209,7 +209,7 @@ impl MoreLikeThis {
}
};
// TOOD: Validate these changed align with the HEAD branch.
// TODO: Validate these changed align with the HEAD branch.
for value in values {
if let Some(text) = value.as_str() {
let tokenizer = match &mut tokenizer_opt {
@@ -241,7 +241,7 @@ impl MoreLikeThis {
let timestamp = value.as_datetime().ok_or_else(|| {
TantivyError::InvalidArgument("invalid value".to_string())
})?;
let term = Term::from_field_date(field, timestamp);
let term = Term::from_field_date_for_search(field, timestamp);
*term_frequencies.entry(term).or_insert(0) += 1;
}
}
@@ -295,7 +295,7 @@ impl MoreLikeThis {
self.stop_words.contains(&word)
}
/// Couputes the score for each term while ignoring not useful terms
/// Computes the score for each term while ignoring not useful terms
fn create_score_term(
&self,
searcher: &Searcher,

View File

@@ -2,7 +2,7 @@ use std::ops::Bound;
use super::{prefix_end, PhrasePrefixWeight};
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight};
use crate::schema::{Field, IndexRecordOption, Term};
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
@@ -86,7 +86,7 @@ impl PhrasePrefixQuery {
///
/// This function is the same as [`Query::weight()`] except it returns
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
/// If the query was only one term long, this returns `None` whereas [`Query::weight`]
/// returns a boxed [`RangeWeight`]
pub(crate) fn phrase_prefix_query_weight(
&self,
@@ -145,17 +145,15 @@ impl Query for PhrasePrefixQuery {
Bound::Unbounded
};
let mut range_query = RangeQuery::new_term_bounds(
enable_scoring
.schema()
.get_field_name(self.field)
.to_owned(),
self.prefix.1.typ(),
&Bound::Included(self.prefix.1.clone()),
&end_term,
);
range_query.limit(self.max_expansions as u64);
range_query.weight(enable_scoring)
let lower_bound = Bound::Included(self.prefix.1.clone());
let upper_bound = end_term;
Ok(Box::new(InvertedIndexRangeWeight::new(
self.field,
&lower_bound,
&upper_bound,
Some(self.max_expansions as u64),
)))
}
}

View File

@@ -6,6 +6,9 @@ use crate::query::phrase_query::{intersection_count, PhraseScorer};
use crate::query::Scorer;
use crate::{DocId, Score};
// MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
// though, it would be interesting to slim it down if possible.
#[allow(clippy::large_enum_variant)]
enum PhraseKind<TPostings: Postings> {
SinglePrefix {
position_offset: u32,
@@ -97,6 +100,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
suffixes: Vec<TPostings>,
suffix_offset: u32,
phrase_count: u32,
suffix_position_buffer: Vec<u32>,
}
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
@@ -140,6 +144,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
suffixes,
suffix_offset: (max_offset - suffix_pos) as u32,
phrase_count: 0,
suffix_position_buffer: Vec::with_capacity(100),
};
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
phrase_prefix_scorer.advance();
@@ -153,7 +158,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
fn matches_prefix(&mut self) -> bool {
let mut count = 0;
let mut positions = Vec::new();
let current_doc = self.doc();
let pos_matching = self.phrase_scorer.get_intersection();
for suffix in &mut self.suffixes {
@@ -162,8 +166,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
}
let doc = suffix.seek(current_doc);
if doc == current_doc {
suffix.positions_with_offset(self.suffix_offset, &mut positions);
count += intersection_count(pos_matching, &positions);
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
count += intersection_count(pos_matching, &self.suffix_position_buffer);
}
}
self.phrase_count = count as u32;

View File

@@ -5,8 +5,8 @@ use crate::schema::{Field, IndexRecordOption, Term};
/// `PhraseQuery` matches a specific sequence of words.
///
/// For instance the phrase query for `"part time"` will match
/// the sentence
/// For instance, the phrase query for `"part time"` will match
/// the sentence:
///
/// **Alan just got a part time job.**
///
@@ -15,7 +15,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
/// **This is my favorite part of the job.**
///
/// [Slop](PhraseQuery::set_slop) allows leniency in term proximity
/// for some performance tradeof.
/// for some performance trade-off.
///
/// Using a `PhraseQuery` on a field requires positions
/// to be indexed for this field.

View File

@@ -219,8 +219,8 @@ fn intersection_exists_with_slop(
/// In contrast to the regular algorithm this solves some issues:
/// - Keep track of the slop so far. Slop is a budget that is spent on the distance between terms.
/// - When encountering a match between two positions, which position is the best match is unclear
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
/// matches, but only counts one.
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
/// matches, but only counts one.
///
/// This algorithm may return an incorrect count in some cases (e.g. left, right expansion and is
/// then matches both on the following term.)

View File

@@ -115,10 +115,10 @@ impl<'a> EnableScoring<'a> {
///
/// So to sum it up :
/// - a `Query` is a recipe to define a set of documents as well the way to score them.
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance
/// hold statistics about the different term of the query. It is created by the query.
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific
/// [`SegmentReader`]. It is created by the [`Weight`].
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance hold
/// statistics about the different term of the query. It is created by the query.
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific [`SegmentReader`].
/// It is created by the [`Weight`].
///
/// When implementing a new type of `Query`, it is normal to implement a
/// dedicated `Query`, [`Weight`] and [`Scorer`].

View File

@@ -2,7 +2,7 @@ use std::fmt;
use std::ops::Bound;
use crate::query::Occur;
use crate::schema::{Term, Type};
use crate::schema::Term;
use crate::Score;
#[derive(Clone)]
@@ -14,8 +14,6 @@ pub enum LogicalLiteral {
prefix: bool,
},
Range {
field: String,
value_type: Type,
lower: Bound<Term>,
upper: Bound<Term>,
},
@@ -39,6 +37,34 @@ impl LogicalAst {
LogicalAst::Boost(Box::new(self), boost)
}
}
pub fn simplify(self) -> LogicalAst {
match self {
LogicalAst::Clause(clauses) => {
let mut new_clauses: Vec<(Occur, LogicalAst)> = Vec::new();
for (occur, sub_ast) in clauses {
let simplified_sub_ast = sub_ast.simplify();
// If clauses below have the same `Occur`, we can pull them up
match simplified_sub_ast {
LogicalAst::Clause(sub_clauses)
if (occur == Occur::Should || occur == Occur::Must)
&& sub_clauses.iter().all(|(o, _)| *o == occur) =>
{
for sub_clause in sub_clauses {
new_clauses.push(sub_clause);
}
}
_ => new_clauses.push((occur, simplified_sub_ast)),
}
}
LogicalAst::Clause(new_clauses)
}
LogicalAst::Leaf(_) | LogicalAst::Boost(_, _) => self,
}
}
}
fn occur_letter(occur: Occur) -> &'static str {

View File

@@ -58,7 +58,7 @@ pub enum QueryParserError {
#[error("Invalid query: Only excluding terms given")]
AllButQueryForbidden,
/// If no default field is declared, running a query without any
/// field specified is forbbidden.
/// field specified is forbidden.
#[error("No default field declared and no field specified in query")]
NoDefaultFieldDeclared,
/// The field searched for is not declared
@@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// so-called default fields (as set up in the constructor).
///
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
/// conjunction as a default, our query will be interpreted as.
/// conjunction as a default, our query will be interpreted as.
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
/// By default, all tokenized and indexed fields are default fields.
///
@@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
/// `body:Barack OR (body:Barack OR text:Obama)` .
///
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
/// interpreted
/// as `(a AND b) OR c`.
/// interpreted as `(a AND b) OR c`.
///
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
@@ -272,8 +271,7 @@ impl QueryParser {
/// Creates a `QueryParser`, given
/// * an index
/// * a set of default fields used to search if no field is specifically defined
/// in the query.
/// * a set of default fields used to search if no field is specifically defined in the query.
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
}
@@ -379,7 +377,7 @@ impl QueryParser {
if !err.is_empty() {
return Err(err.swap_remove(0));
}
Ok(ast)
Ok(ast.simplify())
}
/// Parse the user query into an AST.
@@ -482,16 +480,33 @@ impl QueryParser {
});
if terms.len() != 1 {
return Err(QueryParserError::UnsupportedQuery(format!(
"Range query boundary cannot have multiple tokens: {phrase:?}."
"Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]."
)));
}
Ok(terms.into_iter().next().unwrap())
}
FieldType::JsonObject(_) => {
// Json range are not supported.
Err(QueryParserError::UnsupportedQuery(
"Range query are not supported on json field.".to_string(),
))
FieldType::JsonObject(ref json_options) => {
let get_term_with_path = || {
Term::from_field_json_path(
field,
json_path,
json_options.is_expand_dots_enabled(),
)
};
if let Some(term) =
// Try to convert the phrase to a fast value
convert_to_fast_value_and_append_to_json_term(
get_term_with_path(),
phrase,
false,
)
{
Ok(term)
} else {
let mut term = get_term_with_path();
term.append_type_and_str(phrase);
Ok(term)
}
}
FieldType::Facet(_) => match Facet::from_text(phrase) {
Ok(facet) => Ok(Term::from_facet(field, &facet)),
@@ -553,7 +568,7 @@ impl QueryParser {
}
FieldType::Date(_) => {
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
Ok(vec![LogicalLiteral::Term(dt_term)])
}
FieldType::Str(ref str_options) => {
@@ -685,8 +700,8 @@ impl QueryParser {
///
/// The terms are identified by a triplet:
/// - tantivy field
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
/// object by naturally extending the json field name with a "." separated field_path
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
/// naturally extending the json field name with a "." separated field_path
/// - field_phrase: the phrase that is being searched.
///
/// The literal identifies the targeted field by a so-called *full field path*,
@@ -790,8 +805,6 @@ impl QueryParser {
let (field, json_path) = try_tuple!(self
.split_full_path(&full_path)
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
let field_entry = self.schema.get_field_entry(field);
let value_type = field_entry.field_type().value_type();
let mut errors = Vec::new();
let lower = match self.resolve_bound(field, json_path, &lower) {
Ok(bound) => bound,
@@ -809,15 +822,11 @@ impl QueryParser {
};
if lower == Bound::Unbounded && upper == Bound::Unbounded {
// this range is useless, either because a user requested [* TO *], or because
// we failed to parse something. Either way, there is no point emiting it
// we failed to parse something. Either way, there is no point emitting it
return (None, errors);
}
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
field: self.schema.get_field_name(field).to_string(),
value_type,
lower,
upper,
}));
let logical_ast =
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
(Some(logical_ast), errors)
}
UserInputLeaf::Set {
@@ -884,14 +893,7 @@ fn convert_literal_to_query(
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
}
}
LogicalLiteral::Range {
field,
value_type,
lower,
upper,
} => Box::new(RangeQuery::new_term_bounds(
field, value_type, &lower, &upper,
)),
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
LogicalLiteral::All => Box::new(AllQuery),
}
@@ -962,7 +964,8 @@ fn generate_literals_for_json_object(
|| Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());
// Try to convert the phrase to a fast value
if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase)
if let Some(term) =
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
{
logical_literals.push(LogicalLiteral::Term(term));
}
@@ -1136,8 +1139,8 @@ mod test {
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
assert_eq!(
format!("{query:?}"),
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
upper_bound: Included([98]), limit: None }"
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \
\"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }"
);
}
@@ -1304,7 +1307,7 @@ mod test {
}
#[test]
fn test_json_field_query_with_espaced_dot() {
fn test_json_field_query_with_escaped_dot() {
assert_eq!(
extract_query_term_json_path(r#"json.k8s.node.name:hello"#),
"k8s\u{1}node\u{1}name\0shello"
@@ -1434,7 +1437,7 @@ mod test {
);
test_parse_query_to_logical_ast_helper(
"(+title:a +title:b) title:c",
r#"(+(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) +Term(field=0, type=Str, "c"))"#,
r#"(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b") +Term(field=0, type=Str, "c"))"#,
true,
);
}
@@ -1470,7 +1473,7 @@ mod test {
pub fn test_parse_query_to_ast_two_terms() {
test_parse_query_to_logical_ast_helper(
"title:a b",
r#"(Term(field=0, type=Str, "a") (Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#,
r#"(Term(field=0, type=Str, "a") Term(field=0, type=Str, "b") Term(field=1, type=Str, "b"))"#,
false,
);
test_parse_query_to_logical_ast_helper(
@@ -1702,6 +1705,21 @@ mod test {
);
}
#[test]
pub fn test_parse_query_negative() {
test_parse_query_to_logical_ast_helper(
"title:b -title:a",
r#"(+Term(field=0, type=Str, "b") -Term(field=0, type=Str, "a"))"#,
true,
);
test_parse_query_to_logical_ast_helper(
"title:b -(-title:a -title:c)",
r#"(+Term(field=0, type=Str, "b") -(-Term(field=0, type=Str, "a") -Term(field=0, type=Str, "c")))"#,
true,
);
}
#[test]
pub fn test_query_parser_hyphen() {
test_parse_query_to_logical_ast_helper(
@@ -1815,7 +1833,8 @@ mod test {
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
minimum_number_should_match: 1 }"
);
}
@@ -1880,7 +1899,8 @@ mod test {
format!("{query:?}"),
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
minimum_number_should_match: 1 }"
);
}
@@ -1897,7 +1917,8 @@ mod test {
format!("{query:?}"),
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
distance: 2, transposition_cost_one: false, prefix: true })] }"
distance: 2, transposition_cost_one: false, prefix: true })], \
minimum_number_should_match: 1 }"
);
}
}

View File

@@ -49,10 +49,10 @@ pub(crate) struct RangeDocSet<T> {
///
/// There are two patterns.
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
/// will come, so we start with small chunks
/// will come, so we start with small chunks
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
/// full scan.
/// should load small chunks. When the seeks are small, we can employ the same strategy as on
/// a full scan.
fetch_horizon: u32,
/// Current batch of loaded docs.
loaded_docs: VecCursor,
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
#[cfg(test)]
mod tests {
use std::ops::Bound;
use crate::collector::Count;
use crate::directory::RamDirectory;
use crate::query::RangeQuery;
use crate::{schema, IndexBuilder, TantivyDocument};
use crate::{schema, IndexBuilder, TantivyDocument, Term};
#[test]
fn range_query_fast_optional_field_minimum() {
@@ -218,10 +220,9 @@ mod tests {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let query = RangeQuery::new_u64_bounds(
"score".to_string(),
std::ops::Bound::Included(70),
std::ops::Bound::Unbounded,
let query = RangeQuery::new(
Bound::Included(Term::from_field_u64(score_field, 70)),
Bound::Unbounded,
);
let count = searcher.search(&query, &Count).unwrap();

Some files were not shown because too many files have changed in this diff Show More