Compare commits

...

102 Commits

Author SHA1 Message Date
François Massot
cfa326363e Test. 2023-07-05 21:58:52 +02:00
François Massot
3c300666ad Merge pull request #2110 from quickwit-oss/fulmicoton/dynamic-follow-up
Add dynamic filters to text analyzer builder.
2023-07-03 21:49:24 +02:00
François Massot
b91d3f6be4 Clean comment on 'TextAnalyzerBuilder::filter_dynamic' method. 2023-07-03 18:45:59 +02:00
François Massot
a8e76513bb Remove useless clone. 2023-07-03 22:05:11 +09:00
François Massot
0a23201338 Fix stackoverflow and add docs. 2023-07-03 22:05:11 +09:00
François Massot
81330aaf89 WIP 2023-07-03 22:05:10 +09:00
Paul Masurel
98a3b01992 Removing the BoxedTokenizer 2023-07-03 22:05:10 +09:00
Paul Masurel
d341520938 Dynamic follow up 2023-07-03 22:05:10 +09:00
François Massot
5c9af73e41 Followup fulmicoton poc. 2023-07-03 22:05:10 +09:00
Paul Masurel
ad4c940fa3 proof of concept for dynamic tokenizer. 2023-07-03 22:05:10 +09:00
Paul Masurel
910b0b0c61 Cargo fmt 2023-07-03 22:03:31 +09:00
PSeitz
3fef052bf1 fix flaky test (#2107)
closes #2099
2023-06-29 14:30:56 +08:00
PSeitz
040554f2f9 Update to lz4_flex 0.11 (#2106) 2023-06-29 14:16:00 +08:00
PSeitz
17186ca9c9 improve docs (#2105) 2023-06-27 13:37:14 +08:00
François Massot
212d59c9ab Merge pull request #2102 from quickwit-oss/fmassot/ngram-new-should-return-error
Ngram tokenizer now returns an error with invalid arguments.
2023-06-27 05:36:09 +02:00
dependabot[bot]
1a1f252a3f Update memmap2 requirement from 0.6.0 to 0.7.1 (#2104)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v0.6.0...v0.7.1)

---
updated-dependencies:
- dependency-name: memmap2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-06-27 05:15:43 +02:00
François Massot
d73706dede Ngram tokenizer now returns an error with invalid arguments. 2023-06-25 20:13:24 +02:00
PSeitz
44850e1036 move fail dep to dev only (#2094)
wasm compilation fails with dep only
2023-06-22 06:59:11 +02:00
Adam Reichold
3b0cbf8102 Cosmetic updates to the warmer example. (#2095)
Just some cosmetic tweaks to make the example easier on the eyes as a colleague
was staring at this for quite some time this week.
2023-06-22 11:25:01 +09:00
Adam Reichold
4aa131c3db Make TextAnalyzerBuilder publically accessible (#2097)
This way, client code can name the type to e.g. store it inside structs without
resorting to generics and it means that its documentation is part of the crate
documentation generated by `cargo doc`.
2023-06-22 11:24:21 +09:00
Naveen Aiathurai
59962097d0 fix: #2078 return error when tokenizer not found while indexing (#2093)
* fix: #2078 return error when tokenizer not found while indexing

* chore: formatting issues

* chore: fix review comments
2023-06-16 04:33:55 +02:00
Adam Reichold
ebc78127f3 Add BytesFilterCollector to support filtering based on a bytes fast field (#2075)
* Do some Clippy- and Cargo-related boy-scouting.

* Add BytesFilterCollector to support filtering based on a bytes fast field

This is basically a copy of the existing FilterCollector but modified and
specialised to work on a bytes fast field.

* Changed semantics of filter collectors to consider multi-valued fields
2023-06-13 14:19:58 +09:00
PSeitz
8199aa7de7 bump version to 0.20.2 (#2089) 2023-06-12 18:56:54 +08:00
PSeitz
657f0cd3bd add missing Bytes validation to term_agg (#2077)
returns empty for now instead of failing like before
2023-06-12 16:38:07 +08:00
Adam Reichold
3a82ef2560 Fix is_child_of function not considering the root facet. (#2086) 2023-06-12 08:35:18 +02:00
PSeitz
3546e7fc63 small agg limit docs improvement (#2073)
small docs improvement as follow up on bug https://github.com/quickwit-oss/quickwit/issues/3503
2023-06-12 10:55:24 +09:00
PSeitz
862f367f9e release without Alice in Wonderland, bump version to 0.20.1 (#2087)
* Release without Alice in Wonderland

* bump version to 0.20.1
2023-06-12 10:54:03 +09:00
PSeitz
14137d91c4 Update CHANGELOG.md (#2081) 2023-06-12 10:53:40 +09:00
François Massot
924fc70cb5 Merge pull request #2088 from quickwit-oss/fmassot/align-type-priorities-for-json-numbers
Align numerical type priority order on the search side.
2023-06-11 22:04:54 +02:00
François Massot
07023948aa Add test that indexes and searches a JSON field. 2023-06-11 21:47:52 +02:00
François Massot
0cb53207ec Fix tests. 2023-06-11 12:13:35 +02:00
François Massot
17c783b4db Align numerical type priority order on the search side. 2023-06-11 11:49:27 +02:00
Harrison Burt
7220df8a09 Fix building on windows with mmap (#2070)
* Fix windows build

* Make pub

* Update docs

* Re arrange

* Fix compilation error on unix

* Fix unix borrows

* Revert "Fix unix borrows"

This reverts commit c1d94fd12b.

* Fix unix borrows and revert original change

* Fix warning

* Cleaner code.

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-06-10 18:32:39 +02:00
PSeitz
e3eacb4388 release tantivy (#2083)
* prerelease

* chore: Release
2023-06-09 10:47:46 +02:00
PSeitz
fdecb79273 tokenizer-api: reduce Tokenizer overhead (#2062)
* tokenizer-api: reduce Tokenizer overhead

Previously a new `Token` for each text encountered was created, which
contains `String::with_capacity(200)`
In the new API the token_stream gets mutable access to the tokenizer,
this allows state to be shared (in this PR Token is shared).
Ideally the allocation for the BoxTokenStream would also be removed, but
this may require some lifetime tricks.

* simplify api

* move lowercase and ascii folding buffer to global

* empty Token text as default
2023-06-08 18:37:58 +08:00
PSeitz
27f202083c Improve Termmap Indexing Performance +~30% (#2058)
* update benchmark

* Improve Termmap Indexing Performance +~30%

This contains many small changes to improve Termmap performance.
Most notably:
* Specialized byte compare and equality versions, instead of glibc calls.
* ExpUnrolledLinkedList to not contain inline items.

Allow compare hash only via a feature flag compare_hash_only:
64bits should be enough with a good hash function to compare strings by
their hashes instead of comparing the strings. Disabled by default

CreateHashMap/alice/174693
                        time:   [642.23 µs 643.80 µs 645.24 µs]
                        thrpt:  [258.20 MiB/s 258.78 MiB/s 259.41 MiB/s]
                 change:
                        time:   [-14.429% -13.303% -12.348%] (p = 0.00 < 0.05)
                        thrpt:  [+14.088% +15.344% +16.862%]
                        Performance has improved.
CreateHashMap/alice_expull/174693
                        time:   [877.03 µs 880.44 µs 884.67 µs]
                        thrpt:  [188.32 MiB/s 189.22 MiB/s 189.96 MiB/s]
                 change:
                        time:   [-26.460% -26.274% -26.091%] (p = 0.00 < 0.05)
                        thrpt:  [+35.301% +35.637% +35.981%]
                        Performance has improved.
CreateHashMap/numbers_zipf/8000000
                        time:   [9.1198 ms 9.1573 ms 9.1961 ms]
                        thrpt:  [829.64 MiB/s 833.15 MiB/s 836.57 MiB/s]
                 change:
                        time:   [-35.229% -34.828% -34.384%] (p = 0.00 < 0.05)
                        thrpt:  [+52.403% +53.440% +54.390%]
                        Performance has improved.

* clippy

* add bench for ids

* inline(always) to inline whole block with bounds checks

* cleanup
2023-06-08 11:13:52 +02:00
PSeitz
ccb09aaa83 allow histogram bounds to be passed as Rfc3339 (#2076) 2023-06-08 09:07:08 +02:00
Valerii
4b7c485a08 feat: add stop words for Hungarian language (#2069) 2023-06-02 07:26:03 +02:00
PSeitz
3942fc6d2b update CHANGELOG (#2068) 2023-06-02 05:00:12 +02:00
Adam Reichold
b325d569ad Expose phrase-prefix queries via the built-in query parser (#2044)
* Expose phrase-prefix queries via the built-in query parser

This proposes the less-than-imaginative syntax `field:"phrase ter"*` to
perform a phrase prefix query against `field` using `phrase` and `ter` as the
terms. The aim of this is to make this type of query more discoverable and
simplify manual testing.

I did consider exposing the `max_expansions` parameter similar to how slop is
handled, but I think that this is rather something that should be configured via
the querser parser (similar to `set_field_boost` and `set_field_fuzzy`) as
choosing it requires rather intimiate knowledge of the backing index.

* Prevent construction of zero or one term phrase-prefix queries via the query parser.

* Add example using phrase-prefix search via surface API to improve feature discoverability.
2023-06-01 13:03:16 +02:00
Paul Masurel
7ee78bda52 Readding s in datetime precision variant names (#2065)
There is no clear win and it change some serialization in quickwit.
2023-06-01 06:39:46 +02:00
Paul Masurel
184a9daa8a Cancels concurrently running actions for the same PR. (#2067) 2023-06-01 12:57:38 +09:00
Paul Masurel
47e01b345b Simplified linear probing code (#2066) 2023-06-01 04:58:42 +02:00
PSeitz
3af456972e Fix min doc_count empty merge bug (#2057)
This fixes an issue when min_doc==0 loads terms from the dictionary from
one segment and merges the same term with a subaggregation from another
segment.
Previously the empty structure was not correctly initialized to contain
the subaggregation so the merge was incorrect.
2023-05-29 14:20:50 +08:00
PSeitz
e56addc63e enable tokenizer on json fields (#2053)
* enable tokenizer on json fields

enable tokenizer on json fields for type text

* Avoid making the tokenizer within the TextAnalyzer pub(crate)

* Moving BoxableTokenizer to tantivy.

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-05-24 10:47:39 +02:00
dependabot[bot]
4be6f83b0a Update criterion requirement from 0.4 to 0.5 (#2056)
Updates the requirements on [criterion](https://github.com/bheisler/criterion.rs) to permit the latest version.
- [Changelog](https://github.com/bheisler/criterion.rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/bheisler/criterion.rs/compare/0.4.0...0.5.0)

---
updated-dependencies:
- dependency-name: criterion
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-05-24 15:59:51 +09:00
Adrien Guillo
a789ad9aee Rename DatePrecision to DateTimePrecision (#2051) 2023-05-23 17:09:11 +02:00
Sergei Lavrentev
8cf26da4b2 Add possibility to set up highlighten prefix and postfix for snippet (#1422)
* add possibility to change highlight prefix and postfix

* add comment to Snippet::new

* add test for highlighten elements

* add default highlight prefix and postfix constants

* fix spelling

* fix tests

* fix spelling

* do fixes after code review

* reduce test_snippet_generator_custom_highlighted_elements code

* fix fmt

* change names to more convenient

---------

Co-authored-by: Sergei Lavrentev <23312691+lavrxxx@users.noreply.github.com>
2023-05-23 15:09:24 +02:00
trinity-1686a
a3f001360f add support for warming up range of terms (#2042)
* add support for warming up range of terms

* simplify handling of limit
2023-05-22 14:29:35 +02:00
trinity-1686a
6564e0c467 fix phrase prefix query (#2043)
* fix phrase prefix query

it would fail spectacularly when no doc in the segment would match the phrase part of the query

* clippy
2023-05-22 12:36:20 +02:00
Paul Masurel
d7e97331e5 Minor refactoring find field (#2055)
* Minor refactoring

Moving find_field_with_default to Schema.

* Clippy comments
2023-05-22 15:00:48 +09:00
Paul Masurel
4417be165d Minor refactoring (#2054)
Moving find_field_with_default to Schema.
2023-05-22 14:56:38 +09:00
PSeitz
6239697a02 switch to ms in histogram for date type (#2045)
* switch to ms in histogram for date type

switch to ms in histogram, by adding a normalization step that converts
to nanoseconds precision when creating the collector.

closes #2028
related to #2026

* add missing unit long variants

* use single thread to avoid handling test case

* fix docs

* revert CI

* cleanup

* improve docs

* Update src/aggregation/bucket/histogram/histogram.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-05-19 08:15:44 +02:00
Paul Masurel
62709b8094 Change in the query grammar. (#2050)
* Change in the query grammar.

Quotation mark can now be used for phrase queries.
The delimiter is part of the `UserInputLeaf`.
That information is meant to be used in Quickwit to solve #3364.

This PR also adds support for quotation marks escaping in phrase
queries.

* Apply suggestions from code review
2023-05-19 12:07:10 +09:00
PSeitz
04562c0318 add fastfield tokenizer to IndexBuilder (#2046) 2023-05-18 04:33:42 +02:00
PSeitz
2dfe37940d handle multiple types in term aggregation (#2041) 2023-05-15 11:57:38 +02:00
Denis Bazhenov
e248a4959f Enforcing "NOT" and "-" queries consistency in UserInputAst (#1609)
* Enforcing "NOT" and "-" queries consistency in UserInputAst

* Mutable implementation if rewrite_ast_clause()
2023-05-13 00:27:48 +09:00
PSeitz
00c5df610c update termmap benchmark (#2040) 2023-05-12 07:35:06 +02:00
Adam Reichold
fedd9559e7 Expose create a query from a user input AST. (#2039) 2023-05-11 21:53:18 +09:00
Paul Masurel
fe3ecf9567 Added support for madvise (#2036)
Added support for madvise
2023-05-11 05:39:17 +02:00
PSeitz
ba3a885a3b handle multiple agg results (#2035)
handle multiple intermediate aggregation results with the same name.
2023-05-10 15:00:38 +02:00
PSeitz
d1988be8e9 fix and extend benchmark (#2030)
* add benchmark, add missing inlines

* fix stacker bench

* add wiki benchmark

* move line split out of bench
2023-05-10 13:01:56 +02:00
PSeitz
0eafbaab8e fix slop (#2031)
Fix slop by carrying slop so far for multiterms.
Define slop contract in the API
2023-05-10 11:45:14 +02:00
PSeitz
d3357a8426 fix ArenaHashMap default (#2034)
an empty ArenaHashMap is invalid and causes a panic when combined with `get`
2023-05-10 11:39:47 +02:00
Yuri Astrakhan
74275b76a6 Inline format arguments where makes sense (#2038)
Applied this command to the code, making it a bit shorter and slightly
more readable.

```
cargo +nightly clippy --all-features --benches --tests --workspace --fix -- -A clippy::all -W clippy::uninlined_format_args
cargo +nightly fmt --all
```
2023-05-10 18:03:59 +09:00
dependabot[bot]
f479840a1b Update memmap2 requirement from 0.5.3 to 0.6.0 (#2033)
Updates the requirements on [memmap2](https://github.com/RazrFalcon/memmap2-rs) to permit the latest version.
- [Changelog](https://github.com/RazrFalcon/memmap2-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/RazrFalcon/memmap2-rs/compare/v0.5.3...v0.6.0)

---
updated-dependencies:
- dependency-name: memmap2
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2023-05-10 03:50:14 +02:00
PSeitz
4ee1b5cda0 add seperate tokenizer manager for fast fields (#2019)
* add seperate tokenizer manager for fast fields

* rename
2023-05-08 11:22:31 +02:00
PSeitz
45ff0e3c5c clear memory consumption in AggregationLimits (#2022)
* clear memory consumption in AggregationLimits

clear memory consumption in AggregationLimits at the end of segment collection

* switch to ResourceLimitGuard

* unduplicate code

* merge methods

* Apply suggestions from code review

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-05-08 10:15:09 +02:00
PSeitz
4c58b0086d allow slop in both directions (#2020)
* allow slop in both directions

allow slop in both directions
so "big wolf"~3 can also match "wolf big"

This also fixes #1934, when the docsets were reordered by size and didn't
match the terms.

* remove count

* add test for repeating tokens, unduplicate tests
2023-05-07 12:05:21 +09:00
Tomoko Uchida
85df322ceb fix typo in the architecture doc (#2009) 2023-05-07 12:04:07 +09:00
François Massot
38c863830f Merge pull request #2027 from quickwit-oss/fmassot/fix-date-histogram
Fix date histogram bounds and field name.
2023-05-05 13:03:25 +02:00
François Massot
992f755298 Fix clippy. 2023-05-05 10:51:29 +02:00
François Massot
c8df843f96 Fix date histogram bounds and field name. 2023-05-05 00:52:55 +02:00
Paul Masurel
f28ddb711e Exposing u64-based FastFieldRangeWeight (#2024) 2023-05-03 18:32:00 +09:00
tottoto
73452284ae Remove unused crates from dependencies (#2018)
* Remove unused crates from dependencies

* Revert rand to columnar

* Revert criterion to stacker
2023-05-02 12:34:20 +02:00
PSeitz
ba309e18a1 switch to nanosecond precision (#2016) 2023-05-01 03:32:20 +02:00
PSeitz
cbf2bdc75b change bucket count type (#2013)
* change bucket count type

closes #2012

* Update src/aggregation/agg_limits.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

* Update src/directory/managed_directory.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

* fix test

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-27 15:47:31 +08:00
PSeitz
1f06997d04 fix single collector special case (#2014) 2023-04-27 09:30:19 +02:00
PSeitz
c599bf3b6c chore!:drop JSON support on intermediate agg result (#1992)
* chore!:drop JSON support on intermediate agg result

add support for other formats by removing skip_serialize and untagged
JSON support is broken anyway due it's lack on f64::INF etc. handling

* Update src/aggregation/intermediate_agg_result.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

* move from impl

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-26 13:05:16 +02:00
PSeitz
80df1d9835 Handle error for exists on MMapDirectory (#1988)
`exists` will return false in case of other io errors, like permission denied
2023-04-25 09:20:33 +02:00
PSeitz
2e369db936 switch to Aggregation without serde_untagged (#2003)
* refactor result handling

* remove Internal stuff

* merge different accessors

* switch to Aggregation without serde_untagged

* fix doctests
2023-04-25 08:54:51 +02:00
PSeitz
7b31100208 refactor vint (#2010)
- improve performance of vint
vint serialization shows up in performance profiles during indexing.
It would also make sense to limit the value space to u29 and operate on 4 bytes only.
- remove unused code
- add missing inlines
- fix regex test
2023-04-25 08:49:36 +02:00
trinity-1686a
9c93bfeb51 optimise warmup code path (#2007)
* optimise warmup code path

* better function naming
2023-04-21 11:23:09 +02:00
PSeitz
74f9eafefc refactor Term (#2006)
* refactor Term

add ValueBytes for serialized term values
add missing debug for ip
skip unnecessary json path validation
remove code duplication
add DATE_TIME_PRECISION_INDEXED constant
add missing Term clarification
remove weird value_bytes_mut() API

* fix naming
2023-04-20 15:31:43 +02:00
RT_Enzyme
ff3d3313c4 fix BooleanQuery document (#1999)
* fix BooleanQuery document

* Update src/query/boolean_query/boolean_query.rs

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-20 11:37:20 +02:00
Paul Masurel
fbda511a1a Making more things public for quickwit. (#2005) 2023-04-20 11:37:45 +09:00
Adam Reichold
c1defdda05 Bump aho-corasick dependency to version 1.0 and adjust to API changes (#2002)
* Drop additional Arc-layer as the automaton itself is now cheap-to-clone.
* Drop state ID type parameter as it is not exposed by the library any more.
2023-04-18 07:34:30 +02:00
PSeitz
e522163a1c use json in agg tests (#1998)
* switch to JSON in tests, add flat aggregation types

* use method

* clippy

* remove commented file
2023-04-17 14:08:48 +02:00
PSeitz
e83abbfe4a perf: faster term hash map (#1940)
* add term hashmap benchmark

* refactor arena hashmap

add inlines
remove occupied array and use table_entry.is_empty instead (saves 4 bytes per entry)
reduce saturation threshold from 1/3 to 1/2 to reduce memory
use u32 for UnorderedId (we have the 4billion limit anyways on the Columnar stuff)
fix naming LinearProbing
remove byteorder dependency

memory consumption went down from 2Gb to 1.8GB on indexing wikipedia dataset in tantivy

* Update stacker/src/arena_hashmap.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-17 09:07:33 +02:00
trinity-1686a
780e26331d sstable compression (#1946)
* compress sstable with zstd

* add some details to sstable readme

* compress only block which benefit from it

* multiple changes to sstable

make compression optional
use OwnedBytes instead of impl Read in sstable, required for next point
use zstd bulk api, which is much faster on small records

* cleanup and use bulk api for compression

* use dedicated byte for compression

* switch block len and compression flag

* change default zstd level in sstable
2023-04-14 16:25:50 +02:00
trinity-1686a
0286ecea09 re-export a few sstable functions on dicitonary (#1996)
* re-export a few sstable functions on dicitonary

* Update documentation

Co-authored-by: François Massot <francois.massot@gmail.com>

---------

Co-authored-by: François Massot <francois.massot@gmail.com>
2023-04-14 11:13:48 +02:00
PSeitz
b0ef9a6252 use crates.io dependency (#1990) 2023-04-14 09:35:20 +08:00
François Massot
36138c493b Merge pull request #1994 from quickwit-oss/fmassot/expose-simple-token-stream
Expose `SimpleTokenStream` to use it in quickwit for the multilanguage tokenizer
2023-04-13 18:55:02 +02:00
François Massot
64bce340b2 Expose to use it in quickwit. 2023-04-13 18:28:53 +02:00
trinity-1686a
205e8a0a92 encode dictionary type in fst footer (#1968)
* encode additional footer for dictionary kind in fst
2023-04-12 09:43:01 +02:00
Paul Masurel
4b01cc4c49 Made BooleanWeight and BoostWeight public (#1991) 2023-04-12 10:26:30 +09:00
PSeitz
0ed13eeea8 add sparse to agg benchmark (#1986)
* add sparse to agg benchmark

* Update src/aggregation/agg_bench.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-11 08:13:32 +02:00
Tony-X
91a38058fe Fix typo in READEME.md (#1989) 2023-04-11 12:07:20 +09:00
PSeitz
41af70799d add percentiles aggregations (#1984)
* add percentiles aggregations

add percentiles aggregation
fix disabled agg benchmark

* Update src/aggregation/metric/percentiles.rs

Co-authored-by: Paul Masurel <paul@quickwit.io>

* Apply suggestions from code review

Co-authored-by: Paul Masurel <paul@quickwit.io>

* fix import

* fix import

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-07 07:18:28 +02:00
Paul Masurel
f853bf204b Align the numerical type priority order with columnar. (#1978)
Closes #1956
2023-04-07 10:07:54 +09:00
Tony-X
11ae48d3bc Update benchmarks section in READEME.md to link to the bench repo (#1985)
* Update benchmarks section in READEME.md to link to the bench repo

* Apply suggestions from code review

---------

Co-authored-by: Paul Masurel <paul@quickwit.io>
2023-04-07 10:07:06 +09:00
Paul Masurel
5eb12173d6 Proptest merge columnar (#1976)
* Added proptest on columnar merge with a shuffle

Made column serialization more explicit.
Bugfix when a bytes column is missing, and with a shuffle.
Improved the cardinality detection logic / column detection.

* Code review

* CR comments

* Following CR
2023-04-04 11:28:42 +09:00
197 changed files with 10925 additions and 5093 deletions

View File

@@ -6,6 +6,11 @@ on:
pull_request:
branches: [main]
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
coverage:
runs-on: ubuntu-latest

View File

@@ -8,6 +8,11 @@ env:
CARGO_TERM_COLOR: always
NUM_FUNCTIONAL_TEST_ITERATIONS: 20000
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
test:

View File

@@ -9,6 +9,11 @@ on:
env:
CARGO_TERM_COLOR: always
# Ensures that we cancel running jobs for the same PR / same workflow.
concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true
jobs:
check:

View File

@@ -254,7 +254,7 @@ The token positions of all of the terms are then stored in a separate file with
The [TermInfo](src/postings/term_info.rs) gives an offset (expressed in position this time) in this file. As we iterate through the docset,
we advance the position reader by the number of term frequencies of the current document.
## [fieldnorms/](src/fieldnorms): Here is my doc, how many tokens in this field?
## [fieldnorm/](src/fieldnorm): Here is my doc, how many tokens in this field?
The [BM25](https://en.wikipedia.org/wiki/Okapi_BM25) formula also requires to know the number of tokens stored in a specific field for a given document. We store this information on one byte per document in the fieldnorm.
The fieldnorm is therefore compressed. Values up to 40 are encoded unchanged.

View File

@@ -1,3 +1,81 @@
Tantivy 0.20 [Unreleased]
================================
#### Bugfixes
- Fix phrase queries with slop (slop supports now transpositions, algorithm that carries slop so far for num terms > 2) [#2031](https://github.com/quickwit-oss/tantivy/issues/2031)[#2020](https://github.com/quickwit-oss/tantivy/issues/2020)(@PSeitz)
- Handle error for exists on MMapDirectory [#1988](https://github.com/quickwit-oss/tantivy/issues/1988) (@PSeitz)
- Aggregation
- Fix min doc_count empty merge bug [#2057](https://github.com/quickwit-oss/tantivy/issues/2057) (@PSeitz)
- Fix: Sort order for term aggregations (sort order on key was inverted) [#1858](https://github.com/quickwit-oss/tantivy/issues/1858) (@PSeitz)
#### Features/Improvements
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
- Unified access for fast fields over different cardinalities.
- Unified storage for typed and untyped fields.
- Move fastfield codecs into columnar. [#1782](https://github.com/quickwit-oss/tantivy/issues/1782) (@fulmicoton)
- Sparse dense index for optional values [#1716](https://github.com/quickwit-oss/tantivy/issues/1716) (@PSeitz)
- Switch to nanosecond precision in DateTime fastfield [#2016](https://github.com/quickwit-oss/tantivy/issues/2016) (@PSeitz)
- **Aggregation**
- Add `date_histogram` aggregation (only `fixed_interval` for now) [#1900](https://github.com/quickwit-oss/tantivy/issues/1900) (@PSeitz)
- Add `percentiles` aggregations [#1984](https://github.com/quickwit-oss/tantivy/issues/1984) (@PSeitz)
- [**breaking**] Drop JSON support on intermediate agg result (we use postcard as format in `quickwit` to send intermediate results) [#1992](https://github.com/quickwit-oss/tantivy/issues/1992) (@PSeitz)
- Set memory limit in bytes for aggregations after which they abort (Previously there was only the bucket limit) [#1942](https://github.com/quickwit-oss/tantivy/issues/1942)[#1957](https://github.com/quickwit-oss/tantivy/issues/1957)(@PSeitz)
- Add support for u64,i64,f64 fields in term aggregation [#1883](https://github.com/quickwit-oss/tantivy/issues/1883) (@PSeitz)
- Allow histogram bounds to be passed as Rfc3339 [#2076](https://github.com/quickwit-oss/tantivy/issues/2076) (@PSeitz)
- Add count, min, max, and sum aggregations [#1794](https://github.com/quickwit-oss/tantivy/issues/1794) (@guilload)
- Switch to Aggregation without serde_untagged => better deserialization errors. [#2003](https://github.com/quickwit-oss/tantivy/issues/2003) (@PSeitz)
- Switch to ms in histogram for date type (ES compatibility) [#2045](https://github.com/quickwit-oss/tantivy/issues/2045) (@PSeitz)
- Reduce term aggregation memory consumption [#2013](https://github.com/quickwit-oss/tantivy/issues/2013) (@PSeitz)
- Reduce agg memory consumption: Replace generic aggregation collector (which has a high memory requirement per instance) in aggregation tree with optimized versions behind a trait.
- Split term collection count and sub_agg (Faster term agg with less memory consumption for cases without sub-aggs) [#1921](https://github.com/quickwit-oss/tantivy/issues/1921) (@PSeitz)
- Schemaless aggregations: In combination with stacker tantivy supports now schemaless aggregations via the JSON type.
- Add aggregation support for JSON type [#1888](https://github.com/quickwit-oss/tantivy/issues/1888) (@PSeitz)
- Mixed types support on JSON fields in aggs [#1971](https://github.com/quickwit-oss/tantivy/issues/1971) (@PSeitz)
- Perf: Fetch blocks of vals in aggregation for all cardinality [#1950](https://github.com/quickwit-oss/tantivy/issues/1950) (@PSeitz)
- `Searcher` with disabled scoring via `EnableScoring::Disabled` [#1780](https://github.com/quickwit-oss/tantivy/issues/1780) (@shikhar)
- Enable tokenizer on json fields [#2053](https://github.com/quickwit-oss/tantivy/issues/2053) (@PSeitz)
- Enforcing "NOT" and "-" queries consistency in UserInputAst [#1609](https://github.com/quickwit-oss/tantivy/issues/1609) (@bazhenov)
- Faster indexing
- Refactor tokenization pipeline to use GATs [#1924](https://github.com/quickwit-oss/tantivy/issues/1924) (@trinity-1686a)
- Faster term hash map [#2058](https://github.com/quickwit-oss/tantivy/issues/2058)[#1940](https://github.com/quickwit-oss/tantivy/issues/1940) (@PSeitz)
- Refactor vint [#2010](https://github.com/quickwit-oss/tantivy/issues/2010) (@PSeitz)
- Faster search
- Work in batches of docs on the SegmentCollector (Only for cases without score for now) [#1937](https://github.com/quickwit-oss/tantivy/issues/1937) (@PSeitz)
- Faster fast field range queries using SIMD [#1954](https://github.com/quickwit-oss/tantivy/issues/1954) (@fulmicoton)
- Improve fast field range query performance [#1864](https://github.com/quickwit-oss/tantivy/issues/1864) (@PSeitz)
- Make BM25 scoring more flexible [#1855](https://github.com/quickwit-oss/tantivy/issues/1855) (@alexcole)
- Switch fs2 to fs4 as it is now unmaintained and does not support illumos [#1944](https://github.com/quickwit-oss/tantivy/issues/1944) (@Toasterson)
- Made BooleanWeight and BoostWeight public [#1991](https://github.com/quickwit-oss/tantivy/issues/1991) (@fulmicoton)
- Make index compatible with virtual drives on Windows [#1843](https://github.com/quickwit-oss/tantivy/issues/1843) (@gyk)
- Add stop words for Hungarian language [#2069](https://github.com/quickwit-oss/tantivy/issues/2069) (@tnxbutno)
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
- sstable
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
- Query Parser
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
- PhrasePrefixQuery is supported in the query parser via: `field:"phrase ter"*` [#2044](https://github.com/quickwit-oss/tantivy/issues/2044) (@adamreichold)
- Docs
- Update examples for literate docs [#1880](https://github.com/quickwit-oss/tantivy/issues/1880) (@PSeitz)
- Add ip field example [#1775](https://github.com/quickwit-oss/tantivy/issues/1775) (@PSeitz)
- Fix doc store cache documentation [#1821](https://github.com/quickwit-oss/tantivy/issues/1821) (@PSeitz)
- Fix BooleanQuery document [#1999](https://github.com/quickwit-oss/tantivy/issues/1999) (@RT_Enzyme)
- Update comments in the faceted search example [#1737](https://github.com/quickwit-oss/tantivy/issues/1737) (@DawChihLiou)
Tantivy 0.19
================================
#### Bugfixes

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy"
version = "0.19.0"
version = "0.20.2"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]
@@ -12,6 +12,7 @@ readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.62"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
oneshot = "0.1.5"
@@ -20,10 +21,10 @@ byteorder = "1.4.3"
crc32fast = "1.3.2"
once_cell = "1.10.0"
regex = { version = "1.5.5", default-features = false, features = ["std", "unicode"] }
aho-corasick = "0.7"
aho-corasick = "1.0"
tantivy-fst = "0.4.0"
memmap2 = { version = "0.5.3", optional = true }
lz4_flex = { version = "0.10", default-features = false, features = ["checked-decode"], optional = true }
memmap2 = { version = "0.7.1", optional = true }
lz4_flex = { version = "0.11", default-features = false, optional = true }
brotli = { version = "3.3.4", optional = true }
zstd = { version = "0.12", optional = true, default-features = false }
snap = { version = "1.0.5", optional = true }
@@ -43,7 +44,7 @@ census = "0.4.0"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
htmlescape = "0.3.1"
fail = "0.5.0"
fail = { version = "0.5.0", optional = true }
murmurhash32 = "0.3.0"
time = { version = "0.3.10", features = ["serde-well-known"] }
smallvec = "1.8.0"
@@ -55,13 +56,15 @@ measure_time = "0.8.2"
async-trait = "0.1.53"
arc-swap = "1.5.0"
columnar = { version="0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version="0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version="0.1", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.19.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.3", path="./bitpacker" }
columnar = { version= "0.1", path="./columnar", package ="tantivy-columnar" }
sstable = { version= "0.1", path="./sstable", package ="tantivy-sstable", optional = true }
stacker = { version= "0.1", path="./stacker", package ="tantivy-stacker" }
query-grammar = { version= "0.20.0", path="./query-grammar", package = "tantivy-query-grammar" }
tantivy-bitpacker = { version= "0.4", path="./bitpacker" }
common = { version= "0.5", path = "./common/", package = "tantivy-common" }
tokenizer-api = { version="0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
tokenizer-api = { version= "0.1", path="./tokenizer-api", package="tantivy-tokenizer-api" }
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
futures-util = { version = "0.3.28", optional = true }
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
@@ -72,12 +75,14 @@ maplit = "1.0.2"
matches = "0.1.9"
pretty_assertions = "1.2.1"
proptest = "1.0.0"
criterion = "0.4"
criterion = "0.5"
test-log = "0.2.10"
env_logger = "0.10.0"
pprof = { version = "0.11.0", features = ["flamegraph", "criterion"] }
pprof = { git = "https://github.com/PSeitz/pprof-rs/", rev = "53af24b", features = ["flamegraph", "criterion"] } # temp fork that works with criterion 0.5
futures = "0.3.21"
paste = "1.0.11"
more-asserts = "0.3.1"
rand_distr = "0.4.3"
[dev-dependencies.fail]
version = "0.5.0"
@@ -88,6 +93,11 @@ opt-level = 3
debug = false
debug-assertions = false
[profile.bench]
opt-level = 3
debug = true
debug-assertions = false
[profile.test]
debug-assertions = true
overflow-checks = true
@@ -102,10 +112,10 @@ lz4-compression = ["lz4_flex"]
snappy-compression = ["snap"]
zstd-compression = ["zstd"]
failpoints = ["fail/failpoints"]
failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable"]
quickwit = ["sstable", "futures-util"]
[workspace]
members = ["query-grammar", "bitpacker", "common", "ownedbytes", "stacker", "sstable", "tokenizer-api", "columnar"]
@@ -129,4 +139,3 @@ harness = false
[[bench]]
name = "index-bench"
harness = false

View File

@@ -1,5 +1,5 @@
test:
echo "Run test only... No examples."
@echo "Run test only... No examples."
cargo test --tests --lib
fmt:

View File

@@ -26,6 +26,8 @@ Your mileage WILL vary depending on the nature of queries and their load.
<img src="doc/assets/images/searchbenchmark.png">
Details about the benchmark can be found at this [repository](https://github.com/quickwit-oss/search-benchmark-game).
# Features
- Full-text search

21
RELEASE.md Normal file
View File

@@ -0,0 +1,21 @@
# Release a new Tantivy Version
## Steps
1. Identify new packages in workspace since last release
2. Identify changed packages in workspace since last release
3. Bump version in `Cargo.toml` and their dependents for all changed packages
4. Update version of root `Cargo.toml`
5. Publish version starting with leaf nodes
6. Set git tag with new version
In conjucation with `cargo-release` Steps 1-4 (I'm not sure if the change detection works):
Set new packages to version 0.0.0
Replace prev-tag-name
```bash
cargo release --workspace --no-publish -v --prev-tag-name 0.19 --push-remote origin minor --no-tag --execute
```
no-tag or it will create tags for all the subpackages

View File

@@ -1,23 +0,0 @@
# Appveyor configuration template for Rust using rustup for Rust installation
# https://github.com/starkat99/appveyor-rust
os: Visual Studio 2015
environment:
matrix:
- channel: stable
target: x86_64-pc-windows-msvc
install:
- appveyor DownloadFile https://win.rustup.rs/ -FileName rustup-init.exe
- rustup-init -yv --default-toolchain %channel% --default-host %target%
- set PATH=%PATH%;%USERPROFILE%\.cargo\bin
- if defined msys_bits set PATH=%PATH%;C:\msys64\mingw%msys_bits%\bin
- rustc -vV
- cargo -vV
build: false
test_script:
- REM SET RUST_LOG=tantivy,test & cargo test --all --verbose --no-default-features --features lz4-compression --features mmap
- REM SET RUST_LOG=tantivy,test & cargo test test_store --verbose --no-default-features --features lz4-compression --features snappy-compression --features brotli-compression --features mmap
- REM SET RUST_BACKTRACE=1 & cargo build --examples

View File

@@ -1,11 +1,13 @@
use criterion::{criterion_group, criterion_main, Criterion};
use tantivy::tokenizer::TokenizerManager;
use tantivy::tokenizer::{
LowerCaser, RemoveLongFilter, SimpleTokenizer, TextAnalyzer, TokenizerManager,
};
const ALICE_TXT: &str = include_str!("alice.txt");
pub fn criterion_benchmark(c: &mut Criterion) {
let tokenizer_manager = TokenizerManager::default();
let tokenizer = tokenizer_manager.get("default").unwrap();
let mut tokenizer = tokenizer_manager.get("default").unwrap();
c.bench_function("default-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
@@ -16,7 +18,26 @@ pub fn criterion_benchmark(c: &mut Criterion) {
assert_eq!(word_count, 30_731);
})
});
let mut dynamic_analyzer = TextAnalyzer::builder(SimpleTokenizer::default())
.dynamic()
.filter_dynamic(RemoveLongFilter::limit(40))
.filter_dynamic(LowerCaser)
.build();
c.bench_function("dynamic-tokenize-alice", |b| {
b.iter(|| {
let mut word_count = 0;
let mut token_stream = dynamic_analyzer.token_stream(ALICE_TXT);
while token_stream.advance() {
word_count += 1;
}
assert_eq!(word_count, 30_731);
})
});
}
criterion_group!(benches, criterion_benchmark);
criterion_group! {
name = benches;
config = Criterion::default().sample_size(200);
targets = criterion_benchmark
}
criterion_main!(benches);

1000
benches/gh.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,10 +1,15 @@
use criterion::{criterion_group, criterion_main, Criterion};
use criterion::{criterion_group, criterion_main, Criterion, Throughput};
use pprof::criterion::{Output, PProfProfiler};
use tantivy::schema::{INDEXED, STORED, STRING, TEXT};
use tantivy::schema::{FAST, INDEXED, STORED, STRING, TEXT};
use tantivy::Index;
const HDFS_LOGS: &str = include_str!("hdfs.json");
const NUM_REPEATS: usize = 2;
const GH_LOGS: &str = include_str!("gh.json");
const WIKI: &str = include_str!("wiki.json");
fn get_lines(input: &str) -> Vec<&str> {
input.trim().split('\n').collect()
}
pub fn hdfs_index_benchmark(c: &mut Criterion) {
let schema = {
@@ -28,85 +33,147 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
};
let mut group = c.benchmark_group("index-hdfs");
group.throughput(Throughput::Bytes(HDFS_LOGS.len() as u64));
group.sample_size(20);
group.bench_function("index-hdfs-no-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-hdfs-with-commit", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-hdfs-with-commit-with-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(schema_with_store.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let doc = schema.parse_document(doc_json).unwrap();
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-no-commit-json-without-docstore", |b| {
let lines = get_lines(HDFS_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
group.bench_function("index-hdfs-with-commit-json-without-docstore", |b| {
}
pub fn gh_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-gh");
group.throughput(Throughput::Bytes(GH_LOGS.len() as u64));
group.bench_function("index-gh-no-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let index = Index::create_in_ram(dynamic_schema.clone());
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-gh-with-commit", |b| {
let lines = get_lines(GH_LOGS);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for _ in 0..NUM_REPEATS {
for doc_json in HDFS_LOGS.trim().split('\n') {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
});
}
pub fn wiki_index_benchmark(c: &mut Criterion) {
let dynamic_schema = {
let mut schema_builder = tantivy::schema::SchemaBuilder::new();
schema_builder.add_json_field("json", TEXT | FAST);
schema_builder.build()
};
let mut group = c.benchmark_group("index-wiki");
group.throughput(Throughput::Bytes(WIKI.len() as u64));
group.bench_function("index-wiki-no-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
})
});
group.bench_function("index-wiki-with-commit", |b| {
let lines = get_lines(WIKI);
b.iter(|| {
let json_field = dynamic_schema.get_field("json").unwrap();
let index = Index::create_in_ram(dynamic_schema.clone());
let mut index_writer = index.writer_with_num_threads(1, 100_000_000).unwrap();
for doc_json in &lines {
let json_val: serde_json::Map<String, serde_json::Value> =
serde_json::from_str(doc_json).unwrap();
let doc = tantivy::doc!(json_field=>json_val);
index_writer.add_document(doc).unwrap();
}
index_writer.commit().unwrap();
})
@@ -115,7 +182,17 @@ pub fn hdfs_index_benchmark(c: &mut Criterion) {
criterion_group! {
name = benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
config = Criterion::default();
targets = hdfs_index_benchmark
}
criterion_main!(benches);
criterion_group! {
name = gh_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = gh_index_benchmark
}
criterion_group! {
name = wiki_benches;
config = Criterion::default().with_profiler(PProfProfiler::new(100, Output::Flamegraph(None)));
targets = wiki_index_benchmark
}
criterion_main!(benches, gh_benches, wiki_benches);

1000
benches/wiki.json Normal file

File diff suppressed because one or more lines are too long

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-bitpacker"
version = "0.3.0"
version = "0.4.0"
edition = "2021"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"

View File

@@ -1,5 +1,5 @@
//! SIMD filtering of a vector as described in the following blog post.
//! https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512
//! <https://quickwit.io/blog/filtering%20a%20vector%20with%20simd%20instructions%20avx-2%20and%20avx-512>
use std::arch::x86_64::{
__m256i as DataType, _mm256_add_epi32 as op_add, _mm256_cmpgt_epi32 as op_greater,
_mm256_lddqu_si256 as load_unaligned, _mm256_or_si256 as op_or, _mm256_set1_epi32 as set1,

View File

@@ -1,6 +1,6 @@
use std::ops::RangeInclusive;
#[cfg(any(target_arch = "x86_64"))]
#[cfg(target_arch = "x86_64")]
mod avx2;
mod scalar;

89
cliff.toml Normal file
View File

@@ -0,0 +1,89 @@
# configuration file for git-cliff{ pattern = "foo", replace = "bar"}
# see https://github.com/orhun/git-cliff#configuration-file
[changelog]
# changelog header
header = """
"""
# template for the changelog body
# https://tera.netlify.app/docs/#introduction
body = """
{% if version %}\
{{ version | trim_start_matches(pat="v") }} ({{ timestamp | date(format="%Y-%m-%d") }})
==================
{% else %}\
## [unreleased]
{% endif %}\
{% for commit in commits %}
- {% if commit.breaking %}[**breaking**] {% endif %}{{ commit.message | split(pat="\n") | first | trim | upper_first }}(@{{ commit.author.name }})\
{% endfor %}
"""
# remove the leading and trailing whitespace from the template
trim = true
# changelog footer
footer = """
"""
postprocessors = [
{ pattern = 'Paul Masurel', replace = "fulmicoton"}, # replace with github user
{ pattern = 'PSeitz', replace = "PSeitz"}, # replace with github user
{ pattern = 'Adam Reichold', replace = "adamreichold"}, # replace with github user
{ pattern = 'trinity-1686a', replace = "trinity-1686a"}, # replace with github user
{ pattern = 'Michael Kleen', replace = "mkleen"}, # replace with github user
{ pattern = 'Adrien Guillo', replace = "guilload"}, # replace with github user
{ pattern = 'François Massot', replace = "fmassot"}, # replace with github user
{ pattern = '', replace = ""}, # replace with github user
]
[git]
# parse the commits based on https://www.conventionalcommits.org
# This is required or commit.message contains the whole commit message and not just the title
conventional_commits = true
# filter out the commits that are not conventional
filter_unconventional = false
# process each line of a commit as an individual commit
split_commits = false
# regex for preprocessing the commit messages
commit_preprocessors = [
{ pattern = '\((\w+\s)?#([0-9]+)\)', replace = "[#${2}](https://github.com/quickwit-oss/tantivy/issues/${2})"}, # replace issue numbers
]
#link_parsers = [
#{ pattern = "#(\\d+)", href = "https://github.com/quickwit-oss/tantivy/pulls/$1"},
#]
# regex for parsing and grouping commits
commit_parsers = [
{ message = "^feat", group = "Features"},
{ message = "^fix", group = "Bug Fixes"},
{ message = "^doc", group = "Documentation"},
{ message = "^perf", group = "Performance"},
{ message = "^refactor", group = "Refactor"},
{ message = "^style", group = "Styling"},
{ message = "^test", group = "Testing"},
{ message = "^chore\\(release\\): prepare for", skip = true},
{ message = "(?i)clippy", skip = true},
{ message = "(?i)dependabot", skip = true},
{ message = "(?i)fmt", skip = true},
{ message = "(?i)bump", skip = true},
{ message = "(?i)readme", skip = true},
{ message = "(?i)comment", skip = true},
{ message = "(?i)spelling", skip = true},
{ message = "^chore", group = "Miscellaneous Tasks"},
{ body = ".*security", group = "Security"},
{ message = ".*", group = "Other", default_scope = "other"},
]
# protect breaking changes from being skipped due to matching a skipping commit_parser
protect_breaking_commits = false
# filter out the commits that are not matched by commit parsers
filter_commits = false
# glob pattern for matching git tags
tag_pattern = "v[0-9]*"
# regex for skipping tags
skip_tags = "v0.1.0-beta.1"
# regex for ignoring tags
ignore_tags = ""
# sort the tags topologically
topo_order = false
# sort the commits inside sections by oldest/newest order
sort_commits = "newest"
# limit the number of commits included in the changelog.
# limit_commits = 42

View File

@@ -3,26 +3,26 @@ name = "tantivy-columnar"
version = "0.1.0"
edition = "2021"
license = "MIT"
homepage = "https://github.com/quickwit-oss/tantivy"
repository = "https://github.com/quickwit-oss/tantivy"
description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.10.5"
log = "0.4.17"
fnv = "1.0.7"
fastdivide = "0.4.0"
rand = { version = "0.8.5", optional = true }
measure_time = { version = "0.8.2", optional = true }
prettytable-rs = { version = "0.10.0", optional = true }
stacker = { path = "../stacker", package="tantivy-stacker"}
sstable = { path = "../sstable", package = "tantivy-sstable" }
common = { path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.3", path = "../bitpacker/" }
stacker = { version= "0.1", path = "../stacker", package="tantivy-stacker"}
sstable = { version= "0.1", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.5", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.4", path = "../bitpacker/" }
serde = "1.0.152"
[dev-dependencies]
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8.5"
rand = "0.8"
[features]
unstable = []

View File

@@ -1,19 +1,73 @@
mod shuffled;
mod stacked;
use common::ReadOnlyBitSet;
use shuffled::merge_column_index_shuffled;
use stacked::merge_column_index_stacked;
use crate::column_index::SerializableColumnIndex;
use crate::{Cardinality, ColumnIndex, MergeRowOrder};
// For simplification, we never have cardinality go down due to deletes.
fn detect_cardinality(columns: &[ColumnIndex]) -> Cardinality {
columns
.iter()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full)
fn detect_cardinality_single_column_index(
column_index: &ColumnIndex,
alive_bitset_opt: &Option<ReadOnlyBitSet>,
) -> Cardinality {
let Some(alive_bitset) = alive_bitset_opt else {
return column_index.get_cardinality();
};
let cardinality_before_deletes = column_index.get_cardinality();
if cardinality_before_deletes == Cardinality::Full {
// The columnar cardinality can only become more restrictive in the presence of deletes
// (where cardinality sorted from the more restrictive to the least restrictive are Full,
// Optional, Multivalued)
//
// If we are already "Full", we are guaranteed to stay "Full" after deletes.
return Cardinality::Full;
}
let mut cardinality_so_far = Cardinality::Full;
for doc_id in alive_bitset.iter() {
let num_values = column_index.value_row_ids(doc_id).len();
let row_cardinality = match num_values {
0 => Cardinality::Optional,
1 => Cardinality::Full,
_ => Cardinality::Multivalued,
};
cardinality_so_far = cardinality_so_far.max(row_cardinality);
if cardinality_so_far >= cardinality_before_deletes {
// There won't be any improvement in the cardinality.
// We can early exit.
return cardinality_before_deletes;
}
}
cardinality_so_far
}
fn detect_cardinality(
column_indexes: &[ColumnIndex],
merge_row_order: &MergeRowOrder,
) -> Cardinality {
match merge_row_order {
MergeRowOrder::Stack(_) => column_indexes
.iter()
.map(ColumnIndex::get_cardinality)
.max()
.unwrap_or(Cardinality::Full),
MergeRowOrder::Shuffled(shuffle_merge_order) => {
let mut merged_cardinality = Cardinality::Full;
for (column_index, alive_bitset_opt) in column_indexes
.iter()
.zip(shuffle_merge_order.alive_bitsets.iter())
{
let cardinality: Cardinality =
detect_cardinality_single_column_index(column_index, alive_bitset_opt);
if cardinality == Cardinality::Multivalued {
return cardinality;
}
merged_cardinality = merged_cardinality.max(cardinality);
}
merged_cardinality
}
}
}
pub fn merge_column_index<'a>(
@@ -22,7 +76,7 @@ pub fn merge_column_index<'a>(
) -> SerializableColumnIndex<'a> {
// For simplification, we do not try to detect whether the cardinality could be
// downgraded thanks to deletes.
let cardinality_after_merge = detect_cardinality(columns);
let cardinality_after_merge = detect_cardinality(columns, merge_row_order);
match merge_row_order {
MergeRowOrder::Stack(stack_merge_order) => {
merge_column_index_stacked(columns, cardinality_after_merge, stack_merge_order)
@@ -44,34 +98,54 @@ mod tests {
use crate::column_index::merge::detect_cardinality;
use crate::column_index::multivalued_index::MultiValueIndex;
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
use crate::{Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder};
use crate::{
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
};
#[test]
fn test_detect_cardinality() {
assert_eq!(detect_cardinality(&[]), Cardinality::Full);
assert_eq!(
detect_cardinality(&[], &StackMergeOrder::stack_for_test(&[]).into()),
Cardinality::Full
);
let optional_index: ColumnIndex = OptionalIndex::for_test(1, &[]).into();
let multivalued_index: ColumnIndex = MultiValueIndex::for_test(&[0, 1]).into();
assert_eq!(
detect_cardinality(&[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }]),
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Empty { num_docs: 0 }],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(&[optional_index.clone(), ColumnIndex::Full]),
detect_cardinality(
&[optional_index.clone(), ColumnIndex::Full],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
Cardinality::Optional
);
assert_eq!(
detect_cardinality(&[
multivalued_index.clone(),
ColumnIndex::Empty { num_docs: 0 }
]),
detect_cardinality(
&[
multivalued_index.clone(),
ColumnIndex::Empty { num_docs: 0 }
],
&StackMergeOrder::stack_for_test(&[1, 0]).into()
),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(&[multivalued_index.clone(), optional_index.clone()]),
detect_cardinality(
&[multivalued_index.clone(), optional_index.clone()],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
Cardinality::Multivalued
);
assert_eq!(
detect_cardinality(&[optional_index, multivalued_index]),
detect_cardinality(
&[optional_index, multivalued_index],
&StackMergeOrder::stack_for_test(&[1, 1]).into()
),
Cardinality::Multivalued
);
}
@@ -94,8 +168,9 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5]);
}
@@ -126,8 +201,9 @@ mod tests {
)
.into();
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index
else { panic!("Excpected a multivalued index") };
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
panic!("Excpected a multivalued index")
};
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
}

View File

@@ -157,7 +157,13 @@ mod tests {
Cardinality::Optional,
&shuffle_merge_order,
);
let SerializableColumnIndex::Optional { non_null_row_ids, num_rows } = serializable_index else { panic!() };
let SerializableColumnIndex::Optional {
non_null_row_ids,
num_rows,
} = serializable_index
else {
panic!()
};
assert_eq!(num_rows, 2);
let non_null_rows: Vec<RowId> = non_null_row_ids.boxed_iter().collect();
assert_eq!(&non_null_rows, &[1]);

View File

@@ -2,7 +2,7 @@
//! # `fastfield_codecs`
//!
//! - Columnar storage of data for tantivy [`Column`].
//! - Columnar storage of data for tantivy [`crate::Column`].
//! - Encode data in different codecs.
//! - Monotonically map values to u64/u128

View File

@@ -139,12 +139,12 @@ impl MonotonicallyMappableToU64 for i64 {
impl MonotonicallyMappableToU64 for DateTime {
#[inline(always)]
fn to_u64(self) -> u64 {
common::i64_to_u64(self.into_timestamp_micros())
common::i64_to_u64(self.into_timestamp_nanos())
}
#[inline(always)]
fn from_u64(val: u64) -> Self {
DateTime::from_timestamp_micros(common::u64_to_i64(val))
DateTime::from_timestamp_nanos(common::u64_to_i64(val))
}
}

View File

@@ -83,7 +83,8 @@ impl ColumnValues for BitpackedReader {
doc_id_range: Range<u32>,
positions: &mut Vec<u32>,
) {
let Some(transformed_range) = transform_range_before_linear_transformation(&self.stats, range)
let Some(transformed_range) =
transform_range_before_linear_transformation(&self.stats, range)
else {
positions.clear();
return;

View File

@@ -27,7 +27,7 @@ pub struct StatsCollector {
// This is the same as computing the difference between the values and the first value.
//
// This way, we can compress i64-converted-to-u64 (e.g. timestamp that were supplied in
// seconds, only to be converted in microseconds).
// seconds, only to be converted in nanoseconds).
increment_gcd_opt: Option<(NonZeroU64, DividerU64)>,
first_value_opt: Option<u64>,
}

View File

@@ -34,7 +34,7 @@ impl fmt::Display for ColumnType {
ColumnType::IpAddr => "ip",
ColumnType::DateTime => "datetime",
};
write!(f, "{}", short_str)
write!(f, "{short_str}")
}
}
@@ -54,6 +54,9 @@ impl ColumnType {
pub fn to_code(self) -> u8 {
self as u8
}
pub fn is_date_time(&self) -> bool {
self == &ColumnType::DateTime
}
pub(crate) fn try_from_code(code: u8) -> Result<ColumnType, InvalidData> {
COLUMN_TYPES.get(code as usize).copied().ok_or(InvalidData)

View File

@@ -1,7 +1,7 @@
use std::io::{self, Write};
use common::{BitSet, CountingWriter, ReadOnlyBitSet};
use sstable::{SSTable, TermOrdinal};
use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
use super::term_merger::TermMerger;
use crate::column::serialize_column_mappable_to_u64;
@@ -52,18 +52,23 @@ impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
impl<'a> RemappedTermOrdinalsValues<'a> {
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
let iter = self.bytes_columns.iter().flatten().enumerate().flat_map(
move |(seg_ord_with_column, bytes_column)| {
let term_ord_after_merge_mapping = self
.term_ord_mapping
.get_segment(seg_ord_with_column as u32);
let iter = self
.bytes_columns
.iter()
.enumerate()
.flat_map(|(seg_ord, bytes_column_opt)| {
let bytes_column = bytes_column_opt.as_ref()?;
Some((seg_ord, bytes_column))
})
.flat_map(move |(seg_ord, bytes_column)| {
let term_ord_after_merge_mapping =
self.term_ord_mapping.get_segment(seg_ord as u32);
bytes_column
.ords()
.values
.iter()
.map(move |term_ord| term_ord_after_merge_mapping[term_ord as usize])
},
);
});
Box::new(iter)
}
@@ -121,10 +126,15 @@ fn serialize_merged_dict(
let mut term_ord_mapping = TermOrdinalMapping::default();
let mut field_term_streams = Vec::new();
for column in bytes_columns.iter().flatten() {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms = column.dictionary.stream()?;
field_term_streams.push(terms);
for column_opt in bytes_columns.iter() {
if let Some(column) = column_opt {
term_ord_mapping.add_segment(column.dictionary.num_terms());
let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
field_term_streams.push(terms);
} else {
term_ord_mapping.add_segment(0);
field_term_streams.push(Streamer::empty());
}
}
let mut merged_terms = TermMerger::new(field_term_streams);

View File

@@ -11,6 +11,17 @@ pub struct StackMergeOrder {
}
impl StackMergeOrder {
#[cfg(test)]
pub fn stack_for_test(num_rows_per_columnar: &[u32]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(num_rows_per_columnar.len());
let mut cumulated_row_id = 0;
for &num_rows in num_rows_per_columnar {
cumulated_row_id += num_rows;
cumulated_row_ids.push(cumulated_row_id);
}
StackMergeOrder { cumulated_row_ids }
}
pub fn stack(columnars: &[&ColumnarReader]) -> StackMergeOrder {
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
let mut cumulated_row_id = 0;
@@ -41,8 +52,8 @@ pub enum MergeRowOrder {
/// Columnar tables are simply stacked one above the other.
/// If the i-th columnar_readers has n_rows_i rows, then
/// in the resulting columnar,
/// rows [r0..n_row_0) contains the row of columnar_readers[0], in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of columnar_readers[1], in order.
/// rows [r0..n_row_0) contains the row of `columnar_readers[0]`, in ordder
/// rows [n_row_0..n_row_0 + n_row_1 contains the row of `columnar_readers[1]`, in order.
/// ..
/// No documents is deleted.
Stack(StackMergeOrder),

View File

@@ -7,6 +7,7 @@ use std::io;
use std::net::Ipv6Addr;
use std::sync::Arc;
use itertools::Itertools;
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
use super::writer::ColumnarSerializer;
@@ -82,10 +83,11 @@ pub fn merge_columnar(
.iter()
.map(|reader| reader.num_rows())
.collect::<Vec<u32>>();
let columns_to_merge = group_columns_for_merge(columnar_readers, required_columns)?;
let columns_to_merge =
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
for ((column_name, column_type), columns) in columns_to_merge {
let mut column_serializer =
serializer.serialize_column(column_name.as_bytes(), column_type);
serializer.start_serialize_column(column_name.as_bytes(), column_type);
merge_column(
column_type,
&num_rows_per_columnar,
@@ -93,6 +95,7 @@ pub fn merge_columnar(
&merge_row_order,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
serializer.finalize(merge_row_order.num_rows())?;
Ok(())
@@ -287,10 +290,69 @@ fn merged_numerical_columns_type<'a>(
compatible_numerical_types.to_numerical_type()
}
fn is_empty_after_merge(
merge_row_order: &MergeRowOrder,
column: &DynamicColumn,
columnar_id: usize,
) -> bool {
if column.num_values() == 0u32 {
// It was empty before the merge.
return true;
}
match merge_row_order {
MergeRowOrder::Stack(_) => {
// If we are stacking the columnar, no rows are being deleted.
false
}
MergeRowOrder::Shuffled(shuffled) => {
if let Some(alive_bitset) = &shuffled.alive_bitsets[columnar_id] {
let column_index = column.column_index();
match column_index {
ColumnIndex::Empty { .. } => true,
ColumnIndex::Full => alive_bitset.len() == 0,
ColumnIndex::Optional(optional_index) => {
for doc in optional_index.iter_rows() {
if alive_bitset.contains(doc) {
return false;
}
}
true
}
ColumnIndex::Multivalued(multivalued_index) => {
for (doc_id, (start_index, end_index)) in multivalued_index
.start_index_column
.iter()
.tuple_windows()
.enumerate()
{
let doc_id = doc_id as u32;
if start_index == end_index {
// There are no values in this document
continue;
}
// The document contains values and is present in the alive bitset.
// The column is therefore not empty.
if alive_bitset.contains(doc_id) {
return false;
}
}
true
}
}
} else {
// No document is being deleted.
// The shuffle is applying a permutation.
false
}
}
}
}
#[allow(clippy::type_complexity)]
fn group_columns_for_merge(
columnar_readers: &[&ColumnarReader],
required_columns: &[(String, ColumnType)],
merge_row_order: &MergeRowOrder,
) -> io::Result<BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>>> {
// Each column name may have multiple types of column associated.
// For merging we are interested in the same column type category since they can be merged.
@@ -307,9 +369,16 @@ fn group_columns_for_merge(
for (columnar_id, columnar_reader) in columnar_readers.iter().enumerate() {
let column_name_and_handle = columnar_reader.list_columns()?;
// We skip columns that end up with 0 documents.
// That way, we make sure they don't end up influencing the merge type or
// creating empty columns.
for (column_name, handle) in column_name_and_handle {
let column_category: ColumnTypeCategory = handle.column_type().into();
let column = handle.open()?;
if is_empty_after_merge(merge_row_order, &column, columnar_id) {
continue;
}
columns_grouped
.entry((column_name, column_category))
.or_insert_with(|| {

View File

@@ -25,8 +25,10 @@ fn test_column_coercion_to_u64() {
let columnar1 = make_columnar("numbers", &[1i64]);
// u64 type
let columnar2 = make_columnar("numbers", &[u64::MAX]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
@@ -35,8 +37,10 @@ fn test_column_coercion_to_u64() {
fn test_column_no_coercion_if_all_the_same() {
let columnar1 = make_columnar("numbers", &[1u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::U64)));
}
@@ -45,8 +49,10 @@ fn test_column_no_coercion_if_all_the_same() {
fn test_column_coercion_to_i64() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 1);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
}
@@ -54,10 +60,13 @@ fn test_column_coercion_to_i64() {
#[test]
fn test_impossible_coercion_returns_an_error() {
let columnar1 = make_columnar("numbers", &[u64::MAX]);
let group_error =
group_columns_for_merge(&[&columnar1], &[("numbers".to_string(), ColumnType::I64)])
.map(|_| ())
.unwrap_err();
let merge_order = StackMergeOrder::stack(&[&columnar1]).into();
let group_error = group_columns_for_merge(
&[&columnar1],
&[("numbers".to_string(), ColumnType::I64)],
&merge_order,
)
.unwrap_err();
assert_eq!(group_error.kind(), io::ErrorKind::InvalidInput);
}
@@ -65,10 +74,13 @@ fn test_impossible_coercion_returns_an_error() {
fn test_group_columns_with_required_column() {
let columnar1 = make_columnar("numbers", &[1i64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
@@ -79,10 +91,13 @@ fn test_group_columns_with_required_column() {
fn test_group_columns_required_column_with_no_existing_columns() {
let columnar1 = make_columnar("numbers", &[2u64]);
let columnar2 = make_columnar("numbers", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
columnars,
&[("required_col".to_string(), ColumnType::Str)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 2);
@@ -98,10 +113,13 @@ fn test_group_columns_required_column_with_no_existing_columns() {
fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_rule() {
let columnar1 = make_columnar("numbers", &[2i64]);
let columnar2 = make_columnar("numbers", &[2i64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(
&[&columnar1, &columnar2],
columnars,
&[("numbers".to_string(), ColumnType::U64)],
&merge_order,
)
.unwrap();
assert_eq!(column_map.len(), 1);
@@ -112,8 +130,10 @@ fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_ru
fn test_missing_column() {
let columnar1 = make_columnar("numbers", &[-1i64]);
let columnar2 = make_columnar("numbers2", &[2u64]);
let columnars = &[&columnar1, &columnar2];
let merge_order = StackMergeOrder::stack(columnars).into();
let column_map: BTreeMap<(String, ColumnType), Vec<Option<DynamicColumn>>> =
group_columns_for_merge(&[&columnar1, &columnar2], &[]).unwrap();
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
assert_eq!(column_map.len(), 2);
assert!(column_map.contains_key(&("numbers".to_string(), ColumnType::I64)));
{
@@ -224,7 +244,9 @@ fn test_merge_columnar_numbers() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("numbers").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::F64(vals) = dynamic_column else { panic!() };
let DynamicColumn::F64(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.first(0u32), Some(-1f64));
assert_eq!(vals.first(1u32), None);
@@ -250,7 +272,9 @@ fn test_merge_columnar_texts() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("texts").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
@@ -297,7 +321,9 @@ fn test_merge_columnar_byte() {
assert_eq!(columnar_reader.num_columns(), 1);
let cols = columnar_reader.read_columns("bytes").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -351,7 +377,9 @@ fn test_merge_columnar_byte_with_missing() {
assert_eq!(columnar_reader.num_columns(), 2);
let cols = columnar_reader.read_columns("col").unwrap();
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::Bytes(vals) = dynamic_column else { panic!() };
let DynamicColumn::Bytes(vals) = dynamic_column else {
panic!()
};
let get_bytes_for_ord = |ord| {
let mut out = Vec::new();
vals.ord_to_bytes(ord, &mut out).unwrap();
@@ -403,7 +431,9 @@ fn test_merge_columnar_different_types() {
// numeric column
let dynamic_column = cols[0].open().unwrap();
let DynamicColumn::I64(vals) = dynamic_column else { panic!() };
let DynamicColumn::I64(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
@@ -413,7 +443,9 @@ fn test_merge_columnar_different_types() {
// text column
let dynamic_column = cols[1].open().unwrap();
let DynamicColumn::Str(vals) = dynamic_column else { panic!() };
let DynamicColumn::Str(vals) = dynamic_column else {
panic!()
};
assert_eq!(vals.ords().get_cardinality(), Cardinality::Optional);
let get_str_for_ord = |ord| {
let mut out = String::new();

View File

@@ -98,9 +98,11 @@ impl ColumnarWriter {
///
/// The sort applied is stable.
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
let Some(numerical_col_writer) =
self.numerical_field_hash_map.get::<NumericalColumnWriter>(sort_field.as_bytes()) else {
return Vec::new();
let Some(numerical_col_writer) = self
.numerical_field_hash_map
.get::<NumericalColumnWriter>(sort_field.as_bytes())
else {
return Vec::new();
};
let mut symbols_buffer = Vec::new();
let mut values = Vec::new();
@@ -266,7 +268,7 @@ impl ColumnarWriter {
let mut column: ColumnWriter = column_opt.unwrap_or_default();
column.record(
doc,
NumericalValue::I64(datetime.into_timestamp_micros()),
NumericalValue::I64(datetime.into_timestamp_nanos()),
arena,
);
column
@@ -370,7 +372,7 @@ impl ColumnarWriter {
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, column_type);
serializer.start_serialize_column(column_name, column_type);
serialize_bool_column(
cardinality,
num_docs,
@@ -382,12 +384,13 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::IpAddr => {
let column_writer: ColumnWriter = self.ip_addr_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::IpAddr);
serializer.start_serialize_column(column_name, ColumnType::IpAddr);
serialize_ip_addr_column(
cardinality,
num_docs,
@@ -399,6 +402,7 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::Bytes | ColumnType::Str => {
let str_or_bytes_column_writer: StrOrBytesColumnWriter =
@@ -413,7 +417,7 @@ impl ColumnarWriter {
.column_writer
.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, column_type);
serializer.start_serialize_column(column_name, column_type);
serialize_bytes_or_str_column(
cardinality,
num_docs,
@@ -427,13 +431,14 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::F64 | ColumnType::I64 | ColumnType::U64 => {
let numerical_column_writer: NumericalColumnWriter =
self.numerical_field_hash_map.read(addr);
let cardinality = numerical_column_writer.cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, column_type);
serializer.start_serialize_column(column_name, column_type);
let numerical_type = column_type.numerical_type().unwrap();
serialize_numerical_column(
cardinality,
@@ -447,12 +452,13 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
ColumnType::DateTime => {
let column_writer: ColumnWriter = self.datetime_field_hash_map.read(addr);
let cardinality = column_writer.get_cardinality(num_docs);
let mut column_serializer =
serializer.serialize_column(column_name, ColumnType::DateTime);
serializer.start_serialize_column(column_name, ColumnType::DateTime);
serialize_numerical_column(
cardinality,
num_docs,
@@ -465,6 +471,7 @@ impl ColumnarWriter {
buffers,
&mut column_serializer,
)?;
column_serializer.finalize()?;
}
};
}

View File

@@ -34,11 +34,12 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
pub fn serialize_column<'a>(
/// Creates a ColumnSerializer.
pub fn start_serialize_column<'a>(
&'a mut self,
column_name: &[u8],
column_type: ColumnType,
) -> impl io::Write + 'a {
) -> ColumnSerializer<'a, W> {
let start_offset = self.wrt.written_bytes();
prepare_key(column_name, column_type, &mut self.prepare_key_buffer);
ColumnSerializer {
@@ -60,20 +61,21 @@ impl<W: io::Write> ColumnarSerializer<W> {
}
}
struct ColumnSerializer<'a, W: io::Write> {
pub struct ColumnSerializer<'a, W: io::Write> {
columnar_serializer: &'a mut ColumnarSerializer<W>,
start_offset: u64,
}
impl<'a, W: io::Write> Drop for ColumnSerializer<'a, W> {
fn drop(&mut self) {
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
pub fn finalize(self) -> io::Result<()> {
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
let byte_range = self.start_offset..end_offset;
self.columnar_serializer.sstable_range.insert_cannot_fail(
self.columnar_serializer.sstable_range.insert(
&self.columnar_serializer.prepare_key_buffer[..],
&byte_range,
);
)?;
self.columnar_serializer.prepare_key_buffer.clear();
Ok(())
}
}

View File

@@ -26,14 +26,14 @@ impl fmt::Debug for DynamicColumn {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(f, "[{} {} |", self.get_cardinality(), self.column_type())?;
match self {
DynamicColumn::Bool(col) => write!(f, " {:?}", col)?,
DynamicColumn::I64(col) => write!(f, " {:?}", col)?,
DynamicColumn::U64(col) => write!(f, " {:?}", col)?,
DynamicColumn::F64(col) => write!(f, "{:?}", col)?,
DynamicColumn::IpAddr(col) => write!(f, "{:?}", col)?,
DynamicColumn::DateTime(col) => write!(f, "{:?}", col)?,
DynamicColumn::Bytes(col) => write!(f, "{:?}", col)?,
DynamicColumn::Str(col) => write!(f, "{:?}", col)?,
DynamicColumn::Bool(col) => write!(f, " {col:?}")?,
DynamicColumn::I64(col) => write!(f, " {col:?}")?,
DynamicColumn::U64(col) => write!(f, " {col:?}")?,
DynamicColumn::F64(col) => write!(f, "{col:?}")?,
DynamicColumn::IpAddr(col) => write!(f, "{col:?}")?,
DynamicColumn::DateTime(col) => write!(f, "{col:?}")?,
DynamicColumn::Bytes(col) => write!(f, "{col:?}")?,
DynamicColumn::Str(col) => write!(f, "{col:?}")?,
}
write!(f, "]")
}

View File

@@ -39,7 +39,7 @@ pub use self::dynamic_column::{DynamicColumn, DynamicColumnHandle};
pub type RowId = u32;
pub type DocId = u32;
#[derive(Clone, Copy)]
#[derive(Clone, Copy, Debug)]
pub struct RowAddr {
pub segment_ord: u32,
pub row_id: RowId,

View File

@@ -4,13 +4,15 @@ use std::net::Ipv6Addr;
use common::DateTime;
use proptest::prelude::*;
use proptest::sample::subsequence;
use crate::column_values::MonotonicallyMappableToU128;
use crate::columnar::{ColumnType, ColumnTypeCategory};
use crate::dynamic_column::{DynamicColumn, DynamicColumnHandle};
use crate::value::{Coerce, NumericalValue};
use crate::{
BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowId, StackMergeOrder,
BytesColumn, Cardinality, Column, ColumnarReader, ColumnarWriter, RowAddr, RowId,
ShuffleMergeOrder, StackMergeOrder,
};
#[test]
@@ -24,7 +26,7 @@ fn test_dataframe_writer_str() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 89);
assert_eq!(cols[0].num_bytes(), 87);
}
#[test]
@@ -38,7 +40,7 @@ fn test_dataframe_writer_bytes() {
assert_eq!(columnar.num_columns(), 1);
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 89);
assert_eq!(cols[0].num_bytes(), 87);
}
#[test]
@@ -55,7 +57,9 @@ fn test_dataframe_writer_bool() {
assert_eq!(cols[0].num_bytes(), 22);
assert_eq!(cols[0].column_type(), ColumnType::Bool);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::Bool(bool_col) = dyn_bool_col else { panic!(); };
let DynamicColumn::Bool(bool_col) = dyn_bool_col else {
panic!();
};
let vals: Vec<Option<bool>> = (0..5).map(|row_id| bool_col.first(row_id)).collect();
assert_eq!(&vals, &[None, Some(false), None, Some(true), None,]);
}
@@ -77,7 +81,9 @@ fn test_dataframe_writer_u64_multivalued() {
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].num_bytes(), 29);
let dyn_i64_col = cols[0].open().unwrap();
let DynamicColumn::I64(divisor_col) = dyn_i64_col else { panic!(); };
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
panic!();
};
assert_eq!(
divisor_col.get_cardinality(),
crate::Cardinality::Multivalued
@@ -99,7 +105,9 @@ fn test_dataframe_writer_ip_addr() {
assert_eq!(cols[0].num_bytes(), 42);
assert_eq!(cols[0].column_type(), ColumnType::IpAddr);
let dyn_bool_col = cols[0].open().unwrap();
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else { panic!(); };
let DynamicColumn::IpAddr(ip_col) = dyn_bool_col else {
panic!();
};
let vals: Vec<Option<Ipv6Addr>> = (0..5).map(|row_id| ip_col.first(row_id)).collect();
assert_eq!(
&vals,
@@ -132,7 +140,9 @@ fn test_dataframe_writer_numerical() {
// - null footer 6 bytes
assert_eq!(cols[0].num_bytes(), 33);
let column = cols[0].open().unwrap();
let DynamicColumn::I64(column_i64) = column else { panic!(); };
let DynamicColumn::I64(column_i64) = column else {
panic!();
};
assert_eq!(column_i64.index.get_cardinality(), Cardinality::Optional);
assert_eq!(column_i64.first(0), None);
assert_eq!(column_i64.first(1), Some(12i64));
@@ -196,7 +206,9 @@ fn test_dictionary_encoded_str() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else { panic!(); };
let DynamicColumn::Str(str_col) = col_handles[0].open().unwrap() else {
panic!();
};
let index: Vec<Option<u64>> = (0..5).map(|row_id| str_col.ords().first(row_id)).collect();
assert_eq!(index, &[None, Some(0), None, Some(2), Some(1)]);
assert_eq!(str_col.num_rows(), 5);
@@ -228,7 +240,9 @@ fn test_dictionary_encoded_bytes() {
assert_eq!(columnar_reader.num_columns(), 2);
let col_handles = columnar_reader.read_columns("my.column").unwrap();
assert_eq!(col_handles.len(), 1);
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else { panic!(); };
let DynamicColumn::Bytes(bytes_col) = col_handles[0].open().unwrap() else {
panic!();
};
let index: Vec<Option<u64>> = (0..5)
.map(|row_id| bytes_col.ords().first(row_id))
.collect();
@@ -260,12 +274,15 @@ fn test_dictionary_encoded_bytes() {
fn num_strategy() -> impl Strategy<Value = NumericalValue> {
prop_oneof![
Just(NumericalValue::U64(0u64)),
Just(NumericalValue::U64(u64::MAX)),
Just(NumericalValue::I64(0i64)),
Just(NumericalValue::I64(i64::MIN)),
Just(NumericalValue::I64(i64::MAX)),
Just(NumericalValue::F64(1.2f64)),
3 => Just(NumericalValue::U64(0u64)),
3 => Just(NumericalValue::U64(u64::MAX)),
3 => Just(NumericalValue::I64(0i64)),
3 => Just(NumericalValue::I64(i64::MIN)),
3 => Just(NumericalValue::I64(i64::MAX)),
3 => Just(NumericalValue::F64(1.2f64)),
1 => any::<f64>().prop_map(NumericalValue::from),
1 => any::<u64>().prop_map(NumericalValue::from),
1 => any::<i64>().prop_map(NumericalValue::from),
]
}
@@ -279,6 +296,12 @@ enum ColumnValue {
DateTime(DateTime),
}
impl<T: Into<NumericalValue>> From<T> for ColumnValue {
fn from(val: T) -> ColumnValue {
ColumnValue::Numerical(val.into())
}
}
impl ColumnValue {
pub(crate) fn column_type_category(&self) -> ColumnTypeCategory {
match self {
@@ -328,12 +351,22 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
// A document contains up to 4 values.
fn doc_strategy() -> impl Strategy<Value = Vec<(&'static str, ColumnValue)>> {
proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..4)
proptest::collection::vec((column_name_strategy(), column_value_strategy()), 0..=4)
}
fn num_docs_strategy() -> impl Strategy<Value = usize> {
prop_oneof!(
// We focus heavily on the 0..2 case as we assume it is sufficient to cover all edge cases.
0usize..=3usize,
// We leave 50% of the effort exploring more defensively.
3usize..=12usize
)
}
// A columnar contains up to 2 docs.
fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, ColumnValue)>>> {
proptest::collection::vec(doc_strategy(), 0..=2)
num_docs_strategy()
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
}
fn columnar_docs_and_mapping_strategy(
@@ -347,6 +380,11 @@ fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
Just((0u32..n as RowId).collect()).prop_shuffle()
}
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
let vals: Vec<usize> = (0..n).collect();
subsequence(vals, 0..=n).prop_shuffle()
}
fn build_columnar_with_mapping(
docs: &[Vec<(&'static str, ColumnValue)>],
old_to_new_row_ids_opt: Option<&[RowId]>,
@@ -389,7 +427,15 @@ fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
build_columnar_with_mapping(docs, None)
}
fn assert_columnar_eq(left: &ColumnarReader, right: &ColumnarReader) {
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
assert_columnar_eq(left, right, false);
}
fn assert_columnar_eq(
left: &ColumnarReader,
right: &ColumnarReader,
lenient_on_numerical_value: bool,
) {
assert_eq!(left.num_rows(), right.num_rows());
let left_columns = left.list_columns().unwrap();
let right_columns = right.list_columns().unwrap();
@@ -398,7 +444,7 @@ fn assert_columnar_eq(left: &ColumnarReader, right: &ColumnarReader) {
assert_eq!(left_columns[i].0, right_columns[i].0);
let left_column = left_columns[i].1.open().unwrap();
let right_column = right_columns[i].1.open().unwrap();
assert_dyn_column_eq(&left_column, &right_column);
assert_dyn_column_eq(&left_column, &right_column, lenient_on_numerical_value);
}
}
@@ -442,11 +488,11 @@ fn assert_bytes_column_eq(left: &BytesColumn, right: &BytesColumn) {
assert!(!right_terms.advance());
}
fn assert_dyn_column_eq(left_dyn_column: &DynamicColumn, right_dyn_column: &DynamicColumn) {
assert_eq!(
&left_dyn_column.column_type(),
&right_dyn_column.column_type()
);
fn assert_dyn_column_eq(
left_dyn_column: &DynamicColumn,
right_dyn_column: &DynamicColumn,
lenient_on_numerical_value: bool,
) {
assert_eq!(
&left_dyn_column.get_cardinality(),
&right_dyn_column.get_cardinality()
@@ -476,8 +522,19 @@ fn assert_dyn_column_eq(left_dyn_column: &DynamicColumn, right_dyn_column: &Dyna
(DynamicColumn::Str(left_col), DynamicColumn::Str(right_col)) => {
assert_bytes_column_eq(left_col, right_col);
}
_ => {
unreachable!()
(left, right) => {
if lenient_on_numerical_value {
assert_eq!(
ColumnTypeCategory::from(left.column_type()),
ColumnTypeCategory::from(right.column_type())
);
} else {
panic!(
"Column type are not the same: {:?} vs {:?}",
left.column_type(),
right.column_type()
);
}
}
}
}
@@ -488,28 +545,36 @@ trait AssertEqualToColumnValue {
impl AssertEqualToColumnValue for bool {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Bool(val) = column_value else { panic!() };
let ColumnValue::Bool(val) = column_value else {
panic!()
};
assert_eq!(self, val);
}
}
impl AssertEqualToColumnValue for Ipv6Addr {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::IpAddr(val) = column_value else { panic!() };
let ColumnValue::IpAddr(val) = column_value else {
panic!()
};
assert_eq!(self, val);
}
}
impl<T: Coerce + PartialEq + Debug + Into<NumericalValue>> AssertEqualToColumnValue for T {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::Numerical(num) = column_value else { panic!() };
let ColumnValue::Numerical(num) = column_value else {
panic!()
};
assert_eq!(self, &T::coerce(*num));
}
}
impl AssertEqualToColumnValue for DateTime {
fn assert_equal_to_column_value(&self, column_value: &ColumnValue) {
let ColumnValue::DateTime(dt) = column_value else { panic!() };
let ColumnValue::DateTime(dt) = column_value else {
panic!()
};
assert_eq!(self, dt);
}
}
@@ -683,7 +748,7 @@ proptest! {
let merged_columnar = ColumnarReader::open(output).unwrap();
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> = columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
}
@@ -709,7 +774,7 @@ fn test_columnar_merging_empty_columnar() {
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
#[test]
@@ -746,8 +811,135 @@ fn test_columnar_merging_number_columns() {
let concat_rows: Vec<Vec<(&'static str, ColumnValue)>> =
columnar_docs.iter().cloned().flatten().collect();
let expected_merged_columnar = build_columnar(&concat_rows[..]);
assert_columnar_eq(&merged_columnar, &expected_merged_columnar);
assert_columnar_eq_strict(&merged_columnar, &expected_merged_columnar);
}
// TODO add non trivial remap and merge
// TODO test required_columns
// TODO document edge case: required_columns incompatible with values.
fn columnar_docs_and_remap(
) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|columnars_docs: Vec<Vec<Vec<(&str, ColumnValue)>>>| {
let row_addrs: Vec<RowAddr> = columnars_docs
.iter()
.enumerate()
.flat_map(|(segment_ord, columnar_docs)| {
(0u32..columnar_docs.len() as u32).map(move |row_id| RowAddr {
segment_ord: segment_ord as u32,
row_id,
})
})
.collect();
permutation_and_subset_strategy(row_addrs.len()).prop_map(move |shuffled_subset| {
let shuffled_row_addr_subset: Vec<RowAddr> =
shuffled_subset.iter().map(|ord| row_addrs[*ord]).collect();
(columnars_docs.clone(), shuffled_row_addr_subset)
})
},
)
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
.collect();
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
.map(|docs| build_columnar(&docs[..]))
.collect::<Vec<_>>();
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
}
}
#[test]
fn test_columnar_merge_empty() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 0];
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, vec![]);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 0);
assert_eq!(merged_columnar.num_columns(), 0);
}
#[test]
fn test_columnar_merge_single_str_column() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[vec![("c1", ColumnValue::Str("a"))]][..];
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let segment_num_rows: Vec<RowId> = vec![0, 1];
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&segment_num_rows,
vec![RowAddr {
segment_ord: 1u32,
row_id: 0u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
}
#[test]
fn test_delete_decrease_cardinality() {
let columnar_reader_1 = build_columnar(&[]);
let rows: &[Vec<_>] = &[
vec![
("c", ColumnValue::from(0i64)),
("c", ColumnValue::from(0i64)),
],
vec![("c", ColumnValue::from(0i64))],
][..];
// c is multivalued here
let columnar_reader_2 = build_columnar(rows);
let mut output: Vec<u8> = Vec::new();
let shuffle_merge_order = ShuffleMergeOrder::for_test(
&[0, 2],
vec![RowAddr {
segment_ord: 1u32,
row_id: 1u32,
}],
);
crate::merge_columnar(
&[&columnar_reader_1, &columnar_reader_2],
&[],
shuffle_merge_order.into(),
&mut output,
)
.unwrap();
let merged_columnar = ColumnarReader::open(output).unwrap();
assert_eq!(merged_columnar.num_rows(), 1);
assert_eq!(merged_columnar.num_columns(), 1);
let cols = merged_columnar.read_columns("c").unwrap();
assert_eq!(cols.len(), 1);
assert_eq!(cols[0].column_type(), ColumnType::I64);
assert_eq!(cols[0].open().unwrap().get_cardinality(), Cardinality::Full);
}

View File

@@ -109,7 +109,7 @@ impl Coerce for f64 {
impl Coerce for DateTime {
fn coerce(value: NumericalValue) -> Self {
let timestamp_micros = i64::coerce(value);
DateTime::from_timestamp_micros(timestamp_micros)
DateTime::from_timestamp_nanos(timestamp_micros)
}
}

39
common/benches/bench.rs Normal file
View File

@@ -0,0 +1,39 @@
#![feature(test)]
extern crate test;
#[cfg(test)]
mod tests {
use rand::seq::IteratorRandom;
use rand::thread_rng;
use tantivy_common::serialize_vint_u32;
use test::Bencher;
#[bench]
fn bench_vint(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).collect();
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
#[bench]
fn bench_vint_rand(b: &mut Bencher) {
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
b.iter(|| {
let mut out = 0u64;
for val in vals.iter().cloned() {
let mut buf = [0u8; 8];
serialize_vint_u32(val, &mut buf);
out += u64::from(buf[0]);
}
out
});
}
}

View File

@@ -37,7 +37,7 @@ impl ByteCount {
for (suffix, threshold) in SUFFIX_AND_THRESHOLD.iter().rev() {
if self.get_bytes() >= *threshold {
let unit_num = self.get_bytes() as f64 / *threshold as f64;
return format!("{:.2} {}", unit_num, suffix);
return format!("{unit_num:.2} {suffix}");
}
}
format!("{:.2} B", self.get_bytes())

View File

@@ -1,25 +1,33 @@
#![allow(deprecated)]
use std::fmt;
use serde::{Deserialize, Serialize};
use time::format_description::well_known::Rfc3339;
use time::{OffsetDateTime, PrimitiveDateTime, UtcOffset};
/// DateTime Precision
/// Precision with which datetimes are truncated when stored in fast fields. This setting is only
/// relevant for fast fields. In the docstore, datetimes are always saved with nanosecond precision.
#[derive(
Clone, Copy, Debug, Hash, PartialEq, Eq, PartialOrd, Ord, Serialize, Deserialize, Default,
)]
#[serde(rename_all = "lowercase")]
pub enum DatePrecision {
/// Seconds precision
pub enum DateTimePrecision {
/// Second precision.
#[default]
Seconds,
/// Milli-seconds precision.
/// Millisecond precision.
Milliseconds,
/// Micro-seconds precision.
/// Microsecond precision.
Microseconds,
/// Nanosecond precision.
Nanoseconds,
}
/// A date/time value with microsecond precision.
#[deprecated(since = "0.20.0", note = "Use `DateTimePrecision` instead")]
pub type DatePrecision = DateTimePrecision;
/// A date/time value with nanoseconds precision.
///
/// This timestamp does not carry any explicit time zone information.
/// Users are responsible for applying the provided conversion
@@ -31,39 +39,46 @@ pub enum DatePrecision {
/// to prevent unintended usage.
#[derive(Clone, Default, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
pub struct DateTime {
// Timestamp in microseconds.
pub(crate) timestamp_micros: i64,
// Timestamp in nanoseconds.
pub(crate) timestamp_nanos: i64,
}
impl DateTime {
/// Minimum possible `DateTime` value.
pub const MIN: DateTime = DateTime {
timestamp_micros: i64::MIN,
timestamp_nanos: i64::MIN,
};
/// Maximum possible `DateTime` value.
pub const MAX: DateTime = DateTime {
timestamp_micros: i64::MAX,
timestamp_nanos: i64::MAX,
};
/// Create new from UNIX timestamp in seconds
pub const fn from_timestamp_secs(seconds: i64) -> Self {
Self {
timestamp_micros: seconds * 1_000_000,
timestamp_nanos: seconds * 1_000_000_000,
}
}
/// Create new from UNIX timestamp in milliseconds
pub const fn from_timestamp_millis(milliseconds: i64) -> Self {
Self {
timestamp_micros: milliseconds * 1_000,
timestamp_nanos: milliseconds * 1_000_000,
}
}
/// Create new from UNIX timestamp in microseconds.
pub const fn from_timestamp_micros(microseconds: i64) -> Self {
Self {
timestamp_micros: microseconds,
timestamp_nanos: microseconds * 1_000,
}
}
/// Create new from UNIX timestamp in nanoseconds.
pub const fn from_timestamp_nanos(nanoseconds: i64) -> Self {
Self {
timestamp_nanos: nanoseconds,
}
}
@@ -71,9 +86,9 @@ impl DateTime {
///
/// The given date/time is converted to UTC and the actual
/// time zone is discarded.
pub const fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_micros = dt.unix_timestamp() * 1_000_000 + dt.microsecond() as i64;
Self { timestamp_micros }
pub fn from_utc(dt: OffsetDateTime) -> Self {
let timestamp_nanos = dt.unix_timestamp_nanos() as i64;
Self { timestamp_nanos }
}
/// Create new from `PrimitiveDateTime`
@@ -87,23 +102,27 @@ impl DateTime {
/// Convert to UNIX timestamp in seconds.
pub const fn into_timestamp_secs(self) -> i64 {
self.timestamp_micros / 1_000_000
self.timestamp_nanos / 1_000_000_000
}
/// Convert to UNIX timestamp in milliseconds.
pub const fn into_timestamp_millis(self) -> i64 {
self.timestamp_micros / 1_000
self.timestamp_nanos / 1_000_000
}
/// Convert to UNIX timestamp in microseconds.
pub const fn into_timestamp_micros(self) -> i64 {
self.timestamp_micros
self.timestamp_nanos / 1_000
}
/// Convert to UNIX timestamp in nanoseconds.
pub const fn into_timestamp_nanos(self) -> i64 {
self.timestamp_nanos
}
/// Convert to UTC `OffsetDateTime`
pub fn into_utc(self) -> OffsetDateTime {
let timestamp_nanos = self.timestamp_micros as i128 * 1000;
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(timestamp_nanos)
let utc_datetime = OffsetDateTime::from_unix_timestamp_nanos(self.timestamp_nanos as i128)
.expect("valid UNIX timestamp");
debug_assert_eq!(UtcOffset::UTC, utc_datetime.offset());
utc_datetime
@@ -126,20 +145,21 @@ impl DateTime {
}
/// Truncates the microseconds value to the corresponding precision.
pub fn truncate(self, precision: DatePrecision) -> Self {
pub fn truncate(self, precision: DateTimePrecision) -> Self {
let truncated_timestamp_micros = match precision {
DatePrecision::Seconds => (self.timestamp_micros / 1_000_000) * 1_000_000,
DatePrecision::Milliseconds => (self.timestamp_micros / 1_000) * 1_000,
DatePrecision::Microseconds => self.timestamp_micros,
DateTimePrecision::Seconds => (self.timestamp_nanos / 1_000_000_000) * 1_000_000_000,
DateTimePrecision::Milliseconds => (self.timestamp_nanos / 1_000_000) * 1_000_000,
DateTimePrecision::Microseconds => (self.timestamp_nanos / 1_000) * 1_000,
DateTimePrecision::Nanoseconds => self.timestamp_nanos,
};
Self {
timestamp_micros: truncated_timestamp_micros,
timestamp_nanos: truncated_timestamp_micros,
}
}
}
impl fmt::Debug for DateTime {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let utc_rfc3339 = self.into_utc().format(&Rfc3339).map_err(|_| fmt::Error)?;
f.write_str(&utc_rfc3339)
}

View File

@@ -1,63 +0,0 @@
use std::io::{self, Read, Write};
use crate::BinarySerializable;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
#[repr(u32)]
pub enum DictionaryKind {
Fst = 1,
SSTable = 2,
}
#[derive(Debug, Clone, PartialEq)]
pub struct DictionaryFooter {
pub kind: DictionaryKind,
pub version: u32,
}
impl DictionaryFooter {
pub fn verify_equal(&self, other: &DictionaryFooter) -> io::Result<()> {
if self.kind != other.kind {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Invalid dictionary type, expected {:?}, found {:?}",
self.kind, other.kind
),
));
}
if self.version != other.version {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Unsuported dictionary version, expected {}, found {}",
self.version, other.version
),
));
}
Ok(())
}
}
impl BinarySerializable for DictionaryFooter {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
self.version.serialize(writer)?;
(self.kind as u32).serialize(writer)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Self> {
let version = u32::deserialize(reader)?;
let kind = u32::deserialize(reader)?;
let kind = match kind {
1 => DictionaryKind::Fst,
2 => DictionaryKind::SSTable,
_ => {
return Err(io::Error::new(
io::ErrorKind::Other,
format!("invalid dictionary kind: {kind}"),
))
}
};
Ok(DictionaryFooter { kind, version })
}
}

View File

@@ -7,7 +7,6 @@ pub use byteorder::LittleEndian as Endianness;
mod bitset;
mod byte_count;
mod datetime;
mod dictionary_footer;
pub mod file_slice;
mod group_by;
mod serialize;
@@ -15,14 +14,14 @@ mod vint;
mod writer;
pub use bitset::*;
pub use byte_count::ByteCount;
pub use datetime::{DatePrecision, DateTime};
pub use dictionary_footer::*;
#[allow(deprecated)]
pub use datetime::DatePrecision;
pub use datetime::{DateTime, DateTimePrecision};
pub use group_by::GroupByIteratorExtended;
pub use ownedbytes::{OwnedBytes, StableDeref};
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
pub use vint::{
deserialize_vint_u128, read_u32_vint, read_u32_vint_no_advance, serialize_vint_u128,
serialize_vint_u32, write_u32_vint, VInt, VIntU128,
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
};
pub use writer::{AntiCallToken, CountingWriter, TerminatingWrite};

View File

@@ -1,8 +1,6 @@
use std::io;
use std::io::{Read, Write};
use byteorder::{ByteOrder, LittleEndian};
use super::BinarySerializable;
/// Variable int serializes a u128 number
@@ -19,26 +17,6 @@ pub fn serialize_vint_u128(mut val: u128, output: &mut Vec<u8>) {
}
}
/// Deserializes a u128 number
///
/// Returns the number and the slice after the vint
pub fn deserialize_vint_u128(data: &[u8]) -> io::Result<(u128, &[u8])> {
let mut result = 0u128;
let mut shift = 0u64;
for i in 0..19 {
let b = data[i];
result |= u128::from(b % 128u8) << shift;
if b >= STOP_BIT {
return Ok((result, &data[i + 1..]));
}
shift += 7;
}
Err(io::Error::new(
io::ErrorKind::InvalidData,
"Failed to deserialize u128 vint",
))
}
/// Wrapper over a `u128` that serializes as a variable int.
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub struct VIntU128(pub u128);
@@ -80,17 +58,13 @@ pub struct VInt(pub u64);
const STOP_BIT: u8 = 128;
#[inline]
pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
const START_2: u64 = 1 << 7;
const START_3: u64 = 1 << 14;
const START_4: u64 = 1 << 21;
const START_5: u64 = 1 << 28;
const STOP_1: u64 = START_2 - 1;
const STOP_2: u64 = START_3 - 1;
const STOP_3: u64 = START_4 - 1;
const STOP_4: u64 = START_5 - 1;
const MASK_1: u64 = 127;
const MASK_2: u64 = MASK_1 << 7;
const MASK_3: u64 = MASK_2 << 7;
@@ -99,25 +73,29 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
let val = u64::from(val);
const STOP_BIT: u64 = 128u64;
let (res, num_bytes) = match val {
0..=STOP_1 => (val | STOP_BIT, 1),
START_2..=STOP_2 => (
let (res, num_bytes) = if val < START_2 {
(val | STOP_BIT, 1)
} else if val < START_3 {
(
(val & MASK_1) | ((val & MASK_2) << 1) | (STOP_BIT << (8)),
2,
),
START_3..=STOP_3 => (
)
} else if val < START_4 {
(
(val & MASK_1) | ((val & MASK_2) << 1) | ((val & MASK_3) << 2) | (STOP_BIT << (8 * 2)),
3,
),
START_4..=STOP_4 => (
)
} else if val < START_5 {
(
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
| ((val & MASK_4) << 3)
| (STOP_BIT << (8 * 3)),
4,
),
_ => (
)
} else {
(
(val & MASK_1)
| ((val & MASK_2) << 1)
| ((val & MASK_3) << 2)
@@ -125,9 +103,9 @@ pub fn serialize_vint_u32(val: u32, buf: &mut [u8; 8]) -> &[u8] {
| ((val & MASK_5) << 4)
| (STOP_BIT << (8 * 4)),
5,
),
)
};
LittleEndian::write_u64(&mut buf[..], res);
*buf = res.to_le_bytes();
&buf[0..num_bytes]
}
@@ -245,7 +223,6 @@ impl BinarySerializable for VInt {
mod tests {
use super::{serialize_vint_u32, BinarySerializable, VInt};
use crate::vint::{deserialize_vint_u128, serialize_vint_u128, VIntU128};
fn aux_test_vint(val: u64) {
let mut v = [14u8; 10];
@@ -284,27 +261,7 @@ mod tests {
let mut buffer2 = [0u8; 8];
let len_vint = VInt(val as u64).serialize_into(&mut buffer);
let res2 = serialize_vint_u32(val, &mut buffer2);
assert_eq!(&buffer[..len_vint], res2, "array wrong for {}", val);
}
fn aux_test_vint_u128(val: u128) {
let mut data = vec![];
serialize_vint_u128(val, &mut data);
let (deser_val, _data) = deserialize_vint_u128(&data).unwrap();
assert_eq!(val, deser_val);
let mut out = vec![];
VIntU128(val).serialize(&mut out).unwrap();
let deser_val = VIntU128::deserialize(&mut &out[..]).unwrap();
assert_eq!(val, deser_val.0);
}
#[test]
fn test_vint_u128() {
aux_test_vint_u128(0);
aux_test_vint_u128(1);
aux_test_vint_u128(u128::MAX / 3);
aux_test_vint_u128(u128::MAX);
assert_eq!(&buffer[..len_vint], res2, "array wrong for {val}");
}
#[test]

View File

@@ -7,13 +7,8 @@
// ---
use serde_json::{Deserializer, Value};
use tantivy::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
RangeAggregation,
};
use tantivy::aggregation::agg_req::Aggregations;
use tantivy::aggregation::agg_result::AggregationResults;
use tantivy::aggregation::bucket::RangeAggregationRange;
use tantivy::aggregation::metric::AverageAggregation;
use tantivy::aggregation::AggregationCollector;
use tantivy::query::AllQuery;
use tantivy::schema::{self, IndexRecordOption, Schema, TextFieldIndexing, FAST};
@@ -194,56 +189,9 @@ fn main() -> tantivy::Result<()> {
let agg_req: Aggregations = serde_json::from_str(agg_req_str)?;
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res2: Value = serde_json::to_value(agg_res)?;
// ### Request Rust API
//
// This is exactly the same request as above, but via the rust structures.
//
let agg_req: Aggregations = vec![(
"group_by_stock".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "stock".to_string(),
ranges: vec![
RangeAggregationRange {
key: Some("few".into()),
from: None,
to: Some(1f64),
},
RangeAggregationRange {
key: Some("some".into()),
from: Some(1f64),
to: Some(10f64),
},
RangeAggregationRange {
key: Some("many".into()),
from: Some(10f64),
to: None,
},
],
..Default::default()
}),
sub_aggregation: vec![(
"average_price".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("price".to_string()),
)),
)]
.into_iter()
.collect(),
})),
)]
.into_iter()
.collect();
let collector = AggregationCollector::from_aggs(agg_req, Default::default());
// We use the `AllQuery` which will pass all documents to the AggregationCollector.
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res1: Value = serde_json::to_value(agg_res)?;
let res: Value = serde_json::to_value(agg_res)?;
// ### Aggregation Result
//
@@ -261,8 +209,7 @@ fn main() -> tantivy::Result<()> {
}
"#;
let expected_json: Value = serde_json::from_str(expected_res)?;
assert_eq!(expected_json, res1);
assert_eq!(expected_json, res2);
assert_eq!(expected_json, res);
// ### Request 2
//

View File

@@ -53,7 +53,7 @@ fn main() -> tantivy::Result<()> {
// this will store tokens of 3 characters each
index
.tokenizers()
.register("ngram3", NgramTokenizer::new(3, 3, false));
.register("ngram3", NgramTokenizer::new(3, 3, false).unwrap());
// To insert document we need an index writer.
// There must be only one writer at a time.

View File

@@ -13,7 +13,7 @@ fn main() -> tantivy::Result<()> {
let opts = DateOptions::from(INDEXED)
.set_stored()
.set_fast()
.set_precision(tantivy::DatePrecision::Seconds);
.set_precision(tantivy::DateTimePrecision::Seconds);
// Add `occurred_at` date field type
let occurred_at = schema_builder.add_date_field("occurred_at", opts);
let event_type = schema_builder.add_text_field("event", STRING | STORED);

View File

@@ -96,7 +96,7 @@ fn main() -> tantivy::Result<()> {
let mut index_writer_wlock = index_writer.write().unwrap();
index_writer_wlock.commit()?
};
println!("committed with opstamp {}", opstamp);
println!("committed with opstamp {opstamp}");
thread::sleep(Duration::from_millis(500));
}

View File

@@ -84,7 +84,7 @@ fn main() -> tantivy::Result<()> {
// Doc 0: TermFreq 2: [0, 4]
// Doc 2: TermFreq 1: [0]
// ```
println!("Doc {}: TermFreq {}: {:?}", doc_id, term_freq, positions);
println!("Doc {doc_id}: TermFreq {term_freq}: {positions:?}");
doc_id = segment_postings.advance();
}
}
@@ -125,7 +125,7 @@ fn main() -> tantivy::Result<()> {
// Once again these docs MAY contains deleted documents as well.
let docs = block_segment_postings.docs();
// Prints `Docs [0, 2].`
println!("Docs {:?}", docs);
println!("Docs {docs:?}");
block_segment_postings.advance();
}
}

View File

@@ -0,0 +1,79 @@
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{doc, Index, ReloadPolicy, Result};
use tempfile::TempDir;
fn main() -> Result<()> {
let index_path = TempDir::new()?;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("body", TEXT);
let schema = schema_builder.build();
let title = schema.get_field("title").unwrap();
let body = schema.get_field("body").unwrap();
let index = Index::create_in_dir(&index_path, schema)?;
let mut index_writer = index.writer(50_000_000)?;
index_writer.add_document(doc!(
title => "The Old Man and the Sea",
body => "He was an old man who fished alone in a skiff in the Gulf Stream and he had gone \
eighty-four days now without taking a fish.",
))?;
index_writer.add_document(doc!(
title => "Of Mice and Men",
body => "A few miles south of Soledad, the Salinas River drops in close to the hillside \
bank and runs deep and green. The water is warm too, for it has slipped twinkling \
over the yellow sands in the sunlight before reaching the narrow pool. On one \
side of the river the golden foothill slopes curve up to the strong and rocky \
Gabilan Mountains, but on the valley side the water is lined with trees—willows \
fresh and green with every spring, carrying in their lower leaf junctures the \
debris of the winters flooding; and sycamores with mottled, white, recumbent \
limbs and branches that arch over the pool"
))?;
// Multivalued field just need to be repeated.
index_writer.add_document(doc!(
title => "Frankenstein",
title => "The Modern Prometheus",
body => "You will rejoice to hear that no disaster has accompanied the commencement of an \
enterprise which you have regarded with such evil forebodings. I arrived here \
yesterday, and my first task is to assure my dear sister of my welfare and \
increasing confidence in the success of my undertaking."
))?;
index_writer.commit()?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommit)
.try_into()?;
let searcher = reader.searcher();
let query_parser = QueryParser::for_index(&index, vec![title, body]);
// This will match documents containing the phrase "in the"
// followed by some word starting with "su",
// i.e. it will match "in the sunlight" and "in the success",
// but not "in the Gulf Stream".
let query = query_parser.parse_query("\"in the su\"*")?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(10))?;
let mut titles = top_docs
.into_iter()
.map(|(_score, doc_address)| {
let doc = searcher.doc(doc_address)?;
let title = doc.get_first(title).unwrap().as_text().unwrap().to_owned();
Ok(title)
})
.collect::<Result<Vec<_>>>()?;
titles.sort_unstable();
assert_eq!(titles, ["Frankenstein", "Of Mice and Men"]);
Ok(())
}

View File

@@ -17,7 +17,8 @@ use tantivy::{doc, Index, ReloadPolicy};
use tempfile::TempDir;
fn pre_tokenize_text(text: &str) -> Vec<Token> {
let mut token_stream = SimpleTokenizer.token_stream(text);
let mut tokenizer = SimpleTokenizer::default();
let mut token_stream = tokenizer.token_stream(text);
let mut tokens = vec![];
while token_stream.advance() {
tokens.push(token_stream.token().clone());

View File

@@ -56,7 +56,7 @@ fn main() -> tantivy::Result<()> {
for (score, doc_address) in top_docs {
let doc = searcher.doc(doc_address)?;
let snippet = snippet_generator.snippet_from_doc(&doc);
println!("Document score {}:", score);
println!("Document score {score}:");
println!(
"title: {}",
doc.get_first(title).unwrap().as_text().unwrap()

View File

@@ -50,7 +50,7 @@ fn main() -> tantivy::Result<()> {
// This tokenizer lowers all of the text (to help with stop word matching)
// then removes all instances of `the` and `and` from the corpus
let tokenizer = TextAnalyzer::builder(SimpleTokenizer)
let tokenizer = TextAnalyzer::builder(SimpleTokenizer::default())
.filter(LowerCaser)
.filter(StopWordFilter::remove(vec![
"the".to_string(),
@@ -106,7 +106,7 @@ fn main() -> tantivy::Result<()> {
for (score, doc_address) in top_docs {
let retrieved_doc = searcher.doc(doc_address)?;
println!("\n==\nDocument score {}:", score);
println!("\n==\nDocument score {score}:");
println!("{}", schema.to_json(&retrieved_doc));
}

View File

@@ -6,12 +6,14 @@ use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::{Schema, FAST, TEXT};
use tantivy::{
doc, DocAddress, DocId, Index, IndexReader, Opstamp, Searcher, SearcherGeneration, SegmentId,
SegmentReader, Warmer,
doc, DocAddress, DocId, Index, Opstamp, Searcher, SearcherGeneration, SegmentId, SegmentReader,
Warmer,
};
// This example shows how warmers can be used to
// load a values from an external sources using the Warmer API.
// load values from an external sources and
// tie their lifecycle to that of the index segments
// using the Warmer API.
//
// In this example, we assume an e-commerce search engine.
@@ -23,9 +25,11 @@ pub trait PriceFetcher: Send + Sync + 'static {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price>;
}
type SegmentKey = (SegmentId, Option<Opstamp>);
struct DynamicPriceColumn {
field: String,
price_cache: RwLock<HashMap<(SegmentId, Option<Opstamp>), Arc<Vec<Price>>>>,
price_cache: RwLock<HashMap<SegmentKey, Arc<Vec<Price>>>>,
price_fetcher: Box<dyn PriceFetcher>,
}
@@ -46,7 +50,6 @@ impl DynamicPriceColumn {
impl Warmer for DynamicPriceColumn {
fn warm(&self, searcher: &Searcher) -> tantivy::Result<()> {
for segment in searcher.segment_readers() {
let key = (segment.segment_id(), segment.delete_opstamp());
let product_id_reader = segment
.fast_fields()
.u64(&self.field)?
@@ -55,37 +58,40 @@ impl Warmer for DynamicPriceColumn {
.doc_ids_alive()
.map(|doc| product_id_reader.get_val(doc))
.collect();
let mut prices_it = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let mut price_vals: Vec<Price> = Vec::new();
for doc in 0..segment.max_doc() {
if segment.is_deleted(doc) {
price_vals.push(0);
} else {
price_vals.push(prices_it.next().unwrap())
}
}
let mut prices = self.price_fetcher.fetch_prices(&product_ids).into_iter();
let prices: Vec<Price> = (0..segment.max_doc())
.map(|doc| {
if !segment.is_deleted(doc) {
prices.next().unwrap()
} else {
0
}
})
.collect();
let key = (segment.segment_id(), segment.delete_opstamp());
self.price_cache
.write()
.unwrap()
.insert(key, Arc::new(price_vals));
.insert(key, Arc::new(prices));
}
Ok(())
}
fn garbage_collect(&self, live_generations: &[&SearcherGeneration]) {
let live_segment_id_and_delete_ops: HashSet<(SegmentId, Option<Opstamp>)> =
live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
let mut price_cache_wrt = self.price_cache.write().unwrap();
// let price_cache = std::mem::take(&mut *price_cache_wrt);
// Drain would be nicer here.
*price_cache_wrt = std::mem::take(&mut *price_cache_wrt)
.into_iter()
.filter(|(seg_id_and_op, _)| !live_segment_id_and_delete_ops.contains(seg_id_and_op))
let live_keys: HashSet<SegmentKey> = live_generations
.iter()
.flat_map(|gen| gen.segments())
.map(|(&segment_id, &opstamp)| (segment_id, opstamp))
.collect();
self.price_cache
.write()
.unwrap()
.retain(|key, _| live_keys.contains(key));
}
}
@@ -100,17 +106,17 @@ pub struct ExternalPriceTable {
impl ExternalPriceTable {
pub fn update_price(&self, product_id: ProductId, price: Price) {
let mut prices_wrt = self.prices.write().unwrap();
prices_wrt.insert(product_id, price);
self.prices.write().unwrap().insert(product_id, price);
}
}
impl PriceFetcher for ExternalPriceTable {
fn fetch_prices(&self, product_ids: &[ProductId]) -> Vec<Price> {
let prices_read = self.prices.read().unwrap();
let prices = self.prices.read().unwrap();
product_ids
.iter()
.map(|product_id| prices_read.get(product_id).cloned().unwrap_or(0))
.map(|product_id| prices.get(product_id).cloned().unwrap_or(0))
.collect()
}
}
@@ -143,11 +149,8 @@ fn main() -> tantivy::Result<()> {
writer.add_document(doc!(product_id=>SNEAKERS, text=>"uber sweet sneakers"))?;
writer.commit()?;
let warmers: Vec<Weak<dyn Warmer>> = vec![Arc::downgrade(
&(price_dynamic_column.clone() as Arc<dyn Warmer>),
)];
let reader: IndexReader = index.reader_builder().warmers(warmers).try_into()?;
reader.reload()?;
let warmers = vec![Arc::downgrade(&price_dynamic_column) as Weak<dyn Warmer>];
let reader = index.reader_builder().warmers(warmers).try_into()?;
let query_parser = QueryParser::for_index(&index, vec![text]);
let query = query_parser.parse_query("cooking")?;

View File

@@ -139,6 +139,16 @@ impl OwnedBytes {
self.advance(8);
u64::from_le_bytes(octlet)
}
/// Reads an `u32` encoded as little-endian from the `OwnedBytes` and advance by 4 bytes.
#[inline]
pub fn read_u32(&mut self) -> u32 {
assert!(self.len() > 3);
let quad: [u8; 4] = self.as_slice()[..4].try_into().unwrap();
self.advance(4);
u32::from_le_bytes(quad)
}
}
impl fmt::Debug for OwnedBytes {
@@ -150,7 +160,7 @@ impl fmt::Debug for OwnedBytes {
} else {
self.as_slice()
};
write!(f, "OwnedBytes({:?}, len={})", bytes_truncated, self.len())
write!(f, "OwnedBytes({bytes_truncated:?}, len={})", self.len())
}
}
@@ -249,12 +259,12 @@ mod tests {
fn test_owned_bytes_debug() {
let short_bytes = OwnedBytes::new(b"abcd".as_ref());
assert_eq!(
format!("{:?}", short_bytes),
format!("{short_bytes:?}"),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{:?}", long_bytes),
format!("{long_bytes:?}"),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105, 106], len=17)"
);
}

View File

@@ -1,6 +1,6 @@
[package]
name = "tantivy-query-grammar"
version = "0.19.0"
version = "0.20.0"
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
license = "MIT"
categories = ["database-implementations", "data-structures"]

View File

@@ -7,7 +7,9 @@ use combine::parser::Parser;
pub use crate::occur::Occur;
use crate::query_grammar::parse_to_ast;
pub use crate::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
pub use crate::user_input_ast::{
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
};
pub struct Error;

View File

@@ -5,13 +5,14 @@ use combine::parser::range::{take_while, take_while1};
use combine::parser::repeat::escaped;
use combine::parser::Parser;
use combine::{
attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
any, attempt, between, choice, eof, many, many1, one_of, optional, parser, satisfy, sep_by,
skip_many1, value,
};
use once_cell::sync::Lazy;
use regex::Regex;
use super::user_input_ast::{UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral};
use crate::user_input_ast::Delimiter;
use crate::Occur;
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
@@ -56,7 +57,7 @@ fn word<'a>() -> impl Parser<&'a str, Output = String> {
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
.map(|(s1, s2): (char, String)| format!("{s1}{s2}"))
.and_then(|s: String| match s.as_str() {
"OR" | "AND " | "NOT" => Err(StringStreamError::UnexpectedParse),
_ => Ok(s),
@@ -74,7 +75,7 @@ fn relaxed_word<'a>() -> impl Parser<&'a str, Output = String> {
!c.is_whitespace() && !['{', '}', '"', '[', ']', '(', ')'].contains(&c)
})),
)
.map(|(s1, s2): (char, String)| format!("{}{}", s1, s2))
.map(|(s1, s2): (char, String)| format!("{s1}{s2}"))
}
/// Parses a date time according to rfc3339
@@ -133,17 +134,50 @@ fn date_time<'a>() -> impl Parser<&'a str, Output = String> {
recognize((date, char('T'), time))
}
fn term_val<'a>() -> impl Parser<&'a str, Output = String> {
let phrase = char('"').with(many1(satisfy(|c| c != '"'))).skip(char('"'));
negative_number().or(phrase.or(word()))
fn escaped_character<'a>() -> impl Parser<&'a str, Output = char> {
(char('\\'), any()).map(|(_, x)| x)
}
fn escaped_string<'a>(delimiter: char) -> impl Parser<&'a str, Output = String> {
(
char(delimiter),
many(choice((
escaped_character(),
satisfy(move |c: char| c != delimiter),
))),
char(delimiter),
)
.map(|(_, s, _)| s)
}
fn term_val<'a>() -> impl Parser<&'a str, Output = (Delimiter, String)> {
let double_quotes = escaped_string('"').map(|phrase| (Delimiter::DoubleQuotes, phrase));
let single_quotes = escaped_string('\'').map(|phrase| (Delimiter::SingleQuotes, phrase));
let text_no_delimiter = word().map(|text| (Delimiter::None, text));
negative_number()
.map(|negative_number_str| (Delimiter::None, negative_number_str))
.or(double_quotes)
.or(single_quotes)
.or(text_no_delimiter)
}
fn term_query<'a>() -> impl Parser<&'a str, Output = UserInputLiteral> {
(field_name(), term_val(), slop_val()).map(|(field_name, phrase, slop)| UserInputLiteral {
field_name: Some(field_name),
phrase,
slop,
})
(field_name(), term_val(), slop_or_prefix_val()).map(
|(field_name, (delimiter, phrase), (slop, prefix))| UserInputLiteral {
field_name: Some(field_name),
phrase,
delimiter,
slop,
prefix,
},
)
}
fn slop_or_prefix_val<'a>() -> impl Parser<&'a str, Output = (u32, bool)> {
let prefix_val = char('*').map(|_ast| (0, true));
let slop_val = slop_val().map(|slop| (slop, false));
prefix_val.or(slop_val)
}
fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
@@ -159,11 +193,16 @@ fn slop_val<'a>() -> impl Parser<&'a str, Output = u32> {
}
fn literal<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_default_field = (term_val(), slop_val()).map(|(phrase, slop)| UserInputLiteral {
field_name: None,
phrase,
slop,
});
let term_default_field =
(term_val(), slop_or_prefix_val()).map(|((delimiter, phrase), (slop, prefix))| {
UserInputLiteral {
field_name: None,
phrase,
delimiter,
slop,
prefix,
}
});
attempt(term_query())
.or(term_default_field)
@@ -178,9 +217,9 @@ fn negative_number<'a>() -> impl Parser<&'a str, Output = String> {
)
.map(|(s1, s2, s3): (char, String, Option<(char, String)>)| {
if let Some(('.', s3)) = s3 {
format!("{}{}.{}", s1, s2, s3)
format!("{s1}{s2}.{s3}")
} else {
format!("{}{}", s1, s2)
format!("{s1}{s2}")
}
})
}
@@ -268,7 +307,11 @@ fn range<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
/// Function that parses a set out of a Stream
/// Supports ranges like: `IN [val1 val2 val3]`
fn set<'a>() -> impl Parser<&'a str, Output = UserInputLeaf> {
let term_list = between(char('['), char(']'), sep_by(term_val(), spaces()));
let term_list = between(
char('['),
char(']'),
sep_by(term_val().map(|(_delimiter, text)| text), spaces()),
);
let set_content = ((string("IN"), spaces()), term_list).map(|(_, elements)| elements);
@@ -401,6 +444,28 @@ pub fn parse_to_ast<'a>() -> impl Parser<&'a str, Output = UserInputAst> {
spaces()
.with(optional(ast()).skip(eof()))
.map(|opt_ast| opt_ast.unwrap_or_else(UserInputAst::empty_query))
.map(rewrite_ast)
}
/// Removes unnecessary children clauses in AST
///
/// Motivated by [issue #1433](https://github.com/quickwit-oss/tantivy/issues/1433)
fn rewrite_ast(mut input: UserInputAst) -> UserInputAst {
if let UserInputAst::Clause(terms) = &mut input {
for term in terms {
rewrite_ast_clause(term);
}
}
input
}
fn rewrite_ast_clause(input: &mut (Option<Occur>, UserInputAst)) {
match input {
(None, UserInputAst::Clause(ref mut clauses)) if clauses.len() == 1 => {
*input = clauses.pop().unwrap(); // safe because clauses.len() == 1
}
_ => {}
}
}
#[cfg(test)]
@@ -419,9 +484,7 @@ mod test {
fn assert_nearly_equals(expected: f64, val: f64) {
assert!(
nearly_equals(val, expected),
"Got {}, expected {}.",
val,
expected
"Got {val}, expected {expected}."
);
}
@@ -466,9 +529,10 @@ mod test {
assert_eq!(remaining, "");
}
#[track_caller]
fn test_parse_query_to_ast_helper(query: &str, expected: &str) {
let query = parse_to_ast().parse(query).unwrap().0;
let query_str = format!("{:?}", query);
let query_str = format!("{query:?}");
assert_eq!(query_str, expected);
}
@@ -484,8 +548,9 @@ mod test {
#[test]
fn test_parse_query_to_ast_hyphen() {
test_parse_query_to_ast_helper("\"www-form-encoded\"", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("www-form-encoded", "\"www-form-encoded\"");
test_parse_query_to_ast_helper("'www-form-encoded'", "'www-form-encoded'");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
test_parse_query_to_ast_helper("www-form-encoded", "www-form-encoded");
}
#[test]
@@ -494,25 +559,25 @@ mod test {
format!("{:?}", parse_to_ast().parse("NOT")),
"Err(UnexpectedParse)"
);
test_parse_query_to_ast_helper("NOTa", "\"NOTa\"");
test_parse_query_to_ast_helper("NOT a", "(-\"a\")");
test_parse_query_to_ast_helper("NOTa", "NOTa");
test_parse_query_to_ast_helper("NOT a", "(-a)");
}
#[test]
fn test_boosting() {
assert!(parse_to_ast().parse("a^2^3").is_err());
assert!(parse_to_ast().parse("a^2^").is_err());
test_parse_query_to_ast_helper("a^3", "(\"a\")^3");
test_parse_query_to_ast_helper("a^3 b^2", "(*(\"a\")^3 *(\"b\")^2)");
test_parse_query_to_ast_helper("a^1", "\"a\"");
test_parse_query_to_ast_helper("a^3", "(a)^3");
test_parse_query_to_ast_helper("a^3 b^2", "(*(a)^3 *(b)^2)");
test_parse_query_to_ast_helper("a^1", "a");
}
#[test]
fn test_parse_query_to_ast_binary_op() {
test_parse_query_to_ast_helper("a AND b", "(+\"a\" +\"b\")");
test_parse_query_to_ast_helper("a OR b", "(?\"a\" ?\"b\")");
test_parse_query_to_ast_helper("a OR b AND c", "(?\"a\" ?(+\"b\" +\"c\"))");
test_parse_query_to_ast_helper("a AND b AND c", "(+\"a\" +\"b\" +\"c\")");
test_parse_query_to_ast_helper("a AND b", "(+a +b)");
test_parse_query_to_ast_helper("a OR b", "(?a ?b)");
test_parse_query_to_ast_helper("a OR b AND c", "(?a ?(+b +c))");
test_parse_query_to_ast_helper("a AND b AND c", "(+a +b +c)");
assert_eq!(
format!("{:?}", parse_to_ast().parse("a OR b aaa")),
"Err(UnexpectedParse)"
@@ -554,7 +619,7 @@ mod test {
fn test_occur_leaf() {
let ((occur, ast), _) = super::occur_leaf().parse("+abc").unwrap();
assert_eq!(occur, Some(Occur::Must));
assert_eq!(format!("{:?}", ast), "\"abc\"");
assert_eq!(format!("{ast:?}"), "abc");
}
#[test]
@@ -613,7 +678,7 @@ mod test {
let escaped_special_chars_re = Regex::new(ESCAPED_SPECIAL_CHARS_PATTERN).unwrap();
for special_char in SPECIAL_CHARS.iter() {
assert_eq!(
escaped_special_chars_re.replace_all(&format!("\\{}", special_char), "$1"),
escaped_special_chars_re.replace_all(&format!("\\{special_char}"), "$1"),
special_char.to_string()
);
}
@@ -708,56 +773,62 @@ mod test {
#[test]
fn test_parse_query_to_triming_spaces() {
test_parse_query_to_ast_helper(" abc", "\"abc\"");
test_parse_query_to_ast_helper("abc ", "\"abc\"");
test_parse_query_to_ast_helper("( a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc)", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("a OR abc ", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc )", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper("(a OR abc) ", "(?\"a\" ?\"abc\")");
test_parse_query_to_ast_helper(" abc", "abc");
test_parse_query_to_ast_helper("abc ", "abc");
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc)", "(?a ?abc)");
test_parse_query_to_ast_helper("a OR abc ", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc )", "(?a ?abc)");
test_parse_query_to_ast_helper("(a OR abc) ", "(?a ?abc)");
}
#[test]
fn test_parse_query_single_term() {
test_parse_query_to_ast_helper("abc", "\"abc\"");
test_parse_query_to_ast_helper("abc", "abc");
}
#[test]
fn test_parse_query_default_clause() {
test_parse_query_to_ast_helper("a b", "(*\"a\" *\"b\")");
test_parse_query_to_ast_helper("a b", "(*a *b)");
}
#[test]
fn test_parse_query_must_default_clause() {
test_parse_query_to_ast_helper("+(a b)", "(*\"a\" *\"b\")");
test_parse_query_to_ast_helper("+(a b)", "(*a *b)");
}
#[test]
fn test_parse_query_must_single_term() {
test_parse_query_to_ast_helper("+d", "\"d\"");
test_parse_query_to_ast_helper("+d", "d");
}
#[test]
fn test_single_term_with_field() {
test_parse_query_to_ast_helper("abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
}
#[test]
fn test_phrase_with_field() {
test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
test_parse_query_to_ast_helper("abc:'happy tax payer'", "\"abc\":'happy tax payer'");
}
#[test]
fn test_single_term_with_float() {
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":\"1.1\"");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":\"1.1\"");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":\"1.1\"");
test_parse_query_to_ast_helper("abc:1.1", "\"abc\":1.1");
test_parse_query_to_ast_helper("a.b.c:1.1", "\"a.b.c\":1.1");
test_parse_query_to_ast_helper("a\\ b\\ c:1.1", "\"a b c\":1.1");
}
#[test]
fn test_must_clause() {
test_parse_query_to_ast_helper("(+a +b)", "(+\"a\" +\"b\")");
test_parse_query_to_ast_helper("(+a +b)", "(+a +b)");
}
#[test]
fn test_parse_test_query_plus_a_b_plus_d() {
test_parse_query_to_ast_helper("+(a b) +d", "(+(*\"a\" *\"b\") +\"d\")");
test_parse_query_to_ast_helper("+(a b) +d", "(+(*a *b) +d)");
}
#[test]
@@ -770,13 +841,13 @@ mod test {
#[test]
fn test_parse_test_query_other() {
test_parse_query_to_ast_helper("(+a +b) d", "(*(+\"a\" +\"b\") *\"d\")");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":\"toto\"");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":\"toto\"");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":\"toto\" -\"titi\")");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":\"toto\")");
test_parse_query_to_ast_helper("(+a +b) d", "(*(+a +b) *d)");
test_parse_query_to_ast_helper("+abc:toto", "\"abc\":toto");
test_parse_query_to_ast_helper("+a\\+b\\+c:toto", "\"a+b+c\":toto");
test_parse_query_to_ast_helper("(+abc:toto -titi)", "(+\"abc\":toto -titi)");
test_parse_query_to_ast_helper("-abc:toto", "(-\"abc\":toto)");
test_is_parse_err("--abc:toto");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":\"a\" *\"b\")");
test_parse_query_to_ast_helper("abc:a b", "(*\"abc\":a *b)");
test_parse_query_to_ast_helper("abc:\"a b\"", "\"abc\":\"a b\"");
test_parse_query_to_ast_helper("foo:[1 TO 5]", "\"foo\":[\"1\" TO \"5\"]");
}
@@ -801,15 +872,42 @@ mod test {
assert!(parse_to_ast().parse("foo:\"a b\"~").is_err());
assert!(parse_to_ast().parse("\"a b\"~a").is_err());
assert!(parse_to_ast().parse("\"a b\"~100000000000000000").is_err());
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *\"~4\")");
test_parse_query_to_ast_helper("\"a b\"^2~4", "(*(\"a b\")^2 *~4)");
test_parse_query_to_ast_helper("\"~Document\"", "\"~Document\"");
test_parse_query_to_ast_helper("~Document", "\"~Document\"");
test_parse_query_to_ast_helper("a~2", "\"a~2\"");
test_parse_query_to_ast_helper("~Document", "~Document");
test_parse_query_to_ast_helper("a~2", "a~2");
test_parse_query_to_ast_helper("\"a b\"~0", "\"a b\"");
test_parse_query_to_ast_helper("\"a b\"~1", "\"a b\"~1");
test_parse_query_to_ast_helper("\"a b\"~3", "\"a b\"~3");
test_parse_query_to_ast_helper("foo:\"a b\"~300", "\"foo\":\"a b\"~300");
test_parse_query_to_ast_helper("\"a b\"~300^2", "(\"a b\"~300)^2");
}
#[test]
fn test_phrase_prefix() {
test_parse_query_to_ast_helper("\"a b\"*", "\"a b\"*");
test_parse_query_to_ast_helper("\"a\"*", "\"a\"*");
test_parse_query_to_ast_helper("\"\"*", "\"\"*");
test_parse_query_to_ast_helper("foo:\"a b\"*", "\"foo\":\"a b\"*");
test_parse_query_to_ast_helper("foo:\"a\"*", "\"foo\":\"a\"*");
test_parse_query_to_ast_helper("foo:\"\"*", "\"foo\":\"\"*");
}
#[test]
fn test_not_queries_are_consistent() {
test_parse_query_to_ast_helper("tata -toto", "(*tata -toto)");
test_parse_query_to_ast_helper("tata NOT toto", "(*tata -toto)");
}
#[test]
fn test_escaping() {
test_parse_query_to_ast_helper(
r#"myfield:"hello\"happy\'tax""#,
r#""myfield":"hello"happy'tax""#,
);
test_parse_query_to_ast_helper(
r#"myfield:'hello\"happy\'tax'"#,
r#""myfield":'hello"happy'tax'"#,
);
}
}

View File

@@ -19,7 +19,7 @@ pub enum UserInputLeaf {
}
impl Debug for UserInputLeaf {
fn fmt(&self, formatter: &mut Formatter<'_>) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut Formatter) -> Result<(), fmt::Error> {
match self {
UserInputLeaf::Literal(literal) => literal.fmt(formatter),
UserInputLeaf::Range {
@@ -28,7 +28,7 @@ impl Debug for UserInputLeaf {
ref upper,
} => {
if let Some(ref field) = field {
write!(formatter, "\"{}\":", field)?;
write!(formatter, "\"{field}\":")?;
}
lower.display_lower(formatter)?;
write!(formatter, " TO ")?;
@@ -37,14 +37,14 @@ impl Debug for UserInputLeaf {
}
UserInputLeaf::Set { field, elements } => {
if let Some(ref field) = field {
write!(formatter, "\"{}\": ", field)?;
write!(formatter, "\"{field}\": ")?;
}
write!(formatter, "IN [")?;
for (i, element) in elements.iter().enumerate() {
for (i, text) in elements.iter().enumerate() {
if i != 0 {
write!(formatter, " ")?;
}
write!(formatter, "\"{}\"", element)?;
write!(formatter, "\"{text}\"")?;
}
write!(formatter, "]")
}
@@ -53,21 +53,42 @@ impl Debug for UserInputLeaf {
}
}
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
pub enum Delimiter {
SingleQuotes,
DoubleQuotes,
None,
}
#[derive(PartialEq)]
pub struct UserInputLiteral {
pub field_name: Option<String>,
pub phrase: String,
pub delimiter: Delimiter,
pub slop: u32,
pub prefix: bool,
}
impl fmt::Debug for UserInputLiteral {
fn fmt(&self, formatter: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
fn fmt(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
if let Some(ref field) = self.field_name {
write!(formatter, "\"{}\":", field)?;
write!(formatter, "\"{field}\":")?;
}
match self.delimiter {
Delimiter::SingleQuotes => {
write!(formatter, "'{}'", self.phrase)?;
}
Delimiter::DoubleQuotes => {
write!(formatter, "\"{}\"", self.phrase)?;
}
Delimiter::None => {
write!(formatter, "{}", self.phrase)?;
}
}
write!(formatter, "\"{}\"", self.phrase)?;
if self.slop > 0 {
write!(formatter, "~{}", self.slop)?;
} else if self.prefix {
write!(formatter, "*")?;
}
Ok(())
}
@@ -83,16 +104,16 @@ pub enum UserInputBound {
impl UserInputBound {
fn display_lower(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{}\"", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{}\"", word),
UserInputBound::Inclusive(ref word) => write!(formatter, "[\"{word}\""),
UserInputBound::Exclusive(ref word) => write!(formatter, "{{\"{word}\""),
UserInputBound::Unbounded => write!(formatter, "{{\"*\""),
}
}
fn display_upper(&self, formatter: &mut fmt::Formatter) -> Result<(), fmt::Error> {
match *self {
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{}\"]", word),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{}\"}}", word),
UserInputBound::Inclusive(ref word) => write!(formatter, "\"{word}\"]"),
UserInputBound::Exclusive(ref word) => write!(formatter, "\"{word}\"}}"),
UserInputBound::Unbounded => write!(formatter, "\"*\"}}"),
}
}
@@ -163,9 +184,9 @@ fn print_occur_ast(
formatter: &mut fmt::Formatter,
) -> fmt::Result {
if let Some(occur) = occur_opt {
write!(formatter, "{}{:?}", occur, ast)?;
write!(formatter, "{occur}{ast:?}")?;
} else {
write!(formatter, "*{:?}", ast)?;
write!(formatter, "*{ast:?}")?;
}
Ok(())
}
@@ -187,8 +208,8 @@ impl fmt::Debug for UserInputAst {
}
Ok(())
}
UserInputAst::Leaf(ref subquery) => write!(formatter, "{:?}", subquery),
UserInputAst::Boost(ref leaf, boost) => write!(formatter, "({:?})^{}", leaf, boost),
UserInputAst::Leaf(ref subquery) => write!(formatter, "{subquery:?}"),
UserInputAst::Boost(ref leaf, boost) => write!(formatter, "({leaf:?})^{boost}"),
}
}
}

View File

@@ -1,19 +1,36 @@
#[cfg(all(test, feature = "unstable"))]
mod bench {
use columnar::Cardinality;
use rand::prelude::SliceRandom;
use rand::{thread_rng, Rng};
use rand::rngs::StdRng;
use rand::{Rng, SeedableRng};
use rand_distr::Distribution;
use serde_json::json;
use test::{self, Bencher};
use super::*;
use crate::aggregation::bucket::{
CustomOrder, HistogramAggregation, HistogramBounds, Order, OrderTarget, TermsAggregation,
};
use crate::aggregation::metric::StatsAggregation;
use crate::query::AllQuery;
use crate::schema::{Schema, TextFieldIndexing, FAST, STRING};
use crate::Index;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
use crate::schema::{IndexRecordOption, Schema, TextFieldIndexing, FAST, STRING};
use crate::{Index, Term};
#[derive(Clone, Copy, Hash, Default, Debug, PartialEq, Eq, PartialOrd, Ord)]
enum Cardinality {
/// All documents contain exactly one value.
/// `Full` is the default for auto-detecting the Cardinality, since it is the most strict.
#[default]
Full = 0,
/// All documents contain at most one value.
Optional = 1,
/// All documents may contain any number of values.
Multivalued = 2,
/// 1 / 20 documents has a value
Sparse = 3,
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
AggregationCollector::from_aggs(agg_req, Default::default())
}
fn get_test_index_bench(cardinality: Cardinality) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
@@ -23,6 +40,7 @@ mod bench {
)
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let json_field = schema_builder.add_json_field("json", FAST);
let text_field_many_terms = schema_builder.add_text_field("text_many_terms", STRING | FAST);
let text_field_few_terms = schema_builder.add_text_field("text_few_terms", STRING | FAST);
let score_fieldtype = crate::schema::NumericOptions::default().set_fast();
@@ -31,12 +49,15 @@ mod bench {
let score_field_i64 = schema_builder.add_i64_field("score_i64", score_fieldtype);
let index = Index::create_from_tempdir(schema_builder.build())?;
let few_terms_data = vec!["INFO", "ERROR", "WARN", "DEBUG"];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let many_terms_data = (0..150_000)
.map(|num| format!("author{}", num))
.collect::<Vec<_>>();
{
let mut rng = thread_rng();
let mut index_writer = index.writer_with_num_threads(1, 100_000_000)?;
let mut rng = StdRng::from_seed([1u8; 32]);
let mut index_writer = index.writer_with_num_threads(1, 200_000_000)?;
// To make the different test cases comparable we just change one doc to force the
// cardinality
if cardinality == Cardinality::Optional {
@@ -44,6 +65,8 @@ mod bench {
}
if cardinality == Cardinality::Multivalued {
index_writer.add_document(doc!(
json_field => json!({"mixed_type": 10.0}),
json_field => json!({"mixed_type": 10.0}),
text_field => "cool",
text_field => "cool",
text_field_many_terms => "cool",
@@ -52,22 +75,39 @@ mod bench {
text_field_few_terms => "cool",
score_field => 1u64,
score_field => 1u64,
score_field_f64 => 1.0,
score_field_f64 => 1.0,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => 1i64,
score_field_i64 => 1i64,
))?;
}
for _ in 0..1_000_000 {
let mut doc_with_value = 1_000_000;
if cardinality == Cardinality::Sparse {
doc_with_value /= 20;
}
let val_max = 1_000_000.0;
for _ in 0..doc_with_value {
let val: f64 = rng.gen_range(0.0..1_000_000.0);
let json = if rng.gen_bool(0.1) {
// 10% are numeric values
json!({ "mixed_type": val })
} else {
json!({"mixed_type": many_terms_data.choose(&mut rng).unwrap().to_string()})
};
index_writer.add_document(doc!(
text_field => "cool",
json_field => json,
text_field_many_terms => many_terms_data.choose(&mut rng).unwrap().to_string(),
text_field_few_terms => few_terms_data.choose(&mut rng).unwrap().to_string(),
score_field => val as u64,
score_field_f64 => val,
score_field_f64 => lg_norm.sample(&mut rng),
score_field_i64 => val as i64,
))?;
if cardinality == Cardinality::Sparse {
for _ in 0..20 {
index_writer.add_document(doc!(text_field => "cool"))?;
}
}
}
// writing the segment
index_writer.commit()?;
@@ -95,6 +135,12 @@ mod bench {
fn [<$x _multi>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Multivalued)
}
#[bench]
fn [<$x _sparse>](b: &mut Bencher) {
[<$x _card>](b, Cardinality::Sparse)
}
}
};
}
@@ -112,14 +158,10 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = vec![(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": { "avg": { "field": "score", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -141,14 +183,10 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score_f64".to_string(),
))),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "stats": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -170,14 +208,10 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64", } }
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -186,6 +220,31 @@ mod bench {
});
}
bench_all_cardinalities!(bench_aggregation_percentiles_f64);
fn bench_aggregation_percentiles_f64_card(b: &mut Bencher, cardinality: Cardinality) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_average_u64_and_f64);
fn bench_aggregation_average_u64_and_f64_card(b: &mut Bencher, cardinality: Cardinality) {
@@ -199,22 +258,11 @@ mod bench {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = vec![
(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
),
(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
),
]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average_f64": { "avg": { "field": "score_f64" } },
"average": { "avg": { "field": "score" } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -230,21 +278,10 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_few_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_few_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
@@ -260,30 +297,42 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: sub_agg_req,
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "text_many_terms" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
.into(),
),
)]
.into_iter()
.collect();
},
}))
.unwrap();
let collector = get_collector(agg_req);
let searcher = reader.searcher();
searcher.search(&AllQuery, &collector).unwrap()
});
}
bench_all_cardinalities!(bench_aggregation_terms_many_json_mixed_type_with_sub_agg);
fn bench_aggregation_terms_many_json_mixed_type_with_sub_agg_card(
b: &mut Bencher,
cardinality: Cardinality,
) {
let index = get_test_index_bench(cardinality).unwrap();
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": {
"terms": { "field": "json.mixed_type" },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
},
}))
.unwrap();
let collector = get_collector(agg_req);
@@ -299,21 +348,10 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms" } },
}))
.unwrap();
let collector = get_collector(agg_req);
@@ -329,25 +367,10 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req: Aggregations = vec![(
"my_texts".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text_many_terms".to_string(),
order: Some(CustomOrder {
order: Order::Desc,
target: OrderTarget::Key,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
}))
.unwrap();
let collector = get_collector(agg_req);
@@ -363,29 +386,17 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..30000f64).into(),
(30000f64..40000f64).into(),
(40000f64..50000f64).into(),
(50000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"range_f64": { "range": { "field": "score_f64", "ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
] } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -401,38 +412,25 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..30000f64).into(),
(30000f64..40000f64).into(),
(40000f64..50000f64).into(),
(50000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req,
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 30000 },
{ "from": 30000, "to": 40000 },
{ "from": 40000, "to": 50000 },
{ "from": 50000, "to": 60000 }
]
},
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
.into(),
),
)]
.into_iter()
.collect();
},
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -453,26 +451,10 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64,
hard_bounds: Some(HistogramBounds {
min: 1000.0,
max: 300_000.0,
}),
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": { "histogram": { "field": "score_f64", "interval": 100, "hard_bounds": { "min": 1000, "max": 300000 } } },
}))
.unwrap();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
@@ -487,31 +469,15 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let sub_agg_req: Aggregations = vec![(
"average_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: sub_agg_req,
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": { "field": "score_f64", "interval": 100 },
"aggs": {
"average_f64": { "avg": { "field": "score_f64" } }
}
.into(),
),
)]
.into_iter()
.collect();
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -527,22 +493,15 @@ mod bench {
let reader = index.reader().unwrap();
b.iter(|| {
let agg_req_1: Aggregations = vec![(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Histogram(HistogramAggregation {
field: "score_f64".to_string(),
interval: 100f64, // 1000 buckets
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"histogram": {
"field": "score_f64",
"interval": 100 // 1000 buckets
},
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -564,43 +523,23 @@ mod bench {
IndexRecordOption::Basic,
);
let sub_agg_req_1: Aggregations = vec![(
"average_in_range".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = vec![
(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score".to_string()),
)),
),
(
"rangef64".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7000f64).into(),
(7000f64..20000f64).into(),
(20000f64..60000f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req_1,
}
.into(),
),
),
]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"rangef64": {
"range": {
"field": "score_f64",
"ranges": [
{ "from": 3, "to": 7000 },
{ "from": 7000, "to": 20000 },
{ "from": 20000, "to": 60000 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "score" } }
}
},
"average": { "avg": { "field": "score" } }
}))
.unwrap();
let collector = get_collector(agg_req_1);

View File

@@ -1,12 +1,11 @@
use std::collections::HashMap;
use std::sync::atomic::AtomicU64;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use common::ByteCount;
use super::collector::DEFAULT_MEMORY_LIMIT;
use super::{AggregationError, DEFAULT_BUCKET_LIMIT};
use crate::TantivyError;
/// An estimate for memory consumption. Non recursive
pub trait MemoryConsumption {
@@ -15,8 +14,8 @@ pub trait MemoryConsumption {
impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
fn memory_consumption(&self) -> usize {
let num_items = self.capacity();
(std::mem::size_of::<K>() + std::mem::size_of::<V>()) * num_items
let capacity = self.capacity();
(std::mem::size_of::<K>() + std::mem::size_of::<V>() + 1) * capacity
}
}
@@ -61,6 +60,8 @@ impl AggregationLimits {
/// *bucket_limit*
/// Limits the maximum number of buckets returned from an aggregation request.
/// bucket_limit will default to `DEFAULT_BUCKET_LIMIT` (65000)
///
/// Note: The returned instance contains a Arc shared counter to track memory consumption.
pub fn new(memory_limit: Option<u64>, bucket_limit: Option<u32>) -> Self {
Self {
memory_consumption: Default::default(),
@@ -68,28 +69,68 @@ impl AggregationLimits {
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
}
}
pub(crate) fn validate_memory_consumption(&self) -> crate::Result<()> {
if self.get_memory_consumed() > self.memory_limit {
return Err(TantivyError::AggregationError(
AggregationError::MemoryExceeded {
limit: self.memory_limit,
current: self.get_memory_consumed(),
},
));
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
pub fn new_guard(&self) -> ResourceLimitGuard {
ResourceLimitGuard {
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc::clone(&self.memory_consumption),
/// The memory_limit in bytes
memory_limit: self.memory_limit,
allocated_with_the_guard: 0,
}
}
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
Ok(())
}
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) {
self.memory_consumption
.fetch_add(num_bytes, std::sync::atomic::Ordering::Relaxed);
}
/// Returns the estimated memory consumed by the aggregations
pub fn get_memory_consumed(&self) -> ByteCount {
self.memory_consumption
.load(std::sync::atomic::Ordering::Relaxed)
.into()
}
pub(crate) fn get_bucket_limit(&self) -> u32 {
self.bucket_limit
}
}
fn validate_memory_consumption(
memory_consumption: &AtomicU64,
memory_limit: ByteCount,
) -> Result<(), AggregationError> {
// Load the estimated memory consumed by the aggregations
let memory_consumed: ByteCount = memory_consumption.load(Ordering::Relaxed).into();
if memory_consumed > memory_limit {
return Err(AggregationError::MemoryExceeded {
limit: memory_limit,
current: memory_consumed,
});
}
Ok(())
}
pub struct ResourceLimitGuard {
/// The counter which is shared between the aggregations for one request.
memory_consumption: Arc<AtomicU64>,
/// The memory_limit in bytes
memory_limit: ByteCount,
/// Allocated memory with this guard.
allocated_with_the_guard: u64,
}
impl ResourceLimitGuard {
pub(crate) fn add_memory_consumed(&self, num_bytes: u64) -> crate::Result<()> {
self.memory_consumption
.fetch_add(num_bytes, Ordering::Relaxed);
validate_memory_consumption(&self.memory_consumption, self.memory_limit)?;
Ok(())
}
}
impl Drop for ResourceLimitGuard {
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
/// This is used to clear the segment specific memory consumption all at once.
fn drop(&mut self) {
self.memory_consumption
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
}
}

View File

@@ -9,25 +9,7 @@
//! # Example
//!
//! ```
//! use tantivy::aggregation::bucket::RangeAggregation;
//! use tantivy::aggregation::agg_req::BucketAggregationType;
//! use tantivy::aggregation::agg_req::{Aggregation, Aggregations};
//! use tantivy::aggregation::agg_req::BucketAggregation;
//! let agg_req1: Aggregations = vec![
//! (
//! "range".to_string(),
//! Aggregation::Bucket(Box::new(BucketAggregation {
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
//! field: "score".to_string(),
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
//! keyed: false,
//! }),
//! sub_aggregation: Default::default(),
//! })),
//! ),
//! ]
//! .into_iter()
//! .collect();
//! use tantivy::aggregation::agg_req::Aggregations;
//!
//! let elasticsearch_compatible_json_req = r#"
//! {
@@ -41,89 +23,51 @@
//! }
//! }
//! }"#;
//! let agg_req2: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! assert_eq!(agg_req1, agg_req2);
//! let _agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
//! ```
use std::collections::{HashMap, HashSet};
use serde::{Deserialize, Serialize};
pub use super::bucket::RangeAggregation;
use super::bucket::{DateHistogramAggregationReq, HistogramAggregation, TermsAggregation};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
SumAggregation,
use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
PercentilesAggregationReq, StatsAggregation, SumAggregation,
};
use super::VecWithNames;
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
/// defined names. It is also used in [buckets](BucketAggregation) to define sub-aggregations.
/// defined names. It is also used in buckets aggregations to define sub-aggregations.
///
/// The key is the user defined name of the aggregation.
pub type Aggregations = HashMap<String, Aggregation>;
/// Like Aggregations, but optimized to work with the aggregation result
#[derive(Clone, Debug)]
pub(crate) struct AggregationsInternal {
pub(crate) metrics: VecWithNames<MetricAggregation>,
pub(crate) buckets: VecWithNames<BucketAggregationInternal>,
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Aggregation request.
///
/// An aggregation is either a bucket or a metric.
pub struct Aggregation {
/// The aggregation variant, which can be either a bucket or a metric.
#[serde(flatten)]
pub agg: AggregationVariants,
/// The sub_aggregations, only valid for bucket type aggregations. Each bucket will aggregate
/// on the document set in the bucket.
#[serde(rename = "aggs")]
#[serde(default)]
#[serde(skip_serializing_if = "Aggregations::is_empty")]
pub sub_aggregation: Aggregations,
}
impl From<Aggregations> for AggregationsInternal {
fn from(aggs: Aggregations) -> Self {
let mut metrics = vec![];
let mut buckets = vec![];
for (key, agg) in aggs {
match agg {
Aggregation::Bucket(bucket) => buckets.push((
key,
BucketAggregationInternal {
bucket_agg: bucket.bucket_agg,
sub_aggregation: bucket.sub_aggregation.into(),
},
)),
Aggregation::Metric(metric) => metrics.push((key, metric)),
}
}
Self {
metrics: VecWithNames::from_entries(metrics),
buckets: VecWithNames::from_entries(buckets),
}
impl Aggregation {
pub(crate) fn sub_aggregation(&self) -> &Aggregations {
&self.sub_aggregation
}
}
#[derive(Clone, Debug)]
// Like BucketAggregation, but optimized to work with the result
pub(crate) struct BucketAggregationInternal {
/// Bucket aggregation strategy to group documents.
pub bucket_agg: BucketAggregationType,
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
/// bucket.
pub sub_aggregation: AggregationsInternal,
}
impl BucketAggregationInternal {
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
match &self.bucket_agg {
BucketAggregationType::Range(range) => Some(range),
_ => None,
}
}
pub(crate) fn as_histogram(&self) -> crate::Result<Option<HistogramAggregation>> {
match &self.bucket_agg {
BucketAggregationType::Histogram(histogram) => Ok(Some(histogram.clone())),
BucketAggregationType::DateHistogram(histogram) => {
Ok(Some(histogram.to_histogram_req()?))
}
_ => Ok(None),
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self.bucket_agg {
BucketAggregationType::Terms(terms) => Some(terms),
_ => None,
}
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
fast_field_names.insert(self.agg.get_fast_field_name().to_string());
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
}
}
@@ -136,97 +80,24 @@ pub fn get_fast_field_names(aggs: &Aggregations) -> HashSet<String> {
fast_field_names
}
/// Aggregation request of [`BucketAggregation`] or [`MetricAggregation`].
///
/// An aggregation is either a bucket or a metric.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum Aggregation {
/// Bucket aggregation, see [`BucketAggregation`] for details.
Bucket(Box<BucketAggregation>),
/// Metric aggregation, see [`MetricAggregation`] for details.
Metric(MetricAggregation),
}
impl Aggregation {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
match self {
Aggregation::Bucket(bucket) => bucket.get_fast_field_names(fast_field_names),
Aggregation::Metric(metric) => {
fast_field_names.insert(metric.get_fast_field_name().to_string());
}
}
}
}
/// BucketAggregations create buckets of documents. Each bucket is associated with a rule which
/// determines whether or not a document in the falls into it. In other words, the buckets
/// effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
/// fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
/// compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
/// metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
/// the buckets created by their "parent" bucket aggregation. There are different bucket
/// aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
/// define fixed number of multiple buckets, and others dynamically create the buckets during the
/// aggregation process.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct BucketAggregation {
/// Bucket aggregation strategy to group documents.
#[serde(flatten)]
pub bucket_agg: BucketAggregationType,
/// The sub_aggregations in the buckets. Each bucket will aggregate on the document set in the
/// bucket.
#[serde(rename = "aggs")]
#[serde(default)]
#[serde(skip_serializing_if = "Aggregations::is_empty")]
pub sub_aggregation: Aggregations,
}
impl BucketAggregation {
fn get_fast_field_names(&self, fast_field_names: &mut HashSet<String>) {
let fast_field_name = self.bucket_agg.get_fast_field_name();
fast_field_names.insert(fast_field_name.to_string());
fast_field_names.extend(get_fast_field_names(&self.sub_aggregation));
}
}
/// The bucket aggregation types.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum BucketAggregationType {
/// All aggregation types.
pub enum AggregationVariants {
// Bucket aggregation types
/// Put data into buckets of user-defined ranges.
#[serde(rename = "range")]
Range(RangeAggregation),
/// Put data into buckets of user-defined ranges.
/// Put data into a histogram.
#[serde(rename = "histogram")]
Histogram(HistogramAggregation),
/// Put data into buckets of user-defined ranges.
/// Put data into a date histogram.
#[serde(rename = "date_histogram")]
DateHistogram(DateHistogramAggregationReq),
/// Put data into buckets of terms.
#[serde(rename = "terms")]
Terms(TermsAggregation),
}
impl BucketAggregationType {
fn get_fast_field_name(&self) -> &str {
match self {
BucketAggregationType::Terms(terms) => terms.field.as_str(),
BucketAggregationType::Range(range) => range.field.as_str(),
BucketAggregationType::Histogram(histogram) => histogram.field.as_str(),
BucketAggregationType::DateHistogram(histogram) => histogram.field.as_str(),
}
}
}
/// The aggregations in this family compute metrics based on values extracted
/// from the documents that are being aggregated. Values are extracted from the fast field of
/// the document.
/// Some aggregations output a single numeric metric (e.g. Average) and are called
/// single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
/// called multi-value numeric metrics aggregation.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum MetricAggregation {
// Metric aggregation types
/// Computes the average of the extracted values.
#[serde(rename = "avg")]
Average(AverageAggregation),
@@ -246,25 +117,107 @@ pub enum MetricAggregation {
/// Computes the sum of the extracted values.
#[serde(rename = "sum")]
Sum(SumAggregation),
/// Computes the sum of the extracted values.
#[serde(rename = "percentiles")]
Percentiles(PercentilesAggregationReq),
}
impl MetricAggregation {
impl AggregationVariants {
fn get_fast_field_name(&self) -> &str {
match self {
MetricAggregation::Average(avg) => avg.field_name(),
MetricAggregation::Count(count) => count.field_name(),
MetricAggregation::Max(max) => max.field_name(),
MetricAggregation::Min(min) => min.field_name(),
MetricAggregation::Stats(stats) => stats.field_name(),
MetricAggregation::Sum(sum) => sum.field_name(),
AggregationVariants::Terms(terms) => terms.field.as_str(),
AggregationVariants::Range(range) => range.field.as_str(),
AggregationVariants::Histogram(histogram) => histogram.field.as_str(),
AggregationVariants::DateHistogram(histogram) => histogram.field.as_str(),
AggregationVariants::Average(avg) => avg.field_name(),
AggregationVariants::Count(count) => count.field_name(),
AggregationVariants::Max(max) => max.field_name(),
AggregationVariants::Min(min) => min.field_name(),
AggregationVariants::Stats(stats) => stats.field_name(),
AggregationVariants::Sum(sum) => sum.field_name(),
AggregationVariants::Percentiles(per) => per.field_name(),
}
}
pub(crate) fn as_range(&self) -> Option<&RangeAggregation> {
match &self {
AggregationVariants::Range(range) => Some(range),
_ => None,
}
}
pub(crate) fn as_histogram(&self) -> crate::Result<Option<HistogramAggregation>> {
match &self {
AggregationVariants::Histogram(histogram) => Ok(Some(histogram.clone())),
AggregationVariants::DateHistogram(histogram) => {
Ok(Some(histogram.to_histogram_req()?))
}
_ => Ok(None),
}
}
pub(crate) fn as_term(&self) -> Option<&TermsAggregation> {
match &self {
AggregationVariants::Terms(terms) => Some(terms),
_ => None,
}
}
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
match &self {
AggregationVariants::Percentiles(percentile_req) => Some(percentile_req),
_ => None,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn deser_json_test() {
let agg_req_json = r#"{
"price_avg": { "avg": { "field": "price" } },
"price_count": { "value_count": { "field": "price" } },
"price_max": { "max": { "field": "price" } },
"price_min": { "min": { "field": "price" } },
"price_stats": { "stats": { "field": "price" } },
"price_sum": { "sum": { "field": "price" } }
}"#;
let _agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
}
#[test]
fn deser_json_test_bucket() {
let agg_req_json = r#"
{
"termagg": {
"terms": {
"field": "json.mixed_type",
"order": { "min_price": "desc" }
},
"aggs": {
"min_price": { "min": { "field": "json.mixed_type" } }
}
},
"rangeagg": {
"range": {
"field": "json.mixed_type",
"ranges": [
{ "to": 3.0 },
{ "from": 19.0, "to": 20.0 },
{ "from": 20.0 }
]
},
"aggs": {
"average_in_range": { "avg": { "field": "json.mixed_type" } }
}
}
} "#;
let _agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
}
#[test]
fn test_metric_aggregations_deser() {
let agg_req_json = r#"{
@@ -278,46 +231,27 @@ mod tests {
let agg_req: Aggregations = serde_json::from_str(agg_req_json).unwrap();
assert!(
matches!(agg_req.get("price_avg").unwrap(), Aggregation::Metric(MetricAggregation::Average(avg)) if avg.field == "price")
matches!(&agg_req.get("price_avg").unwrap().agg, AggregationVariants::Average(avg) if avg.field == "price")
);
assert!(
matches!(agg_req.get("price_count").unwrap(), Aggregation::Metric(MetricAggregation::Count(count)) if count.field == "price")
matches!(&agg_req.get("price_count").unwrap().agg, AggregationVariants::Count(count) if count.field == "price")
);
assert!(
matches!(agg_req.get("price_max").unwrap(), Aggregation::Metric(MetricAggregation::Max(max)) if max.field == "price")
matches!(&agg_req.get("price_max").unwrap().agg, AggregationVariants::Max(max) if max.field == "price")
);
assert!(
matches!(agg_req.get("price_min").unwrap(), Aggregation::Metric(MetricAggregation::Min(min)) if min.field == "price")
matches!(&agg_req.get("price_min").unwrap().agg, AggregationVariants::Min(min) if min.field == "price")
);
assert!(
matches!(agg_req.get("price_stats").unwrap(), Aggregation::Metric(MetricAggregation::Stats(stats)) if stats.field == "price")
matches!(&agg_req.get("price_stats").unwrap().agg, AggregationVariants::Stats(stats) if stats.field == "price")
);
assert!(
matches!(agg_req.get("price_sum").unwrap(), Aggregation::Metric(MetricAggregation::Sum(sum)) if sum.field == "price")
matches!(&agg_req.get("price_sum").unwrap().agg, AggregationVariants::Sum(sum) if sum.field == "price")
);
}
#[test]
fn serialize_to_json_test() {
let agg_req1: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
],
keyed: true,
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let elasticsearch_compatible_json_req = r#"{
"range": {
"range": {
@@ -342,57 +276,56 @@ mod tests {
}
}
}"#;
let agg_req1: Aggregations =
{ serde_json::from_str(elasticsearch_compatible_json_req).unwrap() };
let agg_req2: String = serde_json::to_string_pretty(&agg_req1).unwrap();
assert_eq!(agg_req2, elasticsearch_compatible_json_req);
}
#[test]
fn test_get_fast_field_names() {
let agg_req2: Aggregations = vec![
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score2".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"metric".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("field123".to_string()),
)),
),
]
.into_iter()
.collect();
let agg_req1: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(f64::MIN..3f64).into(),
(3f64..7f64).into(),
(7f64..20f64).into(),
(20f64..f64::MAX).into(),
let range_agg: Aggregation = {
serde_json::from_value(json!({
"range": {
"field": "score",
"ranges": [
{ "to": 3.0 },
{ "from": 3.0, "to": 7.0 },
{ "from": 7.0, "to": 20.0 },
{ "from": 20.0 }
],
..Default::default()
}),
sub_aggregation: agg_req2,
})),
)]
.into_iter()
.collect();
}
}))
.unwrap()
};
let agg_req1: Aggregations = {
serde_json::from_value(json!({
"range1": range_agg,
"range2":{
"range": {
"field": "score2",
"ranges": [
{ "to": 3.0 },
{ "from": 3.0, "to": 7.0 },
{ "from": 7.0, "to": 20.0 },
{ "from": 20.0 }
],
},
"aggs": {
"metric": {
"avg": {
"field": "field123"
}
}
}
}
}))
.unwrap()
};
assert_eq!(
get_fast_field_names(&agg_req1),

View File

@@ -2,7 +2,8 @@
use columnar::{Column, ColumnBlockAccessor, ColumnType, StrColumn};
use super::agg_req::{Aggregation, Aggregations, BucketAggregationType, MetricAggregation};
use super::agg_limits::ResourceLimitGuard;
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
use super::bucket::{
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
};
@@ -14,36 +15,114 @@ use super::segment_agg_result::AggregationLimits;
use super::VecWithNames;
use crate::SegmentReader;
#[derive(Clone, Default)]
#[derive(Default)]
pub(crate) struct AggregationsWithAccessor {
pub metrics: VecWithNames<MetricAggregationWithAccessor>,
pub buckets: VecWithNames<BucketAggregationWithAccessor>,
pub aggs: VecWithNames<AggregationWithAccessor>,
}
impl AggregationsWithAccessor {
fn from_data(
metrics: VecWithNames<MetricAggregationWithAccessor>,
buckets: VecWithNames<BucketAggregationWithAccessor>,
) -> Self {
Self { metrics, buckets }
fn from_data(aggs: VecWithNames<AggregationWithAccessor>) -> Self {
Self { aggs }
}
pub fn is_empty(&self) -> bool {
self.metrics.is_empty() && self.buckets.is_empty()
self.aggs.is_empty()
}
}
#[derive(Clone)]
pub struct BucketAggregationWithAccessor {
pub struct AggregationWithAccessor {
/// In general there can be buckets without fast field access, e.g. buckets that are created
/// based on search terms. So eventually this needs to be Option or moved.
/// based on search terms. That is not that case currently, but eventually this needs to be
/// Option or moved.
pub(crate) accessor: Column<u64>,
pub(crate) str_dict_column: Option<StrColumn>,
pub(crate) field_type: ColumnType,
pub(crate) bucket_agg: BucketAggregationType,
/// In case there are multiple types of fast fields, e.g. string and numeric.
/// Only used for term aggregations currently.
pub(crate) accessor2: Option<(Column<u64>, ColumnType)>,
pub(crate) sub_aggregation: AggregationsWithAccessor,
pub(crate) limits: AggregationLimits,
pub(crate) limits: ResourceLimitGuard,
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
pub(crate) agg: Aggregation,
}
impl AggregationWithAccessor {
fn try_from_agg(
agg: &Aggregation,
sub_aggregation: &Aggregations,
reader: &SegmentReader,
limits: AggregationLimits,
) -> crate::Result<AggregationWithAccessor> {
let mut str_dict_column = None;
let mut accessor2 = None;
use AggregationVariants::*;
let (accessor, field_type) = match &agg.agg {
Range(RangeAggregation {
field: field_name, ..
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
Histogram(HistogramAggregation {
field: field_name, ..
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
DateHistogram(DateHistogramAggregationReq {
field: field_name, ..
}) => get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?,
Terms(TermsAggregation {
field: field_name, ..
}) => {
str_dict_column = reader.fast_fields().str(field_name)?;
let allowed_column_types = [
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
// ColumnType::Bytes Unsupported
// ColumnType::Bool Unsupported
// ColumnType::IpAddr Unsupported
// ColumnType::DateTime Unsupported
];
let mut columns =
get_all_ff_reader_or_empty(reader, field_name, Some(&allowed_column_types))?;
let first = columns.pop().unwrap();
accessor2 = columns.pop();
first
}
Average(AverageAggregation { field: field_name })
| Count(CountAggregation { field: field_name })
| Max(MaxAggregation { field: field_name })
| Min(MinAggregation { field: field_name })
| Stats(StatsAggregation { field: field_name })
| Sum(SumAggregation { field: field_name }) => {
let (accessor, field_type) =
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
(accessor, field_type)
}
Percentiles(percentiles) => {
let (accessor, field_type) = get_ff_reader(
reader,
percentiles.field_name(),
Some(get_numeric_or_date_column_types()),
)?;
(accessor, field_type)
}
};
let sub_aggregation = sub_aggregation.clone();
Ok(AggregationWithAccessor {
accessor,
accessor2,
field_type,
sub_aggregation: get_aggs_with_segment_accessor_and_validate(
&sub_aggregation,
reader,
&limits,
)?,
agg: agg.clone(),
str_dict_column,
limits: limits.new_guard(),
column_block_accessor: Default::default(),
})
}
}
fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
@@ -55,131 +134,30 @@ fn get_numeric_or_date_column_types() -> &'static [ColumnType] {
]
}
impl BucketAggregationWithAccessor {
fn try_from_bucket(
bucket: &BucketAggregationType,
sub_aggregation: &Aggregations,
reader: &SegmentReader,
limits: AggregationLimits,
) -> crate::Result<BucketAggregationWithAccessor> {
let mut str_dict_column = None;
let (accessor, field_type) = match &bucket {
BucketAggregationType::Range(RangeAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::Histogram(HistogramAggregation {
field: field_name, ..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::DateHistogram(DateHistogramAggregationReq {
field: field_name,
..
}) => get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?,
BucketAggregationType::Terms(TermsAggregation {
field: field_name, ..
}) => {
str_dict_column = reader.fast_fields().str(field_name)?;
get_ff_reader_and_validate(reader, field_name, None)?
}
};
let sub_aggregation = sub_aggregation.clone();
Ok(BucketAggregationWithAccessor {
accessor,
field_type,
sub_aggregation: get_aggs_with_accessor_and_validate(
&sub_aggregation,
reader,
&limits.clone(),
)?,
bucket_agg: bucket.clone(),
str_dict_column,
limits,
column_block_accessor: Default::default(),
})
}
}
/// Contains the metric request and the fast field accessor.
#[derive(Clone)]
pub struct MetricAggregationWithAccessor {
pub metric: MetricAggregation,
pub field_type: ColumnType,
pub accessor: Column<u64>,
pub column_block_accessor: ColumnBlockAccessor<u64>,
}
impl MetricAggregationWithAccessor {
fn try_from_metric(
metric: &MetricAggregation,
reader: &SegmentReader,
) -> crate::Result<MetricAggregationWithAccessor> {
match &metric {
MetricAggregation::Average(AverageAggregation { field: field_name })
| MetricAggregation::Count(CountAggregation { field: field_name })
| MetricAggregation::Max(MaxAggregation { field: field_name })
| MetricAggregation::Min(MinAggregation { field: field_name })
| MetricAggregation::Stats(StatsAggregation { field: field_name })
| MetricAggregation::Sum(SumAggregation { field: field_name }) => {
let (accessor, field_type) = get_ff_reader_and_validate(
reader,
field_name,
Some(get_numeric_or_date_column_types()),
)?;
Ok(MetricAggregationWithAccessor {
accessor,
field_type,
metric: metric.clone(),
column_block_accessor: Default::default(),
})
}
}
}
}
pub(crate) fn get_aggs_with_accessor_and_validate(
pub(crate) fn get_aggs_with_segment_accessor_and_validate(
aggs: &Aggregations,
reader: &SegmentReader,
limits: &AggregationLimits,
) -> crate::Result<AggregationsWithAccessor> {
let mut metrics = vec![];
let mut buckets = vec![];
let mut aggss = Vec::new();
for (key, agg) in aggs.iter() {
match agg {
Aggregation::Bucket(bucket) => buckets.push((
key.to_string(),
BucketAggregationWithAccessor::try_from_bucket(
&bucket.bucket_agg,
&bucket.sub_aggregation,
reader,
limits.clone(),
)?,
)),
Aggregation::Metric(metric) => metrics.push((
key.to_string(),
MetricAggregationWithAccessor::try_from_metric(metric, reader)?,
)),
}
aggss.push((
key.to_string(),
AggregationWithAccessor::try_from_agg(
agg,
agg.sub_aggregation(),
reader,
limits.clone(),
)?,
));
}
Ok(AggregationsWithAccessor::from_data(
VecWithNames::from_entries(metrics),
VecWithNames::from_entries(buckets),
VecWithNames::from_entries(aggss),
))
}
/// Get fast field reader with given cardinatility.
fn get_ff_reader_and_validate(
/// Get fast field reader or empty as default.
fn get_ff_reader(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
@@ -195,3 +173,23 @@ fn get_ff_reader_and_validate(
});
Ok(ff_field_with_type)
}
/// Get all fast field reader or empty as default.
///
/// Is guaranteed to return at least one column.
fn get_all_ff_reader_or_empty(
reader: &SegmentReader,
field_name: &str,
allowed_column_types: Option<&[ColumnType]>,
) -> crate::Result<Vec<(columnar::Column<u64>, ColumnType)>> {
let ff_fields = reader.fast_fields();
let mut ff_field_with_type =
ff_fields.u64_lenient_for_type_all(allowed_column_types, field_name)?;
if ff_field_with_type.is_empty() {
ff_field_with_type.push((
Column::build_empty_column(reader.num_docs()),
ColumnType::U64,
));
}
Ok(ff_field_with_type)
}

View File

@@ -7,12 +7,9 @@
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use super::agg_req::BucketAggregationInternal;
use super::bucket::GetDocCount;
use super::intermediate_agg_result::{IntermediateBucketResult, IntermediateMetricResult};
use super::metric::{SingleMetricResult, Stats};
use super::segment_agg_result::AggregationLimits;
use super::Key;
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats};
use super::{AggregationError, Key};
use crate::TantivyError;
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
@@ -37,8 +34,7 @@ impl AggregationResults {
} else {
// Validation is be done during request parsing, so we can't reach this state.
Err(TantivyError::InternalError(format!(
"Can't find aggregation {:?} in sub-aggregations",
name
"Can't find aggregation {name:?} in sub-aggregations"
)))
}
}
@@ -94,6 +90,8 @@ pub enum MetricResult {
Stats(Stats),
/// Sum metric result.
Sum(SingleMetricResult),
/// Sum metric result.
Percentiles(PercentilesMetricResult),
}
impl MetricResult {
@@ -105,30 +103,9 @@ impl MetricResult {
MetricResult::Min(min) => Ok(min.value),
MetricResult::Stats(stats) => stats.get_value(agg_property),
MetricResult::Sum(sum) => Ok(sum.value),
}
}
}
impl From<IntermediateMetricResult> for MetricResult {
fn from(metric: IntermediateMetricResult) -> Self {
match metric {
IntermediateMetricResult::Average(intermediate_avg) => {
MetricResult::Average(intermediate_avg.finalize().into())
}
IntermediateMetricResult::Count(intermediate_count) => {
MetricResult::Count(intermediate_count.finalize().into())
}
IntermediateMetricResult::Max(intermediate_max) => {
MetricResult::Max(intermediate_max.finalize().into())
}
IntermediateMetricResult::Min(intermediate_min) => {
MetricResult::Min(intermediate_min.finalize().into())
}
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
)),
}
}
}
@@ -183,14 +160,6 @@ impl BucketResult {
} => buckets.iter().map(|bucket| bucket.get_bucket_count()).sum(),
}
}
pub(crate) fn empty_from_req(
req: &BucketAggregationInternal,
limits: &AggregationLimits,
) -> crate::Result<Self> {
let empty_bucket = IntermediateBucketResult::empty_from_req(&req.bucket_agg);
empty_bucket.into_final_bucket_result(req, limits)
}
}
/// This is the wrapper of buckets entries, which can be vector or hashmap

View File

@@ -1,14 +1,9 @@
use serde_json::Value;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::bucket::{RangeAggregation, TermsAggregation};
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
use crate::aggregation::collector::AggregationCollector;
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
use crate::aggregation::metric::AverageAggregation;
use crate::aggregation::segment_agg_result::AggregationLimits;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
use crate::aggregation::DistributedAggregationCollector;
@@ -17,9 +12,12 @@ use crate::schema::{IndexRecordOption, Schema, FAST};
use crate::{Index, Term};
fn get_avg_req(field_name: &str) -> Aggregation {
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name(field_name.to_string()),
))
serde_json::from_value(json!({
"avg": {
"field": field_name,
}
}))
.unwrap()
}
fn get_collector(agg_req: Aggregations) -> AggregationCollector {
@@ -111,7 +109,7 @@ fn test_aggregation_flushing(
let searcher = reader.searcher();
let intermediate_agg_result = searcher.search(&AllQuery, &collector).unwrap();
intermediate_agg_result
.into_final_bucket_result(agg_req, &Default::default())
.into_final_result(agg_req, &Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req);
@@ -198,6 +196,74 @@ fn test_aggregation_flushing_variants() {
test_aggregation_flushing(true, true).unwrap();
}
#[test]
fn test_aggregation_level1_simple() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
let reader = index.reader()?;
let text_field = reader.searcher().schema().get_field("text").unwrap();
let term_query = TermQuery::new(
Term::from_field_text(text_field, "cool"),
IndexRecordOption::Basic,
);
let range_agg = |field_name: &str| -> Aggregation {
serde_json::from_value(json!({
"range": {
"field": field_name,
"ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ]
}
}))
.unwrap()
};
let agg_req_1: Aggregations = vec![
("average".to_string(), get_avg_req("score")),
("range".to_string(), range_agg("score")),
]
.into_iter()
.collect();
let collector = get_collector(agg_req_1);
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&term_query, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(res["average"]["value"], 12.142857142857142);
assert_eq!(
res["range"]["buckets"],
json!(
[
{
"key": "*-3",
"doc_count": 1,
"to": 3.0
},
{
"key": "3-7",
"doc_count": 2,
"from": 3.0,
"to": 7.0
},
{
"key": "7-20",
"doc_count": 3,
"from": 7.0,
"to": 20.0
},
{
"key": "20-*",
"doc_count": 1,
"from": 20.0
}
])
);
Ok(())
}
#[test]
fn test_aggregation_level1() -> crate::Result<()> {
let index = get_test_index_2_segments(true)?;
@@ -210,43 +276,23 @@ fn test_aggregation_level1() -> crate::Result<()> {
IndexRecordOption::Basic,
);
let range_agg = |field_name: &str| -> Aggregation {
serde_json::from_value(json!({
"range": {
"field": field_name,
"ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 20.0f64 } ]
}
}))
.unwrap()
};
let agg_req_1: Aggregations = vec![
("average_i64".to_string(), get_avg_req("score_i64")),
("average_f64".to_string(), get_avg_req("score_f64")),
("average".to_string(), get_avg_req("score")),
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"rangef64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
(
"rangei64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_i64".to_string(),
ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
("range".to_string(), range_agg("score")),
("rangef64".to_string(), range_agg("score_f64")),
("rangei64".to_string(), range_agg("score_i64")),
]
.into_iter()
.collect();
@@ -295,7 +341,6 @@ fn test_aggregation_level1() -> crate::Result<()> {
fn test_aggregation_level2(
merge_segments: bool,
use_distributed_collector: bool,
use_elastic_json_req: bool,
) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
@@ -312,23 +357,7 @@ fn test_aggregation_level2(
IndexRecordOption::Basic,
);
let sub_agg_req: Aggregations = vec![
("average_in_range".to_string(), get_avg_req("score")),
(
"term_agg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "text".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
),
]
.into_iter()
.collect();
let agg_req: Aggregations = if use_elastic_json_req {
let elasticsearch_compatible_json_req = r#"
let elasticsearch_compatible_json_req = r#"
{
"rangef64": {
"range": {
@@ -383,61 +412,7 @@ fn test_aggregation_level2(
}
}
"#;
let value: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
value
} else {
let agg_req: Aggregations = vec![
("average".to_string(), get_avg_req("score")),
(
"range".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req.clone(),
})),
),
(
"rangef64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_f64".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req.clone(),
})),
),
(
"rangei64".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score_i64".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: sub_agg_req,
})),
),
]
.into_iter()
.collect();
agg_req
};
let agg_req: Aggregations = serde_json::from_str(elasticsearch_compatible_json_req).unwrap();
let agg_res: AggregationResults = if use_distributed_collector {
let collector =
@@ -445,10 +420,7 @@ fn test_aggregation_level2(
let searcher = reader.searcher();
let res = searcher.search(&term_query, &collector).unwrap();
// Test de/serialization roundtrip on intermediate_agg_result
let res: IntermediateAggregationResults =
serde_json::from_str(&serde_json::to_string(&res).unwrap()).unwrap();
res.into_final_bucket_result(agg_req.clone(), &Default::default())
res.into_final_result(agg_req.clone(), &Default::default())
.unwrap()
} else {
let collector = get_collector(agg_req.clone());
@@ -518,42 +490,22 @@ fn test_aggregation_level2(
#[test]
fn test_aggregation_level2_multi_segments() -> crate::Result<()> {
test_aggregation_level2(false, false, false)
test_aggregation_level2(false, false)
}
#[test]
fn test_aggregation_level2_single_segment() -> crate::Result<()> {
test_aggregation_level2(true, false, false)
test_aggregation_level2(true, false)
}
#[test]
fn test_aggregation_level2_multi_segments_distributed_collector() -> crate::Result<()> {
test_aggregation_level2(false, true, false)
test_aggregation_level2(false, true)
}
#[test]
fn test_aggregation_level2_single_segment_distributed_collector() -> crate::Result<()> {
test_aggregation_level2(true, true, false)
}
#[test]
fn test_aggregation_level2_multi_segments_use_json() -> crate::Result<()> {
test_aggregation_level2(false, false, true)
}
#[test]
fn test_aggregation_level2_single_segment_use_json() -> crate::Result<()> {
test_aggregation_level2(true, false, true)
}
#[test]
fn test_aggregation_level2_multi_segments_distributed_collector_use_json() -> crate::Result<()> {
test_aggregation_level2(false, true, true)
}
#[test]
fn test_aggregation_level2_single_segment_distributed_collector_use_json() -> crate::Result<()> {
test_aggregation_level2(true, true, true)
test_aggregation_level2(true, true)
}
#[test]
@@ -563,14 +515,14 @@ fn test_aggregation_invalid_requests() -> crate::Result<()> {
let reader = index.reader()?;
let avg_on_field = |field_name: &str| {
let agg_req_1: Aggregations = vec![(
"average".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name(field_name.to_string()),
)),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"average": {
"avg": {
"field": field_name,
},
}
}))
.unwrap();
let collector = get_collector(agg_req_1);
@@ -581,10 +533,36 @@ fn test_aggregation_invalid_requests() -> crate::Result<()> {
let agg_res = avg_on_field("dummy_text").unwrap_err();
assert_eq!(
format!("{:?}", agg_res),
format!("{agg_res:?}"),
r#"InvalidArgument("Field \"dummy_text\" is not configured as fast field")"#
);
let agg_req_1: Result<Aggregations, serde_json::Error> = serde_json::from_value(json!({
"average": {
"avg": {
"fieldd": "a",
},
}
}));
assert_eq!(agg_req_1.is_err(), true);
assert_eq!(agg_req_1.unwrap_err().to_string(), "missing field `field`");
let agg_req_1: Result<Aggregations, serde_json::Error> = serde_json::from_value(json!({
"average": {
"doesnotmatchanyagg": {
"field": "a",
},
}
}));
assert_eq!(agg_req_1.is_err(), true);
// TODO: This should list valid values
assert_eq!(
agg_req_1.unwrap_err().to_string(),
"no variant of enum AggregationVariants found in flattened data"
);
// TODO: This should return an error
// let agg_res = avg_on_field("not_exist_field").unwrap_err();
// assert_eq!(
@@ -618,18 +596,16 @@ fn test_aggregation_on_json_object() {
index_writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = vec![(
"jsonagg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "json.color".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg": {
"terms": {
"field": "json.color",
}
}
}))
.unwrap();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
@@ -687,18 +663,15 @@ fn test_aggregation_on_json_object_empty_columns() {
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let agg: Aggregations = vec![(
"jsonagg".to_string(),
Aggregation::Bucket(Box::new(BucketAggregation {
bucket_agg: BucketAggregationType::Terms(TermsAggregation {
field: "json.color".to_string(),
..Default::default()
}),
sub_aggregation: Default::default(),
})),
)]
.into_iter()
.collect();
let agg: Aggregations = serde_json::from_value(json!({
"jsonagg": {
"terms": {
"field": "json.color",
}
}
}))
.unwrap();
let aggregation_collector = get_collector(agg);
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
@@ -853,10 +826,9 @@ fn test_aggregation_on_json_object_mixed_types() {
"buckets": [
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
// TODO red is missing since there is no multi aggregation within one
// segment for multiple types
// TODO bool is also not yet handled in aggregation
{ "doc_count": 1, "key": "blue", "min_price": { "value": null } }
{ "doc_count": 1, "key": "blue", "min_price": { "value": null } },
{ "doc_count": 1, "key": "red", "min_price": { "value": null } },
],
"sum_other_doc_count": 0
}

View File

@@ -37,10 +37,10 @@ pub struct DateHistogramAggregationReq {
interval: Option<String>,
#[doc(hidden)]
/// Only for validation
date_interval: Option<String>,
calendar_interval: Option<String>,
/// The field to aggregate on.
pub field: String,
/// The format to format dates.
/// The format to format dates. Unsupported currently.
pub format: Option<String>,
/// The interval to chunk your data range. Each bucket spans a value range of
/// [0..fixed_interval). Accepted values
@@ -67,6 +67,13 @@ pub struct DateHistogramAggregationReq {
pub fixed_interval: Option<String>,
/// Intervals implicitly defines an absolute grid of buckets `[interval * k, interval * (k +
/// 1))`.
///
/// Offset makes it possible to shift this grid into
/// `[offset + interval * k, offset + interval * (k + 1))`. Offset has to be in the range [0,
/// interval).
///
/// The `offset` parameter is has the same syntax as the `fixed_interval` parameter, but
/// also allows for negative values.
pub offset: Option<String>,
/// The minimum number of documents in a bucket to be returned. Defaults to 0.
pub min_doc_count: Option<u64>,
@@ -77,7 +84,7 @@ pub struct DateHistogramAggregationReq {
/// hard_bounds only limits the buckets, to force a range set both extended_bounds and
/// hard_bounds to the same range.
///
/// Needs to be provided as timestamp in microseconds precision.
/// Needs to be provided as timestamp in millisecond precision.
///
/// ## Example
/// ```json
@@ -88,7 +95,7 @@ pub struct DateHistogramAggregationReq {
/// "interval": "1d",
/// "hard_bounds": {
/// "min": 0,
/// "max": 1420502400000000
/// "max": 1420502400000
/// }
/// }
/// }
@@ -114,16 +121,16 @@ impl DateHistogramAggregationReq {
self.validate()?;
Ok(HistogramAggregation {
field: self.field.to_string(),
interval: parse_into_microseconds(self.fixed_interval.as_ref().unwrap())? as f64,
interval: parse_into_milliseconds(self.fixed_interval.as_ref().unwrap())? as f64,
offset: self
.offset
.as_ref()
.map(|offset| parse_offset_into_microseconds(offset))
.map(|offset| parse_offset_into_milliseconds(offset))
.transpose()?
.map(|el| el as f64),
min_doc_count: self.min_doc_count,
hard_bounds: None,
extended_bounds: None,
hard_bounds: self.hard_bounds,
extended_bounds: self.extended_bounds,
keyed: self.keyed,
})
}
@@ -131,16 +138,14 @@ impl DateHistogramAggregationReq {
fn validate(&self) -> crate::Result<()> {
if let Some(interval) = self.interval.as_ref() {
return Err(crate::TantivyError::InvalidArgument(format!(
"`interval` parameter {:?} in date histogram is unsupported, only \
`fixed_interval` is supported",
interval
"`interval` parameter {interval:?} in date histogram is unsupported, only \
`fixed_interval` is supported"
)));
}
if let Some(interval) = self.date_interval.as_ref() {
if let Some(interval) = self.calendar_interval.as_ref() {
return Err(crate::TantivyError::InvalidArgument(format!(
"`date_interval` parameter {:?} in date histogram is unsupported, only \
`fixed_interval` is supported",
interval
"`calendar_interval` parameter {interval:?} in date histogram is unsupported, \
only `fixed_interval` is supported"
)));
}
if self.format.is_some() {
@@ -155,7 +160,7 @@ impl DateHistogramAggregationReq {
));
}
parse_into_microseconds(self.fixed_interval.as_ref().unwrap())?;
parse_into_milliseconds(self.fixed_interval.as_ref().unwrap())?;
Ok(())
}
@@ -176,9 +181,12 @@ pub enum DateHistogramParseError {
/// Offset invalid
#[error("passed offset is invalid {0:?}")]
InvalidOffset(String),
/// Value out of bounds
#[error("passed value is out of bounds: {0:?}")]
OutOfBounds(String),
}
fn parse_offset_into_microseconds(input: &str) -> Result<i64, AggregationError> {
fn parse_offset_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
let is_sign = |byte| &[byte] == b"-" || &[byte] == b"+";
if input.is_empty() {
return Err(DateHistogramParseError::InvalidOffset(input.to_string()).into());
@@ -187,18 +195,18 @@ fn parse_offset_into_microseconds(input: &str) -> Result<i64, AggregationError>
let has_sign = is_sign(input.as_bytes()[0]);
if has_sign {
let (sign, input) = input.split_at(1);
let val = parse_into_microseconds(input)?;
let val = parse_into_milliseconds(input)?;
if sign == "-" {
Ok(-val)
} else {
Ok(val)
}
} else {
parse_into_microseconds(input)
parse_into_milliseconds(input)
}
}
fn parse_into_microseconds(input: &str) -> Result<i64, AggregationError> {
fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
let split_boundary = input
.as_bytes()
.iter()
@@ -217,16 +225,21 @@ fn parse_into_microseconds(input: &str) -> Result<i64, AggregationError> {
// here and being defensive does not hurt.
.map_err(|_err| DateHistogramParseError::NumberMissing(input.to_string()))?;
let multiplier_from_unit = match unit {
"ms" => 1,
"s" => 1000,
"m" => 60 * 1000,
"h" => 60 * 60 * 1000,
"d" => 24 * 60 * 60 * 1000,
let unit_in_ms = match unit {
"ms" | "milliseconds" => 1,
"s" | "seconds" => 1000,
"m" | "minutes" => 60 * 1000,
"h" | "hours" => 60 * 60 * 1000,
"d" | "days" => 24 * 60 * 60 * 1000,
_ => return Err(DateHistogramParseError::UnitNotRecognized(unit.to_string()).into()),
};
Ok(number * multiplier_from_unit * 1000)
let val = number * unit_in_ms;
// The field type is in nanoseconds precision, so validate the value to fit the range
val.checked_mul(1_000_000)
.ok_or_else(|| DateHistogramParseError::OutOfBounds(input.to_string()))?;
Ok(val)
}
#[cfg(test)]
@@ -241,49 +254,50 @@ mod tests {
use crate::Index;
#[test]
fn test_parse_into_microseconds() {
assert_eq!(parse_into_microseconds("1m").unwrap(), 60_000_000);
assert_eq!(parse_into_microseconds("2m").unwrap(), 120_000_000);
fn test_parse_into_millisecs() {
assert_eq!(parse_into_milliseconds("1m").unwrap(), 60_000);
assert_eq!(parse_into_milliseconds("2m").unwrap(), 120_000);
assert_eq!(parse_into_milliseconds("2minutes").unwrap(), 120_000);
assert_eq!(
parse_into_microseconds("2y").unwrap_err(),
parse_into_milliseconds("2y").unwrap_err(),
DateHistogramParseError::UnitNotRecognized("y".to_string()).into()
);
assert_eq!(
parse_into_microseconds("2000").unwrap_err(),
parse_into_milliseconds("2000").unwrap_err(),
DateHistogramParseError::UnitMissing("2000".to_string()).into()
);
assert_eq!(
parse_into_microseconds("ms").unwrap_err(),
parse_into_milliseconds("ms").unwrap_err(),
DateHistogramParseError::NumberMissing("ms".to_string()).into()
);
}
#[test]
fn test_parse_offset_into_microseconds() {
assert_eq!(parse_offset_into_microseconds("1m").unwrap(), 60_000_000);
assert_eq!(parse_offset_into_microseconds("+1m").unwrap(), 60_000_000);
assert_eq!(parse_offset_into_microseconds("-1m").unwrap(), -60_000_000);
assert_eq!(parse_offset_into_microseconds("2m").unwrap(), 120_000_000);
assert_eq!(parse_offset_into_microseconds("+2m").unwrap(), 120_000_000);
assert_eq!(parse_offset_into_microseconds("-2m").unwrap(), -120_000_000);
assert_eq!(parse_offset_into_microseconds("-2ms").unwrap(), -2_000);
fn test_parse_offset_into_milliseconds() {
assert_eq!(parse_offset_into_milliseconds("1m").unwrap(), 60_000);
assert_eq!(parse_offset_into_milliseconds("+1m").unwrap(), 60_000);
assert_eq!(parse_offset_into_milliseconds("-1m").unwrap(), -60_000);
assert_eq!(parse_offset_into_milliseconds("2m").unwrap(), 120_000);
assert_eq!(parse_offset_into_milliseconds("+2m").unwrap(), 120_000);
assert_eq!(parse_offset_into_milliseconds("-2m").unwrap(), -120_000);
assert_eq!(parse_offset_into_milliseconds("-2ms").unwrap(), -2);
assert_eq!(
parse_offset_into_microseconds("2y").unwrap_err(),
parse_offset_into_milliseconds("2y").unwrap_err(),
DateHistogramParseError::UnitNotRecognized("y".to_string()).into()
);
assert_eq!(
parse_offset_into_microseconds("2000").unwrap_err(),
parse_offset_into_milliseconds("2000").unwrap_err(),
DateHistogramParseError::UnitMissing("2000".to_string()).into()
);
assert_eq!(
parse_offset_into_microseconds("ms").unwrap_err(),
parse_offset_into_milliseconds("ms").unwrap_err(),
DateHistogramParseError::NumberMissing("ms".to_string()).into()
);
}
#[test]
fn test_parse_into_milliseconds_do_not_accept_non_ascii() {
assert!(parse_into_microseconds("m").is_err());
assert!(parse_into_milliseconds("m").is_err());
}
pub fn get_test_index_from_docs(
@@ -322,168 +336,316 @@ mod tests {
}
#[test]
fn histogram_test_date_force_merge_segments() -> crate::Result<()> {
fn histogram_test_date_force_merge_segments() {
histogram_test_date_merge_segments(true)
}
#[test]
fn histogram_test_date() -> crate::Result<()> {
fn histogram_test_date() {
histogram_test_date_merge_segments(false)
}
fn histogram_test_date_merge_segments(merge_segments: bool) -> crate::Result<()> {
fn histogram_test_date_merge_segments(merge_segments: bool) {
let docs = vec![
vec![r#"{ "date": "2015-01-01T12:10:30Z", "text": "aaa" }"#],
vec![r#"{ "date": "2015-01-01T11:11:30Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-02T00:00:00Z", "text": "bbb" }"#],
vec![r#"{ "date": "2015-01-06T00:00:00Z", "text": "ccc" }"#],
];
let index = get_test_index_from_docs(merge_segments, &docs).unwrap();
let index = get_test_index_from_docs(merge_segments, &docs)?;
// 30day + offset
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
}
}
}
);
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000000.0,
"doc_count" : 4
}
]
}
});
assert_eq!(res, expected_res);
// 30day + offset + sub_agg
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
},
"aggs": {
"texts": {
"terms": {"field": "text"}
{
// 30day + offset
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
}
}
}
}
);
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
println!("{}", serde_json::to_string_pretty(&res).unwrap());
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000000.0,
"doc_count" : 4,
"texts": {
"buckets": [
{
"doc_count": 2,
"key": "bbb"
},
{
"doc_count": 1,
"key": "ccc"
},
{
"doc_count": 1,
"key": "aaa"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
]
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0,
"doc_count" : 4
}
]
}
});
assert_eq!(res, expected_res);
}
});
assert_eq!(res, expected_res);
// 1day
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d"
{
// 30day + offset + sub_agg
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "30d",
"offset": "-4d"
},
"aggs": {
"texts": {
"terms": {"field": "text"}
}
}
}
}
}
}
);
);
let agg_req: Aggregations =
serde_json::from_str(&serde_json::to_string(&elasticsearch_compatible_json).unwrap())
.unwrap();
let res = exec_request(agg_req, &index)?;
let expected_res = json!( {
"sales_over_time": {
"buckets": [
{
"doc_count": 2,
"key": 1420070400000000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000000.0,
"key_as_string": "2015-01-06T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets" : [
{
"key_as_string" : "2015-01-01T00:00:00Z",
"key" : 1420070400000.0,
"doc_count" : 4,
"texts": {
"buckets": [
{
"doc_count": 2,
"key": "bbb"
},
{
"doc_count": 1,
"key": "ccc"
},
{
"doc_count": 1,
"key": "aaa"
}
],
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0
}
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d"
}
}
}
);
Ok(())
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!( {
"sales_over_time": {
"buckets": [
{
"doc_count": 2,
"key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day + extended_bounds
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"extended_bounds": {
"min": 1419984000000.0,
"max": 1420588800000.0
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 0,
"key": 1419984000000.0,
"key_as_string": "2014-12-31T00:00:00Z"
},
{
"doc_count": 2,
"key": 1420070400000.0,
"key_as_string": "2015-01-01T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420243200000.0,
"key_as_string": "2015-01-03T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420329600000.0,
"key_as_string": "2015-01-04T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420416000000.0,
"key_as_string": "2015-01-05T00:00:00Z"
},
{
"doc_count": 1,
"key": 1420502400000.0,
"key_as_string": "2015-01-06T00:00:00Z"
},
{
"doc_count": 0,
"key": 1420588800000.0,
"key_as_string": "2015-01-07T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day + hard_bounds + extended_bounds
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"hard_bounds": {
"min": 1420156800000.0,
"max": 1420243200000.0
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
{
// 1day + hard_bounds as Rfc3339
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
"date_histogram": {
"field": "date",
"fixed_interval": "1d",
"hard_bounds": {
"min": "2015-01-02T00:00:00Z",
"max": "2015-01-02T12:00:00Z"
}
}
}
}
);
let agg_req: Aggregations = serde_json::from_str(
&serde_json::to_string(&elasticsearch_compatible_json).unwrap(),
)
.unwrap();
let res = exec_request(agg_req, &index).unwrap();
let expected_res = json!({
"sales_over_time" : {
"buckets": [
{
"doc_count": 1,
"key": 1420156800000.0,
"key_as_string": "2015-01-02T00:00:00Z"
}
]
}
});
assert_eq!(res, expected_res);
}
}
#[test]
fn histogram_test_invalid_req() -> crate::Result<()> {
fn histogram_test_invalid_req() {
let docs = vec![];
let index = get_test_index_from_docs(false, &docs)?;
let index = get_test_index_from_docs(false, &docs).unwrap();
let elasticsearch_compatible_json = json!(
{
"sales_over_time": {
@@ -504,7 +666,5 @@ mod tests {
err.to_string(),
r#"An invalid argument was passed: '`interval` parameter "30d" in date histogram is unsupported, only `fixed_interval` is supported'"#
);
Ok(())
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -1,11 +1,26 @@
//! Module for all bucket aggregations.
//!
//! BucketAggregations create buckets of documents
//! [`BucketAggregation`](super::agg_req::BucketAggregation).
//! BucketAggregations create buckets of documents.
//! Each bucket is associated with a rule which
//! determines whether or not a document in the falls into it. In other words, the buckets
//! effectively define document sets. Buckets are not necessarily disjunct, therefore a document can
//! fall into multiple buckets. In addition to the buckets themselves, the bucket aggregations also
//! compute and return the number of documents for each bucket. Bucket aggregations, as opposed to
//! metric aggregations, can hold sub-aggregations. These sub-aggregations will be aggregated for
//! the buckets created by their "parent" bucket aggregation. There are different bucket
//! aggregators, each with a different "bucketing" strategy. Some define a single bucket, some
//! define fixed number of multiple buckets, and others dynamically create the buckets during the
//! aggregation process.
//!
//! Results of final buckets are [`BucketResult`](super::agg_result::BucketResult).
//! Results of intermediate buckets are
//! [`IntermediateBucketResult`](super::intermediate_agg_result::IntermediateBucketResult)
//!
//! ## Supported Bucket Aggregations
//! - [Histogram](HistogramAggregation)
//! - [DateHistogram](DateHistogramAggregationReq)
//! - [Range](RangeAggregation)
//! - [Terms](TermsAggregation)
mod histogram;
mod range;

View File

@@ -5,16 +5,17 @@ use columnar::{ColumnType, MonotonicallyMappableToU64};
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
use crate::aggregation::agg_limits::ResourceLimitGuard;
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateBucketResult, IntermediateRangeBucketEntry,
IntermediateRangeBucketResult,
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
IntermediateRangeBucketEntry, IntermediateRangeBucketResult,
};
use crate::aggregation::segment_agg_result::{
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
build_segment_agg_collector, SegmentAggregationCollector,
};
use crate::aggregation::{
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey, VecWithNames,
f64_from_fastfield_u64, f64_to_fastfield_u64, format_date, Key, SerializedKey,
};
use crate::TantivyError;
@@ -157,16 +158,18 @@ impl SegmentRangeBucketEntry {
self,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateRangeBucketEntry> {
let sub_aggregation = if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation.into_intermediate_aggregations_result(agg_with_accessor)?
let mut sub_aggregation_res = IntermediateAggregationResults::default();
if let Some(sub_aggregation) = self.sub_aggregation {
sub_aggregation
.add_intermediate_aggregation_result(agg_with_accessor, &mut sub_aggregation_res)?
} else {
Default::default()
};
Ok(IntermediateRangeBucketEntry {
key: self.key,
key: self.key.into(),
doc_count: self.doc_count,
sub_aggregation,
sub_aggregation: sub_aggregation_res,
from: self.from,
to: self.to,
})
@@ -174,13 +177,14 @@ impl SegmentRangeBucketEntry {
}
impl SegmentAggregationCollector for SegmentRangeCollector {
fn into_intermediate_aggregations_result(
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let field_type = self.column_type;
let name = agg_with_accessor.buckets.keys[self.accessor_idx].to_string();
let sub_agg = &agg_with_accessor.buckets.values[self.accessor_idx].sub_aggregation;
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let sub_agg = &agg_with_accessor.aggs.values[self.accessor_idx].sub_aggregation;
let buckets: FxHashMap<SerializedKey, IntermediateRangeBucketEntry> = self
.buckets
@@ -200,12 +204,9 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
column_type: Some(self.column_type),
});
let buckets = Some(VecWithNames::from_entries(vec![(name, bucket)]));
results.push(name, IntermediateAggregationResult::Bucket(bucket))?;
Ok(IntermediateAggregationResults {
metrics: None,
buckets,
})
Ok(())
}
#[inline]
@@ -223,7 +224,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let bucket_agg_accessor = &mut agg_with_accessor.buckets.values[self.accessor_idx];
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
bucket_agg_accessor
.column_block_accessor
@@ -245,7 +246,7 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
fn flush(&mut self, agg_with_accessor: &mut AggregationsWithAccessor) -> crate::Result<()> {
let sub_aggregation_accessor =
&mut agg_with_accessor.buckets.values[self.accessor_idx].sub_aggregation;
&mut agg_with_accessor.aggs.values[self.accessor_idx].sub_aggregation;
for bucket in self.buckets.iter_mut() {
if let Some(sub_agg) = bucket.bucket.sub_aggregation.as_mut() {
@@ -260,8 +261,8 @@ impl SegmentAggregationCollector for SegmentRangeCollector {
impl SegmentRangeCollector {
pub(crate) fn from_req_and_validate(
req: &RangeAggregation,
sub_aggregation: &AggregationsWithAccessor,
limits: &AggregationLimits,
sub_aggregation: &mut AggregationsWithAccessor,
limits: &mut ResourceLimitGuard,
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
@@ -307,8 +308,7 @@ impl SegmentRangeCollector {
limits.add_memory_consumed(
buckets.len() as u64 * std::mem::size_of::<SegmentRangeAndBucketEntry>() as u64,
);
limits.validate_memory_consumption()?;
)?;
Ok(SegmentRangeCollector {
buckets,
@@ -445,14 +445,12 @@ mod tests {
use serde_json::Value;
use super::*;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
};
use crate::aggregation::metric::AverageAggregation;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::tests::{
exec_request, exec_request_with_query, get_test_index_2_segments,
get_test_index_with_num_docs,
};
use crate::aggregation::AggregationLimits;
pub fn get_collector_from_ranges(
ranges: Vec<RangeAggregationRange>,
@@ -466,8 +464,8 @@ mod tests {
SegmentRangeCollector::from_req_and_validate(
&req,
&Default::default(),
&Default::default(),
&mut Default::default(),
&mut AggregationLimits::default().new_guard(),
field_type,
0,
)
@@ -478,22 +476,18 @@ mod tests {
fn range_fraction_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
..Default::default()
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
]
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -513,31 +507,25 @@ mod tests {
fn range_fraction_test_with_sub_agg() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let sub_agg_req: Aggregations = vec![(
"score_f64".to_string(),
Aggregation::Metric(MetricAggregation::Average(
AverageAggregation::from_field_name("score_f64".to_string()),
)),
)]
.into_iter()
.collect();
let sub_agg_req: Aggregations = serde_json::from_value(json!({
"avg": { "avg": { "field": "score_f64", } }
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
..Default::default()
}),
sub_aggregation: sub_agg_req,
}
.into(),
),
)]
.into_iter()
.collect();
}))
.unwrap();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
]
},
"aggs": sub_agg_req
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -557,22 +545,19 @@ mod tests {
fn range_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![(0f64..0.1f64).into(), (0.1f64..0.2f64).into()],
keyed: true,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
],
"keyed": true
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -597,33 +582,19 @@ mod tests {
fn range_custom_key_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![
RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
},
RangeAggregationRange {
key: None,
from: Some(0.1f64),
to: Some(0.2f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"key": "custom-key-0-to-0.1", "from": 0.0, "to": 0.1},
{"from": 0.1, "to": 0.2},
],
"keyed": false
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;
@@ -657,33 +628,19 @@ mod tests {
fn range_date_test_with_opt(merge_segments: bool) -> crate::Result<()> {
let index = get_test_index_2_segments(merge_segments)?;
let agg_req: Aggregations = vec![(
"date_ranges".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "date".to_string(),
ranges: vec![
RangeAggregationRange {
key: None,
from: None,
to: Some(1546300800000000.0f64),
},
RangeAggregationRange {
key: None,
from: Some(1546300800000000.0f64),
to: Some(1546387200000000.0f64),
},
],
keyed: false,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"date_ranges": {
"range": {
"field": "date",
"ranges": [
{"to": 1546300800000000000i64},
{"from": 1546300800000000000i64, "to": 1546387200000000000i64},
],
"keyed": false
},
}
}))
.unwrap();
let agg_res = exec_request(agg_req, &index)?;
@@ -722,26 +679,18 @@ mod tests {
fn range_custom_key_keyed_buckets_test() -> crate::Result<()> {
let index = get_test_index_with_num_docs(false, 100)?;
let agg_req: Aggregations = vec![(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "fraction_f64".to_string(),
ranges: vec![RangeAggregationRange {
key: Some("custom-key-0-to-0.1".to_string()),
from: Some(0f64),
to: Some(0.1f64),
}],
keyed: true,
}),
sub_aggregation: Default::default(),
}
.into(),
),
)]
.into_iter()
.collect();
let agg_req: Aggregations = serde_json::from_value(json!({
"range": {
"range": {
"field": "fraction_f64",
"ranges": [
{"key": "custom-key-0-to-0.1", "from": 0.0, "to": 0.1},
],
"keyed": true
},
}
}))
.unwrap();
let res = exec_request_with_query(agg_req, &index, None)?;

File diff suppressed because it is too large Load Diff

View File

@@ -35,11 +35,12 @@ impl BufAggregationCollector {
impl SegmentAggregationCollector for BufAggregationCollector {
#[inline]
fn into_intermediate_aggregations_result(
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
Box::new(self.collector).into_intermediate_aggregations_result(agg_with_accessor)
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
Box::new(self.collector).add_intermediate_aggregation_result(agg_with_accessor, results)
}
#[inline]

View File

@@ -6,7 +6,7 @@ use super::intermediate_agg_result::IntermediateAggregationResults;
use super::segment_agg_result::{
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
};
use crate::aggregation::agg_req_with_accessor::get_aggs_with_accessor_and_validate;
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
use crate::collector::{Collector, SegmentCollector};
use crate::{DocId, SegmentReader, TantivyError};
@@ -104,7 +104,7 @@ impl Collector for AggregationCollector {
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
) -> crate::Result<Self::Fruit> {
let res = merge_fruits(segment_fruits)?;
res.into_final_bucket_result(self.agg.clone(), &self.limits)
res.into_final_result(self.agg.clone(), &self.limits)
}
}
@@ -114,7 +114,7 @@ fn merge_fruits(
if let Some(fruit) = segment_fruits.pop() {
let mut fruit = fruit?;
for next_fruit in segment_fruits {
fruit.merge_fruits(next_fruit?);
fruit.merge_fruits(next_fruit?)?;
}
Ok(fruit)
} else {
@@ -137,9 +137,10 @@ impl AggregationSegmentCollector {
reader: &SegmentReader,
limits: &AggregationLimits,
) -> crate::Result<Self> {
let aggs_with_accessor = get_aggs_with_accessor_and_validate(agg, reader, limits)?;
let mut aggs_with_accessor =
get_aggs_with_segment_accessor_and_validate(agg, reader, limits)?;
let result =
BufAggregationCollector::new(build_segment_agg_collector(&aggs_with_accessor)?);
BufAggregationCollector::new(build_segment_agg_collector(&mut aggs_with_accessor)?);
Ok(AggregationSegmentCollector {
aggs_with_accessor,
agg_collector: result,
@@ -184,6 +185,13 @@ impl SegmentCollector for AggregationSegmentCollector {
return Err(err);
}
self.agg_collector.flush(&mut self.aggs_with_accessor)?;
Box::new(self.agg_collector).into_intermediate_aggregations_result(&self.aggs_with_accessor)
let mut sub_aggregation_res = IntermediateAggregationResults::default();
Box::new(self.agg_collector).add_intermediate_aggregation_result(
&self.aggs_with_accessor,
&mut sub_aggregation_res,
)?;
Ok(sub_aggregation_res)
}
}

View File

@@ -4,13 +4,11 @@ use time::OffsetDateTime;
use crate::TantivyError;
pub(crate) fn format_date(val: i64) -> crate::Result<String> {
let datetime =
OffsetDateTime::from_unix_timestamp_nanos(1_000 * (val as i128)).map_err(|err| {
TantivyError::InvalidArgument(format!(
"Could not convert {:?} to OffsetDateTime, err {:?}",
val, err
))
})?;
let datetime = OffsetDateTime::from_unix_timestamp_nanos(val as i128).map_err(|err| {
TantivyError::InvalidArgument(format!(
"Could not convert {val:?} to OffsetDateTime, err {err:?}"
))
})?;
let key_as_string = datetime
.format(&Rfc3339)
.map_err(|_err| TantivyError::InvalidArgument("Could not serialize date".to_string()))?;

View File

@@ -5,6 +5,12 @@ use super::bucket::DateHistogramParseError;
/// Error that may occur when opening a directory
#[derive(Debug, Clone, PartialEq, Eq, Error)]
pub enum AggregationError {
/// InternalError Aggregation Request
#[error("InternalError: {0:?}")]
InternalError(String),
/// Invalid Aggregation Request
#[error("InvalidRequest: {0:?}")]
InvalidRequest(String),
/// Date histogram parse error
#[error("Date histogram parse error: {0:?}")]
DateHistogramParseError(#[from] DateHistogramParseError),

View File

@@ -3,50 +3,101 @@
//! indices.
use std::cmp::Ordering;
use std::collections::hash_map::Entry;
use std::hash::Hash;
use columnar::ColumnType;
use itertools::Itertools;
use rustc_hash::FxHashMap;
use serde::ser::SerializeSeq;
use serde::{Deserialize, Deserializer, Serialize, Serializer};
use serde::{Deserialize, Serialize};
use super::agg_req::{
Aggregations, AggregationsInternal, BucketAggregationInternal, BucketAggregationType,
MetricAggregation, RangeAggregation,
};
use super::agg_result::{AggregationResult, BucketResult, RangeBucketEntry};
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
use super::agg_result::{AggregationResult, BucketResult, MetricResult, RangeBucketEntry};
use super::bucket::{
cut_off_buckets, get_agg_name_and_property, intermediate_histogram_buckets_to_final_buckets,
GetDocCount, Order, OrderTarget, SegmentHistogramBucketEntry, TermsAggregation,
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
};
use super::metric::{
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
IntermediateSum,
IntermediateSum, PercentilesCollector,
};
use super::segment_agg_result::AggregationLimits;
use super::{format_date, AggregationError, Key, SerializedKey, VecWithNames};
use super::{format_date, AggregationError, Key, SerializedKey};
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
use crate::aggregation::bucket::TermsAggregationInternal;
use crate::TantivyError;
/// Contains the intermediate aggregation result, which is optimized to be merged with other
/// intermediate results.
///
/// Notice: This struct should not be de/serialized via JSON format.
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateAggregationResults {
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) metrics: Option<VecWithNames<IntermediateMetricResult>>,
#[serde(skip_serializing_if = "Option::is_none")]
pub(crate) buckets: Option<VecWithNames<IntermediateBucketResult>>,
pub(crate) aggs_res: FxHashMap<String, IntermediateAggregationResult>,
}
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd, PartialEq)]
/// The key to identify a bucket.
/// This might seem redundant with `Key`, but the point is to have a different
/// Serialize implementation.
pub enum IntermediateKey {
/// String key
Str(String),
/// `f64` key
F64(f64),
}
impl From<Key> for IntermediateKey {
fn from(value: Key) -> Self {
match value {
Key::Str(s) => Self::Str(s),
Key::F64(f) => Self::F64(f),
}
}
}
impl From<IntermediateKey> for Key {
fn from(value: IntermediateKey) -> Self {
match value {
IntermediateKey::Str(s) => Self::Str(s),
IntermediateKey::F64(f) => Self::F64(f),
}
}
}
impl Eq for IntermediateKey {}
impl std::hash::Hash for IntermediateKey {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
core::mem::discriminant(self).hash(state);
match self {
IntermediateKey::Str(text) => text.hash(state),
IntermediateKey::F64(val) => val.to_bits().hash(state),
}
}
}
impl IntermediateAggregationResults {
/// Add a result
pub fn push(&mut self, key: String, value: IntermediateAggregationResult) -> crate::Result<()> {
let entry = self.aggs_res.entry(key);
match entry {
Entry::Occupied(mut e) => {
// In case of term aggregation over different types, we need to merge the results.
e.get_mut().merge_fruits(value)?;
}
Entry::Vacant(e) => {
e.insert(value);
}
}
Ok(())
}
/// Convert intermediate result and its aggregation request to the final result.
pub fn into_final_bucket_result(
pub fn into_final_result(
self,
req: Aggregations,
limits: &AggregationLimits,
) -> crate::Result<AggregationResults> {
let res = self.into_final_bucket_result_internal(&(req.into()), limits)?;
let res = self.into_final_result_internal(&req, limits)?;
let bucket_count = res.get_bucket_count() as u32;
if bucket_count > limits.get_bucket_limit() {
return Err(TantivyError::AggregationError(
@@ -63,149 +114,90 @@ impl IntermediateAggregationResults {
///
/// Internal function, AggregationsInternal is used instead Aggregations, which is optimized
/// for internal processing, by splitting metric and buckets into separate groups.
pub(crate) fn into_final_bucket_result_internal(
pub(crate) fn into_final_result_internal(
self,
req: &AggregationsInternal,
req: &Aggregations,
limits: &AggregationLimits,
) -> crate::Result<AggregationResults> {
// Important assumption:
// When the tree contains buckets/metric, we expect it to have all buckets/metrics from the
// request
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
if let Some(buckets) = self.buckets {
convert_and_add_final_buckets_to_result(&mut results, buckets, &req.buckets, limits)?
} else {
// When there are no buckets, we create empty buckets, so that the serialized json
// format is constant
add_empty_final_buckets_to_result(&mut results, &req.buckets, limits)?
};
if let Some(metrics) = self.metrics {
convert_and_add_final_metrics_to_result(&mut results, metrics);
} else {
// When there are no metrics, we create empty metric results, so that the serialized
// json format is constant
add_empty_final_metrics_to_result(&mut results, &req.metrics)?;
for (key, agg_res) in self.aggs_res.into_iter() {
let req = req.get(key.as_str()).unwrap();
results.insert(key, agg_res.into_final_result(req, limits)?);
}
// Handle empty results
if results.len() != req.len() {
for (key, req) in req.iter() {
if !results.contains_key(key) {
let empty_res = empty_from_req(req);
results.insert(key.to_string(), empty_res.into_final_result(req, limits)?);
}
}
}
Ok(AggregationResults(results))
}
pub(crate) fn empty_from_req(req: &AggregationsInternal) -> Self {
let metrics = if req.metrics.is_empty() {
None
} else {
let metrics = req
.metrics
.iter()
.map(|(key, req)| {
(
key.to_string(),
IntermediateMetricResult::empty_from_req(req),
)
})
.collect();
Some(VecWithNames::from_entries(metrics))
};
pub(crate) fn empty_from_req(req: &Aggregations) -> Self {
let mut aggs_res: FxHashMap<String, IntermediateAggregationResult> = FxHashMap::default();
for (key, req) in req.iter() {
let empty_res = empty_from_req(req);
aggs_res.insert(key.to_string(), empty_res);
}
let buckets = if req.buckets.is_empty() {
None
} else {
let buckets = req
.buckets
.iter()
.map(|(key, req)| {
(
key.to_string(),
IntermediateBucketResult::empty_from_req(&req.bucket_agg),
)
})
.collect();
Some(VecWithNames::from_entries(buckets))
};
Self { metrics, buckets }
Self { aggs_res }
}
/// Merge another intermediate aggregation result into this result.
///
/// The order of the values need to be the same on both results. This is ensured when the same
/// (key values) are present on the underlying `VecWithNames` struct.
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) {
if let (Some(buckets_left), Some(buckets_right)) = (&mut self.buckets, other.buckets) {
for (bucket_left, bucket_right) in
buckets_left.values_mut().zip(buckets_right.into_values())
{
bucket_left.merge_fruits(bucket_right);
}
pub fn merge_fruits(&mut self, other: IntermediateAggregationResults) -> crate::Result<()> {
for (left, right) in self.aggs_res.values_mut().zip(other.aggs_res.into_values()) {
left.merge_fruits(right)?;
}
Ok(())
}
}
if let (Some(metrics_left), Some(metrics_right)) = (&mut self.metrics, other.metrics) {
for (metric_left, metric_right) in
metrics_left.values_mut().zip(metrics_right.into_values())
{
metric_left.merge_fruits(metric_right);
}
pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult {
use AggregationVariants::*;
match req.agg {
Terms(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Terms(
Default::default(),
)),
Range(_) => IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
Default::default(),
)),
Histogram(_) | DateHistogram(_) => {
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Histogram {
buckets: Vec::new(),
column_type: None,
})
}
Average(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Average(
IntermediateAverage::default(),
)),
Count(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Count(
IntermediateCount::default(),
)),
Max(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Max(
IntermediateMax::default(),
)),
Min(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Min(
IntermediateMin::default(),
)),
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
IntermediateStats::default(),
)),
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
IntermediateSum::default(),
)),
Percentiles(_) => IntermediateAggregationResult::Metric(
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
),
}
}
fn convert_and_add_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>,
metrics: VecWithNames<IntermediateMetricResult>,
) {
results.extend(
metrics
.into_iter()
.map(|(key, metric)| (key, AggregationResult::MetricResult(metric.into()))),
);
}
fn add_empty_final_metrics_to_result(
results: &mut FxHashMap<String, AggregationResult>,
req_metrics: &VecWithNames<MetricAggregation>,
) -> crate::Result<()> {
results.extend(req_metrics.iter().map(|(key, req)| {
let empty_bucket = IntermediateMetricResult::empty_from_req(req);
(
key.to_string(),
AggregationResult::MetricResult(empty_bucket.into()),
)
}));
Ok(())
}
fn add_empty_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
limits: &AggregationLimits,
) -> crate::Result<()> {
let requested_buckets = req_buckets.iter();
for (key, req) in requested_buckets {
let empty_bucket =
AggregationResult::BucketResult(BucketResult::empty_from_req(req, limits)?);
results.insert(key.to_string(), empty_bucket);
}
Ok(())
}
fn convert_and_add_final_buckets_to_result(
results: &mut FxHashMap<String, AggregationResult>,
buckets: VecWithNames<IntermediateBucketResult>,
req_buckets: &VecWithNames<BucketAggregationInternal>,
limits: &AggregationLimits,
) -> crate::Result<()> {
assert_eq!(buckets.len(), req_buckets.len());
let buckets_with_request = buckets.into_iter().zip(req_buckets.values());
for ((key, bucket), req) in buckets_with_request {
let result = AggregationResult::BucketResult(bucket.into_final_bucket_result(req, limits)?);
results.insert(key, result);
}
Ok(())
}
/// An aggregation is either a bucket or a metric.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum IntermediateAggregationResult {
@@ -215,9 +207,42 @@ pub enum IntermediateAggregationResult {
Metric(IntermediateMetricResult),
}
impl IntermediateAggregationResult {
pub(crate) fn into_final_result(
self,
req: &Aggregation,
limits: &AggregationLimits,
) -> crate::Result<AggregationResult> {
let res = match self {
IntermediateAggregationResult::Bucket(bucket) => {
AggregationResult::BucketResult(bucket.into_final_bucket_result(req, limits)?)
}
IntermediateAggregationResult::Metric(metric) => {
AggregationResult::MetricResult(metric.into_final_metric_result(req))
}
};
Ok(res)
}
fn merge_fruits(&mut self, other: IntermediateAggregationResult) -> crate::Result<()> {
match (self, other) {
(
IntermediateAggregationResult::Bucket(b1),
IntermediateAggregationResult::Bucket(b2),
) => b1.merge_fruits(b2),
(
IntermediateAggregationResult::Metric(m1),
IntermediateAggregationResult::Metric(m2),
) => m1.merge_fruits(m2),
_ => panic!("aggregation result type mismatch (mixed metric and buckets)"),
}
}
}
/// Holds the intermediate data for metric results
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub enum IntermediateMetricResult {
/// Intermediate average result.
Percentiles(PercentilesCollector),
/// Intermediate average result.
Average(IntermediateAverage),
/// Intermediate count result.
@@ -233,23 +258,34 @@ pub enum IntermediateMetricResult {
}
impl IntermediateMetricResult {
pub(crate) fn empty_from_req(req: &MetricAggregation) -> Self {
match req {
MetricAggregation::Average(_) => {
IntermediateMetricResult::Average(IntermediateAverage::default())
fn into_final_metric_result(self, req: &Aggregation) -> MetricResult {
match self {
IntermediateMetricResult::Average(intermediate_avg) => {
MetricResult::Average(intermediate_avg.finalize().into())
}
MetricAggregation::Count(_) => {
IntermediateMetricResult::Count(IntermediateCount::default())
IntermediateMetricResult::Count(intermediate_count) => {
MetricResult::Count(intermediate_count.finalize().into())
}
MetricAggregation::Max(_) => IntermediateMetricResult::Max(IntermediateMax::default()),
MetricAggregation::Min(_) => IntermediateMetricResult::Min(IntermediateMin::default()),
MetricAggregation::Stats(_) => {
IntermediateMetricResult::Stats(IntermediateStats::default())
IntermediateMetricResult::Max(intermediate_max) => {
MetricResult::Max(intermediate_max.finalize().into())
}
MetricAggregation::Sum(_) => IntermediateMetricResult::Sum(IntermediateSum::default()),
IntermediateMetricResult::Min(intermediate_min) => {
MetricResult::Min(intermediate_min.finalize().into())
}
IntermediateMetricResult::Stats(intermediate_stats) => {
MetricResult::Stats(intermediate_stats.finalize())
}
IntermediateMetricResult::Sum(intermediate_sum) => {
MetricResult::Sum(intermediate_sum.finalize().into())
}
IntermediateMetricResult::Percentiles(percentiles) => MetricResult::Percentiles(
percentiles
.into_final_result(req.agg.as_percentile().expect("unexpected metric type")),
),
}
}
fn merge_fruits(&mut self, other: IntermediateMetricResult) {
fn merge_fruits(&mut self, other: IntermediateMetricResult) -> crate::Result<()> {
match (self, other) {
(
IntermediateMetricResult::Average(avg_left),
@@ -278,10 +314,18 @@ impl IntermediateMetricResult {
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
sum_left.merge_fruits(sum_right);
}
(
IntermediateMetricResult::Percentiles(left),
IntermediateMetricResult::Percentiles(right),
) => {
left.merge_fruits(right)?;
}
_ => {
panic!("incompatible fruit types in tree");
panic!("incompatible fruit types in tree or missing merge_fruits handler");
}
}
Ok(())
}
}
@@ -307,7 +351,7 @@ pub enum IntermediateBucketResult {
impl IntermediateBucketResult {
pub(crate) fn into_final_bucket_result(
self,
req: &BucketAggregationInternal,
req: &Aggregation,
limits: &AggregationLimits,
) -> crate::Result<BucketResult> {
match self {
@@ -317,8 +361,9 @@ impl IntermediateBucketResult {
.into_values()
.map(|bucket| {
bucket.into_final_bucket_entry(
&req.sub_aggregation,
req.as_range()
req.sub_aggregation(),
req.agg
.as_range()
.expect("unexpected aggregation, expected histogram aggregation"),
range_res.column_type,
limits,
@@ -333,6 +378,7 @@ impl IntermediateBucketResult {
});
let is_keyed = req
.agg
.as_range()
.expect("unexpected aggregation, expected range aggregation")
.keyed;
@@ -353,13 +399,14 @@ impl IntermediateBucketResult {
buckets,
} => {
let histogram_req = &req
.agg
.as_histogram()?
.expect("unexpected aggregation, expected histogram aggregation");
let buckets = intermediate_histogram_buckets_to_final_buckets(
buckets,
column_type,
histogram_req,
&req.sub_aggregation,
req.sub_aggregation(),
limits,
)?;
@@ -376,33 +423,22 @@ impl IntermediateBucketResult {
Ok(BucketResult::Histogram { buckets })
}
IntermediateBucketResult::Terms(terms) => terms.into_final_result(
req.as_term()
req.agg
.as_term()
.expect("unexpected aggregation, expected term aggregation"),
&req.sub_aggregation,
req.sub_aggregation(),
limits,
),
}
}
pub(crate) fn empty_from_req(req: &BucketAggregationType) -> Self {
match req {
BucketAggregationType::Terms(_) => IntermediateBucketResult::Terms(Default::default()),
BucketAggregationType::Range(_) => IntermediateBucketResult::Range(Default::default()),
BucketAggregationType::Histogram(_) | BucketAggregationType::DateHistogram(_) => {
IntermediateBucketResult::Histogram {
buckets: vec![],
column_type: None,
}
}
}
}
fn merge_fruits(&mut self, other: IntermediateBucketResult) {
fn merge_fruits(&mut self, other: IntermediateBucketResult) -> crate::Result<()> {
match (self, other) {
(
IntermediateBucketResult::Terms(term_res_left),
IntermediateBucketResult::Terms(term_res_right),
) => {
merge_key_maps(&mut term_res_left.entries, term_res_right.entries);
merge_maps(&mut term_res_left.entries, term_res_right.entries)?;
term_res_left.sum_other_doc_count += term_res_right.sum_other_doc_count;
term_res_left.doc_count_error_upper_bound +=
term_res_right.doc_count_error_upper_bound;
@@ -412,7 +448,7 @@ impl IntermediateBucketResult {
IntermediateBucketResult::Range(range_res_left),
IntermediateBucketResult::Range(range_res_right),
) => {
merge_serialized_key_maps(&mut range_res_left.buckets, range_res_right.buckets);
merge_maps(&mut range_res_left.buckets, range_res_right.buckets)?;
}
(
IntermediateBucketResult::Histogram {
@@ -424,22 +460,23 @@ impl IntermediateBucketResult {
..
},
) => {
let buckets = buckets_left
.drain(..)
.merge_join_by(buckets_right.into_iter(), |left, right| {
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
})
.map(|either| match either {
itertools::EitherOrBoth::Both(mut left, right) => {
left.merge_fruits(right);
left
}
itertools::EitherOrBoth::Left(left) => left,
itertools::EitherOrBoth::Right(right) => right,
})
.collect();
let buckets: Result<Vec<IntermediateHistogramBucketEntry>, TantivyError> =
buckets_left
.drain(..)
.merge_join_by(buckets_right.into_iter(), |left, right| {
left.key.partial_cmp(&right.key).unwrap_or(Ordering::Equal)
})
.map(|either| match either {
itertools::EitherOrBoth::Both(mut left, right) => {
left.merge_fruits(right)?;
Ok(left)
}
itertools::EitherOrBoth::Left(left) => Ok(left),
itertools::EitherOrBoth::Right(right) => Ok(right),
})
.collect::<Result<_, _>>();
*buckets_left = buckets;
*buckets_left = buckets?;
}
(IntermediateBucketResult::Range(_), _) => {
panic!("try merge on different types")
@@ -451,6 +488,7 @@ impl IntermediateBucketResult {
panic!("try merge on different types")
}
}
Ok(())
}
}
@@ -464,59 +502,31 @@ pub struct IntermediateRangeBucketResult {
#[derive(Default, Clone, Debug, PartialEq, Serialize, Deserialize)]
/// Term aggregation including error counts
pub struct IntermediateTermBucketResult {
#[serde(
serialize_with = "serialize_entries",
deserialize_with = "deserialize_entries"
)]
pub(crate) entries: FxHashMap<Key, IntermediateTermBucketEntry>,
pub(crate) entries: FxHashMap<IntermediateKey, IntermediateTermBucketEntry>,
pub(crate) sum_other_doc_count: u64,
pub(crate) doc_count_error_upper_bound: u64,
}
// Serialize into a Vec to circument the JSON limitation, where keys can't be numbers
fn serialize_entries<S>(
entries: &FxHashMap<Key, IntermediateTermBucketEntry>,
serializer: S,
) -> Result<S::Ok, S::Error>
where
S: Serializer,
{
let mut seq = serializer.serialize_seq(Some(entries.len()))?;
for (k, v) in entries {
seq.serialize_element(&(k, v))?;
}
seq.end()
}
fn deserialize_entries<'de, D>(
deserializer: D,
) -> Result<FxHashMap<Key, IntermediateTermBucketEntry>, D::Error>
where D: Deserializer<'de> {
let vec_entries: Vec<(Key, IntermediateTermBucketEntry)> =
Deserialize::deserialize(deserializer)?;
Ok(vec_entries.into_iter().collect())
}
impl IntermediateTermBucketResult {
pub(crate) fn into_final_result(
self,
req: &TermsAggregation,
sub_aggregation_req: &AggregationsInternal,
sub_aggregation_req: &Aggregations,
limits: &AggregationLimits,
) -> crate::Result<BucketResult> {
let req = TermsAggregationInternal::from_req(req);
let mut buckets: Vec<BucketEntry> = self
.entries
.into_iter()
.filter(|bucket| bucket.1.doc_count >= req.min_doc_count)
.filter(|bucket| bucket.1.doc_count as u64 >= req.min_doc_count)
.map(|(key, entry)| {
Ok(BucketEntry {
key_as_string: None,
key,
doc_count: entry.doc_count,
key: key.into(),
doc_count: entry.doc_count as u64,
sub_aggregation: entry
.sub_aggregation
.into_final_bucket_result_internal(sub_aggregation_req, limits)?,
.into_final_result_internal(sub_aggregation_req, limits)?,
})
})
.collect::<crate::Result<_>>()?;
@@ -587,37 +597,23 @@ impl IntermediateTermBucketResult {
}
trait MergeFruits {
fn merge_fruits(&mut self, other: Self);
fn merge_fruits(&mut self, other: Self) -> crate::Result<()>;
}
fn merge_serialized_key_maps<V: MergeFruits + Clone>(
entries_left: &mut FxHashMap<SerializedKey, V>,
mut entries_right: FxHashMap<SerializedKey, V>,
) {
fn merge_maps<V: MergeFruits + Clone, T: Eq + PartialEq + Hash>(
entries_left: &mut FxHashMap<T, V>,
mut entries_right: FxHashMap<T, V>,
) -> crate::Result<()> {
for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) {
entry_left.merge_fruits(entry_right);
}
}
for (key, res) in entries_right.into_iter() {
entries_left.entry(key).or_insert(res);
}
}
fn merge_key_maps<V: MergeFruits + Clone>(
entries_left: &mut FxHashMap<Key, V>,
mut entries_right: FxHashMap<Key, V>,
) {
for (name, entry_left) in entries_left.iter_mut() {
if let Some(entry_right) = entries_right.remove(name) {
entry_left.merge_fruits(entry_right);
entry_left.merge_fruits(entry_right)?;
}
}
for (key, res) in entries_right.into_iter() {
entries_left.entry(key).or_insert(res);
}
Ok(())
}
/// This is the histogram entry for a bucket, which contains a key, count, and optionally
@@ -635,7 +631,7 @@ pub struct IntermediateHistogramBucketEntry {
impl IntermediateHistogramBucketEntry {
pub(crate) fn into_final_bucket_entry(
self,
req: &AggregationsInternal,
req: &Aggregations,
limits: &AggregationLimits,
) -> crate::Result<BucketEntry> {
Ok(BucketEntry {
@@ -644,53 +640,41 @@ impl IntermediateHistogramBucketEntry {
doc_count: self.doc_count,
sub_aggregation: self
.sub_aggregation
.into_final_bucket_result_internal(req, limits)?,
.into_final_result_internal(req, limits)?,
})
}
}
impl From<SegmentHistogramBucketEntry> for IntermediateHistogramBucketEntry {
fn from(entry: SegmentHistogramBucketEntry) -> Self {
IntermediateHistogramBucketEntry {
key: entry.key,
doc_count: entry.doc_count,
sub_aggregation: Default::default(),
}
}
}
/// This is the range entry for a bucket, which contains a key, count, and optionally
/// sub_aggregations.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateRangeBucketEntry {
/// The unique the bucket is identified.
pub key: Key,
/// The unique key the bucket is identified with.
pub key: IntermediateKey,
/// The number of documents in the bucket.
pub doc_count: u64,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
/// The from range of the bucket. Equals `f64::MIN` when `None`.
#[serde(skip_serializing_if = "Option::is_none")]
pub from: Option<f64>,
/// The to range of the bucket. Equals `f64::MAX` when `None`.
#[serde(skip_serializing_if = "Option::is_none")]
pub to: Option<f64>,
}
impl IntermediateRangeBucketEntry {
pub(crate) fn into_final_bucket_entry(
self,
req: &AggregationsInternal,
req: &Aggregations,
_range_req: &RangeAggregation,
column_type: Option<ColumnType>,
limits: &AggregationLimits,
) -> crate::Result<RangeBucketEntry> {
let mut range_bucket_entry = RangeBucketEntry {
key: self.key,
key: self.key.into(),
doc_count: self.doc_count,
sub_aggregation: self
.sub_aggregation
.into_final_bucket_result_internal(req, limits)?,
.into_final_result_internal(req, limits)?,
to: self.to,
from: self.from,
to_as_string: None,
@@ -719,29 +703,32 @@ impl IntermediateRangeBucketEntry {
#[derive(Clone, Default, Debug, PartialEq, Serialize, Deserialize)]
pub struct IntermediateTermBucketEntry {
/// The number of documents in the bucket.
pub doc_count: u64,
pub doc_count: u32,
/// The sub_aggregation in this bucket.
pub sub_aggregation: IntermediateAggregationResults,
}
impl MergeFruits for IntermediateTermBucketEntry {
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) {
fn merge_fruits(&mut self, other: IntermediateTermBucketEntry) -> crate::Result<()> {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
Ok(())
}
}
impl MergeFruits for IntermediateRangeBucketEntry {
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) {
fn merge_fruits(&mut self, other: IntermediateRangeBucketEntry) -> crate::Result<()> {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
Ok(())
}
}
impl MergeFruits for IntermediateHistogramBucketEntry {
fn merge_fruits(&mut self, other: IntermediateHistogramBucketEntry) {
fn merge_fruits(&mut self, other: IntermediateHistogramBucketEntry) -> crate::Result<()> {
self.doc_count += other.doc_count;
self.sub_aggregation.merge_fruits(other.sub_aggregation);
self.sub_aggregation.merge_fruits(other.sub_aggregation)?;
Ok(())
}
}
@@ -760,7 +747,7 @@ mod tests {
buckets.insert(
key.to_string(),
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
key: IntermediateKey::Str(key.to_string()),
doc_count: *doc_count,
sub_aggregation: Default::default(),
from: None,
@@ -770,14 +757,15 @@ mod tests {
}
map.insert(
"my_agg_level2".to_string(),
IntermediateBucketResult::Range(IntermediateRangeBucketResult {
buckets,
column_type: None,
}),
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
IntermediateRangeBucketResult {
buckets,
column_type: None,
},
)),
);
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
metrics: Default::default(),
aggs_res: map.into_iter().collect(),
}
}
@@ -790,7 +778,7 @@ mod tests {
buckets.insert(
key.to_string(),
IntermediateRangeBucketEntry {
key: Key::Str(key.to_string()),
key: IntermediateKey::Str(key.to_string()),
doc_count: *doc_count,
from: None,
to: None,
@@ -803,14 +791,15 @@ mod tests {
}
map.insert(
"my_agg_level1".to_string(),
IntermediateBucketResult::Range(IntermediateRangeBucketResult {
buckets,
column_type: None,
}),
IntermediateAggregationResult::Bucket(IntermediateBucketResult::Range(
IntermediateRangeBucketResult {
buckets,
column_type: None,
},
)),
);
IntermediateAggregationResults {
buckets: Some(VecWithNames::from_entries(map.into_iter().collect())),
metrics: Default::default(),
aggs_res: map.into_iter().collect(),
}
}
@@ -825,7 +814,7 @@ mod tests {
("blue".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(tree_right);
tree_left.merge_fruits(tree_right).unwrap();
let tree_expected = get_intermediat_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
@@ -846,7 +835,7 @@ mod tests {
("green".to_string(), 25, "1900".to_string(), 50),
]);
tree_left.merge_fruits(tree_right);
tree_left.merge_fruits(tree_right).unwrap();
let tree_expected = get_intermediat_tree_with_ranges(&[
("red".to_string(), 110, "1900".to_string(), 55),
@@ -866,30 +855,10 @@ mod tests {
let orig = tree_left.clone();
tree_left.merge_fruits(IntermediateAggregationResults::default());
tree_left
.merge_fruits(IntermediateAggregationResults::default())
.unwrap();
assert_eq!(tree_left, orig);
}
#[test]
fn test_term_bucket_json_roundtrip() {
let term_buckets = IntermediateTermBucketResult {
entries: vec![(
Key::F64(5.0),
IntermediateTermBucketEntry {
doc_count: 10,
sub_aggregation: Default::default(),
},
)]
.into_iter()
.collect(),
sum_other_doc_count: 0,
doc_count_error_upper_bound: 0,
};
let term_buckets_round: IntermediateTermBucketResult =
serde_json::from_str(&serde_json::to_string(&term_buckets).unwrap()).unwrap();
assert_eq!(term_buckets, term_buckets_round);
}
}

View File

@@ -1,17 +1,34 @@
//! Module for all metric aggregations.
//!
//! The aggregations in this family compute metrics, see [super::agg_req::MetricAggregation] for
//! details.
//! The aggregations in this family compute metrics based on values extracted
//! from the documents that are being aggregated. Values are extracted from the fast field of
//! the document.
//! Some aggregations output a single numeric metric (e.g. Average) and are called
//! single-value numeric metrics aggregation, others generate multiple metrics (e.g. Stats) and are
//! called multi-value numeric metrics aggregation.
//!
//! ## Supported Metric Aggregations
//! - [Average](AverageAggregation)
//! - [Stats](StatsAggregation)
//! - [Min](MinAggregation)
//! - [Max](MaxAggregation)
//! - [Sum](SumAggregation)
//! - [Count](CountAggregation)
//! - [Percentiles](PercentilesAggregationReq)
mod average;
mod count;
mod max;
mod min;
mod percentiles;
mod stats;
mod sum;
pub use average::*;
pub use count::*;
pub use max::*;
pub use min::*;
pub use percentiles::*;
use rustc_hash::FxHashMap;
use serde::{Deserialize, Serialize};
pub use stats::*;
pub use sum::*;
@@ -37,6 +54,33 @@ impl From<Option<f64>> for SingleMetricResult {
}
}
/// This is the wrapper of percentile entries, which can be vector or hashmap
/// depending on if it's keyed or not.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum PercentileValues {
/// Vector format percentile entries
Vec(Vec<PercentileValuesVecEntry>),
/// HashMap format percentile entries. Key is the serialized percentile
HashMap(FxHashMap<String, f64>),
}
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
/// The entry when requesting percentiles with keyed: false
pub struct PercentileValuesVecEntry {
key: f64,
value: f64,
}
/// Single-metric aggregations use this common result structure.
///
/// Main reason to wrap it in value is to match elasticsearch output structure.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PercentilesMetricResult {
/// The result of the percentile metric.
pub values: PercentileValues,
}
#[cfg(test)]
mod tests {
use crate::aggregation::agg_req::Aggregations;

View File

@@ -0,0 +1,548 @@
use std::fmt::Debug;
use columnar::ColumnType;
use serde::{Deserialize, Serialize};
use super::*;
use crate::aggregation::agg_req_with_accessor::{
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::{f64_from_fastfield_u64, AggregationError};
use crate::{DocId, TantivyError};
/// # Percentiles
///
/// The percentiles aggregation is a useful tool for understanding the distribution
/// of a data set. It calculates the values below which a given percentage of the
/// data falls. For instance, the 95th percentile indicates the value below which
/// 95% of the data points can be found.
///
/// This aggregation can be particularly interesting for analyzing website or service response
/// times. For example, if the 95th percentile website load time is significantly higher than the
/// median, this indicates that a small percentage of users are experiencing much slower load times
/// than the majority.
///
/// To use the percentiles aggregation, you'll need to provide a field to
/// aggregate on. In the case of website load times, this would typically be a
/// field containing the duration of time it takes for the site to load.
///
/// The following example demonstrates a request for the percentiles of the "load_time"
/// field:
///
/// ```JSON
/// {
/// "percentiles": {
/// "field": "load_time"
/// }
/// }
/// ```
///
/// This request will return an object containing the default percentiles (1, 5,
/// 25, 50 (median), 75, 95, and 99). You can also customize the percentiles you want to
/// calculate by providing an array of values in the "percents" parameter:
///
/// ```JSON
/// {
/// "percentiles": {
/// "field": "load_time",
/// "percents": [10, 20, 30, 40, 50, 60, 70, 80, 90]
/// }
/// }
/// ```
///
/// In this example, the aggregation will return the 10th, 20th, 30th, 40th, 50th,
/// 60th, 70th, 80th, and 90th percentiles of the "load_time" field.
///
/// Analyzing the percentiles of website load times can help you understand the
/// user experience and identify areas for optimization. For example, if the 95th
/// percentile load time is significantly higher than the median, this indicates
/// that a small percentage of users are experiencing much slower load times than
/// the majority.
///
/// # Estimating Percentiles
///
/// While percentiles provide valuable insights into the distribution of data, it's
/// important to understand that they are often estimates. This is because
/// calculating exact percentiles for large data sets can be computationally
/// expensive and time-consuming. As a result, many percentile aggregation
/// algorithms use approximation techniques to provide faster results.
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
pub struct PercentilesAggregationReq {
/// The field name to compute the percentiles on.
pub field: String,
/// The percentiles to compute.
/// Defaults to [1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
pub percents: Option<Vec<f64>>,
/// Whether to return the percentiles as a hash map
#[serde(default = "default_as_true")]
pub keyed: bool,
}
fn default_percentiles() -> &'static [f64] {
&[1.0, 5.0, 25.0, 50.0, 75.0, 95.0, 99.0]
}
fn default_as_true() -> bool {
true
}
impl PercentilesAggregationReq {
/// Creates a new [`PercentilesAggregationReq`] instance from a field name.
pub fn from_field_name(field_name: String) -> Self {
PercentilesAggregationReq {
field: field_name,
percents: None,
keyed: default_as_true(),
}
}
/// Returns the field name the aggregation is computed on.
pub fn field_name(&self) -> &str {
&self.field
}
fn validate(&self) -> crate::Result<()> {
if let Some(percents) = self.percents.as_ref() {
let all_in_range = percents
.iter()
.cloned()
.all(|percent| (0.0..=100.0).contains(&percent));
if !all_in_range {
return Err(TantivyError::AggregationError(
AggregationError::InvalidRequest(
"All percentiles have to be between 0.0 and 100.0".to_string(),
),
));
}
}
Ok(())
}
}
#[derive(Clone, Debug, PartialEq)]
pub(crate) struct SegmentPercentilesCollector {
field_type: ColumnType,
pub(crate) percentiles: PercentilesCollector,
pub(crate) accessor_idx: usize,
val_cache: Vec<u64>,
}
#[derive(Clone, Serialize, Deserialize)]
/// The percentiles collector used during segment collection and for merging results.
pub struct PercentilesCollector {
sketch: sketches_ddsketch::DDSketch,
}
impl Default for PercentilesCollector {
fn default() -> Self {
Self::new()
}
}
impl Debug for PercentilesCollector {
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
f.debug_struct("IntermediatePercentiles")
.field("sketch_len", &self.sketch.length())
.finish()
}
}
impl PartialEq for PercentilesCollector {
fn eq(&self, _other: &Self) -> bool {
false
}
}
fn format_percentil(percentil: f64) -> String {
let mut out = percentil.to_string();
// Slightly silly way to format trailing decimals
if !out.contains('.') {
out.push_str(".0");
}
out
}
impl PercentilesCollector {
/// Convert result into final result. This will query the quantils from the underlying quantil
/// collector.
pub fn into_final_result(self, req: &PercentilesAggregationReq) -> PercentilesMetricResult {
let percentiles: &[f64] = req
.percents
.as_ref()
.map(|el| el.as_ref())
.unwrap_or(default_percentiles());
let iter_quantile_and_values = percentiles.iter().cloned().map(|percentile| {
(
percentile,
self.sketch
.quantile(percentile / 100.0)
.expect(
"quantil out of range. This error should have been caught during \
validation phase",
)
.unwrap_or(f64::NAN),
)
});
let values = if req.keyed {
PercentileValues::HashMap(
iter_quantile_and_values
.map(|(val, quantil)| (format_percentil(val), quantil))
.collect(),
)
} else {
PercentileValues::Vec(
iter_quantile_and_values
.map(|(key, value)| PercentileValuesVecEntry { key, value })
.collect(),
)
};
PercentilesMetricResult { values }
}
fn new() -> Self {
let ddsketch_config = sketches_ddsketch::Config::defaults();
let sketch = sketches_ddsketch::DDSketch::new(ddsketch_config);
Self { sketch }
}
fn collect(&mut self, val: f64) {
self.sketch.add(val);
}
pub(crate) fn merge_fruits(&mut self, right: PercentilesCollector) -> crate::Result<()> {
self.sketch.merge(&right.sketch).map_err(|err| {
TantivyError::AggregationError(AggregationError::InternalError(format!(
"Error while merging percentiles {err:?}"
)))
})?;
Ok(())
}
}
impl SegmentPercentilesCollector {
pub fn from_req_and_validate(
req: &PercentilesAggregationReq,
field_type: ColumnType,
accessor_idx: usize,
) -> crate::Result<Self> {
req.validate()?;
Ok(Self {
field_type,
percentiles: PercentilesCollector::new(),
accessor_idx,
val_cache: Default::default(),
})
}
#[inline]
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
agg_accessor: &mut AggregationWithAccessor,
) {
agg_accessor
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
for val in agg_accessor.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.percentiles.collect(val1);
}
}
}
impl SegmentAggregationCollector for SegmentPercentilesCollector {
#[inline]
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let intermediate_metric_result = IntermediateMetricResult::Percentiles(self.percentiles);
results.push(
name,
IntermediateAggregationResult::Metric(intermediate_metric_result),
)?;
Ok(())
}
#[inline]
fn collect(
&mut self,
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let field = &agg_with_accessor.aggs.values[self.accessor_idx].accessor;
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.percentiles.collect(val1);
}
Ok(())
}
#[inline]
fn collect_block(
&mut self,
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let field = &mut agg_with_accessor.aggs.values[self.accessor_idx];
self.collect_block_with_field(docs, field);
Ok(())
}
}
#[cfg(test)]
mod tests {
use itertools::Itertools;
use more_asserts::{assert_ge, assert_le};
use rand::rngs::StdRng;
use rand::SeedableRng;
use serde_json::Value;
use crate::aggregation::agg_req::Aggregations;
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::tests::{
get_test_index_from_values, get_test_index_from_values_and_terms,
};
use crate::aggregation::AggregationCollector;
use crate::query::AllQuery;
#[test]
fn test_aggregation_percentiles_empty_index() -> crate::Result<()> {
// test index without segments
let values = vec![];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = serde_json::from_value(json!({
"percentiles": {
"percentiles": {
"field": "score",
}
},
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
assert_eq!(
res["percentiles"]["values"],
json!({
"1.0": Value::Null,
"5.0": Value::Null,
"25.0": Value::Null,
"50.0": Value::Null,
"75.0": Value::Null,
"95.0": Value::Null,
"99.0": Value::Null,
})
);
Ok(())
}
#[test]
fn test_aggregation_percentile_simple() -> crate::Result<()> {
let values = vec![10.0];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = serde_json::from_value(json!({
"percentiles": {
"percentiles": {
"field": "score",
}
},
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
let percents = vec!["1.0", "5.0", "25.0", "50.0", "75.0", "95.0", "99.0"];
let range = 9.9..10.1;
for percent in percents {
let val = res["percentiles"]["values"][percent].as_f64().unwrap();
assert!(range.contains(&val));
}
Ok(())
}
#[test]
fn test_aggregation_percentile_parameters() -> crate::Result<()> {
let values = vec![10.0];
let index = get_test_index_from_values(false, &values)?;
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
let percents = vec!["95.0", "99.0", "99.9"];
let expected_range = 9.9..10.1;
for percent in percents {
let val = res["mypercentiles"]["values"][percent].as_f64().unwrap();
assert!(expected_range.contains(&val));
}
// Keyed false
//
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score",
"percents": [ 95, 99, 99.9 ],
"keyed": false
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let reader = index.reader()?;
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
let vals = &res["mypercentiles"]["values"];
assert_eq!(vals[0]["key"].as_f64().unwrap(), 95.0);
assert_eq!(vals[1]["key"].as_f64().unwrap(), 99.0);
assert_eq!(vals[2]["key"].as_f64().unwrap(), 99.9);
assert_eq!(vals[3]["key"], serde_json::Value::Null);
assert!(expected_range.contains(&vals[0]["value"].as_f64().unwrap()));
assert!(expected_range.contains(&vals[1]["value"].as_f64().unwrap()));
assert!(expected_range.contains(&vals[2]["value"].as_f64().unwrap()));
Ok(())
}
#[test]
fn test_aggregation_percentiles_single_seg() -> crate::Result<()> {
test_aggregation_percentiles(true)
}
#[test]
fn test_aggregation_percentiles_multi_seg() -> crate::Result<()> {
test_aggregation_percentiles(false)
}
fn test_aggregation_percentiles(merge_segments: bool) -> crate::Result<()> {
use rand_distr::Distribution;
let num_values_in_segment = vec![100, 30_000, 8000];
let lg_norm = rand_distr::LogNormal::new(2.996f64, 0.979f64).unwrap();
let mut rng = StdRng::from_seed([1u8; 32]);
let segment_data = |i| {
(0..num_values_in_segment[i])
.map(|_| lg_norm.sample(&mut rng))
.collect_vec()
};
let values = (0..=2).map(segment_data).collect_vec();
let mut all_values = values
.iter()
.flat_map(|el| el.iter().cloned())
.collect_vec();
all_values.sort_unstable_by(|a, b| a.total_cmp(b));
fn get_exact_quantil(q: f64, all_values: &[f64]) -> f64 {
let q = q / 100.0;
assert!((0f64..=1f64).contains(&q));
let index = (all_values.len() as f64 * q).ceil() as usize;
let index = index.min(all_values.len() - 1);
all_values[index]
}
let segment_and_values = values
.into_iter()
.map(|segment_data| {
segment_data
.into_iter()
.map(|val| (val, val.to_string()))
.collect_vec()
})
.collect_vec();
let index =
get_test_index_from_values_and_terms(merge_segments, &segment_and_values).unwrap();
let reader = index.reader()?;
let agg_req_str = r#"
{
"mypercentiles": {
"percentiles": {
"field": "score_f64",
"percents": [ 95, 99, 99.9 ]
}
}
} "#;
let agg_req_1: Aggregations = serde_json::from_str(agg_req_str).unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
let searcher = reader.searcher();
let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
let res: Value = serde_json::from_str(&serde_json::to_string(&agg_res)?)?;
let vals = &res["mypercentiles"]["values"];
let check_quantil = |exact_quantil: f64, val: f64| {
let lower = exact_quantil - exact_quantil * 0.02;
let upper = exact_quantil + exact_quantil * 0.02;
assert_le!(val, upper);
assert_ge!(val, lower);
};
let val = vals["95.0"].as_f64().unwrap();
let exact_quantil = get_exact_quantil(95.0, &all_values);
check_quantil(exact_quantil, val);
let val = vals["99.0"].as_f64().unwrap();
let exact_quantil = get_exact_quantil(99.0, &all_values);
check_quantil(exact_quantil, val);
let val = vals["99.9"].as_f64().unwrap();
let exact_quantil = get_exact_quantil(99.9, &all_values);
check_quantil(exact_quantil, val);
Ok(())
}
}

View File

@@ -3,13 +3,13 @@ use serde::{Deserialize, Serialize};
use super::*;
use crate::aggregation::agg_req_with_accessor::{
AggregationsWithAccessor, MetricAggregationWithAccessor,
AggregationWithAccessor, AggregationsWithAccessor,
};
use crate::aggregation::f64_from_fastfield_u64;
use crate::aggregation::intermediate_agg_result::{
IntermediateAggregationResults, IntermediateMetricResult,
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
};
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
use crate::aggregation::{f64_from_fastfield_u64, VecWithNames};
use crate::{DocId, TantivyError};
/// A multi-value metric aggregation that computes a collection of statistics on numeric values that
@@ -66,8 +66,7 @@ impl Stats {
"max" => Ok(self.max),
"avg" => Ok(self.avg),
_ => Err(TantivyError::InvalidArgument(format!(
"Unknown property {} on stats metric aggregation",
agg_property
"Unknown property {agg_property} on stats metric aggregation"
))),
}
}
@@ -179,7 +178,7 @@ impl SegmentStatsCollector {
pub(crate) fn collect_block_with_field(
&mut self,
docs: &[DocId],
agg_accessor: &mut MetricAggregationWithAccessor,
agg_accessor: &mut AggregationWithAccessor,
) {
agg_accessor
.column_block_accessor
@@ -194,11 +193,12 @@ impl SegmentStatsCollector {
impl SegmentAggregationCollector for SegmentStatsCollector {
#[inline]
fn into_intermediate_aggregations_result(
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
let name = agg_with_accessor.metrics.keys[self.accessor_idx].to_string();
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
let intermediate_metric_result = match self.collecting_for {
SegmentStatsType::Average => {
@@ -219,15 +219,12 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
}
};
let metrics = Some(VecWithNames::from_entries(vec![(
results.push(
name,
intermediate_metric_result,
)]));
IntermediateAggregationResult::Metric(intermediate_metric_result),
)?;
Ok(IntermediateAggregationResults {
metrics,
buckets: None,
})
Ok(())
}
#[inline]
@@ -236,7 +233,7 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
doc: crate::DocId,
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let field = &agg_with_accessor.metrics.values[self.accessor_idx].accessor;
let field = &agg_with_accessor.aggs.values[self.accessor_idx].accessor;
for val in field.values_for_doc(doc) {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
@@ -252,7 +249,7 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
let field = &mut agg_with_accessor.metrics.values[self.accessor_idx];
let field = &mut agg_with_accessor.aggs.values[self.accessor_idx];
self.collect_block_with_field(docs, field);
Ok(())
}
@@ -261,16 +258,10 @@ impl SegmentAggregationCollector for SegmentStatsCollector {
#[cfg(test)]
mod tests {
use std::iter;
use serde_json::Value;
use crate::aggregation::agg_req::{
Aggregation, Aggregations, BucketAggregation, BucketAggregationType, MetricAggregation,
RangeAggregation,
};
use crate::aggregation::agg_req::{Aggregation, Aggregations};
use crate::aggregation::agg_result::AggregationResults;
use crate::aggregation::metric::StatsAggregation;
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values};
use crate::aggregation::AggregationCollector;
use crate::query::{AllQuery, TermQuery};
@@ -284,14 +275,14 @@ mod tests {
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = vec![(
"stats".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score".to_string(),
))),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"stats": {
"stats": {
"field": "score",
},
}
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
@@ -316,19 +307,18 @@ mod tests {
#[test]
fn test_aggregation_stats_simple() -> crate::Result<()> {
// test index without segments
let values = vec![10.0];
let index = get_test_index_from_values(false, &values)?;
let agg_req_1: Aggregations = vec![(
"stats".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score".to_string(),
))),
)]
.into_iter()
.collect();
let agg_req_1: Aggregations = serde_json::from_value(json!({
"stats": {
"stats": {
"field": "score",
},
}
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());
@@ -363,52 +353,42 @@ mod tests {
IndexRecordOption::Basic,
);
let agg_req_1: Aggregations = vec![
(
"stats_i64".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score_i64".to_string(),
))),
),
(
"stats_f64".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score_f64".to_string(),
))),
),
(
"stats".to_string(),
Aggregation::Metric(MetricAggregation::Stats(StatsAggregation::from_field_name(
"score".to_string(),
))),
),
(
"range".to_string(),
Aggregation::Bucket(
BucketAggregation {
bucket_agg: BucketAggregationType::Range(RangeAggregation {
field: "score".to_string(),
ranges: vec![
(3f64..7f64).into(),
(7f64..19f64).into(),
(19f64..20f64).into(),
],
..Default::default()
}),
sub_aggregation: iter::once((
"stats".to_string(),
Aggregation::Metric(MetricAggregation::Stats(
StatsAggregation::from_field_name("score".to_string()),
)),
))
.collect(),
let range_agg: Aggregation = {
serde_json::from_value(json!({
"range": {
"field": "score",
"ranges": [ { "from": 3.0f64, "to": 7.0f64 }, { "from": 7.0f64, "to": 19.0f64 }, { "from": 19.0f64, "to": 20.0f64 } ]
},
"aggs": {
"stats": {
"stats": {
"field": "score"
}
}
.into(),
),
),
]
.into_iter()
.collect();
}
}))
.unwrap()
};
let agg_req_1: Aggregations = serde_json::from_value(json!({
"stats_i64": {
"stats": {
"field": "score_i64",
},
},
"stats_f64": {
"stats": {
"field": "score_f64",
},
},
"stats": {
"stats": {
"field": "score",
},
},
"range": range_agg
}))
.unwrap();
let collector = AggregationCollector::from_aggs(agg_req_1, Default::default());

View File

@@ -24,6 +24,9 @@
//! ## JSON Format
//! Aggregations request and result structures de/serialize into elasticsearch compatible JSON.
//!
//! Notice: Intermediate aggregation results should not be de/serialized via JSON format.
//! Postcard is a good choice.
//!
//! ```verbatim
//! let agg_req: Aggregations = serde_json::from_str(json_request_string).unwrap();
//! let collector = AggregationCollector::from_aggs(agg_req, None);
@@ -35,6 +38,7 @@
//! ## Supported Aggregations
//! - [Bucket](bucket)
//! - [Histogram](bucket::HistogramAggregation)
//! - [DateHistogram](bucket::DateHistogramAggregationReq)
//! - [Range](bucket::RangeAggregation)
//! - [Terms](bucket::TermsAggregation)
//! - [Metric](metric)
@@ -44,39 +48,12 @@
//! - [Max](metric::MaxAggregation)
//! - [Sum](metric::SumAggregation)
//! - [Count](metric::CountAggregation)
//! - [Percentiles](metric::PercentilesAggregationReq)
//!
//! # Example
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
//! `(String, agg_req::Aggregation)` iterator.
//!
//! ```
//! use tantivy::aggregation::agg_req::{Aggregations, Aggregation, MetricAggregation};
//! use tantivy::aggregation::AggregationCollector;
//! use tantivy::aggregation::metric::AverageAggregation;
//! use tantivy::query::AllQuery;
//! use tantivy::aggregation::agg_result::AggregationResults;
//! use tantivy::IndexReader;
//!
//! # #[allow(dead_code)]
//! fn aggregate_on_index(reader: &IndexReader) {
//! let agg_req: Aggregations = vec![
//! (
//! "average".to_string(),
//! Aggregation::Metric(MetricAggregation::Average(
//! AverageAggregation::from_field_name("score".to_string()),
//! )),
//! ),
//! ]
//! .into_iter()
//! .collect();
//!
//! let collector = AggregationCollector::from_aggs(agg_req, Default::default());
//!
//! let searcher = reader.searcher();
//! let agg_res: AggregationResults = searcher.search(&AllQuery, &collector).unwrap();
//! }
//! ```
//! # Example JSON
//! Requests are compatible with the elasticsearch JSON request format.
//!
//! ```
@@ -116,32 +93,24 @@
//! aggregation and then calculate the average on each bucket.
//! ```
//! use tantivy::aggregation::agg_req::*;
//! use tantivy::aggregation::metric::AverageAggregation;
//! use tantivy::aggregation::bucket::RangeAggregation;
//! let sub_agg_req_1: Aggregations = vec![(
//! "average_in_range".to_string(),
//! Aggregation::Metric(MetricAggregation::Average(
//! AverageAggregation::from_field_name("score".to_string()),
//! )),
//! )]
//! .into_iter()
//! .collect();
//! use serde_json::json;
//!
//! let agg_req_1: Aggregations = vec![
//! (
//! "range".to_string(),
//! Aggregation::Bucket(Box::new(BucketAggregation {
//! bucket_agg: BucketAggregationType::Range(RangeAggregation{
//! field: "score".to_string(),
//! ranges: vec![(3f64..7f64).into(), (7f64..20f64).into()],
//! keyed: false,
//! }),
//! sub_aggregation: sub_agg_req_1.clone(),
//! })),
//! ),
//! ]
//! .into_iter()
//! .collect();
//! let agg_req_1: Aggregations = serde_json::from_value(json!({
//! "rangef64": {
//! "range": {
//! "field": "score",
//! "ranges": [
//! { "from": 3, "to": 7000 },
//! { "from": 7000, "to": 20000 },
//! { "from": 50000, "to": 60000 }
//! ]
//! },
//! "aggs": {
//! "average_in_range": { "avg": { "field": "score" } }
//! }
//! },
//! }))
//! .unwrap();
//! ```
//!
//! # Distributed Aggregation
@@ -153,7 +122,7 @@
//! [`merge_fruits`](intermediate_agg_result::IntermediateAggregationResults::merge_fruits) method
//! to merge multiple results. The merged result can then be converted into
//! [`AggregationResults`](agg_result::AggregationResults) via the
//! [`into_final_bucket_result`](intermediate_agg_result::IntermediateAggregationResults::into_final_bucket_result) method.
//! [`into_final_result`](intermediate_agg_result::IntermediateAggregationResults::into_final_result) method.
mod agg_limits;
pub mod agg_req;
@@ -174,6 +143,8 @@ use std::fmt::Display;
#[cfg(test)]
mod agg_tests;
mod agg_bench;
pub use agg_limits::AggregationLimits;
pub use collector::{
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
@@ -186,13 +157,22 @@ use itertools::Itertools;
use serde::{Deserialize, Serialize};
/// Represents an associative array `(key => values)` in a very efficient manner.
#[derive(Clone, PartialEq, Serialize, Deserialize)]
pub(crate) struct VecWithNames<T: Clone> {
#[derive(PartialEq, Serialize, Deserialize)]
pub(crate) struct VecWithNames<T> {
pub(crate) values: Vec<T>,
keys: Vec<String>,
}
impl<T: Clone> Default for VecWithNames<T> {
impl<T: Clone> Clone for VecWithNames<T> {
fn clone(&self) -> Self {
Self {
values: self.values.clone(),
keys: self.keys.clone(),
}
}
}
impl<T> Default for VecWithNames<T> {
fn default() -> Self {
Self {
values: Default::default(),
@@ -201,24 +181,19 @@ impl<T: Clone> Default for VecWithNames<T> {
}
}
impl<T: Clone + std::fmt::Debug> std::fmt::Debug for VecWithNames<T> {
impl<T: std::fmt::Debug> std::fmt::Debug for VecWithNames<T> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_map().entries(self.iter()).finish()
}
}
impl<T: Clone> From<HashMap<String, T>> for VecWithNames<T> {
impl<T> From<HashMap<String, T>> for VecWithNames<T> {
fn from(map: HashMap<String, T>) -> Self {
VecWithNames::from_entries(map.into_iter().collect_vec())
}
}
impl<T: Clone> VecWithNames<T> {
fn extend(&mut self, entries: VecWithNames<T>) {
self.keys.extend(entries.keys);
self.values.extend(entries.values);
}
impl<T> VecWithNames<T> {
fn from_entries(mut entries: Vec<(String, T)>) -> Self {
// Sort to ensure order of elements match across multiple instances
entries.sort_by(|left, right| left.0.cmp(&right.0));
@@ -233,21 +208,12 @@ impl<T: Clone> VecWithNames<T> {
keys: data_names,
}
}
fn into_iter(self) -> impl Iterator<Item = (String, T)> {
self.keys.into_iter().zip(self.values.into_iter())
}
fn iter(&self) -> impl Iterator<Item = (&str, &T)> + '_ {
self.keys().zip(self.values.iter())
}
fn keys(&self) -> impl Iterator<Item = &str> + '_ {
self.keys.iter().map(|key| key.as_str())
}
fn into_values(self) -> impl Iterator<Item = T> {
self.values.into_iter()
}
fn values(&self) -> impl Iterator<Item = &T> + '_ {
self.values.iter()
}
fn values_mut(&mut self) -> impl Iterator<Item = &mut T> + '_ {
self.values.iter_mut()
}
@@ -316,7 +282,7 @@ pub(crate) fn f64_from_fastfield_u64(val: u64, field_type: &ColumnType) -> f64 {
ColumnType::I64 | ColumnType::DateTime => i64::from_u64(val) as f64,
ColumnType::F64 => f64::from_u64(val),
_ => {
panic!("unexpected type {:?}. This should not happen", field_type)
panic!("unexpected type {field_type:?}. This should not happen")
}
}
}
@@ -461,7 +427,7 @@ mod tests {
let index = Index::create_in_ram(schema_builder.build());
{
// let mut index_writer = index.writer_for_tests()?;
let mut index_writer = index.writer_with_num_threads(1, 30_000_000)?;
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
index_writer.set_merge_policy(Box::new(NoMergePolicy));
for values in segment_and_values {
for (i, term) in values {

View File

@@ -6,24 +6,23 @@
use std::fmt::Debug;
pub(crate) use super::agg_limits::AggregationLimits;
use super::agg_req::MetricAggregation;
use super::agg_req_with_accessor::{
AggregationsWithAccessor, BucketAggregationWithAccessor, MetricAggregationWithAccessor,
};
use super::agg_req::AggregationVariants;
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
use super::intermediate_agg_result::IntermediateAggregationResults;
use super::metric::{
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, SegmentStatsCollector,
SegmentStatsType, StatsAggregation, SumAggregation,
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
SegmentPercentilesCollector, SegmentStatsCollector, SegmentStatsType, StatsAggregation,
SumAggregation,
};
use super::VecWithNames;
use crate::aggregation::agg_req::BucketAggregationType;
use crate::aggregation::bucket::SegmentTermCollectorComposite;
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
fn into_intermediate_aggregations_result(
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults>;
results: &mut IntermediateAggregationResults,
) -> crate::Result<()>;
fn collect(
&mut self,
@@ -63,92 +62,101 @@ impl Clone for Box<dyn SegmentAggregationCollector> {
}
pub(crate) fn build_segment_agg_collector(
req: &AggregationsWithAccessor,
req: &mut AggregationsWithAccessor,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
// Single metric special case
if req.buckets.is_empty() && req.metrics.len() == 1 {
let req = &req.metrics.values[0];
// Single collector special case
if req.aggs.len() == 1 {
let req = &mut req.aggs.values[0];
let accessor_idx = 0;
return build_metric_segment_agg_collector(req, accessor_idx);
}
// Single bucket special case
if req.metrics.is_empty() && req.buckets.len() == 1 {
let req = &req.buckets.values[0];
let accessor_idx = 0;
return build_bucket_segment_agg_collector(req, accessor_idx);
return build_single_agg_segment_collector(req, accessor_idx);
}
let agg = GenericSegmentAggregationResultsCollector::from_req_and_validate(req)?;
Ok(Box::new(agg))
}
pub(crate) fn build_metric_segment_agg_collector(
req: &MetricAggregationWithAccessor,
pub(crate) fn build_single_agg_segment_collector(
req: &mut AggregationWithAccessor,
accessor_idx: usize,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
let stats_collector = match &req.metric {
MetricAggregation::Average(AverageAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Average, accessor_idx)
use AggregationVariants::*;
match &req.agg.agg {
Terms(terms_req) => {
if let Some(acc2) = req.accessor2.as_ref() {
Ok(Box::new(
SegmentTermCollectorComposite::from_req_and_validate(
terms_req,
&mut req.sub_aggregation,
req.field_type,
acc2.1,
accessor_idx,
)?,
))
} else {
Ok(Box::new(SegmentTermCollector::from_req_and_validate(
terms_req,
&mut req.sub_aggregation,
req.field_type,
accessor_idx,
)?))
}
}
MetricAggregation::Count(CountAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Count, accessor_idx)
}
MetricAggregation::Max(MaxAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Max, accessor_idx)
}
MetricAggregation::Min(MinAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Min, accessor_idx)
}
MetricAggregation::Stats(StatsAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Stats, accessor_idx)
}
MetricAggregation::Sum(SumAggregation { .. }) => {
SegmentStatsCollector::from_req(req.field_type, SegmentStatsType::Sum, accessor_idx)
}
};
Ok(Box::new(stats_collector))
}
pub(crate) fn build_bucket_segment_agg_collector(
req: &BucketAggregationWithAccessor,
accessor_idx: usize,
) -> crate::Result<Box<dyn SegmentAggregationCollector>> {
match &req.bucket_agg {
BucketAggregationType::Terms(terms_req) => {
Ok(Box::new(SegmentTermCollector::from_req_and_validate(
terms_req,
&req.sub_aggregation,
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
range_req,
&mut req.sub_aggregation,
&mut req.limits,
req.field_type,
accessor_idx,
)?)),
Histogram(histogram) => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
histogram.clone(),
&mut req.sub_aggregation,
req.field_type,
accessor_idx,
)?)),
DateHistogram(histogram) => Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
histogram.to_histogram_req()?,
&mut req.sub_aggregation,
req.field_type,
accessor_idx,
)?)),
Average(AverageAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Average,
accessor_idx,
))),
Count(CountAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Count,
accessor_idx,
))),
Max(MaxAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Max,
accessor_idx,
))),
Min(MinAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Min,
accessor_idx,
))),
Stats(StatsAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Stats,
accessor_idx,
))),
Sum(SumAggregation { .. }) => Ok(Box::new(SegmentStatsCollector::from_req(
req.field_type,
SegmentStatsType::Sum,
accessor_idx,
))),
Percentiles(percentiles_req) => Ok(Box::new(
SegmentPercentilesCollector::from_req_and_validate(
percentiles_req,
req.field_type,
accessor_idx,
)?))
}
BucketAggregationType::Range(range_req) => {
Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
range_req,
&req.sub_aggregation,
&req.limits,
req.field_type,
accessor_idx,
)?))
}
BucketAggregationType::Histogram(histogram) => {
Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
histogram,
&req.sub_aggregation,
req.field_type,
accessor_idx,
)?))
}
BucketAggregationType::DateHistogram(histogram) => {
Ok(Box::new(SegmentHistogramCollector::from_req_and_validate(
&histogram.to_histogram_req()?,
&req.sub_aggregation,
req.field_type,
accessor_idx,
)?))
}
)?,
)),
}
}
@@ -157,50 +165,28 @@ pub(crate) fn build_bucket_segment_agg_collector(
/// can handle arbitrary complexity of sub-aggregations. Ideally we never have to pick this one
/// and can provide specialized versions instead, that remove some of its overhead.
pub(crate) struct GenericSegmentAggregationResultsCollector {
pub(crate) metrics: Option<Vec<Box<dyn SegmentAggregationCollector>>>,
pub(crate) buckets: Option<Vec<Box<dyn SegmentAggregationCollector>>>,
pub(crate) aggs: Vec<Box<dyn SegmentAggregationCollector>>,
}
impl Debug for GenericSegmentAggregationResultsCollector {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("SegmentAggregationResultsCollector")
.field("metrics", &self.metrics)
.field("buckets", &self.buckets)
.field("aggs", &self.aggs)
.finish()
}
}
impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
fn into_intermediate_aggregations_result(
fn add_intermediate_aggregation_result(
self: Box<Self>,
agg_with_accessor: &AggregationsWithAccessor,
) -> crate::Result<IntermediateAggregationResults> {
let buckets = if let Some(buckets) = self.buckets {
let mut intermeditate_buckets = VecWithNames::default();
for bucket in buckets {
// TODO too many allocations?
let res = bucket.into_intermediate_aggregations_result(agg_with_accessor)?;
// unwrap is fine since we only have buckets here
intermeditate_buckets.extend(res.buckets.unwrap());
}
Some(intermeditate_buckets)
} else {
None
};
let metrics = if let Some(metrics) = self.metrics {
let mut intermeditate_metrics = VecWithNames::default();
for metric in metrics {
// TODO too many allocations?
let res = metric.into_intermediate_aggregations_result(agg_with_accessor)?;
// unwrap is fine since we only have metrics here
intermeditate_metrics.extend(res.metrics.unwrap());
}
Some(intermeditate_metrics)
} else {
None
};
results: &mut IntermediateAggregationResults,
) -> crate::Result<()> {
for agg in self.aggs {
agg.add_intermediate_aggregation_result(agg_with_accessor, results)?;
}
Ok(IntermediateAggregationResults { metrics, buckets })
Ok(())
}
fn collect(
@@ -218,66 +204,30 @@ impl SegmentAggregationCollector for GenericSegmentAggregationResultsCollector {
docs: &[crate::DocId],
agg_with_accessor: &mut AggregationsWithAccessor,
) -> crate::Result<()> {
if let Some(metrics) = self.metrics.as_mut() {
for collector in metrics {
collector.collect_block(docs, agg_with_accessor)?;
}
}
if let Some(buckets) = self.buckets.as_mut() {
for collector in buckets {
collector.collect_block(docs, agg_with_accessor)?;
}
for collector in &mut self.aggs {
collector.collect_block(docs, agg_with_accessor)?;
}
Ok(())
}
fn flush(&mut self, agg_with_accessor: &mut AggregationsWithAccessor) -> crate::Result<()> {
if let Some(metrics) = &mut self.metrics {
for collector in metrics {
collector.flush(agg_with_accessor)?;
}
}
if let Some(buckets) = &mut self.buckets {
for collector in buckets {
collector.flush(agg_with_accessor)?;
}
for collector in &mut self.aggs {
collector.flush(agg_with_accessor)?;
}
Ok(())
}
}
impl GenericSegmentAggregationResultsCollector {
pub(crate) fn from_req_and_validate(req: &AggregationsWithAccessor) -> crate::Result<Self> {
let buckets = req
.buckets
.iter()
pub(crate) fn from_req_and_validate(req: &mut AggregationsWithAccessor) -> crate::Result<Self> {
let aggs = req
.aggs
.values_mut()
.enumerate()
.map(|(accessor_idx, (_key, req))| {
build_bucket_segment_agg_collector(req, accessor_idx)
})
.collect::<crate::Result<Vec<Box<dyn SegmentAggregationCollector>>>>()?;
let metrics = req
.metrics
.iter()
.enumerate()
.map(|(accessor_idx, (_key, req))| {
build_metric_segment_agg_collector(req, accessor_idx)
})
.map(|(accessor_idx, req)| build_single_agg_segment_collector(req, accessor_idx))
.collect::<crate::Result<Vec<Box<dyn SegmentAggregationCollector>>>>()?;
let metrics = if metrics.is_empty() {
None
} else {
Some(metrics)
};
let buckets = if buckets.is_empty() {
None
} else {
Some(buckets)
};
Ok(GenericSegmentAggregationResultsCollector { metrics, buckets })
Ok(GenericSegmentAggregationResultsCollector { aggs })
}
}

View File

@@ -161,6 +161,21 @@ fn facet_depth(facet_bytes: &[u8]) -> usize {
/// ]);
/// }
///
/// {
/// let mut facet_collector = FacetCollector::for_field("facet");
/// facet_collector.add_facet("/");
/// let facet_counts = searcher.search(&AllQuery, &facet_collector)?;
///
/// // This lists all of the facet counts
/// let facets: Vec<(&Facet, u64)> = facet_counts
/// .get("/")
/// .collect();
/// assert_eq!(facets, vec![
/// (&Facet::from("/category"), 4),
/// (&Facet::from("/lang"), 4)
/// ]);
/// }
///
/// Ok(())
/// }
/// # assert!(example().is_ok());
@@ -285,6 +300,9 @@ fn is_child_facet(parent_facet: &[u8], possible_child_facet: &[u8]) -> bool {
if !possible_child_facet.starts_with(parent_facet) {
return false;
}
if parent_facet.is_empty() {
return true;
}
possible_child_facet.get(parent_facet.len()).copied() == Some(0u8)
}
@@ -414,8 +432,8 @@ impl FacetCounts {
pub fn get<T>(&self, facet_from: T) -> FacetChildIterator<'_>
where Facet: From<T> {
let facet = Facet::from(facet_from);
let left_bound = Bound::Excluded(facet.clone());
let right_bound = if facet.is_root() {
let lower_bound = Bound::Excluded(facet.clone());
let upper_bound = if facet.is_root() {
Bound::Unbounded
} else {
let mut facet_after_bytes: String = facet.encoded_str().to_owned();
@@ -424,7 +442,7 @@ impl FacetCounts {
Bound::Excluded(facet_after)
};
let underlying: btree_map::Range<'_, _, _> =
self.facet_counts.range((left_bound, right_bound));
self.facet_counts.range((lower_bound, upper_bound));
FacetChildIterator { underlying }
}
@@ -789,6 +807,15 @@ mod tests {
);
Ok(())
}
#[test]
fn is_child_facet() {
assert!(super::is_child_facet(&b"foo"[..], &b"foo\0bar"[..]));
assert!(super::is_child_facet(&b""[..], &b"foo\0bar"[..]));
assert!(super::is_child_facet(&b""[..], &b"foo"[..]));
assert!(!super::is_child_facet(&b"foo\0bar"[..], &b"foo"[..]));
assert!(!super::is_child_facet(&b"foo"[..], &b"foobar\0baz"[..]));
}
}
#[cfg(all(test, feature = "unstable"))]
@@ -812,7 +839,7 @@ mod bench {
let mut docs = vec![];
for val in 0..50 {
let facet = Facet::from(&format!("/facet_{}", val));
let facet = Facet::from(&format!("/facet_{val}"));
for _ in 0..val * val {
docs.push(doc!(facet_field=>facet.clone()));
}

View File

@@ -6,32 +6,35 @@
//
// Of course, you can have a look at the tantivy's built-in collectors
// such as the `CountCollector` for more examples.
// ---
// Importing tantivy...
use std::fmt::Debug;
use std::marker::PhantomData;
use std::sync::Arc;
use columnar::{ColumnValues, DynamicColumn, HasAssociatedColumnType};
use columnar::{BytesColumn, Column, DynamicColumn, HasAssociatedColumnType};
use crate::collector::{Collector, SegmentCollector};
use crate::schema::Field;
use crate::{Score, SegmentReader, TantivyError};
use crate::{DocId, Score, SegmentReader, TantivyError};
/// The `FilterCollector` filters docs using a fast field value and a predicate.
/// Only the documents for which the predicate returned "true" will be passed on to the next
/// collector.
///
/// Only the documents containing at least one value for which the predicate returns `true`
/// will be passed on to the next collector.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
///
/// ```rust
/// use tantivy::collector::{TopDocs, FilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, INDEXED, FAST};
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let price = schema_builder.add_u64_field("price", INDEXED | FAST);
/// let price = schema_builder.add_u64_field("price", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
@@ -47,20 +50,24 @@ use crate::{Score, SegmentReader, TantivyError};
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let no_filter_collector = FilterCollector::new(price, &|value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let no_filter_collector = FilterCollector::new(price, |value: u64| value > 20_120u64, TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &no_filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 1));
///
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, &|value| value < 5u64, TopDocs::with_limit(2));
/// let filter_all_collector: FilterCollector<_, _, u64> = FilterCollector::new(price, |value| value < 5u64, TopDocs::with_limit(2));
/// let filtered_top_docs = searcher.search(&query, &filter_all_collector)?;
///
/// assert_eq!(filtered_top_docs.len(), 0);
/// # Ok(())
/// # }
/// ```
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue: Default>
///
/// Note that this is limited to fast fields which implement the
/// [`FastValue`][crate::fastfield::FastValue] trait, e.g. `u64` but not `&[u8]`.
/// To filter based on a bytes fast field, use a [`BytesFilterCollector`] instead.
pub struct FilterCollector<TCollector, TPredicate, TPredicateValue>
where TPredicate: 'static + Clone
{
field: Field,
@@ -69,19 +76,15 @@ where TPredicate: 'static + Clone
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TCollector, TPredicate, TPredicateValue: Default>
impl<TCollector, TPredicate, TPredicateValue>
FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(TPredicateValue) -> bool + Send + Sync + Clone,
{
/// Create a new FilterCollector.
pub fn new(
field: Field,
predicate: TPredicate,
collector: TCollector,
) -> FilterCollector<TCollector, TPredicate, TPredicateValue> {
FilterCollector {
/// Create a new `FilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
@@ -90,7 +93,7 @@ where
}
}
impl<TCollector, TPredicate, TPredicateValue: Default> Collector
impl<TCollector, TPredicate, TPredicateValue> Collector
for FilterCollector<TCollector, TPredicate, TPredicateValue>
where
TCollector: Collector + Send + Sync,
@@ -98,8 +101,6 @@ where
TPredicateValue: HasAssociatedColumnType,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
// That's the type of our result.
// Our standard deviation will be a float.
type Fruit = TCollector::Fruit;
type Child = FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>;
@@ -108,7 +109,7 @@ where
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<FilterSegmentCollector<TCollector::Child, TPredicate, TPredicateValue>> {
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_entry = schema.get_field_entry(self.field);
if !field_entry.is_fast() {
@@ -118,16 +119,16 @@ where
)));
}
let fast_field_reader = segment_reader
let column_opt = segment_reader
.fast_fields()
.column_first_or_default(schema.get_field_name(self.field))?;
.column_opt(field_entry.name())?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(FilterSegmentCollector {
fast_field_reader,
column_opt,
segment_collector,
predicate: self.predicate.clone(),
t_predicate_value: PhantomData,
@@ -146,35 +147,208 @@ where
}
}
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicate: 'static,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
{
fast_field_reader: Arc<dyn ColumnValues<TPredicateValue>>,
pub struct FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue> {
column_opt: Option<Column<TPredicateValue>>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
t_predicate_value: PhantomData<TPredicateValue>,
}
impl<TSegmentCollector, TPredicate, TPredicateValue>
FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TPredicateValue: PartialOrd + Copy + Debug + Send + Sync + 'static,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for val in column.values_for_doc(doc_id) {
if (self.predicate)(val) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate, TPredicateValue> SegmentCollector
for FilterSegmentCollector<TSegmentCollector, TPredicate, TPredicateValue>
where
TSegmentCollector: SegmentCollector,
TPredicateValue: HasAssociatedColumnType,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync,
DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>>,
TPredicate: 'static + Fn(TPredicateValue) -> bool + Send + Sync, /* DynamicColumn: Into<Option<columnar::Column<TPredicateValue>>> */
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
let value = self.fast_field_reader.get_val(doc);
if (self.predicate)(value) {
self.segment_collector.collect(doc, score)
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> <TSegmentCollector as SegmentCollector>::Fruit {
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
/// it transparently wraps an inner [`Collector`] but filters documents
/// based on the result of applying the predicate to the bytes fast field.
///
/// A document is accepted if and only if the predicate returns `true` for at least one value.
///
/// In other words,
/// - documents with no values are filtered out.
/// - documents with several values are accepted if at least one value matches the predicate.
///
/// ```rust
/// use tantivy::collector::{TopDocs, BytesFilterCollector};
/// use tantivy::query::QueryParser;
/// use tantivy::schema::{Schema, TEXT, FAST};
/// use tantivy::{doc, DocAddress, Index};
///
/// # fn main() -> tantivy::Result<()> {
/// let mut schema_builder = Schema::builder();
/// let title = schema_builder.add_text_field("title", TEXT);
/// let barcode = schema_builder.add_bytes_field("barcode", FAST);
/// let schema = schema_builder.build();
/// let index = Index::create_in_ram(schema);
///
/// let mut index_writer = index.writer_with_num_threads(1, 10_000_000)?;
/// index_writer.add_document(doc!(title => "The Name of the Wind", barcode => &b"010101"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of Muadib", barcode => &b"110011"[..]))?;
/// index_writer.add_document(doc!(title => "A Dairy Cow", barcode => &b"110111"[..]))?;
/// index_writer.add_document(doc!(title => "The Diary of a Young Girl", barcode => &b"011101"[..]))?;
/// index_writer.add_document(doc!(title => "Bridget Jones's Diary"))?;
/// index_writer.commit()?;
///
/// let reader = index.reader()?;
/// let searcher = reader.searcher();
///
/// let query_parser = QueryParser::for_index(&index, vec![title]);
/// let query = query_parser.parse_query("diary")?;
/// let filter_collector = BytesFilterCollector::new(barcode, |bytes: &[u8]| bytes.starts_with(b"01"), TopDocs::with_limit(2));
/// let top_docs = searcher.search(&query, &filter_collector)?;
///
/// assert_eq!(top_docs.len(), 1);
/// assert_eq!(top_docs[0].1, DocAddress::new(0, 3));
/// # Ok(())
/// # }
/// ```
pub struct BytesFilterCollector<TCollector, TPredicate>
where TPredicate: 'static + Clone
{
field: Field,
collector: TCollector,
predicate: TPredicate,
}
impl<TCollector, TPredicate> BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: Fn(&[u8]) -> bool + Send + Sync + Clone,
{
/// Create a new `BytesFilterCollector`.
pub fn new(field: Field, predicate: TPredicate, collector: TCollector) -> Self {
Self {
field,
predicate,
collector,
}
}
}
impl<TCollector, TPredicate> Collector for BytesFilterCollector<TCollector, TPredicate>
where
TCollector: Collector + Send + Sync,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync + Clone,
{
type Fruit = TCollector::Fruit;
type Child = BytesFilterSegmentCollector<TCollector::Child, TPredicate>;
fn for_segment(
&self,
segment_local_id: u32,
segment_reader: &SegmentReader,
) -> crate::Result<Self::Child> {
let schema = segment_reader.schema();
let field_name = schema.get_field_name(self.field);
let column_opt = segment_reader.fast_fields().bytes(field_name)?;
let segment_collector = self
.collector
.for_segment(segment_local_id, segment_reader)?;
Ok(BytesFilterSegmentCollector {
column_opt,
segment_collector,
predicate: self.predicate.clone(),
buffer: Vec::new(),
})
}
fn requires_scoring(&self) -> bool {
self.collector.requires_scoring()
}
fn merge_fruits(
&self,
segment_fruits: Vec<<TCollector::Child as SegmentCollector>::Fruit>,
) -> crate::Result<TCollector::Fruit> {
self.collector.merge_fruits(segment_fruits)
}
}
pub struct BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where TPredicate: 'static
{
column_opt: Option<BytesColumn>,
segment_collector: TSegmentCollector,
predicate: TPredicate,
buffer: Vec<u8>,
}
impl<TSegmentCollector, TPredicate> BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
#[inline]
fn accept_document(&mut self, doc_id: DocId) -> bool {
if let Some(column) = &self.column_opt {
for ord in column.term_ords(doc_id) {
self.buffer.clear();
let found = column.ord_to_bytes(ord, &mut self.buffer).unwrap_or(false);
if found && (self.predicate)(&self.buffer) {
return true;
}
}
}
false
}
}
impl<TSegmentCollector, TPredicate> SegmentCollector
for BytesFilterSegmentCollector<TSegmentCollector, TPredicate>
where
TSegmentCollector: SegmentCollector,
TPredicate: 'static + Fn(&[u8]) -> bool + Send + Sync,
{
type Fruit = TSegmentCollector::Fruit;
fn collect(&mut self, doc: u32, score: Score) {
if self.accept_document(doc) {
self.segment_collector.collect(doc, score);
}
}
fn harvest(self) -> TSegmentCollector::Fruit {
self.segment_collector.harvest()
}
}

View File

@@ -295,7 +295,7 @@ mod tests {
DateTime::from_primitive(
Date::from_calendar_date(1980, Month::January, 1)?.with_hms(0, 0, 0)?,
),
3_600_000_000 * 24 * 365, // it is just for a unit test... sorry leap years.
3_600_000_000_000 * 24 * 365, // it is just for a unit test... sorry leap years.
10,
);
let week_histogram = searcher.search(&all_query, &week_histogram_collector)?;

View File

@@ -112,7 +112,7 @@ mod docset_collector;
pub use self::docset_collector::DocSetCollector;
mod filter_collector_wrapper;
pub use self::filter_collector_wrapper::FilterCollector;
pub use self::filter_collector_wrapper::{BytesFilterCollector, FilterCollector};
/// `Fruit` is the type for the result of our collection.
/// e.g. `usize` for the `Count` collector.

View File

@@ -52,10 +52,8 @@ where
let requested_type = field_entry.field_type().value_type();
if schema_type != requested_type {
return Err(TantivyError::SchemaError(format!(
"Field {:?} is of type {:?}!={:?}",
field_entry.name(),
schema_type,
requested_type
"Field {:?} is of type {schema_type:?}!={requested_type:?}",
field_entry.name()
)));
}
self.collector.for_segment(segment_local_id, segment)

View File

@@ -26,7 +26,7 @@ impl Executor {
pub fn multi_thread(num_threads: usize, prefix: &'static str) -> crate::Result<Executor> {
let pool = ThreadPoolBuilder::new()
.num_threads(num_threads)
.thread_name(move |num| format!("{}{}", prefix, num))
.thread_name(move |num| format!("{prefix}{num}"))
.build()?;
Ok(Executor::ThreadPool(pool))
}

View File

@@ -39,10 +39,7 @@ fn load_metas(
.map_err(|e| {
DataCorruption::new(
META_FILEPATH.to_path_buf(),
format!(
"Meta file cannot be deserialized. {:?}. Content: {:?}",
e, meta_string
),
format!("Meta file cannot be deserialized. {e:?}. Content: {meta_string:?}"),
)
})
.map_err(From::from)
@@ -110,6 +107,7 @@ pub struct IndexBuilder {
schema: Option<Schema>,
index_settings: IndexSettings,
tokenizer_manager: TokenizerManager,
fast_field_tokenizer_manager: TokenizerManager,
}
impl Default for IndexBuilder {
fn default() -> Self {
@@ -123,6 +121,7 @@ impl IndexBuilder {
schema: None,
index_settings: IndexSettings::default(),
tokenizer_manager: TokenizerManager::default(),
fast_field_tokenizer_manager: TokenizerManager::default(),
}
}
@@ -140,12 +139,18 @@ impl IndexBuilder {
self
}
/// Set the tokenizers .
/// Set the tokenizers.
pub fn tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
self.tokenizer_manager = tokenizers;
self
}
/// Set the fast field tokenizers.
pub fn fast_field_tokenizers(mut self, tokenizers: TokenizerManager) -> Self {
self.fast_field_tokenizer_manager = tokenizers;
self
}
/// Creates a new index using the [`RamDirectory`].
///
/// The index will be allocated in anonymous memory.
@@ -270,6 +275,7 @@ impl IndexBuilder {
metas.index_settings = self.index_settings;
let mut index = Index::open_from_metas(directory, &metas, SegmentMetaInventory::default());
index.set_tokenizers(self.tokenizer_manager);
index.set_fast_field_tokenizers(self.fast_field_tokenizer_manager);
Ok(index)
}
}
@@ -282,6 +288,7 @@ pub struct Index {
settings: IndexSettings,
executor: Arc<Executor>,
tokenizers: TokenizerManager,
fast_field_tokenizers: TokenizerManager,
inventory: SegmentMetaInventory,
}
@@ -394,6 +401,7 @@ impl Index {
directory,
schema,
tokenizers: TokenizerManager::default(),
fast_field_tokenizers: TokenizerManager::default(),
executor: Arc::new(Executor::single_thread()),
inventory,
}
@@ -409,6 +417,16 @@ impl Index {
&self.tokenizers
}
/// Setter for the fast field tokenizer manager.
pub fn set_fast_field_tokenizers(&mut self, tokenizers: TokenizerManager) {
self.fast_field_tokenizers = tokenizers;
}
/// Accessor for the fast field tokenizer manager.
pub fn fast_field_tokenizer(&self) -> &TokenizerManager {
&self.fast_field_tokenizers
}
/// Get the tokenizer associated with a specific field.
pub fn tokenizer_for_field(&self, field: Field) -> crate::Result<TextAnalyzer> {
let field_entry = self.schema.get_field_entry(field);
@@ -426,8 +444,7 @@ impl Index {
};
let indexing_options = indexing_options_opt.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No indexing options set for field {:?}",
field_entry
"No indexing options set for field {field_entry:?}"
))
})?;
@@ -435,8 +452,7 @@ impl Index {
.get(indexing_options.tokenizer())
.ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"No Tokenizer found for field {:?}",
field_entry
"No Tokenizer found for field {field_entry:?}"
))
})
}

View File

@@ -61,7 +61,7 @@ impl InvertedIndexReader {
/// Returns the term info associated with the term.
pub fn get_term_info(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get(term.value_bytes())
self.termdict.get(term.serialized_value_bytes())
}
/// Return the term dictionary datastructure.
@@ -203,32 +203,104 @@ impl InvertedIndexReader {
#[cfg(feature = "quickwit")]
impl InvertedIndexReader {
pub(crate) async fn get_term_info_async(&self, term: &Term) -> io::Result<Option<TermInfo>> {
self.termdict.get_async(term.value_bytes()).await
self.termdict.get_async(term.serialized_value_bytes()).await
}
/// Returns a block postings given a `Term`.
async fn get_term_range_async(
&self,
terms: impl std::ops::RangeBounds<Term>,
limit: Option<u64>,
) -> io::Result<impl Iterator<Item = TermInfo> + '_> {
use std::ops::Bound;
let range_builder = self.termdict.range();
let range_builder = match terms.start_bound() {
Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()),
Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()),
Bound::Unbounded => range_builder,
};
let range_builder = match terms.end_bound() {
Bound::Included(bound) => range_builder.le(bound.serialized_value_bytes()),
Bound::Excluded(bound) => range_builder.lt(bound.serialized_value_bytes()),
Bound::Unbounded => range_builder,
};
let range_builder = if let Some(limit) = limit {
range_builder.limit(limit)
} else {
range_builder
};
let mut stream = range_builder.into_stream_async().await?;
let iter = std::iter::from_fn(move || stream.next().map(|(_k, v)| v.clone()));
// limit on stream is only an optimization to load less data, the stream may still return
// more than limit elements.
let limit = limit.map(|limit| limit as usize).unwrap_or(usize::MAX);
let iter = iter.take(limit);
Ok(iter)
}
/// Warmup a block postings given a `Term`.
/// This method is for an advanced usage only.
///
/// Most users should prefer using [`Self::read_postings()`] instead.
pub async fn warm_postings(&self, term: &Term, with_positions: bool) -> io::Result<()> {
let term_info_opt: Option<TermInfo> = self.get_term_info_async(term).await?;
if let Some(term_info) = term_info_opt {
self.postings_file_slice
.read_bytes_slice_async(term_info.postings_range.clone())
.await?;
let postings = self
.postings_file_slice
.read_bytes_slice_async(term_info.postings_range.clone());
if with_positions {
self.positions_file_slice
.read_bytes_slice_async(term_info.positions_range.clone())
.await?;
let positions = self
.positions_file_slice
.read_bytes_slice_async(term_info.positions_range.clone());
futures_util::future::try_join(postings, positions).await?;
} else {
postings.await?;
}
}
Ok(())
}
/// Read the block postings for all terms.
/// Warmup a block postings given a range of `Term`s.
/// This method is for an advanced usage only.
pub async fn warm_postings_range(
&self,
terms: impl std::ops::RangeBounds<Term>,
limit: Option<u64>,
with_positions: bool,
) -> io::Result<()> {
let mut term_info = self.get_term_range_async(terms, limit).await?;
let Some(first_terminfo) = term_info.next() else {
// no key matches, nothing more to load
return Ok(());
};
let last_terminfo = term_info.last().unwrap_or_else(|| first_terminfo.clone());
let postings_range = first_terminfo.postings_range.start..last_terminfo.postings_range.end;
let positions_range =
first_terminfo.positions_range.start..last_terminfo.positions_range.end;
let postings = self
.postings_file_slice
.read_bytes_slice_async(postings_range);
if with_positions {
let positions = self
.positions_file_slice
.read_bytes_slice_async(positions_range);
futures_util::future::try_join(postings, positions).await?;
} else {
postings.await?;
}
Ok(())
}
/// Warmup the block postings for all terms.
/// This method is for an advanced usage only.
///
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] instead.
/// If you know which terms to pre-load, prefer using [`Self::warm_postings`] or
/// [`Self::warm_postings`] instead.
pub async fn warm_postings_full(&self, with_positions: bool) -> io::Result<()> {
self.postings_file_slice.read_bytes_async().await?;
if with_positions {

View File

@@ -5,12 +5,12 @@ use rustc_hash::FxHashMap;
use crate::fastfield::FastValue;
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
use crate::schema::term::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type};
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{Field, Type, DATE_TIME_PRECISION_INDEXED};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::{OffsetDateTime, UtcOffset};
use crate::tokenizer::TextAnalyzer;
use crate::{DatePrecision, DateTime, DocId, Term};
use crate::{DateTime, DocId, Term};
/// This object is a map storing the last position for a given path for the current document
/// being indexed.
@@ -59,7 +59,7 @@ struct IndexingPositionsPerPath {
impl IndexingPositionsPerPath {
fn get_position(&mut self, term: &Term) -> &mut IndexingPosition {
self.positions_per_path
.entry(murmurhash2(term.as_slice()))
.entry(murmurhash2(term.serialized_term()))
.or_insert_with(Default::default)
}
}
@@ -67,7 +67,7 @@ impl IndexingPositionsPerPath {
pub(crate) fn index_json_values<'a>(
doc: DocId,
json_values: impl Iterator<Item = crate::Result<&'a serde_json::Map<String, serde_json::Value>>>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
expand_dots_enabled: bool,
term_buffer: &mut Term,
postings_writer: &mut dyn PostingsWriter,
@@ -93,7 +93,7 @@ pub(crate) fn index_json_values<'a>(
fn index_json_object(
doc: DocId,
json_value: &serde_json::Map<String, serde_json::Value>,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
@@ -117,7 +117,7 @@ fn index_json_object(
fn index_json_value(
doc: DocId,
json_value: &serde_json::Value,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
json_term_writer: &mut JsonTermWriter,
postings_writer: &mut dyn PostingsWriter,
ctx: &mut IndexingContext,
@@ -130,10 +130,10 @@ fn index_json_value(
postings_writer.subscribe(doc, 0u32, json_term_writer.term(), ctx);
}
serde_json::Value::Number(number) => {
if let Some(number_u64) = number.as_u64() {
json_term_writer.set_fast_value(number_u64);
} else if let Some(number_i64) = number.as_i64() {
if let Some(number_i64) = number.as_i64() {
json_term_writer.set_fast_value(number_i64);
} else if let Some(number_u64) = number.as_u64() {
json_term_writer.set_fast_value(number_u64);
} else if let Some(number_f64) = number.as_f64() {
json_term_writer.set_fast_value(number_f64);
}
@@ -201,7 +201,7 @@ fn infer_type_from_str(text: &str) -> TextOrDateTime {
}
// Tries to infer a JSON type from a string.
pub(crate) fn convert_to_fast_value_and_get_term(
pub fn convert_to_fast_value_and_get_term(
json_term_writer: &mut JsonTermWriter,
phrase: &str,
) -> Option<Term> {
@@ -212,12 +212,12 @@ pub(crate) fn convert_to_fast_value_and_get_term(
DateTime::from_utc(dt_utc),
));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
}
if let Ok(i64_val) = str::parse::<i64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, i64_val));
}
if let Ok(u64_val) = str::parse::<u64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, u64_val));
}
if let Ok(f64_val) = str::parse::<f64>(phrase) {
return Some(set_fastvalue_and_get_term(json_term_writer, f64_val));
}
@@ -239,7 +239,7 @@ pub(crate) fn set_fastvalue_and_get_term<T: FastValue>(
pub(crate) fn set_string_and_get_terms(
json_term_writer: &mut JsonTermWriter,
value: &str,
text_analyzer: &TextAnalyzer,
text_analyzer: &mut TextAnalyzer,
) -> Vec<(usize, Term)> {
let mut positions_and_terms = Vec::<(usize, Term)>::new();
json_term_writer.close_path_and_set_type(Type::Str);
@@ -257,6 +257,9 @@ pub(crate) fn set_string_and_get_terms(
positions_and_terms
}
/// Writes a value of a JSON field to a `Term`.
/// The Term format is as follows:
/// `[JSON_TYPE][JSON_PATH][JSON_END_OF_PATH][VALUE_BYTES]`
pub struct JsonTermWriter<'a> {
term_buffer: &'a mut Term,
path_stack: Vec<usize>,
@@ -355,27 +358,23 @@ impl<'a> JsonTermWriter<'a> {
pub fn close_path_and_set_type(&mut self, typ: Type) {
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
buffer[buffer_len - 1] = JSON_END_OF_PATH;
self.term_buffer.set_json_path_end();
self.term_buffer.append_bytes(&[typ.to_code()]);
}
pub fn push_path_segment(&mut self, segment: &str) {
// the path stack should never be empty.
self.trim_to_end_of_path();
let buffer = self.term_buffer.value_bytes_mut();
let buffer_len = buffer.len();
if self.path_stack.len() > 1 {
buffer[buffer_len - 1] = JSON_PATH_SEGMENT_SEP;
self.term_buffer.set_json_path_separator();
}
let appended_segment = self.term_buffer.append_bytes(segment.as_bytes());
if self.expand_dots_enabled {
// We need to replace `.` by JSON_PATH_SEGMENT_SEP.
replace_in_place(b'.', JSON_PATH_SEGMENT_SEP, appended_segment);
}
self.term_buffer.push_byte(JSON_PATH_SEGMENT_SEP);
self.term_buffer.add_json_path_separator();
self.path_stack.push(self.term_buffer.len_bytes());
}
@@ -389,14 +388,14 @@ impl<'a> JsonTermWriter<'a> {
#[cfg(test)]
pub(crate) fn path(&self) -> &[u8] {
let end_of_path = self.path_stack.last().cloned().unwrap_or(1);
&self.term().value_bytes()[..end_of_path - 1]
&self.term().serialized_value_bytes()[..end_of_path - 1]
}
pub(crate) fn set_fast_value<T: FastValue>(&mut self, val: T) {
self.close_path_and_set_type(T::to_type());
let value = if T::to_type() == Type::Date {
DateTime::from_u64(val.to_u64())
.truncate(DatePrecision::Seconds)
.truncate(DATE_TIME_PRECISION_INDEXED)
.to_u64()
} else {
val.to_u64()
@@ -405,8 +404,7 @@ impl<'a> JsonTermWriter<'a> {
.append_bytes(value.to_be_bytes().as_slice());
}
#[cfg(test)]
pub(crate) fn set_str(&mut self, text: &str) {
pub fn set_str(&mut self, text: &str) {
self.close_path_and_set_type(Type::Str);
self.term_buffer.append_bytes(text.as_bytes());
}
@@ -432,12 +430,12 @@ mod tests {
json_writer.set_str("red");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"red\")"
"Term(field=1, type=Json, path=attributes.color, type=Str, \"red\")"
);
json_writer.set_str("blue");
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.color, vtype=Str, \"blue\")"
"Term(field=1, type=Json, path=attributes.color, type=Str, \"blue\")"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("dimensions");
@@ -445,14 +443,14 @@ mod tests {
json_writer.set_fast_value(400i64);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.dimensions.width, vtype=I64, 400)"
"Term(field=1, type=Json, path=attributes.dimensions.width, type=I64, 400)"
);
json_writer.pop_path_segment();
json_writer.push_path_segment("height");
json_writer.set_fast_value(300i64);
assert_eq!(
format!("{:?}", json_writer.term()),
"Term(type=Json, field=1, path=attributes.dimensions.height, vtype=I64, 300)"
"Term(field=1, type=Json, path=attributes.dimensions.height, type=I64, 300)"
);
}
@@ -464,7 +462,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}
@@ -477,7 +475,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(-4i64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00i\x7f\xff\xff\xff\xff\xff\xff\xfc"
)
}
@@ -490,7 +488,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4u64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00u\x00\x00\x00\x00\x00\x00\x00\x04"
)
}
@@ -503,7 +501,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(4.0f64);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00f\xc0\x10\x00\x00\x00\x00\x00\x00"
)
}
@@ -516,7 +514,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_fast_value(true);
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00o\x00\x00\x00\x00\x00\x00\x00\x01"
)
}
@@ -531,7 +529,7 @@ mod tests {
json_writer.push_path_segment("color");
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jattribute\x01color\x00sred"
)
}
@@ -546,7 +544,7 @@ mod tests {
json_writer.pop_path_segment();
json_writer.set_str("red");
assert_eq!(
json_writer.term().as_slice(),
json_writer.term().serialized_term(),
b"\x00\x00\x00\x01jcolor\x00sred"
)
}

View File

@@ -2,6 +2,7 @@ mod executor;
pub mod index;
mod index_meta;
mod inverted_index_reader;
#[doc(hidden)]
pub mod json_utils;
pub mod searcher;
mod segment;

View File

@@ -296,6 +296,6 @@ impl fmt::Debug for Searcher {
.iter()
.map(SegmentReader::segment_id)
.collect::<Vec<_>>();
write!(f, "Searcher({:?})", segment_ids)
write!(f, "Searcher({segment_ids:?})")
}
}

View File

@@ -2,8 +2,6 @@ use std::collections::HashMap;
use std::sync::{Arc, RwLock};
use std::{fmt, io};
use fail::fail_point;
use crate::core::{InvertedIndexReader, Segment, SegmentComponent, SegmentId};
use crate::directory::{CompositeFile, FileSlice};
use crate::error::DataCorruption;
@@ -151,7 +149,7 @@ impl SegmentReader {
let store_file = segment.open_read(SegmentComponent::Store)?;
fail_point!("SegmentReader::open#middle");
crate::fail_point!("SegmentReader::open#middle");
let postings_file = segment.open_read(SegmentComponent::Postings)?;
let postings_composite = CompositeFile::open(&postings_file)?;

View File

@@ -269,9 +269,8 @@ fn garbage_collect_works_as_intended() -> crate::Result<()> {
assert_eq!(searcher.num_docs(), 8_000);
assert!(
mem_right_after_merge_finished < mem_right_after_commit,
"(mem after merge){} is expected < (mem before merge){}",
mem_right_after_merge_finished,
mem_right_after_commit
"(mem after merge){mem_right_after_merge_finished} is expected < (mem before \
merge){mem_right_after_commit}"
);
Ok(())
}

View File

@@ -116,14 +116,14 @@ impl fmt::Debug for Incompatibility {
index_compression_format,
} => {
let err = format!(
"Library was compiled with {:?} compression, index was compressed with {:?}",
library_compression_format, index_compression_format
"Library was compiled with {library_compression_format:?} compression, index \
was compressed with {index_compression_format:?}"
);
let advice = format!(
"Change the feature flag to {:?} and rebuild the library",
index_compression_format
"Change the feature flag to {index_compression_format:?} and rebuild the \
library"
);
write!(f, "{}. {}", err, advice)?;
write!(f, "{err}. {advice}")?;
}
Incompatibility::IndexMismatch {
library_version,
@@ -140,7 +140,7 @@ impl fmt::Debug for Incompatibility {
and rebuild your project.",
index_version.index_format_version, index_version.major, index_version.minor
);
write!(f, "{}. {}", err, advice)?;
write!(f, "{err}. {advice}")?;
}
}

View File

@@ -73,9 +73,9 @@ impl Footer {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"Footer seems invalid as it suggests a footer len of {}. File is corrupted, \
or the index was created with a different & old version of tantivy.",
footer_len
"Footer seems invalid as it suggests a footer len of {footer_len}. File is \
corrupted, or the index was created with a different & old version of \
tantivy."
),
));
}
@@ -84,8 +84,8 @@ impl Footer {
return Err(io::Error::new(
io::ErrorKind::UnexpectedEof,
format!(
"File corrupted. The file is smaller than it's footer bytes (len={}).",
total_footer_size
"File corrupted. The file is smaller than it's footer bytes \
(len={total_footer_size})."
),
));
}

View File

@@ -69,7 +69,7 @@ impl ManagedDirectory {
.map_err(|e| {
DataCorruption::new(
MANAGED_FILEPATH.to_path_buf(),
format!("Managed file cannot be deserialized: {:?}. ", e),
format!("Managed file cannot be deserialized: {e:?}. "),
)
})?;
Ok(ManagedDirectory {

View File

@@ -1,10 +1,10 @@
use std::collections::HashMap;
use std::fmt;
use std::fs::{self, File, OpenOptions};
use std::io::{self, BufWriter, Read, Seek, Write};
use std::ops::Deref;
use std::path::{Path, PathBuf};
use std::sync::{Arc, RwLock, Weak};
use std::{fmt, result};
use common::StableDeref;
use fs4::FileExt;
@@ -21,6 +21,8 @@ use crate::directory::{
AntiCallToken, Directory, DirectoryLock, FileHandle, Lock, OwnedBytes, TerminatingWrite,
WatchCallback, WatchHandle, WritePtr,
};
#[cfg(unix)]
use crate::Advice;
pub type ArcBytes = Arc<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
pub type WeakArcBytes = Weak<dyn Deref<Target = [u8]> + Send + Sync + 'static>;
@@ -32,7 +34,7 @@ pub(crate) fn make_io_err(msg: String) -> io::Error {
/// Returns `None` iff the file exists, can be read, but is empty (and hence
/// cannot be mmapped)
fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
fn open_mmap(full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
let file = File::open(full_path).map_err(|io_err| {
if io_err.kind() == io::ErrorKind::NotFound {
OpenReadError::FileDoesNotExist(full_path.to_path_buf())
@@ -50,11 +52,13 @@ fn open_mmap(full_path: &Path) -> result::Result<Option<Mmap>, OpenReadError> {
// instead.
return Ok(None);
}
unsafe {
let mmap_opt: Option<memmap2::Mmap> = unsafe {
memmap2::Mmap::map(&file)
.map(Some)
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, full_path.to_path_buf()))
}
}?;
Ok(mmap_opt)
}
#[derive(Default, Clone, Debug, Serialize, Deserialize)]
@@ -72,13 +76,28 @@ pub struct CacheInfo {
pub mmapped: Vec<PathBuf>,
}
#[derive(Default)]
struct MmapCache {
counters: CacheCounters,
cache: HashMap<PathBuf, WeakArcBytes>,
#[cfg(unix)]
madvice_opt: Option<Advice>,
}
impl MmapCache {
fn new() -> MmapCache {
MmapCache {
counters: CacheCounters::default(),
cache: HashMap::default(),
#[cfg(unix)]
madvice_opt: None,
}
}
#[cfg(unix)]
fn set_advice(&mut self, madvice: Advice) {
self.madvice_opt = Some(madvice);
}
fn get_info(&self) -> CacheInfo {
let paths: Vec<PathBuf> = self.cache.keys().cloned().collect();
CacheInfo {
@@ -99,6 +118,16 @@ impl MmapCache {
}
}
fn open_mmap_impl(&self, full_path: &Path) -> Result<Option<Mmap>, OpenReadError> {
let mmap_opt = open_mmap(full_path)?;
#[cfg(unix)]
if let (Some(mmap), Some(madvice)) = (mmap_opt.as_ref(), self.madvice_opt) {
// We ignore madvise errors.
let _ = mmap.advise(madvice);
}
Ok(mmap_opt)
}
// Returns None if the file exists but as a len of 0 (and hence is not mmappable).
fn get_mmap(&mut self, full_path: &Path) -> Result<Option<ArcBytes>, OpenReadError> {
if let Some(mmap_weak) = self.cache.get(full_path) {
@@ -109,7 +138,7 @@ impl MmapCache {
}
self.cache.remove(full_path);
self.counters.miss += 1;
let mmap_opt = open_mmap(full_path)?;
let mmap_opt = self.open_mmap_impl(full_path)?;
Ok(mmap_opt.map(|mmap| {
let mmap_arc: ArcBytes = Arc::new(mmap);
let mmap_weak = Arc::downgrade(&mmap_arc);
@@ -146,7 +175,7 @@ struct MmapDirectoryInner {
impl MmapDirectoryInner {
fn new(root_path: PathBuf, temp_directory: Option<TempDir>) -> MmapDirectoryInner {
MmapDirectoryInner {
mmap_cache: Default::default(),
mmap_cache: RwLock::new(MmapCache::new()),
_temp_directory: temp_directory,
watcher: FileWatcher::new(&root_path.join(*META_FILEPATH)),
root_path,
@@ -185,12 +214,31 @@ impl MmapDirectory {
))
}
/// Opens a MmapDirectory in a directory, with a given access pattern.
///
/// This is only supported on unix platforms.
#[cfg(unix)]
pub fn open_with_madvice(
directory_path: impl AsRef<Path>,
madvice: Advice,
) -> Result<MmapDirectory, OpenDirectoryError> {
let dir = Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())?;
dir.inner.mmap_cache.write().unwrap().set_advice(madvice);
Ok(dir)
}
/// Opens a MmapDirectory in a directory.
///
/// Returns an error if the `directory_path` does not
/// exist or if it is not a directory.
pub fn open<P: AsRef<Path>>(directory_path: P) -> Result<MmapDirectory, OpenDirectoryError> {
let directory_path: &Path = directory_path.as_ref();
pub fn open(directory_path: impl AsRef<Path>) -> Result<MmapDirectory, OpenDirectoryError> {
Self::open_impl_to_avoid_monomorphization(directory_path.as_ref())
}
#[inline(never)]
fn open_impl_to_avoid_monomorphization(
directory_path: &Path,
) -> Result<MmapDirectory, OpenDirectoryError> {
if !directory_path.exists() {
return Err(OpenDirectoryError::DoesNotExist(PathBuf::from(
directory_path,
@@ -326,15 +374,12 @@ pub(crate) fn atomic_write(path: &Path, content: &[u8]) -> io::Result<()> {
}
impl Directory for MmapDirectory {
fn get_file_handle(&self, path: &Path) -> result::Result<Arc<dyn FileHandle>, OpenReadError> {
fn get_file_handle(&self, path: &Path) -> Result<Arc<dyn FileHandle>, OpenReadError> {
debug!("Open Read {:?}", path);
let full_path = self.resolve_path(path);
let mut mmap_cache = self.inner.mmap_cache.write().map_err(|_| {
let msg = format!(
"Failed to acquired write lock on mmap cache while reading {:?}",
path
);
let msg = format!("Failed to acquired write lock on mmap cache while reading {path:?}");
let io_err = make_io_err(msg);
OpenReadError::wrap_io_error(io_err, path.to_path_buf())
})?;
@@ -352,7 +397,7 @@ impl Directory for MmapDirectory {
/// Any entry associated with the path in the mmap will be
/// removed before the file is deleted.
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fn delete(&self, path: &Path) -> Result<(), DeleteError> {
let full_path = self.resolve_path(path);
fs::remove_file(full_path).map_err(|e| {
if e.kind() == io::ErrorKind::NotFound {
@@ -369,7 +414,9 @@ impl Directory for MmapDirectory {
fn exists(&self, path: &Path) -> Result<bool, OpenReadError> {
let full_path = self.resolve_path(path);
Ok(full_path.exists())
full_path
.try_exists()
.map_err(|io_err| OpenReadError::wrap_io_error(io_err, path.to_path_buf()))
}
fn open_write(&self, path: &Path) -> Result<WritePtr, OpenWriteError> {

View File

@@ -5,7 +5,6 @@ use std::sync::{Arc, RwLock};
use std::{fmt, result};
use common::HasLen;
use fail::fail_point;
use super::FileHandle;
use crate::core::META_FILEPATH;
@@ -184,7 +183,7 @@ impl Directory for RamDirectory {
}
fn delete(&self, path: &Path) -> result::Result<(), DeleteError> {
fail_point!("RamDirectory::delete", |_| {
crate::fail_point!("RamDirectory::delete", |_| {
Err(DeleteError::IoError {
io_error: Arc::new(io::Error::from(io::ErrorKind::Other)),
filepath: path.to_path_buf(),

View File

@@ -44,7 +44,7 @@ impl fmt::Debug for DataCorruption {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> Result<(), fmt::Error> {
write!(f, "Data corruption")?;
if let Some(ref filepath) = &self.filepath {
write!(f, " (in file `{:?}`)", filepath)?;
write!(f, " (in file `{filepath:?}`)")?;
}
write!(f, ": {}.", self.comment)?;
Ok(())
@@ -120,7 +120,7 @@ impl From<DataCorruption> for TantivyError {
}
impl From<FastFieldNotAvailableError> for TantivyError {
fn from(fastfield_error: FastFieldNotAvailableError) -> TantivyError {
TantivyError::SchemaError(format!("{}", fastfield_error))
TantivyError::SchemaError(format!("{fastfield_error}"))
}
}
impl From<LockError> for TantivyError {
@@ -131,7 +131,7 @@ impl From<LockError> for TantivyError {
impl From<query::QueryParserError> for TantivyError {
fn from(parsing_error: query::QueryParserError) -> TantivyError {
TantivyError::InvalidArgument(format!("Query is invalid. {:?}", parsing_error))
TantivyError::InvalidArgument(format!("Query is invalid. {parsing_error:?}"))
}
}
@@ -161,7 +161,7 @@ impl From<time::error::ComponentRange> for TantivyError {
impl From<schema::DocParsingError> for TantivyError {
fn from(error: schema::DocParsingError) -> TantivyError {
TantivyError::InvalidArgument(format!("Failed to parse document {:?}", error))
TantivyError::InvalidArgument(format!("Failed to parse document {error:?}"))
}
}

View File

@@ -14,7 +14,7 @@
//! Fields have to be declared as `FAST` in the schema.
//! Currently supported fields are: u64, i64, f64, bytes, ip and text.
//!
//! Fast fields are stored in with [different codecs](fastfield_codecs). The best codec is detected
//! Fast fields are stored in with [different codecs](columnar). The best codec is detected
//! automatically, when serializing.
//!
//! Read access performance is comparable to that of an array lookup.
@@ -90,11 +90,12 @@ mod tests {
use crate::directory::{Directory, RamDirectory, WritePtr};
use crate::merge_policy::NoMergePolicy;
use crate::schema::{
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder, FAST,
INDEXED, STORED, STRING, TEXT,
Document, Facet, FacetOptions, Field, JsonObjectOptions, Schema, SchemaBuilder,
TextOptions, FAST, INDEXED, STORED, STRING, TEXT,
};
use crate::time::OffsetDateTime;
use crate::{DateOptions, DatePrecision, Index, SegmentId, SegmentReader};
use crate::tokenizer::{LowerCaser, RawTokenizer, TextAnalyzer, TokenizerManager};
use crate::{DateOptions, DateTimePrecision, Index, SegmentId, SegmentReader};
pub static SCHEMA: Lazy<Schema> = Lazy::new(|| {
let mut schema_builder = Schema::builder();
@@ -130,7 +131,7 @@ mod tests {
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 95);
assert_eq!(file.len(), 93);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let column = fast_field_readers
.u64("field")
@@ -180,7 +181,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 123);
assert_eq!(file.len(), 121);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
.u64("field")
@@ -213,7 +214,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 96);
assert_eq!(file.len(), 94);
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let fast_field_reader = fast_field_readers
.u64("field")
@@ -245,7 +246,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 4491);
assert_eq!(file.len(), 4489);
{
let fast_field_readers = FastFieldReaders::open(file, SCHEMA.clone()).unwrap();
let col = fast_field_readers
@@ -278,7 +279,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 267);
assert_eq!(file.len(), 265);
{
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
@@ -685,12 +686,12 @@ mod tests {
let mut schema_builder = Schema::builder();
let date_field = schema_builder.add_date_field(
"date",
DateOptions::from(FAST).set_precision(DatePrecision::Microseconds),
DateOptions::from(FAST).set_precision(DateTimePrecision::Nanoseconds),
);
let multi_date_field = schema_builder.add_date_field(
"multi_date",
DateOptions::default()
.set_precision(DatePrecision::Microseconds)
.set_precision(DateTimePrecision::Nanoseconds)
.set_fast(),
);
let schema = schema_builder.build();
@@ -724,25 +725,25 @@ mod tests {
.column_opt::<DateTime>("multi_date")
.unwrap()
.unwrap();
let mut dates = vec![];
let mut dates = Vec::new();
{
assert_eq!(date_fast_field.get_val(0).into_timestamp_micros(), 1i64);
assert_eq!(date_fast_field.get_val(0).into_timestamp_nanos(), 1i64);
dates_fast_field.fill_vals(0u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 2i64);
assert_eq!(dates[1].into_timestamp_micros(), 3i64);
assert_eq!(dates[0].into_timestamp_nanos(), 2i64);
assert_eq!(dates[1].into_timestamp_nanos(), 3i64);
}
{
assert_eq!(date_fast_field.get_val(1).into_timestamp_micros(), 4i64);
assert_eq!(date_fast_field.get_val(1).into_timestamp_nanos(), 4i64);
dates_fast_field.fill_vals(1u32, &mut dates);
assert!(dates.is_empty());
}
{
assert_eq!(date_fast_field.get_val(2).into_timestamp_micros(), 0i64);
assert_eq!(date_fast_field.get_val(2).into_timestamp_nanos(), 0i64);
dates_fast_field.fill_vals(2u32, &mut dates);
assert_eq!(dates.len(), 2);
assert_eq!(dates[0].into_timestamp_micros(), 5i64);
assert_eq!(dates[1].into_timestamp_micros(), 6i64);
assert_eq!(dates[0].into_timestamp_nanos(), 5i64);
assert_eq!(dates[1].into_timestamp_nanos(), 6i64);
}
Ok(())
}
@@ -772,7 +773,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 104);
assert_eq!(file.len(), 102);
let fast_field_readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = fast_field_readers.bool("field_bool").unwrap();
assert_eq!(bool_col.first(0), Some(true));
@@ -804,7 +805,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 116);
assert_eq!(file.len(), 114);
let readers = FastFieldReaders::open(file, schema).unwrap();
let bool_col = readers.bool("field_bool").unwrap();
for i in 0..25 {
@@ -829,7 +830,7 @@ mod tests {
write.terminate().unwrap();
}
let file = directory.open_read(path).unwrap();
assert_eq!(file.len(), 106);
assert_eq!(file.len(), 104);
let fastfield_readers = FastFieldReaders::open(file, schema).unwrap();
let col = fastfield_readers.bool("field_bool").unwrap();
assert_eq!(col.first(0), None);
@@ -861,9 +862,9 @@ mod tests {
#[test]
pub fn test_gcd_date() {
let size_prec_sec = test_gcd_date_with_codec(DatePrecision::Seconds);
let size_prec_sec = test_gcd_date_with_codec(DateTimePrecision::Seconds);
assert!((1000 * 13 / 8..100 + 1000 * 13 / 8).contains(&size_prec_sec.get_bytes())); // 13 bits per val = ceil(log_2(number of seconds in 2hours);
let size_prec_micros = test_gcd_date_with_codec(DatePrecision::Microseconds);
let size_prec_micros = test_gcd_date_with_codec(DateTimePrecision::Microseconds);
assert!((1000 * 33 / 8..100 + 1000 * 33 / 8).contains(&size_prec_micros.get_bytes()));
// 33 bits per
// val = ceil(log_2(number
@@ -871,7 +872,7 @@ mod tests {
// in 2hours);
}
fn test_gcd_date_with_codec(precision: DatePrecision) -> ByteCount {
fn test_gcd_date_with_codec(precision: DateTimePrecision) -> ByteCount {
let mut rng = StdRng::seed_from_u64(2u64);
const T0: i64 = 1_662_345_825_012_529i64;
const ONE_HOUR_IN_MICROSECS: i64 = 3_600 * 1_000_000;
@@ -1081,7 +1082,7 @@ mod tests {
#[test]
fn test_fast_field_in_json_field_expand_dots_disabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast();
let json_option = JsonObjectOptions::default().set_fast(None);
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
@@ -1104,11 +1105,36 @@ mod tests {
assert_eq!(&vals, &[32])
}
#[test]
fn test_fast_field_in_json_field_with_tokenizer() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default().set_fast(Some("default"));
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(json => json!({"age": 32})))
.unwrap();
index_writer
.add_document(doc!(json => json!({"age": "NEW"})))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_fields = searcher.segment_reader(0u32).fast_fields();
let ff_str = fast_fields.str("json.age").unwrap().unwrap();
let mut output = String::new();
ff_str.ord_to_str(0, &mut output).unwrap();
assert_eq!(output, "new");
}
#[test]
fn test_fast_field_in_json_field_expand_dots_enabled() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled();
let json = schema_builder.add_json_field("json", json_option);
let schema = schema_builder.build();
@@ -1173,6 +1199,35 @@ mod tests {
assert_eq!(&vals, &[33]);
}
#[test]
fn test_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
let opt = TextOptions::default().set_fast(Some("custom_lowercase"));
let text_field = schema_builder.add_text_field("text", opt);
let schema = schema_builder.build();
let ff_tokenizer_manager = TokenizerManager::default();
ff_tokenizer_manager.register(
"custom_lowercase",
TextAnalyzer::builder(RawTokenizer::default())
.filter(LowerCaser)
.build(),
);
let mut index = Index::create_in_ram(schema);
index.set_fast_field_tokenizers(ff_tokenizer_manager);
let mut index_writer = index.writer_for_tests().unwrap();
index_writer
.add_document(doc!(text_field => "Test1 test2"))
.unwrap();
index_writer.commit().unwrap();
let searcher = index.reader().unwrap().searcher();
let fast_field_reader = searcher.segment_reader(0u32).fast_fields();
let column = fast_field_reader.str("text").unwrap().unwrap();
let mut out = String::new();
column.ord_to_str(0u64, &mut out).unwrap();
assert_eq!(&out, "test1 test2");
}
#[test]
fn test_text_fast_field_tokenizer() {
let mut schema_builder = Schema::builder();
@@ -1216,7 +1271,7 @@ mod tests {
fn test_shadowing_fast_field_with_expand_dots() {
let mut schema_builder = Schema::builder();
let json_option = JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled();
let json_field = schema_builder.add_json_field("jsonfield", json_option.clone());
let shadowing_json_field = schema_builder.add_json_field("jsonfield.attr", json_option);

View File

@@ -87,9 +87,8 @@ impl FastFieldReaders {
) -> crate::Result<Option<String>> {
let Some((field, path)): Option<(Field, &str)> = self
.schema
.find_field(field_name)
.or_else(|| default_field_opt.map(|default_field| (default_field, field_name)))
else{
.find_field_with_default(field_name, default_field_opt)
else {
return Ok(None);
};
let field_entry: &FieldEntry = self.schema.get_field_entry(field);
@@ -98,22 +97,17 @@ impl FastFieldReaders {
"Field {field_name:?} is not configured as fast field"
)));
}
let field_name = self.schema.get_field_name(field);
if path.is_empty() {
return Ok(Some(field_name.to_string()));
}
let field_type = field_entry.field_type();
match (field_type, path) {
Ok(match (field_entry.field_type(), path) {
(FieldType::JsonObject(json_options), path) if !path.is_empty() => {
Ok(Some(encode_column_name(
Some(encode_column_name(
field_entry.name(),
path,
json_options.is_expand_dots_enabled(),
)))
))
}
(_, "") => Ok(Some(field_entry.name().to_string())),
_ => Ok(None),
}
(_, "") => Some(field_entry.name().to_string()),
_ => None,
})
}
/// Returns a typed column associated to a given field name.
@@ -126,7 +120,8 @@ impl FastFieldReaders {
T: HasAssociatedColumnType,
DynamicColumn: Into<Option<Column<T>>>,
{
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, T::column_type())?
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, T::column_type())?
else {
return Ok(None);
};
@@ -202,7 +197,8 @@ impl FastFieldReaders {
/// Returns a `str` column.
pub fn str(&self, field_name: &str) -> crate::Result<Option<StrColumn>> {
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Str)?
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Str)?
else {
return Ok(None);
};
@@ -212,7 +208,8 @@ impl FastFieldReaders {
/// Returns a `bytes` column.
pub fn bytes(&self, field_name: &str) -> crate::Result<Option<BytesColumn>> {
let Some(dynamic_column_handle) = self.dynamic_column_handle(field_name, ColumnType::Bytes)?
let Some(dynamic_column_handle) =
self.dynamic_column_handle(field_name, ColumnType::Bytes)?
else {
return Ok(None);
};
@@ -278,6 +275,34 @@ impl FastFieldReaders {
Ok(None)
}
/// Returns the all `u64` column used to represent any `u64`-mapped typed (String/Bytes term
/// ids, i64, u64, f64, DateTime).
///
/// In case of JSON, there may be two columns. One for term and one for numerical types. (This
/// may change later to 3 types if JSON handles DateTime)
#[doc(hidden)]
pub fn u64_lenient_for_type_all(
&self,
type_white_list_opt: Option<&[ColumnType]>,
field_name: &str,
) -> crate::Result<Vec<(Column<u64>, ColumnType)>> {
let mut columns_and_types = Vec::new();
let Some(resolved_field_name) = self.resolve_field(field_name)? else {
return Ok(columns_and_types);
};
for col in self.columnar.read_columns(&resolved_field_name)? {
if let Some(type_white_list) = type_white_list_opt {
if !type_white_list.contains(&col.column_type()) {
continue;
}
}
if let Some(col_u64) = col.open_u64_lenient()? {
columns_and_types.push((col_u64, col.column_type()));
}
}
Ok(columns_and_types)
}
/// Returns the `u64` column used to represent any `u64`-mapped typed (i64, u64, f64, DateTime).
///
/// Returns Ok(None) for empty columns
@@ -324,7 +349,7 @@ mod tests {
schema_builder.add_json_field(
"json_expand_dots_enabled",
JsonObjectOptions::default()
.set_fast()
.set_fast(None)
.set_expand_dots_enabled(),
);
let dynamic_field = schema_builder.add_json_field("_dyna", FAST);

View File

@@ -8,7 +8,7 @@ use crate::indexer::doc_id_mapping::DocIdMapping;
use crate::schema::term::{JSON_PATH_SEGMENT_SEP, JSON_PATH_SEGMENT_SEP_STR};
use crate::schema::{value_type_to_column_type, Document, FieldType, Schema, Type, Value};
use crate::tokenizer::{TextAnalyzer, TokenizerManager};
use crate::{DatePrecision, DocId, TantivyError};
use crate::{DateTimePrecision, DocId, TantivyError};
/// Only index JSON down to a depth of 20.
/// This is mostly to guard us from a stack overflow triggered by malicious input.
@@ -19,7 +19,7 @@ pub struct FastFieldsWriter {
columnar_writer: ColumnarWriter,
fast_field_names: Vec<Option<String>>, //< TODO see if we can hash the field name hash too.
per_field_tokenizer: Vec<Option<TextAnalyzer>>,
date_precisions: Vec<DatePrecision>,
date_precisions: Vec<DateTimePrecision>,
expand_dots: Vec<bool>,
num_docs: DocId,
// Buffer that we recycle to avoid allocation.
@@ -30,7 +30,7 @@ impl FastFieldsWriter {
/// Create all `FastFieldWriter` required by the schema.
#[cfg(test)]
pub fn from_schema(schema: &Schema) -> crate::Result<FastFieldsWriter> {
FastFieldsWriter::from_schema_and_tokenizer_manager(&schema, TokenizerManager::new())
FastFieldsWriter::from_schema_and_tokenizer_manager(schema, TokenizerManager::new())
}
/// Create all `FastFieldWriter` required by the schema.
@@ -41,12 +41,12 @@ impl FastFieldsWriter {
let mut columnar_writer = ColumnarWriter::default();
let mut fast_field_names: Vec<Option<String>> = vec![None; schema.num_fields()];
let mut date_precisions: Vec<DatePrecision> =
std::iter::repeat_with(DatePrecision::default)
let mut date_precisions: Vec<DateTimePrecision> =
std::iter::repeat_with(DateTimePrecision::default)
.take(schema.num_fields())
.collect();
let mut expand_dots = vec![false; schema.num_fields()];
let mut per_field_tokenizer = vec![None; schema.num_fields()];
let mut per_field_tokenizer: Vec<Option<TextAnalyzer>> = vec![None; schema.num_fields()];
// TODO see other types
for (field_id, field_entry) in schema.fields() {
if !field_entry.field_type().is_fast() {
@@ -58,6 +58,15 @@ impl FastFieldsWriter {
date_precisions[field_id.field_id() as usize] = date_options.get_precision();
}
if let FieldType::JsonObject(json_object_options) = field_entry.field_type() {
if let Some(tokenizer_name) = json_object_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
}
expand_dots[field_id.field_id() as usize] =
json_object_options.is_expand_dots_enabled();
}
@@ -65,8 +74,7 @@ impl FastFieldsWriter {
if let Some(tokenizer_name) = text_options.get_fast_field_tokenizer_name() {
let text_analyzer = tokenizer_manager.get(tokenizer_name).ok_or_else(|| {
TantivyError::InvalidArgument(format!(
"Tokenizer {:?} not found",
tokenizer_name
"Tokenizer {tokenizer_name:?} not found"
))
})?;
per_field_tokenizer[field_id.field_id() as usize] = Some(text_analyzer);
@@ -138,10 +146,10 @@ impl FastFieldsWriter {
);
}
Value::Str(text_val) => {
if let Some(text_analyzer) =
&self.per_field_tokenizer[field_value.field().field_id() as usize]
if let Some(tokenizer) =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize]
{
let mut token_stream = text_analyzer.token_stream(text_val);
let mut token_stream = tokenizer.token_stream(text_val);
token_stream.process(&mut |token: &Token| {
self.columnar_writer.record_str(
doc_id,
@@ -192,6 +200,10 @@ impl FastFieldsWriter {
let expand_dots = self.expand_dots[field_value.field().field_id() as usize];
self.json_path_buffer.clear();
self.json_path_buffer.push_str(field_name);
let text_analyzer =
&mut self.per_field_tokenizer[field_value.field().field_id() as usize];
record_json_obj_to_columnar_writer(
doc_id,
json_obj,
@@ -199,6 +211,7 @@ impl FastFieldsWriter {
JSON_DEPTH_LIMIT,
&mut self.json_path_buffer,
&mut self.columnar_writer,
text_analyzer,
);
}
Value::IpAddr(ip_addr) => {
@@ -250,6 +263,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit: usize,
json_path_buffer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
for (key, child) in json_obj {
let len_path = json_path_buffer.len();
@@ -274,6 +288,7 @@ fn record_json_obj_to_columnar_writer(
remaining_depth_limit,
json_path_buffer,
columnar_writer,
tokenizer,
);
// popping our sub path.
json_path_buffer.truncate(len_path);
@@ -287,6 +302,7 @@ fn record_json_value_to_columnar_writer(
mut remaining_depth_limit: usize,
json_path_writer: &mut String,
columnar_writer: &mut columnar::ColumnarWriter,
tokenizer: &mut Option<TextAnalyzer>,
) {
if remaining_depth_limit == 0 {
return;
@@ -305,7 +321,14 @@ fn record_json_value_to_columnar_writer(
}
}
serde_json::Value::String(text) => {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
if let Some(text_analyzer) = tokenizer.as_mut() {
let mut token_stream = text_analyzer.token_stream(text);
token_stream.process(&mut |token| {
columnar_writer.record_str(doc, json_path_writer.as_str(), &token.text);
})
} else {
columnar_writer.record_str(doc, json_path_writer.as_str(), text);
}
}
serde_json::Value::Array(arr) => {
for el in arr {
@@ -316,6 +339,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
);
}
}
@@ -327,6 +351,7 @@ fn record_json_value_to_columnar_writer(
remaining_depth_limit,
json_path_writer,
columnar_writer,
tokenizer,
);
}
}
@@ -354,6 +379,7 @@ mod tests {
JSON_DEPTH_LIMIT,
&mut json_path,
&mut columnar_writer,
&mut None,
);
}
let mut buffer = Vec::new();

Some files were not shown because too many files have changed in this diff Show More