mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2025-12-28 13:02:55 +00:00
Compare commits
149 Commits
update_exa
...
0.24.2
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a3175f5341 | ||
|
|
203b0eebf1 | ||
|
|
eb37dbee26 | ||
|
|
c6e77d27c6 | ||
|
|
db6587ed9b | ||
|
|
3fa90e70e2 | ||
|
|
6ab4102253 | ||
|
|
11c6329ca5 | ||
|
|
ab8bb93928 | ||
|
|
2b668bd2bf | ||
|
|
97a7137ef8 | ||
|
|
ffa7cdf397 | ||
|
|
caf1275e60 | ||
|
|
fb12b7be28 | ||
|
|
6f77083493 | ||
|
|
cd7745da7a | ||
|
|
eb8304dee9 | ||
|
|
e5638112a9 | ||
|
|
81110152fb | ||
|
|
ae88a7ece5 | ||
|
|
bdd5f80fd9 | ||
|
|
3f62ef22e5 | ||
|
|
8102e19e48 | ||
|
|
175c853ea7 | ||
|
|
c992cf3f37 | ||
|
|
83f6c2f265 | ||
|
|
17bf8aa092 | ||
|
|
6fc0e96ff8 | ||
|
|
06d2dcf469 | ||
|
|
b681ec9335 | ||
|
|
da2ff5712a | ||
|
|
18da402e27 | ||
|
|
18ae3ffe94 | ||
|
|
0a37b7acaa | ||
|
|
1a9fd885dd | ||
|
|
3e660905a7 | ||
|
|
0c2b984cb4 | ||
|
|
a69b1c609c | ||
|
|
8d4a6fcaba | ||
|
|
feced4762f | ||
|
|
0149317c5a | ||
|
|
3fcb6f9597 | ||
|
|
388fcd763b | ||
|
|
e488f9e6a2 | ||
|
|
9426d5be7b | ||
|
|
d5d2d41264 | ||
|
|
80f5f1ecd4 | ||
|
|
519e5d2ed1 | ||
|
|
df2d52a84e | ||
|
|
371dba9414 | ||
|
|
0afabad494 | ||
|
|
89b052cd42 | ||
|
|
c48c649436 | ||
|
|
58c0739953 | ||
|
|
e7daf69de9 | ||
|
|
f060e86bc6 | ||
|
|
0368162ef0 | ||
|
|
e843c71015 | ||
|
|
5cea16ef9f | ||
|
|
4aa8cd2470 | ||
|
|
4d4ee1b0ac | ||
|
|
43c89b4360 | ||
|
|
d281ca3e65 | ||
|
|
be17daf658 | ||
|
|
6ca84a61fa | ||
|
|
037d12c9c9 | ||
|
|
71cf19870b | ||
|
|
175a529c41 | ||
|
|
fe0c7c5408 | ||
|
|
148594f0f9 | ||
|
|
8edb439440 | ||
|
|
dfff5f3bcb | ||
|
|
ebf4d84553 | ||
|
|
42efc7f7c8 | ||
|
|
192395c311 | ||
|
|
a1447cc9c2 | ||
|
|
c39d91f827 | ||
|
|
32b6e9711b | ||
|
|
24c5dc2398 | ||
|
|
9e2ddec4b3 | ||
|
|
1f6a8e74bb | ||
|
|
7e901f523b | ||
|
|
3c30a41c14 | ||
|
|
0f99d4f420 | ||
|
|
6e02c5cb25 | ||
|
|
876a579e5d | ||
|
|
4c52499622 | ||
|
|
0bac391291 | ||
|
|
52d4e81e70 | ||
|
|
c71ea7b2ef | ||
|
|
c35a782747 | ||
|
|
c66af2c0a9 | ||
|
|
f9ac055847 | ||
|
|
21d057059e | ||
|
|
dca508b4ca | ||
|
|
aebae9965d | ||
|
|
e7e3e3f44c | ||
|
|
2f2db16ec1 | ||
|
|
d152e29687 | ||
|
|
285bcc25c9 | ||
|
|
7b65ad922d | ||
|
|
99be20cedd | ||
|
|
5f026901b8 | ||
|
|
6dfa2df06f | ||
|
|
c17e513377 | ||
|
|
2f5a269c70 | ||
|
|
50532260e3 | ||
|
|
8bd6eb06e6 | ||
|
|
55b0b52457 | ||
|
|
56fc56c5b9 | ||
|
|
85395d942a | ||
|
|
a206c3ccd3 | ||
|
|
dc5d31c116 | ||
|
|
95a4ddea3e | ||
|
|
ab5125d3dc | ||
|
|
9f81d59ecd | ||
|
|
c71ec8086d | ||
|
|
27be6aed91 | ||
|
|
3d1c4b313a | ||
|
|
0d4e319965 | ||
|
|
75dc3eb298 | ||
|
|
3f6d225086 | ||
|
|
d8843c608c | ||
|
|
7ebcc15b17 | ||
|
|
1b4076691f | ||
|
|
eab660873a | ||
|
|
232f37126e | ||
|
|
13e9885dfd | ||
|
|
56d79cb203 | ||
|
|
0f4c2e27cf | ||
|
|
f9ae295507 | ||
|
|
d9db5302d9 | ||
|
|
e453848134 | ||
|
|
59084143ef | ||
|
|
511b027350 | ||
|
|
322f47eb47 | ||
|
|
72f61ff89c | ||
|
|
a141c3ec59 | ||
|
|
e90e7a25ae | ||
|
|
c3b92a5412 | ||
|
|
2f55511064 | ||
|
|
08b9fc0b31 | ||
|
|
714f363d43 | ||
|
|
93ff7365b0 | ||
|
|
8151925068 | ||
|
|
b960e40bc8 | ||
|
|
1095c9b073 | ||
|
|
c0686515a9 | ||
|
|
455156f51c |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -15,11 +15,11 @@ jobs:
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- name: Install Rust
|
||||
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
|
||||
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
|
||||
- uses: Swatinem/rust-cache@v2
|
||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
continue-on-error: true
|
||||
|
||||
@@ -46,7 +46,7 @@ The file of a segment has the format
|
||||
|
||||
```segment-id . ext```
|
||||
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.
|
||||
|
||||
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
||||
|
||||
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
|
||||
|
||||
Tantivy's document follows a very strict schema, decided before building any index.
|
||||
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
|
||||
Depending on the type of the field, you can decide to
|
||||
|
||||
|
||||
102
CHANGELOG.md
102
CHANGELOG.md
@@ -1,3 +1,85 @@
|
||||
Tantivy 0.23 - Unreleased
|
||||
================================
|
||||
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75.
|
||||
|
||||
#### Bugfixes
|
||||
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
|
||||
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
|
||||
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
|
||||
- fix `OwnedBytes` debug panic [#2512](https://github.com/quickwit-oss/tantivy/pull/2512)(@b41sh)
|
||||
- catch panics during merges [#2582](https://github.com/quickwit-oss/tantivy/pull/2582)(@rdettai)
|
||||
- switch from u32 to usize in bitpacker. This enables multivalued columns larger than 4GB, which crashed during merge before. [#2581](https://github.com/quickwit-oss/tantivy/pull/2581) [#2586](https://github.com/quickwit-oss/tantivy/pull/2586)(@fulmicoton-dd @PSeitz)
|
||||
|
||||
#### Breaking API Changes
|
||||
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
|
||||
|
||||
#### Features/Improvements
|
||||
- **Aggregation**
|
||||
- Support for cardinality aggregation [#2337](https://github.com/quickwit-oss/tantivy/pull/2337) [#2446](https://github.com/quickwit-oss/tantivy/pull/2446) (@raphaelcoeffic @PSeitz)
|
||||
- Support for extended stats aggregation [#2247](https://github.com/quickwit-oss/tantivy/pull/2247)(@giovannicuccu)
|
||||
- Add Key::I64 and Key::U64 variants in aggregation to avoid f64 precision issues [#2468](https://github.com/quickwit-oss/tantivy/pull/2468)(@PSeitz)
|
||||
- Faster term aggregation fetch terms [#2447](https://github.com/quickwit-oss/tantivy/pull/2447)(@PSeitz)
|
||||
- Improve custom order deserialization [#2451](https://github.com/quickwit-oss/tantivy/pull/2451)(@PSeitz)
|
||||
- Change AggregationLimits behavior [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||
- lower contention on AggregationLimits [#2394](https://github.com/quickwit-oss/tantivy/pull/2394)(@PSeitz)
|
||||
- fix postcard compatibility for top_hits, add postcard test [#2346](https://github.com/quickwit-oss/tantivy/pull/2346)(@PSeitz)
|
||||
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
|
||||
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
|
||||
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||
- add support for counting non integer in aggregation [#2547](https://github.com/quickwit-oss/tantivy/pull/2547)(@trinity-1686a)
|
||||
- **Range Queries**
|
||||
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
|
||||
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
|
||||
- modify fastfield range query heuristic [#2375](https://github.com/quickwit-oss/tantivy/pull/2375)(@trinity-1686a)
|
||||
- add FastFieldRangeQuery for explicit range queries on fast field (for `RangeQuery` it is autodetected) [#2477](https://github.com/quickwit-oss/tantivy/pull/2477)(@PSeitz)
|
||||
|
||||
- add format backwards-compatibility tests [#2485](https://github.com/quickwit-oss/tantivy/pull/2485)(@PSeitz)
|
||||
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
|
||||
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
|
||||
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
|
||||
- Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
|
||||
- Make `NUM_MERGE_THREADS` configurable [#2535](https://github.com/quickwit-oss/tantivy/pull/2535)(@Barre)
|
||||
|
||||
- **RegexPhraseQuery**
|
||||
`RegexPhraseQuery` supports phrase queries with regex. E.g. query "b.* b.* wolf" matches "big bad wolf". Slop is supported as well: "b.* wolf"~2 matches "big bad wolf" [#2516](https://github.com/quickwit-oss/tantivy/pull/2516)(@PSeitz)
|
||||
|
||||
- **Optional Index in Multivalue Columnar Index**
|
||||
For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case).
|
||||
This is alleviated by placing an optional index in the multivalued index to mark documents that have values.
|
||||
This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
|
||||
|
||||
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
|
||||
|
||||
- **Performace/Memory**
|
||||
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
||||
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
||||
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
||||
- Recycling buffer in PrefixPhraseScorer [#2443](https://github.com/quickwit-oss/tantivy/pull/2443)(@fulmicoton)
|
||||
|
||||
- **Json Type**
|
||||
- JSON supports now all values on the root level. Previously an object was required. This enables support for flat mixed types. allow more JSON values, fix i64 special case [#2383](https://github.com/quickwit-oss/tantivy/pull/2383)(@PSeitz)
|
||||
- add json path constructor to term [#2367](https://github.com/quickwit-oss/tantivy/pull/2367)(@PSeitz)
|
||||
|
||||
- **QueryParser**
|
||||
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
|
||||
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
|
||||
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
|
||||
- allow term starting with wildcard [#2568](https://github.com/quickwit-oss/tantivy/pull/2568)(@trinity-1686a)
|
||||
|
||||
- Exist queries match subpath fields [#2558](https://github.com/quickwit-oss/tantivy/pull/2558)(@rdettai)
|
||||
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
|
||||
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
|
||||
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
|
||||
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
|
||||
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
|
||||
- use bingang for agg and stacker benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)[#2492](https://github.com/quickwit-oss/tantivy/pull/2492)(@PSeitz)
|
||||
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
|
||||
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
|
||||
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
|
||||
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
|
||||
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
|
||||
|
||||
Tantivy 0.22
|
||||
================================
|
||||
|
||||
@@ -8,7 +90,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
||||
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
||||
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
||||
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
||||
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||
- Fix bug occurring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
||||
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
||||
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
||||
@@ -26,7 +108,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
||||
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
||||
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
||||
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
||||
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||
- Support ip addresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
||||
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
||||
|
||||
@@ -116,7 +198,7 @@ Tantivy 0.20
|
||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- Move tokenizer API to separate crate. Having a separate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||
@@ -163,13 +245,13 @@ Tantivy 0.20
|
||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||
- sstable
|
||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- Isolating sstable and stacker in independent crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Use DeltaReader directly to implement Dictionary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add separate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Added support for madvise when opening an mmapped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||
- Query Parser
|
||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||
@@ -208,7 +290,7 @@ Tantivy 0.19
|
||||
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
||||
- Aggregation
|
||||
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
||||
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||
- Add support for keyed parameter in range and histogram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
||||
- Faster indexing
|
||||
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
||||
@@ -651,7 +733,7 @@ Tantivy 0.4.0
|
||||
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
|
||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
|
||||
- Optimized skip in SegmentPostings (#130) (@lnicola)
|
||||
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
|
||||
- Replacing rustc_serialize by serde. Kudos to benchmark@KodrAus and @lnicola
|
||||
- Using error-chain (@KodrAus)
|
||||
- QueryParser: (@fulmicoton)
|
||||
- Explicit error returned when searched for a term that is not indexed
|
||||
|
||||
10
CITATION.cff
Normal file
10
CITATION.cff
Normal file
@@ -0,0 +1,10 @@
|
||||
cff-version: 1.2.0
|
||||
message: "If you use this software, please cite it as below."
|
||||
authors:
|
||||
- alias: Quickwit Inc.
|
||||
website: "https://quickwit.io"
|
||||
title: "tantivy"
|
||||
version: 0.22.0
|
||||
doi: 10.5281/zenodo.13942948
|
||||
date-released: 2024-10-17
|
||||
url: "https://github.com/quickwit-oss/tantivy"
|
||||
48
Cargo.toml
48
Cargo.toml
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy"
|
||||
version = "0.23.0"
|
||||
version = "0.24.2"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -11,12 +11,11 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.63"
|
||||
rust-version = "1.81"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
# Switch back to the non-forked oneshot crate once https://github.com/faern/oneshot/pull/35 is merged
|
||||
oneshot = { git = "https://github.com/fulmicoton/oneshot.git", rev = "b208f49" }
|
||||
oneshot = "0.1.7"
|
||||
base64 = "0.22.0"
|
||||
byteorder = "1.4.3"
|
||||
crc32fast = "1.3.2"
|
||||
@@ -30,49 +29,52 @@ tantivy-fst = "0.5"
|
||||
memmap2 = { version = "0.9.0", optional = true }
|
||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||
zstd = { version = "0.13", optional = true, default-features = false }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
tempfile = { version = "3.12.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
fs4 = { version = "0.8.0", optional = true }
|
||||
levenshtein_automata = "0.2.1"
|
||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||
crossbeam-channel = "0.5.4"
|
||||
rust-stemmers = "1.2.0"
|
||||
downcast-rs = "1.2.0"
|
||||
downcast-rs = "2.0.1"
|
||||
bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||
"bitpacker4x",
|
||||
] }
|
||||
census = "0.4.2"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
rustc-hash = "2.0.0"
|
||||
thiserror = "2.0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.12.0"
|
||||
fastdivide = "0.4.0"
|
||||
itertools = "0.13.0"
|
||||
measure_time = "0.8.2"
|
||||
itertools = "0.14.0"
|
||||
measure_time = "0.9.0"
|
||||
arc-swap = "1.5.0"
|
||||
bon = "3.3.1"
|
||||
|
||||
columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
|
||||
sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
|
||||
stacker = { version = "0.3", path = "./stacker", package = "tantivy-stacker" }
|
||||
query-grammar = { version = "0.22.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
|
||||
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.2.1", features = ["use_serde"] }
|
||||
columnar = { version = "0.5", path = "./columnar", package = "tantivy-columnar" }
|
||||
sstable = { version = "0.5", path = "./sstable", package = "tantivy-sstable", optional = true }
|
||||
stacker = { version = "0.5", path = "./stacker", package = "tantivy-stacker" }
|
||||
query-grammar = { version = "0.24.0", path = "./query-grammar", package = "tantivy-query-grammar" }
|
||||
tantivy-bitpacker = { version = "0.8", path = "./bitpacker" }
|
||||
common = { version = "0.9", path = "./common/", package = "tantivy-common" }
|
||||
tokenizer-api = { version = "0.5", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||
futures-util = { version = "0.3.28", optional = true }
|
||||
futures-channel = { version = "0.3.28", optional = true }
|
||||
fnv = "1.0.7"
|
||||
|
||||
[target.'cfg(windows)'.dependencies]
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.8.0"
|
||||
binggan = "0.14.0"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
@@ -120,7 +122,7 @@ zstd-compression = ["zstd"]
|
||||
failpoints = ["fail", "fail/failpoints"]
|
||||
unstable = [] # useful for benches.
|
||||
|
||||
quickwit = ["sstable", "futures-util"]
|
||||
quickwit = ["sstable", "futures-util", "futures-channel"]
|
||||
|
||||
# Compares only the hash of a string when indexing data.
|
||||
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.
|
||||
|
||||
@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
||||
|
||||
## Benchmark
|
||||
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
||||
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
|
||||
performance for different types of queries/collections.
|
||||
|
||||
Your mileage WILL vary depending on the nature of queries and their load.
|
||||
@@ -101,7 +101,8 @@ cargo test
|
||||
## Companies Using Tantivy
|
||||
|
||||
<p align="left">
|
||||
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/etsy.png" alt="Etsy" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/paradedb.png" alt="ParadeDB" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/Nuclia.png#gh-light-mode-only" alt="Nuclia" height="25" width="auto" />
|
||||
<img align="center" src="doc/assets/images/humanfirst.png#gh-light-mode-only" alt="Humanfirst.ai" height="30" width="auto" />
|
||||
<img align="center" src="doc/assets/images/element.io.svg#gh-light-mode-only" alt="Element.io" height="25" width="auto" />
|
||||
|
||||
2
TODO.txt
2
TODO.txt
@@ -1,7 +1,7 @@
|
||||
Make schema_builder API fluent.
|
||||
fix doc serialization and prevent compression problems
|
||||
|
||||
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
|
||||
u64 , etc. should return Result<Option> now that we support optional missing a column is really not an error
|
||||
remove fastfield codecs
|
||||
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
||||
rename FastFieldReaders::open to load
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -17,7 +18,9 @@ pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
/// runner.register("average_u64", move |index| average_u64(index));
|
||||
macro_rules! register {
|
||||
($runner:expr, $func:ident) => {
|
||||
$runner.register(stringify!($func), move |index| $func(index))
|
||||
$runner.register(stringify!($func), move |index| {
|
||||
$func(index);
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
@@ -42,18 +45,25 @@ fn main() {
|
||||
}
|
||||
|
||||
fn bench_agg(mut group: InputGroup<Index>) {
|
||||
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
register!(group, average_u64);
|
||||
register!(group, average_f64);
|
||||
register!(group, average_f64_u64);
|
||||
register!(group, stats_f64);
|
||||
register!(group, extendedstats_f64);
|
||||
register!(group, percentiles_f64);
|
||||
register!(group, terms_few);
|
||||
register!(group, terms_many);
|
||||
register!(group, terms_many_top_1000);
|
||||
register!(group, terms_many_order_by_term);
|
||||
register!(group, terms_many_with_top_hits);
|
||||
register!(group, terms_many_with_avg_sub_agg);
|
||||
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
|
||||
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||
|
||||
register!(group, cardinality_agg);
|
||||
register!(group, terms_few_with_cardinality_agg);
|
||||
|
||||
register!(group, range_agg);
|
||||
register!(group, range_agg_with_avg_sub_agg);
|
||||
register!(group, range_agg_with_term_agg_few);
|
||||
@@ -105,7 +115,12 @@ fn stats_f64(index: &Index) {
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
|
||||
fn extendedstats_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"extendedstats_f64": { "extended_stats": { "field": "score_f64", } }
|
||||
});
|
||||
exec_term_with_agg(index, agg_req)
|
||||
}
|
||||
fn percentiles_f64(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"mypercentiles": {
|
||||
@@ -117,6 +132,33 @@ fn percentiles_f64(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_many_terms"
|
||||
},
|
||||
}
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_few_with_cardinality_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "text_few_terms" },
|
||||
"aggs": {
|
||||
"cardinality": {
|
||||
"cardinality": {
|
||||
"field": "text_many_terms"
|
||||
},
|
||||
}
|
||||
}
|
||||
},
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn terms_few(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||
@@ -129,6 +171,12 @@ fn terms_many(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_top_1000(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_order_by_term(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||
@@ -165,7 +213,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
|
||||
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"my_texts": {
|
||||
"terms": { "field": "json.mixed_type" },
|
||||
@@ -262,6 +310,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
|
||||
});
|
||||
execute_agg(index, agg_req);
|
||||
}
|
||||
|
||||
fn histogram(index: &Index) {
|
||||
let agg_req = json!({
|
||||
"rangef64": {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-bitpacker"
|
||||
version = "0.6.0"
|
||||
version = "0.8.0"
|
||||
edition = "2021"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
|
||||
@@ -65,7 +65,7 @@ impl BitPacker {
|
||||
|
||||
#[derive(Clone, Debug, Default, Copy)]
|
||||
pub struct BitUnpacker {
|
||||
num_bits: u32,
|
||||
num_bits: usize,
|
||||
mask: u64,
|
||||
}
|
||||
|
||||
@@ -83,7 +83,7 @@ impl BitUnpacker {
|
||||
(1u64 << num_bits) - 1u64
|
||||
};
|
||||
BitUnpacker {
|
||||
num_bits: u32::from(num_bits),
|
||||
num_bits: usize::from(num_bits),
|
||||
mask,
|
||||
}
|
||||
}
|
||||
@@ -94,14 +94,14 @@ impl BitUnpacker {
|
||||
|
||||
#[inline]
|
||||
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
|
||||
let addr_in_bits = idx * self.num_bits;
|
||||
let addr = (addr_in_bits >> 3) as usize;
|
||||
let addr_in_bits = idx as usize * self.num_bits;
|
||||
let addr = addr_in_bits >> 3;
|
||||
if addr + 8 > data.len() {
|
||||
if self.num_bits == 0 {
|
||||
return 0;
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
return self.get_slow_path(addr, bit_shift, data);
|
||||
return self.get_slow_path(addr, bit_shift as u32, data);
|
||||
}
|
||||
let bit_shift = addr_in_bits & 7;
|
||||
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
|
||||
@@ -134,12 +134,13 @@ impl BitUnpacker {
|
||||
"Bitwidth must be <= 32 to use this method."
|
||||
);
|
||||
|
||||
let end_idx = start_idx + output.len() as u32;
|
||||
let end_idx: u32 = start_idx + output.len() as u32;
|
||||
|
||||
let end_bit_read = end_idx * self.num_bits;
|
||||
// We use `usize` here to avoid overflow issues.
|
||||
let end_bit_read = (end_idx as usize) * self.num_bits;
|
||||
let end_byte_read = (end_bit_read + 7) / 8;
|
||||
assert!(
|
||||
end_byte_read as usize <= data.len(),
|
||||
end_byte_read <= data.len(),
|
||||
"Requested index is out of bounds."
|
||||
);
|
||||
|
||||
@@ -159,24 +160,24 @@ impl BitUnpacker {
|
||||
// We want the start of the fast track to start align with bytes.
|
||||
// A sufficient condition is to start with an idx that is a multiple of 8,
|
||||
// so highway start is the closest multiple of 8 that is >= start_idx.
|
||||
let entrance_ramp_len = 8 - (start_idx % 8) % 8;
|
||||
let entrance_ramp_len: u32 = 8 - (start_idx % 8) % 8;
|
||||
|
||||
let highway_start: u32 = start_idx + entrance_ramp_len;
|
||||
|
||||
if highway_start + BitPacker1x::BLOCK_LEN as u32 > end_idx {
|
||||
if highway_start + (BitPacker1x::BLOCK_LEN as u32) > end_idx {
|
||||
// We don't have enough values to have even a single block of highway.
|
||||
// Let's just supply the values the simple way.
|
||||
get_batch_ramp(start_idx, output);
|
||||
return;
|
||||
}
|
||||
|
||||
let num_blocks: u32 = (end_idx - highway_start) / BitPacker1x::BLOCK_LEN as u32;
|
||||
let num_blocks: usize = (end_idx - highway_start) as usize / BitPacker1x::BLOCK_LEN;
|
||||
|
||||
// Entrance ramp
|
||||
get_batch_ramp(start_idx, &mut output[..entrance_ramp_len as usize]);
|
||||
|
||||
// Highway
|
||||
let mut offset = (highway_start * self.num_bits) as usize / 8;
|
||||
let mut offset = (highway_start as usize * self.num_bits) / 8;
|
||||
let mut output_cursor = (highway_start - start_idx) as usize;
|
||||
for _ in 0..num_blocks {
|
||||
offset += BitPacker1x.decompress(
|
||||
@@ -188,7 +189,7 @@ impl BitUnpacker {
|
||||
}
|
||||
|
||||
// Exit ramp
|
||||
let highway_end = highway_start + num_blocks * BitPacker1x::BLOCK_LEN as u32;
|
||||
let highway_end: u32 = highway_start + (num_blocks * BitPacker1x::BLOCK_LEN) as u32;
|
||||
get_batch_ramp(highway_end, &mut output[output_cursor..]);
|
||||
}
|
||||
|
||||
@@ -368,9 +369,9 @@ mod test {
|
||||
for start_idx in 0u32..32u32 {
|
||||
output.resize(len, 0);
|
||||
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
||||
for i in 0..len {
|
||||
for (i, output_byte) in output.iter().enumerate() {
|
||||
let expected = (start_idx + i as u32) & mask;
|
||||
assert_eq!(output[i], expected);
|
||||
assert_eq!(*output_byte, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,7 +34,7 @@ struct BlockedBitpackerEntryMetaData {
|
||||
|
||||
impl BlockedBitpackerEntryMetaData {
|
||||
fn new(offset: u64, num_bits: u8, base_value: u64) -> Self {
|
||||
let encoded = offset | (num_bits as u64) << (64 - 8);
|
||||
let encoded = offset | (u64::from(num_bits) << (64 - 8));
|
||||
Self {
|
||||
encoded,
|
||||
base_value,
|
||||
|
||||
@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||
|
||||
impl FilterImplPerInstructionSet {
|
||||
#[allow(unused_variables)]
|
||||
#[inline]
|
||||
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||
|
||||
@@ -16,14 +16,14 @@ body = """
|
||||
|
||||
{%- if version %} in {{ version }}{%- endif -%}
|
||||
{% for commit in commits %}
|
||||
{% if commit.github.pr_title -%}
|
||||
{%- set commit_message = commit.github.pr_title -%}
|
||||
{% if commit.remote.pr_title -%}
|
||||
{%- set commit_message = commit.remote.pr_title -%}
|
||||
{%- else -%}
|
||||
{%- set commit_message = commit.message -%}
|
||||
{%- endif -%}
|
||||
- {{ commit_message | split(pat="\n") | first | trim }}\
|
||||
{% if commit.github.pr_number %} \
|
||||
[#{{ commit.github.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.github.pr_number }}){% if commit.github.username %}(@{{ commit.github.username }}){%- endif -%} \
|
||||
{% if commit.remote.pr_number %} \
|
||||
[#{{ commit.remote.pr_number }}]({{ self::remote_url() }}/pull/{{ commit.remote.pr_number }}){% if commit.remote.username %}(@{{ commit.remote.username }}){%- endif -%} \
|
||||
{%- endif %}
|
||||
{%- endfor -%}
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-columnar"
|
||||
version = "0.3.0"
|
||||
version = "0.5.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
@@ -9,20 +9,30 @@ description = "column oriented storage for tantivy"
|
||||
categories = ["database-implementations", "data-structures", "compression"]
|
||||
|
||||
[dependencies]
|
||||
itertools = "0.13.0"
|
||||
itertools = "0.14.0"
|
||||
fastdivide = "0.4.0"
|
||||
|
||||
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.3", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.7", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.6", path = "../bitpacker/" }
|
||||
stacker = { version= "0.5", path = "../stacker", package="tantivy-stacker"}
|
||||
sstable = { version= "0.5", path = "../sstable", package = "tantivy-sstable" }
|
||||
common = { version= "0.9", path = "../common", package = "tantivy-common" }
|
||||
tantivy-bitpacker = { version= "0.8", path = "../bitpacker/" }
|
||||
serde = "1.0.152"
|
||||
downcast-rs = "1.2.0"
|
||||
downcast-rs = "2.0.1"
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.8"
|
||||
binggan = "0.14.0"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
harness = false
|
||||
|
||||
[[bench]]
|
||||
name = "bench_access"
|
||||
harness = false
|
||||
|
||||
|
||||
[features]
|
||||
unstable = []
|
||||
|
||||
@@ -31,7 +31,7 @@ restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot
|
||||
# Columnar format
|
||||
|
||||
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
||||
The `(column_name, columne_type)` couple however uniquely identifies a column.
|
||||
The `(column_name, column_type)` couple however uniquely identifies a column.
|
||||
That couple is serialized as a column `column_key`. The format of that key is:
|
||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||
|
||||
|
||||
67
columnar/benches/bench_access.rs
Normal file
67
columnar/benches/bench_access.rs
Normal file
@@ -0,0 +1,67 @@
|
||||
use binggan::{black_box, InputGroup};
|
||||
use common::*;
|
||||
use tantivy_columnar::Column;
|
||||
|
||||
pub mod common;
|
||||
|
||||
const NUM_DOCS: u32 = 2_000_000;
|
||||
|
||||
pub fn generate_columnar_and_open(card: Card, num_docs: u32) -> Column {
|
||||
let reader = generate_columnar_with_name(card, num_docs, "price");
|
||||
reader.read_columns("price").unwrap()[0]
|
||||
.open_u64_lenient()
|
||||
.unwrap()
|
||||
.unwrap()
|
||||
}
|
||||
|
||||
fn main() {
|
||||
let mut inputs = Vec::new();
|
||||
|
||||
let mut add_card = |card1: Card| {
|
||||
inputs.push((
|
||||
format!("{card1}"),
|
||||
generate_columnar_and_open(card1, NUM_DOCS),
|
||||
));
|
||||
};
|
||||
|
||||
add_card(Card::MultiSparse);
|
||||
add_card(Card::Multi);
|
||||
add_card(Card::Sparse);
|
||||
add_card(Card::Dense);
|
||||
add_card(Card::Full);
|
||||
|
||||
bench_group(InputGroup::new_with_inputs(inputs));
|
||||
}
|
||||
|
||||
fn bench_group(mut runner: InputGroup<Column>) {
|
||||
runner.register("access_values_for_doc", |column| {
|
||||
let mut sum = 0;
|
||||
for i in 0..NUM_DOCS {
|
||||
for value in column.values_for_doc(i) {
|
||||
sum += value;
|
||||
}
|
||||
}
|
||||
black_box(sum);
|
||||
});
|
||||
runner.register("access_first_vals", |column| {
|
||||
let mut sum = 0;
|
||||
const BLOCK_SIZE: usize = 32;
|
||||
let mut docs = vec![0; BLOCK_SIZE];
|
||||
let mut buffer = vec![None; BLOCK_SIZE];
|
||||
for i in (0..NUM_DOCS).step_by(BLOCK_SIZE) {
|
||||
// fill docs
|
||||
for idx in 0..BLOCK_SIZE {
|
||||
docs[idx] = idx as u32 + i;
|
||||
}
|
||||
|
||||
column.first_vals(&docs, &mut buffer);
|
||||
for val in buffer.iter() {
|
||||
let Some(val) = val else { continue };
|
||||
sum += *val;
|
||||
}
|
||||
}
|
||||
|
||||
black_box(sum);
|
||||
});
|
||||
runner.run();
|
||||
}
|
||||
@@ -31,7 +31,7 @@ fn get_test_columns() -> Columns {
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(data.len() as u32, None, &mut buffer)
|
||||
.serialize(data.len() as u32, &mut buffer)
|
||||
.unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
|
||||
|
||||
49
columnar/benches/bench_merge.rs
Normal file
49
columnar/benches/bench_merge.rs
Normal file
@@ -0,0 +1,49 @@
|
||||
pub mod common;
|
||||
|
||||
use binggan::BenchRunner;
|
||||
use common::{generate_columnar_with_name, Card};
|
||||
use tantivy_columnar::*;
|
||||
|
||||
const NUM_DOCS: u32 = 100_000;
|
||||
|
||||
fn main() {
|
||||
let mut inputs = Vec::new();
|
||||
|
||||
let mut add_combo = |card1: Card, card2: Card| {
|
||||
inputs.push((
|
||||
format!("merge_{card1}_and_{card2}"),
|
||||
vec![
|
||||
generate_columnar_with_name(card1, NUM_DOCS, "price"),
|
||||
generate_columnar_with_name(card2, NUM_DOCS, "price"),
|
||||
],
|
||||
));
|
||||
};
|
||||
|
||||
add_combo(Card::Multi, Card::Multi);
|
||||
add_combo(Card::MultiSparse, Card::MultiSparse);
|
||||
add_combo(Card::Dense, Card::Dense);
|
||||
add_combo(Card::Sparse, Card::Sparse);
|
||||
add_combo(Card::Sparse, Card::Dense);
|
||||
add_combo(Card::MultiSparse, Card::Dense);
|
||||
add_combo(Card::MultiSparse, Card::Sparse);
|
||||
add_combo(Card::Multi, Card::Dense);
|
||||
add_combo(Card::Multi, Card::Sparse);
|
||||
|
||||
let mut runner: BenchRunner = BenchRunner::new();
|
||||
let mut group = runner.new_group();
|
||||
for (input_name, columnar_readers) in inputs.iter() {
|
||||
group.register_with_input(
|
||||
input_name,
|
||||
columnar_readers,
|
||||
move |columnar_readers: &Vec<ColumnarReader>| {
|
||||
let mut out = Vec::new();
|
||||
let columnar_readers = columnar_readers.iter().collect::<Vec<_>>();
|
||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||
|
||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||
Some(out.len() as u64)
|
||||
},
|
||||
);
|
||||
}
|
||||
group.run();
|
||||
}
|
||||
59
columnar/benches/common.rs
Normal file
59
columnar/benches/common.rs
Normal file
@@ -0,0 +1,59 @@
|
||||
extern crate tantivy_columnar;
|
||||
|
||||
use core::fmt;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use tantivy_columnar::{ColumnarReader, ColumnarWriter};
|
||||
|
||||
pub enum Card {
|
||||
MultiSparse,
|
||||
Multi,
|
||||
Sparse,
|
||||
Dense,
|
||||
Full,
|
||||
}
|
||||
impl Display for Card {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Card::MultiSparse => write!(f, "multi sparse 1/13"),
|
||||
Card::Multi => write!(f, "multi 2x"),
|
||||
Card::Sparse => write!(f, "sparse 1/13"),
|
||||
Card::Dense => write!(f, "dense 1/12"),
|
||||
Card::Full => write!(f, "full"),
|
||||
}
|
||||
}
|
||||
}
|
||||
pub fn generate_columnar_with_name(card: Card, num_docs: u32, column_name: &str) -> ColumnarReader {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
if let Card::MultiSparse = card {
|
||||
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||
columnar_writer.record_numerical(0, column_name, 10u64);
|
||||
}
|
||||
|
||||
for i in 0..num_docs {
|
||||
match card {
|
||||
Card::MultiSparse | Card::Sparse => {
|
||||
if i % 13 == 0 {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
Card::Dense => {
|
||||
if i % 12 == 0 {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
Card::Full => {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
Card::Multi => {
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
columnar_writer.record_numerical(i, column_name, i as u64);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
ColumnarReader::open(wrt).unwrap()
|
||||
}
|
||||
18
columnar/columnar-cli-inspect/Cargo.toml
Normal file
18
columnar/columnar-cli-inspect/Cargo.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[package]
|
||||
name = "tantivy-columnar-inspect"
|
||||
version = "0.1.0"
|
||||
edition = "2021"
|
||||
license = "MIT"
|
||||
|
||||
[dependencies]
|
||||
tantivy = {path="../..", package="tantivy"}
|
||||
columnar = {path="../", package="tantivy-columnar"}
|
||||
common = {path="../../common", package="tantivy-common"}
|
||||
|
||||
[workspace]
|
||||
members = []
|
||||
|
||||
[profile.release]
|
||||
debug = true
|
||||
#debug-assertions = true
|
||||
#overflow-checks = true
|
||||
54
columnar/columnar-cli-inspect/src/main.rs
Normal file
54
columnar/columnar-cli-inspect/src/main.rs
Normal file
@@ -0,0 +1,54 @@
|
||||
use columnar::ColumnarReader;
|
||||
use common::file_slice::{FileSlice, WrapFile};
|
||||
use std::io;
|
||||
use std::path::Path;
|
||||
use tantivy::directory::footer::Footer;
|
||||
|
||||
fn main() -> io::Result<()> {
|
||||
println!("Opens a columnar file written by tantivy and validates it.");
|
||||
let path = std::env::args().nth(1).unwrap();
|
||||
|
||||
let path = Path::new(&path);
|
||||
println!("Reading {:?}", path);
|
||||
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn validate_columnar_reader(reader: &ColumnarReader) {
|
||||
let num_rows = reader.num_rows();
|
||||
println!("num_rows: {}", num_rows);
|
||||
let columns = reader.list_columns().unwrap();
|
||||
println!("num columns: {:?}", columns.len());
|
||||
for (col_name, dynamic_column_handle) in columns {
|
||||
let col = dynamic_column_handle.open().unwrap();
|
||||
match col {
|
||||
columnar::DynamicColumn::Bool(_)
|
||||
| columnar::DynamicColumn::I64(_)
|
||||
| columnar::DynamicColumn::U64(_)
|
||||
| columnar::DynamicColumn::F64(_)
|
||||
| columnar::DynamicColumn::IpAddr(_)
|
||||
| columnar::DynamicColumn::DateTime(_)
|
||||
| columnar::DynamicColumn::Bytes(_) => {}
|
||||
columnar::DynamicColumn::Str(str_column) => {
|
||||
let num_vals = str_column.ords().values.num_vals();
|
||||
let num_terms_dict = str_column.num_terms() as u64;
|
||||
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
|
||||
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
|
||||
for ord in str_column.ords().values.iter() {
|
||||
assert!(ord < num_terms_dict);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Opens a columnar file that was written by tantivy and validates it.
|
||||
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
|
||||
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
|
||||
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
|
||||
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
|
||||
let reader = ColumnarReader::open(slice).unwrap();
|
||||
validate_columnar_reader(&reader);
|
||||
Ok(reader)
|
||||
}
|
||||
BIN
columnar/compat_tests_data/v1.columnar
Normal file
BIN
columnar/compat_tests_data/v1.columnar
Normal file
Binary file not shown.
BIN
columnar/compat_tests_data/v2.columnar
Normal file
BIN
columnar/compat_tests_data/v2.columnar
Normal file
Binary file not shown.
@@ -10,7 +10,7 @@
|
||||
|
||||
# Perf and Size
|
||||
* remove alloc in `ord_to_term`
|
||||
+ multivaued range queries restrat frm the beginning all of the time.
|
||||
+ multivaued range queries restart from the beginning all of the time.
|
||||
* re-add ZSTD compression for dictionaries
|
||||
no systematic monotonic mapping
|
||||
consider removing multilinear
|
||||
@@ -30,7 +30,7 @@ investigate if should have better errors? io::Error is overused at the moment.
|
||||
rename rank/select in unit tests
|
||||
Review the public API via cargo doc
|
||||
go through TODOs
|
||||
remove all doc_id occurences -> row_id
|
||||
remove all doc_id occurrences -> row_id
|
||||
use the rank & select naming in unit tests branch.
|
||||
multi-linear -> blockwise
|
||||
linear codec -> simply a multiplication for the index column
|
||||
@@ -43,5 +43,5 @@ isolate u128_based and uniform naming
|
||||
# Other
|
||||
fix enhance column-cli
|
||||
|
||||
# Santa claus
|
||||
# Santa Claus
|
||||
autodetect datetime ipaddr, plug customizable tokenizer.
|
||||
|
||||
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
&'a self,
|
||||
docs: &'a [u32],
|
||||
accessor: &Column<T>,
|
||||
) -> impl Iterator<Item = (DocId, T)> + '_ {
|
||||
) -> impl Iterator<Item = (DocId, T)> + 'a {
|
||||
if accessor.index.get_cardinality().is_full() {
|
||||
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
||||
} else {
|
||||
@@ -139,7 +139,7 @@ mod tests {
|
||||
missing_docs.push(missing_doc);
|
||||
});
|
||||
|
||||
assert_eq!(missing_docs, vec![]);
|
||||
assert_eq!(missing_docs, Vec::<u32>::new());
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -136,7 +136,7 @@ impl<T: PartialOrd + Copy + Debug + Send + Sync + 'static> Column<T> {
|
||||
.map(|value_row_id: RowId| self.values.get_val(value_row_id))
|
||||
}
|
||||
|
||||
/// Get the docids of values which are in the provided value range.
|
||||
/// Get the docids of values which are in the provided value and docid range.
|
||||
#[inline]
|
||||
pub fn get_docids_for_value_range(
|
||||
&self,
|
||||
|
||||
@@ -12,7 +12,7 @@ use crate::column_values::{
|
||||
CodecType, MonotonicallyMappableToU128, MonotonicallyMappableToU64,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::StrColumn;
|
||||
use crate::{StrColumn, Version};
|
||||
|
||||
pub fn serialize_column_mappable_to_u128<T: MonotonicallyMappableToU128>(
|
||||
column_index: SerializableColumnIndex<'_>,
|
||||
@@ -40,25 +40,9 @@ pub fn serialize_column_mappable_to_u64<T: MonotonicallyMappableToU64>(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(bytes: OwnedBytes) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
.as_slice()
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_values = load_u64_based_column_values(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
values: column_values,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
pub fn open_column_u64<T: MonotonicallyMappableToU64>(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
@@ -68,7 +52,27 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = load_u64_based_column_values(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
values: column_values,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<T>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
.as_slice()
|
||||
.try_into()
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = crate::column_values::open_u128_mapped(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
@@ -79,7 +83,10 @@ pub fn open_column_u128<T: MonotonicallyMappableToU128>(
|
||||
/// Open the column as u64.
|
||||
///
|
||||
/// See [`open_u128_as_compact_u64`] for more details.
|
||||
pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u64>> {
|
||||
pub fn open_column_u128_as_compact_u64(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<Column<u64>> {
|
||||
let (body, column_index_num_bytes_payload) = bytes.rsplit(4);
|
||||
let column_index_num_bytes = u32::from_le_bytes(
|
||||
column_index_num_bytes_payload
|
||||
@@ -88,7 +95,7 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
||||
.unwrap(),
|
||||
);
|
||||
let (column_index_data, column_values_data) = body.split(column_index_num_bytes as usize);
|
||||
let column_index = crate::column_index::open_column_index(column_index_data)?;
|
||||
let column_index = crate::column_index::open_column_index(column_index_data, format_version)?;
|
||||
let column_values = crate::column_values::open_u128_as_compact_u64(column_values_data)?;
|
||||
Ok(Column {
|
||||
index: column_index,
|
||||
@@ -96,19 +103,19 @@ pub fn open_column_u128_as_compact_u64(bytes: OwnedBytes) -> io::Result<Column<u
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_bytes(data: OwnedBytes) -> io::Result<BytesColumn> {
|
||||
pub fn open_column_bytes(data: OwnedBytes, format_version: Version) -> io::Result<BytesColumn> {
|
||||
let (body, dictionary_len_bytes) = data.rsplit(4);
|
||||
let dictionary_len = u32::from_le_bytes(dictionary_len_bytes.as_slice().try_into().unwrap());
|
||||
let (dictionary_bytes, column_bytes) = body.split(dictionary_len as usize);
|
||||
let dictionary = Arc::new(Dictionary::from_bytes(dictionary_bytes)?);
|
||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
let term_ord_column = crate::column::open_column_u64::<u64>(column_bytes, format_version)?;
|
||||
Ok(BytesColumn {
|
||||
dictionary,
|
||||
term_ord_column,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn open_column_str(data: OwnedBytes) -> io::Result<StrColumn> {
|
||||
let bytes_column = open_column_bytes(data)?;
|
||||
pub fn open_column_str(data: OwnedBytes, format_version: Version) -> io::Result<StrColumn> {
|
||||
let bytes_column = open_column_bytes(data, format_version)?;
|
||||
Ok(StrColumn::wrap(bytes_column))
|
||||
}
|
||||
|
||||
@@ -95,8 +95,12 @@ pub fn merge_column_index<'a>(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common::OwnedBytes;
|
||||
|
||||
use crate::column_index::merge::detect_cardinality;
|
||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||
use crate::column_index::multivalued_index::{
|
||||
open_multivalued_index, serialize_multivalued_index, MultiValueIndex,
|
||||
};
|
||||
use crate::column_index::{merge_column_index, OptionalIndex, SerializableColumnIndex};
|
||||
use crate::{
|
||||
Cardinality, ColumnIndex, MergeRowOrder, RowAddr, RowId, ShuffleMergeOrder, StackMergeOrder,
|
||||
@@ -169,9 +173,13 @@ mod tests {
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
panic!("Expected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
let multivalue =
|
||||
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5]);
|
||||
}
|
||||
|
||||
@@ -200,11 +208,16 @@ mod tests {
|
||||
],
|
||||
)
|
||||
.into();
|
||||
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
panic!("Expected a multivalued index")
|
||||
};
|
||||
let start_indexes: Vec<RowId> = start_index_iterable.boxed_iter().collect();
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
let multivalue =
|
||||
open_multivalued_index(OwnedBytes::new(output), crate::Version::V2).unwrap();
|
||||
let start_indexes: Vec<RowId> = multivalue.get_start_index_column().iter().collect();
|
||||
assert_eq!(&start_indexes, &[0, 3, 5, 6]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::iter;
|
||||
|
||||
use crate::column_index::{SerializableColumnIndex, Set};
|
||||
use crate::column_index::{
|
||||
SerializableColumnIndex, SerializableMultivalueIndex, SerializableOptionalIndex, Set,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, ColumnIndex, RowId, ShuffleMergeOrder};
|
||||
|
||||
@@ -14,15 +16,24 @@ pub fn merge_column_index_shuffled<'a>(
|
||||
Cardinality::Optional => {
|
||||
let non_null_row_ids =
|
||||
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: shuffle_merge_order.num_rows(),
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalue_start_index =
|
||||
merge_column_index_shuffled_multivalued(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Multivalued(multivalue_start_index)
|
||||
let non_null_row_ids =
|
||||
merge_column_index_shuffled_optional(column_indexes, shuffle_merge_order);
|
||||
SerializableColumnIndex::Multivalued(SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: shuffle_merge_order.num_rows(),
|
||||
},
|
||||
start_offsets: merge_column_index_shuffled_multivalued(
|
||||
column_indexes,
|
||||
shuffle_merge_order,
|
||||
),
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -47,7 +58,7 @@ struct ShuffledIndex<'a> {
|
||||
merge_order: &'a ShuffleMergeOrder,
|
||||
}
|
||||
|
||||
impl<'a> Iterable<u32> for ShuffledIndex<'a> {
|
||||
impl Iterable<u32> for ShuffledIndex<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(
|
||||
self.merge_order
|
||||
@@ -102,14 +113,21 @@ fn iter_num_values<'a>(
|
||||
|
||||
/// Transforms an iterator containing the number of vals per row (with `num_rows` elements)
|
||||
/// into a `start_offset` iterator starting at 0 and (with `num_rows + 1` element)
|
||||
///
|
||||
/// This will filter values with 0 values as these are covered by the optional index in the
|
||||
/// multivalue index.
|
||||
fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item = RowId> {
|
||||
iter::once(0u32).chain(num_vals.scan(0, |state, num_vals| {
|
||||
*state += num_vals;
|
||||
Some(*state)
|
||||
}))
|
||||
iter::once(0u32).chain(
|
||||
num_vals
|
||||
.filter(|num_vals| *num_vals != 0)
|
||||
.scan(0, |state, num_vals| {
|
||||
*state += num_vals;
|
||||
Some(*state)
|
||||
}),
|
||||
)
|
||||
}
|
||||
|
||||
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
|
||||
impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
|
||||
Box::new(integrate_num_vals(num_vals_per_row))
|
||||
@@ -134,7 +152,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_integrate_num_vals_several() {
|
||||
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 3, 13, 33].into_iter()));
|
||||
assert!(integrate_num_vals([3, 0, 10, 20].into_iter()).eq([0, 3, 13, 33].into_iter()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -157,10 +175,10 @@ mod tests {
|
||||
Cardinality::Optional,
|
||||
&shuffle_merge_order,
|
||||
);
|
||||
let SerializableColumnIndex::Optional {
|
||||
let SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = serializable_index
|
||||
}) = serializable_index
|
||||
else {
|
||||
panic!()
|
||||
};
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
use std::iter;
|
||||
use std::ops::Range;
|
||||
|
||||
use crate::column_index::{SerializableColumnIndex, Set};
|
||||
use crate::column_index::multivalued_index::{MultiValueIndex, SerializableMultivalueIndex};
|
||||
use crate::column_index::serialize::SerializableOptionalIndex;
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, ColumnIndex, RowId, StackMergeOrder};
|
||||
|
||||
@@ -15,23 +17,149 @@ pub fn merge_column_index_stacked<'a>(
|
||||
) -> SerializableColumnIndex<'a> {
|
||||
match cardinality_after_merge {
|
||||
Cardinality::Full => SerializableColumnIndex::Full,
|
||||
Cardinality::Optional => SerializableColumnIndex::Optional {
|
||||
Cardinality::Optional => SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(StackedOptionalIndex {
|
||||
columns,
|
||||
stack_merge_order,
|
||||
}),
|
||||
num_rows: stack_merge_order.num_rows(),
|
||||
},
|
||||
}),
|
||||
Cardinality::Multivalued => {
|
||||
let stacked_multivalued_index = StackedMultivaluedIndex {
|
||||
columns,
|
||||
stack_merge_order,
|
||||
};
|
||||
SerializableColumnIndex::Multivalued(Box::new(stacked_multivalued_index))
|
||||
let serializable_multivalue_index =
|
||||
make_serializable_multivalued_index(columns, stack_merge_order);
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalue_index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedDocIdsWithValues<'a> {
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
impl Iterable<u32> for StackedDocIdsWithValues<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new((0..self.column_indexes.len()).flat_map(|i| {
|
||||
let column_index = &self.column_indexes[i];
|
||||
let doc_range = self.stack_merge_order.columnar_range(i);
|
||||
get_doc_ids_with_values(column_index, doc_range)
|
||||
}))
|
||||
}
|
||||
}
|
||||
|
||||
fn get_doc_ids_with_values<'a>(
|
||||
column_index: &'a ColumnIndex,
|
||||
doc_range: Range<u32>,
|
||||
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||
match column_index {
|
||||
ColumnIndex::Empty { .. } => Box::new(0..0),
|
||||
ColumnIndex::Full => Box::new(doc_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
MultiValueIndex::MultiValueIndexV1(multivalued_index) => {
|
||||
Box::new((0..multivalued_index.num_docs()).filter_map(move |docid| {
|
||||
let range = multivalued_index.range(docid);
|
||||
if range.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(docid + doc_range.start)
|
||||
}
|
||||
}))
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.optional_index
|
||||
.iter_docs()
|
||||
.map(move |row| row + doc_range.start),
|
||||
),
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
fn stack_doc_ids_with_values<'a>(
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> SerializableOptionalIndex<'a> {
|
||||
let num_rows = stack_merge_order.num_rows();
|
||||
SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(StackedDocIdsWithValues {
|
||||
column_indexes,
|
||||
stack_merge_order,
|
||||
}),
|
||||
num_rows,
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedStartOffsets<'a> {
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
fn get_num_values_iterator<'a>(
|
||||
column_index: &'a ColumnIndex,
|
||||
num_docs: u32,
|
||||
) -> Box<dyn Iterator<Item = u32> + 'a> {
|
||||
match column_index {
|
||||
ColumnIndex::Empty { .. } => Box::new(std::iter::empty()),
|
||||
ColumnIndex::Full => Box::new(std::iter::repeat(1u32).take(num_docs as usize)),
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
Box::new(std::iter::repeat(1u32).take(optional_index.num_non_nulls() as usize))
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => Box::new(
|
||||
multivalued_index
|
||||
.get_start_index_column()
|
||||
.iter()
|
||||
.scan(0u32, |previous_start_offset, current_start_offset| {
|
||||
let num_vals = current_start_offset - *previous_start_offset;
|
||||
*previous_start_offset = current_start_offset;
|
||||
Some(num_vals)
|
||||
})
|
||||
.skip(1),
|
||||
),
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterable<u32> for StackedStartOffsets<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
|
||||
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
|
||||
let column_index = &self.column_indexes[columnar_id];
|
||||
get_num_values_iterator(column_index, num_docs)
|
||||
});
|
||||
Box::new(std::iter::once(0u32).chain(num_values_it.into_iter().scan(
|
||||
0u32,
|
||||
|cumulated, el| {
|
||||
*cumulated += el;
|
||||
Some(*cumulated)
|
||||
},
|
||||
)))
|
||||
}
|
||||
}
|
||||
|
||||
fn stack_start_offsets<'a>(
|
||||
column_indexes: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> Box<dyn Iterable<u32> + 'a> {
|
||||
Box::new(StackedStartOffsets {
|
||||
column_indexes,
|
||||
stack_merge_order,
|
||||
})
|
||||
}
|
||||
|
||||
fn make_serializable_multivalued_index<'a>(
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
) -> SerializableMultivalueIndex<'a> {
|
||||
SerializableMultivalueIndex {
|
||||
doc_ids_with_values: stack_doc_ids_with_values(columns, stack_merge_order),
|
||||
start_offsets: stack_start_offsets(columns, stack_merge_order),
|
||||
}
|
||||
}
|
||||
|
||||
struct StackedOptionalIndex<'a> {
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
@@ -49,7 +177,7 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
ColumnIndex::Full => Box::new(columnar_row_range),
|
||||
ColumnIndex::Optional(optional_index) => Box::new(
|
||||
optional_index
|
||||
.iter_rows()
|
||||
.iter_docs()
|
||||
.map(move |row_id: RowId| columnar_row_range.start + row_id),
|
||||
),
|
||||
ColumnIndex::Multivalued(_) => {
|
||||
@@ -62,87 +190,3 @@ impl<'a> Iterable<RowId> for StackedOptionalIndex<'a> {
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy)]
|
||||
struct StackedMultivaluedIndex<'a> {
|
||||
columns: &'a [ColumnIndex],
|
||||
stack_merge_order: &'a StackMergeOrder,
|
||||
}
|
||||
|
||||
fn convert_column_opt_to_multivalued_index<'a>(
|
||||
column_index_opt: &'a ColumnIndex,
|
||||
num_rows: RowId,
|
||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
match column_index_opt {
|
||||
ColumnIndex::Empty { .. } => Box::new(iter::repeat(0u32).take(num_rows as usize + 1)),
|
||||
ColumnIndex::Full => Box::new(0..num_rows + 1),
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
Box::new(
|
||||
(0..num_rows)
|
||||
// TODO optimize
|
||||
.map(|row_id| optional_index.rank(row_id))
|
||||
.chain(std::iter::once(optional_index.num_non_nulls())),
|
||||
)
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => multivalued_index.start_index_column.iter(),
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterable<RowId> for StackedMultivaluedIndex<'a> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = RowId> + '_> {
|
||||
let multivalued_indexes =
|
||||
self.columns
|
||||
.iter()
|
||||
.enumerate()
|
||||
.map(|(columnar_id, column_opt)| {
|
||||
let num_rows =
|
||||
self.stack_merge_order.columnar_range(columnar_id).len() as RowId;
|
||||
convert_column_opt_to_multivalued_index(column_opt, num_rows)
|
||||
});
|
||||
stack_multivalued_indexes(multivalued_indexes)
|
||||
}
|
||||
}
|
||||
|
||||
// Refactor me
|
||||
fn stack_multivalued_indexes<'a>(
|
||||
mut multivalued_indexes: impl Iterator<Item = Box<dyn Iterator<Item = RowId> + 'a>> + 'a,
|
||||
) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
let mut offset = 0;
|
||||
let mut last_row_id = 0;
|
||||
let mut current_it = multivalued_indexes.next();
|
||||
Box::new(std::iter::from_fn(move || loop {
|
||||
if let Some(row_id) = current_it.as_mut()?.next() {
|
||||
last_row_id = offset + row_id;
|
||||
return Some(last_row_id);
|
||||
}
|
||||
offset = last_row_id;
|
||||
loop {
|
||||
current_it = multivalued_indexes.next();
|
||||
if current_it.as_mut()?.next().is_some() {
|
||||
break;
|
||||
}
|
||||
}
|
||||
}))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::RowId;
|
||||
|
||||
fn it<'a>(row_ids: &'a [RowId]) -> Box<dyn Iterator<Item = RowId> + 'a> {
|
||||
Box::new(row_ids.iter().copied())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_stack() {
|
||||
let columns = [
|
||||
it(&[0u32, 0u32]),
|
||||
it(&[0u32, 1u32, 1u32, 4u32]),
|
||||
it(&[0u32, 3u32, 5u32]),
|
||||
it(&[0u32, 4u32]),
|
||||
]
|
||||
.into_iter();
|
||||
let start_offsets: Vec<RowId> = super::stack_multivalued_indexes(columns).collect();
|
||||
assert_eq!(start_offsets, &[0, 0, 1, 1, 4, 7, 9, 13]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -11,8 +11,11 @@ mod serialize;
|
||||
use std::ops::Range;
|
||||
|
||||
pub use merge::merge_column_index;
|
||||
pub(crate) use multivalued_index::SerializableMultivalueIndex;
|
||||
pub use optional_index::{OptionalIndex, Set};
|
||||
pub use serialize::{open_column_index, serialize_column_index, SerializableColumnIndex};
|
||||
pub use serialize::{
|
||||
open_column_index, serialize_column_index, SerializableColumnIndex, SerializableOptionalIndex,
|
||||
};
|
||||
|
||||
use crate::column_index::multivalued_index::MultiValueIndex;
|
||||
use crate::{Cardinality, DocId, RowId};
|
||||
@@ -25,7 +28,7 @@ pub enum ColumnIndex {
|
||||
Full,
|
||||
Optional(OptionalIndex),
|
||||
/// In addition, at index num_rows, an extra value is added
|
||||
/// containing the overal number of values.
|
||||
/// containing the overall number of values.
|
||||
Multivalued(MultiValueIndex),
|
||||
}
|
||||
|
||||
@@ -131,15 +134,41 @@ impl ColumnIndex {
|
||||
let row_end = optional_index.rank(doc_id_range.end);
|
||||
row_start..row_end
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => {
|
||||
let end_docid = doc_id_range.end.min(multivalued_index.num_docs() - 1) + 1;
|
||||
let start_docid = doc_id_range.start.min(end_docid);
|
||||
ColumnIndex::Multivalued(multivalued_index) => match multivalued_index {
|
||||
MultiValueIndex::MultiValueIndexV1(index) => {
|
||||
let row_start = index.start_index_column.get_val(doc_id_range.start);
|
||||
let row_end = index.start_index_column.get_val(doc_id_range.end);
|
||||
row_start..row_end
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(index) => {
|
||||
// In this case we will use the optional_index select the next values
|
||||
// that are valid. There are different cases to consider:
|
||||
// Not exists below means does not exist in the optional
|
||||
// index, because it has no values.
|
||||
// * doc_id_range may cover a range of docids which are non existent
|
||||
// => rank
|
||||
// will give us the next document outside the range with a value. They both
|
||||
// get the same rank and therefore return a zero range
|
||||
//
|
||||
// * doc_id_range.start and doc_id_range.end may not exist, but docids in
|
||||
// between may have values
|
||||
// => rank will give us the next document outside the range with a value.
|
||||
//
|
||||
// * doc_id_range.start may be not existent but doc_id_range.end may exist
|
||||
// * doc_id_range.start may exist but doc_id_range.end may not exist
|
||||
// * doc_id_range.start and doc_id_range.end may exist
|
||||
// => rank on doc_id_range.end will give use the next value, which matches
|
||||
// how the `start_index_column` works, so we get the value start of the next
|
||||
// docid which we use to create the exclusive range.
|
||||
//
|
||||
let rank_start = index.optional_index.rank(doc_id_range.start);
|
||||
let row_start = index.start_index_column.get_val(rank_start);
|
||||
let rank_end = index.optional_index.rank(doc_id_range.end);
|
||||
let row_end = index.start_index_column.get_val(rank_end);
|
||||
|
||||
let row_start = multivalued_index.start_index_column.get_val(start_docid);
|
||||
let row_end = multivalued_index.start_index_column.get_val(end_docid);
|
||||
|
||||
row_start..row_end
|
||||
}
|
||||
row_start..row_end
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -3,64 +3,98 @@ use std::io::Write;
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use common::OwnedBytes;
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use super::optional_index::{open_optional_index, serialize_optional_index};
|
||||
use super::{OptionalIndex, SerializableOptionalIndex, Set};
|
||||
use crate::column_values::{
|
||||
load_u64_based_column_values, serialize_u64_based_column_values, CodecType, ColumnValues,
|
||||
};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{DocId, RowId};
|
||||
use crate::{DocId, RowId, Version};
|
||||
|
||||
pub struct SerializableMultivalueIndex<'a> {
|
||||
pub doc_ids_with_values: SerializableOptionalIndex<'a>,
|
||||
pub start_offsets: Box<dyn Iterable<u32> + 'a>,
|
||||
}
|
||||
|
||||
pub fn serialize_multivalued_index(
|
||||
multivalued_index: &dyn Iterable<RowId>,
|
||||
multivalued_index: &SerializableMultivalueIndex,
|
||||
output: &mut impl Write,
|
||||
) -> io::Result<()> {
|
||||
let SerializableMultivalueIndex {
|
||||
doc_ids_with_values,
|
||||
start_offsets,
|
||||
} = multivalued_index;
|
||||
let mut count_writer = CountingWriter::wrap(output);
|
||||
let SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} = doc_ids_with_values;
|
||||
serialize_optional_index(&**non_null_row_ids, *num_rows, &mut count_writer)?;
|
||||
let optional_len = count_writer.written_bytes() as u32;
|
||||
let output = count_writer.finish();
|
||||
serialize_u64_based_column_values(
|
||||
multivalued_index,
|
||||
&**start_offsets,
|
||||
&[CodecType::Bitpacked, CodecType::Linear],
|
||||
output,
|
||||
)?;
|
||||
output.write_all(&optional_len.to_le_bytes())?;
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub fn open_multivalued_index(bytes: OwnedBytes) -> io::Result<MultiValueIndex> {
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> = load_u64_based_column_values(bytes)?;
|
||||
Ok(MultiValueIndex { start_index_column })
|
||||
pub fn open_multivalued_index(
|
||||
bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<MultiValueIndex> {
|
||||
match format_version {
|
||||
Version::V1 => {
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||
load_u64_based_column_values(bytes)?;
|
||||
Ok(MultiValueIndex::MultiValueIndexV1(MultiValueIndexV1 {
|
||||
start_index_column,
|
||||
}))
|
||||
}
|
||||
Version::V2 => {
|
||||
let (body_bytes, optional_index_len) = bytes.rsplit(4);
|
||||
let optional_index_len =
|
||||
u32::from_le_bytes(optional_index_len.as_slice().try_into().unwrap());
|
||||
let (optional_index_bytes, start_index_bytes) =
|
||||
body_bytes.split(optional_index_len as usize);
|
||||
let optional_index = open_optional_index(optional_index_bytes)?;
|
||||
let start_index_column: Arc<dyn ColumnValues<RowId>> =
|
||||
load_u64_based_column_values(start_index_bytes)?;
|
||||
Ok(MultiValueIndex::MultiValueIndexV2(MultiValueIndexV2 {
|
||||
optional_index,
|
||||
start_index_column,
|
||||
}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndex {
|
||||
pub enum MultiValueIndex {
|
||||
MultiValueIndexV1(MultiValueIndexV1),
|
||||
MultiValueIndexV2(MultiValueIndexV2),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndexV1 {
|
||||
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MultiValueIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("MultiValuedIndex")
|
||||
.field("num_rows", &self.start_index_column.num_vals())
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl From<Arc<dyn ColumnValues<RowId>>> for MultiValueIndex {
|
||||
fn from(start_index_column: Arc<dyn ColumnValues<RowId>>) -> Self {
|
||||
MultiValueIndex { start_index_column }
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueIndex {
|
||||
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
||||
let mut buffer = Vec::new();
|
||||
serialize_multivalued_index(&start_offsets, &mut buffer).unwrap();
|
||||
let bytes = OwnedBytes::new(buffer);
|
||||
open_multivalued_index(bytes).unwrap()
|
||||
}
|
||||
|
||||
impl MultiValueIndexV1 {
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
if doc_id >= self.num_docs() {
|
||||
return 0..0;
|
||||
}
|
||||
let start = self.start_index_column.get_val(doc_id);
|
||||
let end = self.start_index_column.get_val(doc_id + 1);
|
||||
start..end
|
||||
@@ -83,7 +117,6 @@ impl MultiValueIndex {
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
#[allow(clippy::bool_to_int_with_if)]
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
if ranks.is_empty() {
|
||||
return;
|
||||
@@ -111,11 +144,170 @@ impl MultiValueIndex {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
/// Index to resolve value range for given doc_id.
|
||||
/// Starts at 0.
|
||||
pub struct MultiValueIndexV2 {
|
||||
pub optional_index: OptionalIndex,
|
||||
pub start_index_column: Arc<dyn crate::ColumnValues<RowId>>,
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for MultiValueIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
let index = match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||
};
|
||||
f.debug_struct("MultiValuedIndex")
|
||||
.field("num_rows", &index.num_vals())
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
|
||||
impl MultiValueIndex {
|
||||
pub fn for_test(start_offsets: &[RowId]) -> MultiValueIndex {
|
||||
assert!(!start_offsets.is_empty());
|
||||
assert_eq!(start_offsets[0], 0);
|
||||
let mut doc_with_values = Vec::new();
|
||||
let mut compact_start_offsets: Vec<u32> = vec![0];
|
||||
for doc in 0..start_offsets.len() - 1 {
|
||||
if start_offsets[doc] < start_offsets[doc + 1] {
|
||||
doc_with_values.push(doc as RowId);
|
||||
compact_start_offsets.push(start_offsets[doc + 1]);
|
||||
}
|
||||
}
|
||||
let serializable_multivalued_index = SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(&doc_with_values[..]),
|
||||
num_rows: start_offsets.len() as u32 - 1,
|
||||
},
|
||||
start_offsets: Box::new(&compact_start_offsets[..]),
|
||||
};
|
||||
let mut buffer = Vec::new();
|
||||
serialize_multivalued_index(&serializable_multivalued_index, &mut buffer).unwrap();
|
||||
let bytes = OwnedBytes::new(buffer);
|
||||
open_multivalued_index(bytes, Version::V2).unwrap()
|
||||
}
|
||||
|
||||
pub fn get_start_index_column(&self) -> &Arc<dyn crate::ColumnValues<RowId>> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => &idx.start_index_column,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => &idx.start_index_column,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns `[start, end)` values range, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => idx.range(doc_id),
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => idx.range(doc_id),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the index.
|
||||
#[inline]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => idx.start_index_column.num_vals() - 1,
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => idx.optional_index.num_docs(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||
/// index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||
/// increasing positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
match self {
|
||||
MultiValueIndex::MultiValueIndexV1(idx) => {
|
||||
idx.select_batch_in_place(docid_start, ranks)
|
||||
}
|
||||
MultiValueIndex::MultiValueIndexV2(idx) => {
|
||||
idx.select_batch_in_place(docid_start, ranks)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
impl MultiValueIndexV2 {
|
||||
/// Returns `[start, end)`, such that the values associated with
|
||||
/// the given document are `start..end`.
|
||||
#[inline]
|
||||
pub(crate) fn range(&self, doc_id: DocId) -> Range<RowId> {
|
||||
let Some(rank) = self.optional_index.rank_if_exists(doc_id) else {
|
||||
return 0..0;
|
||||
};
|
||||
let start = self.start_index_column.get_val(rank);
|
||||
let end = self.start_index_column.get_val(rank + 1);
|
||||
start..end
|
||||
}
|
||||
|
||||
/// Returns the number of documents in the index.
|
||||
#[inline]
|
||||
pub fn num_docs(&self) -> u32 {
|
||||
self.optional_index.num_docs()
|
||||
}
|
||||
|
||||
/// Converts a list of ranks (row ids of values) in a 1:n index to the corresponding list of
|
||||
/// docids. Positions are converted inplace to docids.
|
||||
///
|
||||
/// Since there is no index for value pos -> docid, but docid -> value pos range, we scan the
|
||||
/// index.
|
||||
///
|
||||
/// Correctness: positions needs to be sorted. idx_reader needs to contain monotonically
|
||||
/// increasing positions.
|
||||
///
|
||||
/// TODO: Instead of a linear scan we can employ a exponential search into binary search to
|
||||
/// match a docid to its value position.
|
||||
pub(crate) fn select_batch_in_place(&self, docid_start: DocId, ranks: &mut Vec<u32>) {
|
||||
if ranks.is_empty() {
|
||||
return;
|
||||
}
|
||||
let mut cur_pos_in_idx = self.optional_index.rank(docid_start);
|
||||
let mut last_doc = None;
|
||||
|
||||
assert!(cur_pos_in_idx <= ranks[0]);
|
||||
|
||||
let mut write_doc_pos = 0;
|
||||
for i in 0..ranks.len() {
|
||||
let pos = ranks[i];
|
||||
loop {
|
||||
let end = self.start_index_column.get_val(cur_pos_in_idx + 1);
|
||||
if end > pos {
|
||||
ranks[write_doc_pos] = cur_pos_in_idx;
|
||||
write_doc_pos += if last_doc == Some(cur_pos_in_idx) {
|
||||
0
|
||||
} else {
|
||||
1
|
||||
};
|
||||
last_doc = Some(cur_pos_in_idx);
|
||||
break;
|
||||
}
|
||||
cur_pos_in_idx += 1;
|
||||
}
|
||||
}
|
||||
ranks.truncate(write_doc_pos);
|
||||
|
||||
for rank in ranks.iter_mut() {
|
||||
*rank = self.optional_index.select(*rank);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::ops::Range;
|
||||
|
||||
use super::MultiValueIndex;
|
||||
use crate::{ColumnarReader, DynamicColumn};
|
||||
|
||||
fn index_to_pos_helper(
|
||||
index: &MultiValueIndex,
|
||||
@@ -134,6 +326,7 @@ mod tests {
|
||||
let positions = &[10u32, 11, 15, 20, 21, 22];
|
||||
assert_eq!(index_to_pos_helper(&index, 0..5, positions), vec![1, 3, 4]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, positions), vec![1, 3, 4]);
|
||||
|
||||
assert_eq!(index_to_pos_helper(&index, 0..5, &[9]), vec![0]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[10]), vec![1]);
|
||||
assert_eq!(index_to_pos_helper(&index, 1..5, &[11]), vec![1]);
|
||||
@@ -141,4 +334,67 @@ mod tests {
|
||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14]), vec![2]);
|
||||
assert_eq!(index_to_pos_helper(&index, 2..5, &[12, 14, 15]), vec![2, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_to_rowids() {
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
// This column gets coerced to u64
|
||||
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||
columnar_writer.record_numerical(1, "full", u64::MAX);
|
||||
|
||||
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||
columnar_writer.record_numerical(5, "full", u64::MAX);
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(7, &mut wrt).unwrap();
|
||||
|
||||
let reader = ColumnarReader::open(wrt).unwrap();
|
||||
// Open the column as u64
|
||||
let column = reader.read_columns("full").unwrap()[0]
|
||||
.open()
|
||||
.unwrap()
|
||||
.coerce_numerical(crate::NumericalType::U64)
|
||||
.unwrap();
|
||||
let DynamicColumn::U64(column) = column else {
|
||||
panic!();
|
||||
};
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(1..2);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..2);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..4);
|
||||
assert_eq!(row_id_range, 0..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(3..4);
|
||||
assert_eq!(row_id_range, 2..2);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(1..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(3..6);
|
||||
assert_eq!(row_id_range, 2..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let row_id_range = column.index.docid_range_to_rowids(0..6);
|
||||
assert_eq!(row_id_range, 0..4);
|
||||
|
||||
let check = |range, expected| {
|
||||
let full_range = 0..=u64::MAX;
|
||||
let mut docids = Vec::new();
|
||||
column.get_docids_for_value_range(full_range, range, &mut docids);
|
||||
assert_eq!(docids, expected);
|
||||
};
|
||||
|
||||
// check(0..1, vec![]);
|
||||
// check(0..2, vec![1]);
|
||||
check(1..2, vec![1]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -80,17 +80,23 @@ impl BlockVariant {
|
||||
/// index is the block index. For each block `byte_start` and `offset` is computed.
|
||||
#[derive(Clone)]
|
||||
pub struct OptionalIndex {
|
||||
num_rows: RowId,
|
||||
num_non_null_rows: RowId,
|
||||
num_docs: RowId,
|
||||
num_non_null_docs: RowId,
|
||||
block_data: OwnedBytes,
|
||||
block_metas: Arc<[BlockMeta]>,
|
||||
}
|
||||
|
||||
impl Iterable<u32> for &OptionalIndex {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||
Box::new(self.iter_docs())
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Debug for OptionalIndex {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
|
||||
f.debug_struct("OptionalIndex")
|
||||
.field("num_rows", &self.num_rows)
|
||||
.field("num_non_null_rows", &self.num_non_null_rows)
|
||||
.field("num_docs", &self.num_docs)
|
||||
.field("num_non_null_docs", &self.num_non_null_docs)
|
||||
.finish_non_exhaustive()
|
||||
}
|
||||
}
|
||||
@@ -117,7 +123,7 @@ enum BlockSelectCursor<'a> {
|
||||
Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
|
||||
}
|
||||
|
||||
impl<'a> BlockSelectCursor<'a> {
|
||||
impl BlockSelectCursor<'_> {
|
||||
fn select(&mut self, rank: u16) -> u16 {
|
||||
match self {
|
||||
BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
|
||||
@@ -135,7 +141,7 @@ pub struct OptionalIndexSelectCursor<'a> {
|
||||
num_null_rows_before_block: RowId,
|
||||
}
|
||||
|
||||
impl<'a> OptionalIndexSelectCursor<'a> {
|
||||
impl OptionalIndexSelectCursor<'_> {
|
||||
fn search_and_load_block(&mut self, rank: RowId) {
|
||||
if rank < self.current_block_end_rank {
|
||||
// we are already in the right block
|
||||
@@ -159,7 +165,7 @@ impl<'a> OptionalIndexSelectCursor<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
||||
impl SelectCursor<RowId> for OptionalIndexSelectCursor<'_> {
|
||||
fn select(&mut self, rank: RowId) -> RowId {
|
||||
self.search_and_load_block(rank);
|
||||
let index_in_block = (rank - self.num_null_rows_before_block) as u16;
|
||||
@@ -168,7 +174,9 @@ impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
||||
}
|
||||
|
||||
impl Set<RowId> for OptionalIndex {
|
||||
type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
|
||||
type SelectCursor<'b>
|
||||
= OptionalIndexSelectCursor<'b>
|
||||
where Self: 'b;
|
||||
// Check if value at position is not null.
|
||||
#[inline]
|
||||
fn contains(&self, row_id: RowId) -> bool {
|
||||
@@ -196,6 +204,7 @@ impl Set<RowId> for OptionalIndex {
|
||||
} = row_addr_from_row_id(doc_id);
|
||||
let block_meta = self.block_metas[block_id as usize];
|
||||
let block = self.block(block_meta);
|
||||
|
||||
let block_offset_row_id = match block {
|
||||
Block::Dense(dense_block) => dense_block.rank(in_block_row_id),
|
||||
Block::Sparse(sparse_block) => sparse_block.rank(in_block_row_id),
|
||||
@@ -262,17 +271,17 @@ impl OptionalIndex {
|
||||
}
|
||||
|
||||
pub fn num_docs(&self) -> RowId {
|
||||
self.num_rows
|
||||
self.num_docs
|
||||
}
|
||||
|
||||
pub fn num_non_nulls(&self) -> RowId {
|
||||
self.num_non_null_rows
|
||||
self.num_non_null_docs
|
||||
}
|
||||
|
||||
pub fn iter_rows(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
pub fn iter_docs(&self) -> impl Iterator<Item = RowId> + '_ {
|
||||
// TODO optimize
|
||||
let mut select_batch = self.select_cursor();
|
||||
(0..self.num_non_null_rows).map(move |rank| select_batch.select(rank))
|
||||
(0..self.num_non_null_docs).map(move |rank| select_batch.select(rank))
|
||||
}
|
||||
pub fn select_batch(&self, ranks: &mut [RowId]) {
|
||||
let mut select_cursor = self.select_cursor();
|
||||
@@ -496,7 +505,7 @@ fn deserialize_optional_index_block_metadatas(
|
||||
non_null_rows_before_block += num_non_null_rows;
|
||||
}
|
||||
block_metas.resize(
|
||||
((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize,
|
||||
num_rows.div_ceil(ELEMENTS_PER_BLOCK) as usize,
|
||||
BlockMeta {
|
||||
non_null_rows_before_block,
|
||||
start_byte_offset,
|
||||
@@ -510,15 +519,15 @@ pub fn open_optional_index(bytes: OwnedBytes) -> io::Result<OptionalIndex> {
|
||||
let (mut bytes, num_non_empty_blocks_bytes) = bytes.rsplit(2);
|
||||
let num_non_empty_block_bytes =
|
||||
u16::from_le_bytes(num_non_empty_blocks_bytes.as_slice().try_into().unwrap());
|
||||
let num_rows = VInt::deserialize_u64(&mut bytes)? as u32;
|
||||
let num_docs = VInt::deserialize_u64(&mut bytes)? as u32;
|
||||
let block_metas_num_bytes =
|
||||
num_non_empty_block_bytes as usize * SERIALIZED_BLOCK_META_NUM_BYTES;
|
||||
let (block_data, block_metas) = bytes.rsplit(block_metas_num_bytes);
|
||||
let (block_metas, num_non_null_rows) =
|
||||
deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_rows);
|
||||
let (block_metas, num_non_null_docs) =
|
||||
deserialize_optional_index_block_metadatas(block_metas.as_slice(), num_docs);
|
||||
let optional_index = OptionalIndex {
|
||||
num_rows,
|
||||
num_non_null_rows,
|
||||
num_docs,
|
||||
num_non_null_docs,
|
||||
block_data,
|
||||
block_metas: block_metas.into(),
|
||||
};
|
||||
|
||||
@@ -28,10 +28,11 @@ pub trait Set<T> {
|
||||
/// Returns true if the elements is contained in the Set
|
||||
fn contains(&self, el: T) -> bool;
|
||||
|
||||
/// Returns the number of rows in the set that are < `el`
|
||||
/// Returns the element's rank (its position in the set).
|
||||
/// If the set does not contain the element, it will return the next existing elements rank.
|
||||
fn rank(&self, el: T) -> T;
|
||||
|
||||
/// If the set contains `el` returns the element rank.
|
||||
/// If the set contains `el`, returns the element's rank (its position in the set).
|
||||
/// If the set does not contain the element, it returns `None`.
|
||||
fn rank_if_exists(&self, el: T) -> Option<T>;
|
||||
|
||||
|
||||
@@ -23,7 +23,6 @@ fn set_bit_at(input: &mut u64, n: u16) {
|
||||
///
|
||||
/// When translating a dense index to the original index, we can use the offset to find the correct
|
||||
/// block. Direct computation is not possible, but we can employ a linear or binary search.
|
||||
|
||||
const ELEMENTS_PER_MINI_BLOCK: u16 = 64;
|
||||
const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8;
|
||||
const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2;
|
||||
@@ -109,7 +108,7 @@ pub struct DenseBlockSelectCursor<'a> {
|
||||
dense_block: DenseBlock<'a>,
|
||||
}
|
||||
|
||||
impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
||||
impl SelectCursor<u16> for DenseBlockSelectCursor<'_> {
|
||||
#[inline]
|
||||
fn select(&mut self, rank: u16) -> u16 {
|
||||
self.block_id = self
|
||||
@@ -123,7 +122,9 @@ impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Set<u16> for DenseBlock<'a> {
|
||||
type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
|
||||
type SelectCursor<'b>
|
||||
= DenseBlockSelectCursor<'a>
|
||||
where Self: 'b;
|
||||
|
||||
#[inline(always)]
|
||||
fn contains(&self, el: u16) -> bool {
|
||||
@@ -173,7 +174,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DenseBlock<'a> {
|
||||
impl DenseBlock<'_> {
|
||||
#[inline]
|
||||
fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
|
||||
let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES;
|
||||
|
||||
@@ -31,8 +31,10 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Set<u16> for SparseBlock<'a> {
|
||||
type SelectCursor<'b> = Self where Self: 'b;
|
||||
impl Set<u16> for SparseBlock<'_> {
|
||||
type SelectCursor<'b>
|
||||
= Self
|
||||
where Self: 'b;
|
||||
|
||||
#[inline(always)]
|
||||
fn contains(&self, el: u16) -> bool {
|
||||
@@ -67,7 +69,7 @@ fn get_u16(data: &[u8], byte_position: usize) -> u16 {
|
||||
u16::from_le_bytes(bytes)
|
||||
}
|
||||
|
||||
impl<'a> SparseBlock<'a> {
|
||||
impl SparseBlock<'_> {
|
||||
#[inline(always)]
|
||||
fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 {
|
||||
let start_offset: usize = idx as usize * 2;
|
||||
@@ -80,7 +82,7 @@ impl<'a> SparseBlock<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::comparison_chain)]
|
||||
#[expect(clippy::comparison_chain)]
|
||||
// Looks for the element in the block. Returns the positions if found.
|
||||
fn binary_search(&self, target: u16) -> Result<u16, u16> {
|
||||
let data = &self.0;
|
||||
|
||||
@@ -22,8 +22,8 @@ fn test_set_helper<C: SetCodec<Item = u16>>(vals: &[u16]) -> usize {
|
||||
vals.iter().cloned().take_while(|v| *v < val).count() as u16
|
||||
);
|
||||
}
|
||||
for rank in 0..vals.len() {
|
||||
assert_eq!(tested_set.select(rank as u16), vals[rank]);
|
||||
for (rank, val) in vals.iter().enumerate() {
|
||||
assert_eq!(tested_set.select(rank as u16), *val);
|
||||
}
|
||||
buffer.len()
|
||||
}
|
||||
@@ -107,3 +107,41 @@ fn test_simple_translate_codec_idx_to_original_idx_dense() {
|
||||
assert_eq!(i, select_cursor.select(i));
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_translate_idx_to_value_idx_dense() {
|
||||
let mut buffer = Vec::new();
|
||||
DenseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||
let tested_set = DenseBlockCodec::open(buffer.as_slice());
|
||||
assert!(tested_set.contains(1));
|
||||
assert!(!tested_set.contains(2));
|
||||
assert_eq!(tested_set.rank(0), 0);
|
||||
assert_eq!(tested_set.rank(1), 0);
|
||||
for rank in 2..10 {
|
||||
// ranks that don't exist select the next highest one
|
||||
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||
assert_eq!(tested_set.rank(rank), 1);
|
||||
}
|
||||
assert_eq!(tested_set.rank(10), 1);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_simple_translate_idx_to_value_idx_sparse() {
|
||||
let mut buffer = Vec::new();
|
||||
SparseBlockCodec::serialize([1, 10].iter().copied(), &mut buffer).unwrap();
|
||||
let tested_set = SparseBlockCodec::open(buffer.as_slice());
|
||||
assert!(tested_set.contains(1));
|
||||
assert!(!tested_set.contains(2));
|
||||
assert_eq!(tested_set.rank(0), 0);
|
||||
assert_eq!(tested_set.select(tested_set.rank(0)), 1);
|
||||
assert_eq!(tested_set.rank(1), 0);
|
||||
assert_eq!(tested_set.select(tested_set.rank(1)), 1);
|
||||
for rank in 2..10 {
|
||||
// ranks that don't exist select the next highest one
|
||||
assert_eq!(tested_set.rank_if_exists(rank), None);
|
||||
assert_eq!(tested_set.rank(rank), 1);
|
||||
assert_eq!(tested_set.select(tested_set.rank(rank)), 10);
|
||||
}
|
||||
assert_eq!(tested_set.rank(10), 1);
|
||||
assert_eq!(tested_set.select(tested_set.rank(10)), 10);
|
||||
}
|
||||
|
||||
@@ -15,9 +15,7 @@ fn test_optional_index_with_num_docs(num_docs: u32) {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(100, "score", 80i64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_docs, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("score").unwrap();
|
||||
@@ -112,8 +110,8 @@ fn test_null_index(data: &[bool]) {
|
||||
.map(|(pos, _val)| pos as u32)
|
||||
.collect();
|
||||
let mut select_iter = null_index.select_cursor();
|
||||
for i in 0..orig_idx_with_value.len() {
|
||||
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
|
||||
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
||||
assert_eq!(select_iter.select(i as u32), *expected);
|
||||
}
|
||||
|
||||
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
||||
@@ -166,7 +164,7 @@ fn test_optional_index_large() {
|
||||
fn test_optional_index_iter_aux(row_ids: &[RowId], num_rows: RowId) {
|
||||
let optional_index = OptionalIndex::for_test(num_rows, row_ids);
|
||||
assert_eq!(optional_index.num_docs(), num_rows);
|
||||
assert!(optional_index.iter_rows().eq(row_ids.iter().copied()));
|
||||
assert!(optional_index.iter_docs().eq(row_ids.iter().copied()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -3,28 +3,39 @@ use std::io::Write;
|
||||
|
||||
use common::{CountingWriter, OwnedBytes};
|
||||
|
||||
use super::multivalued_index::SerializableMultivalueIndex;
|
||||
use super::OptionalIndex;
|
||||
use crate::column_index::multivalued_index::serialize_multivalued_index;
|
||||
use crate::column_index::optional_index::serialize_optional_index;
|
||||
use crate::column_index::ColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
use crate::{Cardinality, RowId};
|
||||
use crate::{Cardinality, RowId, Version};
|
||||
|
||||
pub struct SerializableOptionalIndex<'a> {
|
||||
pub non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
||||
pub num_rows: RowId,
|
||||
}
|
||||
|
||||
impl<'a> From<&'a OptionalIndex> for SerializableOptionalIndex<'a> {
|
||||
fn from(optional_index: &'a OptionalIndex) -> Self {
|
||||
SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
num_rows: optional_index.num_docs(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub enum SerializableColumnIndex<'a> {
|
||||
Full,
|
||||
Optional {
|
||||
non_null_row_ids: Box<dyn Iterable<RowId> + 'a>,
|
||||
num_rows: RowId,
|
||||
},
|
||||
// TODO remove the Arc<dyn> apart from serialization this is not
|
||||
// dynamic at all.
|
||||
Multivalued(Box<dyn Iterable<RowId> + 'a>),
|
||||
Optional(SerializableOptionalIndex<'a>),
|
||||
Multivalued(SerializableMultivalueIndex<'a>),
|
||||
}
|
||||
|
||||
impl<'a> SerializableColumnIndex<'a> {
|
||||
impl SerializableColumnIndex<'_> {
|
||||
pub fn get_cardinality(&self) -> Cardinality {
|
||||
match self {
|
||||
SerializableColumnIndex::Full => Cardinality::Full,
|
||||
SerializableColumnIndex::Optional { .. } => Cardinality::Optional,
|
||||
SerializableColumnIndex::Optional(_) => Cardinality::Optional,
|
||||
SerializableColumnIndex::Multivalued(_) => Cardinality::Multivalued,
|
||||
}
|
||||
}
|
||||
@@ -40,12 +51,12 @@ pub fn serialize_column_index(
|
||||
output.write_all(&[cardinality])?;
|
||||
match column_index {
|
||||
SerializableColumnIndex::Full => {}
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows,
|
||||
} => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
||||
}) => serialize_optional_index(non_null_row_ids.as_ref(), num_rows, &mut output)?,
|
||||
SerializableColumnIndex::Multivalued(multivalued_index) => {
|
||||
serialize_multivalued_index(&*multivalued_index, &mut output)?
|
||||
serialize_multivalued_index(&multivalued_index, &mut output)?
|
||||
}
|
||||
}
|
||||
let column_index_num_bytes = output.written_bytes() as u32;
|
||||
@@ -53,7 +64,10 @@ pub fn serialize_column_index(
|
||||
}
|
||||
|
||||
/// Open a serialized column index.
|
||||
pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
||||
pub fn open_column_index(
|
||||
mut bytes: OwnedBytes,
|
||||
format_version: Version,
|
||||
) -> io::Result<ColumnIndex> {
|
||||
if bytes.is_empty() {
|
||||
return Err(io::Error::new(
|
||||
io::ErrorKind::UnexpectedEof,
|
||||
@@ -70,7 +84,8 @@ pub fn open_column_index(mut bytes: OwnedBytes) -> io::Result<ColumnIndex> {
|
||||
Ok(ColumnIndex::Optional(optional_index))
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalue_index = super::multivalued_index::open_multivalued_index(bytes)?;
|
||||
let multivalue_index =
|
||||
super::multivalued_index::open_multivalued_index(bytes, format_version)?;
|
||||
Ok(ColumnIndex::Multivalued(multivalue_index))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
||||
fn value_iter() -> impl Iterator<Item = u64> {
|
||||
0..20_000
|
||||
}
|
||||
|
||||
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
||||
let mut bytes = Vec::new();
|
||||
let stats = compute_stats(data.iter().cloned());
|
||||
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
|
||||
for val in data {
|
||||
codec_serializer.collect(*val);
|
||||
}
|
||||
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
|
||||
codec_serializer
|
||||
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
||||
.unwrap();
|
||||
|
||||
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
||||
}
|
||||
|
||||
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
||||
let col = get_reader_for_bench::<Codec>(data);
|
||||
b.iter(|| {
|
||||
|
||||
@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
|
||||
pub(crate) merge_row_order: &'a MergeRowOrder,
|
||||
}
|
||||
|
||||
impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> {
|
||||
impl<T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'_, T> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
||||
match self.merge_row_order {
|
||||
MergeRowOrder::Stack(_) => Box::new(
|
||||
|
||||
@@ -184,7 +184,7 @@ impl CompactSpaceBuilder {
|
||||
|
||||
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
||||
|
||||
// begining of the blanks
|
||||
// beginning of the blanks
|
||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
||||
if *first_blank_start != 0 {
|
||||
covered_space.push(0..=first_blank_start - 1);
|
||||
|
||||
@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::{
|
||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
|
||||
@@ -39,7 +39,7 @@ impl BinarySerializable for Block {
|
||||
}
|
||||
|
||||
fn compute_num_blocks(num_vals: u32) -> u32 {
|
||||
(num_vals + BLOCK_SIZE - 1) / BLOCK_SIZE
|
||||
num_vals.div_ceil(BLOCK_SIZE)
|
||||
}
|
||||
|
||||
pub struct BlockwiseLinearEstimator {
|
||||
|
||||
@@ -122,12 +122,11 @@ impl Line {
|
||||
line
|
||||
}
|
||||
|
||||
/// Returns a line that attemps to approximate a function
|
||||
/// Returns a line that attempts to approximate a function
|
||||
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
||||
///
|
||||
/// - The approximation is always lower than the actual value.
|
||||
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
|
||||
/// for any i in [0..ys.len()).
|
||||
/// - The approximation is always lower than the actual value. Or more rigorously, formally
|
||||
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
|
||||
/// - It computes without panicking for any value of it.
|
||||
///
|
||||
/// This function is only invariable by translation if all of the
|
||||
|
||||
@@ -1,3 +1,6 @@
|
||||
use core::fmt;
|
||||
use std::fmt::{Display, Formatter};
|
||||
|
||||
use crate::InvalidData;
|
||||
|
||||
pub const VERSION_FOOTER_NUM_BYTES: usize = MAGIC_BYTES.len() + std::mem::size_of::<u32>();
|
||||
@@ -8,7 +11,7 @@ const MAGIC_BYTES: [u8; 4] = [2, 113, 119, 66];
|
||||
|
||||
pub fn footer() -> [u8; VERSION_FOOTER_NUM_BYTES] {
|
||||
let mut footer_bytes = [0u8; VERSION_FOOTER_NUM_BYTES];
|
||||
footer_bytes[0..4].copy_from_slice(&Version::V1.to_bytes());
|
||||
footer_bytes[0..4].copy_from_slice(&CURRENT_VERSION.to_bytes());
|
||||
footer_bytes[4..8].copy_from_slice(&MAGIC_BYTES[..]);
|
||||
footer_bytes
|
||||
}
|
||||
@@ -20,10 +23,22 @@ pub fn parse_footer(footer_bytes: [u8; VERSION_FOOTER_NUM_BYTES]) -> Result<Vers
|
||||
Version::try_from_bytes(footer_bytes[0..4].try_into().unwrap())
|
||||
}
|
||||
|
||||
pub const CURRENT_VERSION: Version = Version::V2;
|
||||
|
||||
#[derive(Debug, Copy, Clone, Eq, PartialEq)]
|
||||
#[repr(u32)]
|
||||
pub enum Version {
|
||||
V1 = 1u32,
|
||||
V2 = 2u32,
|
||||
}
|
||||
|
||||
impl Display for Version {
|
||||
fn fmt(&self, f: &mut Formatter) -> fmt::Result {
|
||||
match self {
|
||||
Version::V1 => write!(f, "v1"),
|
||||
Version::V2 => write!(f, "v2"),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Version {
|
||||
@@ -35,6 +50,7 @@ impl Version {
|
||||
let code = u32::from_le_bytes(bytes);
|
||||
match code {
|
||||
1u32 => Ok(Version::V1),
|
||||
2u32 => Ok(Version::V2),
|
||||
_ => Err(InvalidData),
|
||||
}
|
||||
}
|
||||
@@ -47,9 +63,9 @@ mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_footer_dserialization() {
|
||||
fn test_footer_deserialization() {
|
||||
let parsed_version: Version = parse_footer(footer()).unwrap();
|
||||
assert_eq!(Version::V1, parsed_version);
|
||||
assert_eq!(Version::V2, parsed_version);
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -63,11 +79,10 @@ mod tests {
|
||||
for &i in &version_to_tests {
|
||||
let version_res = Version::try_from_bytes(i.to_le_bytes());
|
||||
if let Ok(version) = version_res {
|
||||
assert_eq!(version, Version::V1);
|
||||
assert_eq!(version.to_bytes(), i.to_le_bytes());
|
||||
valid_versions.insert(i);
|
||||
}
|
||||
}
|
||||
assert_eq!(valid_versions.len(), 1);
|
||||
assert_eq!(valid_versions.len(), 2);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -3,7 +3,7 @@ use std::io::{self, Write};
|
||||
use common::{BitSet, CountingWriter, ReadOnlyBitSet};
|
||||
use sstable::{SSTable, Streamer, TermOrdinal, VoidSSTable};
|
||||
|
||||
use super::term_merger::TermMerger;
|
||||
use super::term_merger::{TermMerger, TermsWithSegmentOrd};
|
||||
use crate::column::serialize_column_mappable_to_u64;
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::iterable::Iterable;
|
||||
@@ -39,7 +39,7 @@ struct RemappedTermOrdinalsValues<'a> {
|
||||
merge_row_order: &'a MergeRowOrder,
|
||||
}
|
||||
|
||||
impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
|
||||
impl Iterable for RemappedTermOrdinalsValues<'_> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
match self.merge_row_order {
|
||||
MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
|
||||
@@ -50,7 +50,7 @@ impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> RemappedTermOrdinalsValues<'a> {
|
||||
impl RemappedTermOrdinalsValues<'_> {
|
||||
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
let iter = self
|
||||
.bytes_columns
|
||||
@@ -126,14 +126,17 @@ fn serialize_merged_dict(
|
||||
let mut term_ord_mapping = TermOrdinalMapping::default();
|
||||
|
||||
let mut field_term_streams = Vec::new();
|
||||
for column_opt in bytes_columns.iter() {
|
||||
for (segment_ord, column_opt) in bytes_columns.iter().enumerate() {
|
||||
if let Some(column) = column_opt {
|
||||
term_ord_mapping.add_segment(column.dictionary.num_terms());
|
||||
let terms: Streamer<VoidSSTable> = column.dictionary.stream()?;
|
||||
field_term_streams.push(terms);
|
||||
field_term_streams.push(TermsWithSegmentOrd { terms, segment_ord });
|
||||
} else {
|
||||
term_ord_mapping.add_segment(0);
|
||||
field_term_streams.push(Streamer::empty());
|
||||
field_term_streams.push(TermsWithSegmentOrd {
|
||||
terms: Streamer::empty(),
|
||||
segment_ord,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -191,6 +194,7 @@ fn serialize_merged_dict(
|
||||
|
||||
#[derive(Default, Debug)]
|
||||
struct TermOrdinalMapping {
|
||||
/// Contains the new term ordinals for each segment.
|
||||
per_segment_new_term_ordinals: Vec<Vec<TermOrdinal>>,
|
||||
}
|
||||
|
||||
@@ -205,6 +209,6 @@ impl TermOrdinalMapping {
|
||||
}
|
||||
|
||||
fn get_segment(&self, segment_ord: u32) -> &[TermOrdinal] {
|
||||
&(self.per_segment_new_term_ordinals[segment_ord as usize])[..]
|
||||
&self.per_segment_new_term_ordinals[segment_ord as usize]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -26,7 +26,7 @@ impl StackMergeOrder {
|
||||
let mut cumulated_row_ids: Vec<RowId> = Vec::with_capacity(columnars.len());
|
||||
let mut cumulated_row_id = 0;
|
||||
for columnar in columnars {
|
||||
cumulated_row_id += columnar.num_rows();
|
||||
cumulated_row_id += columnar.num_docs();
|
||||
cumulated_row_ids.push(cumulated_row_id);
|
||||
}
|
||||
StackMergeOrder { cumulated_row_ids }
|
||||
|
||||
@@ -7,7 +7,6 @@ use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
use std::sync::Arc;
|
||||
|
||||
use itertools::Itertools;
|
||||
pub use merge_mapping::{MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
|
||||
use super::writer::ColumnarSerializer;
|
||||
@@ -26,7 +25,7 @@ use crate::{
|
||||
/// After merge, all columns belonging to the same category are coerced to
|
||||
/// the same column type.
|
||||
///
|
||||
/// In practise, today, only Numerical colummns are coerced into one type today.
|
||||
/// In practise, today, only Numerical columns are coerced into one type today.
|
||||
///
|
||||
/// See also [README.md].
|
||||
///
|
||||
@@ -64,11 +63,10 @@ impl From<ColumnType> for ColumnTypeCategory {
|
||||
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
||||
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
||||
/// happen:
|
||||
/// - If the required column type is compatible with all of the input columnar, the resulsting
|
||||
/// merged
|
||||
/// columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged
|
||||
/// will fail with an InvalidData error.
|
||||
/// - If the required column type is compatible with all of the input columnar, the resulting merged
|
||||
/// columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged will
|
||||
/// fail with an InvalidData error.
|
||||
///
|
||||
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
||||
/// `Columnar` table.
|
||||
@@ -82,13 +80,12 @@ pub fn merge_columnar(
|
||||
output: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(output);
|
||||
let num_rows_per_columnar = columnar_readers
|
||||
let num_docs_per_columnar = columnar_readers
|
||||
.iter()
|
||||
.map(|reader| reader.num_rows())
|
||||
.map(|reader| reader.num_docs())
|
||||
.collect::<Vec<u32>>();
|
||||
|
||||
let columns_to_merge =
|
||||
group_columns_for_merge(columnar_readers, required_columns, &merge_row_order)?;
|
||||
let columns_to_merge = group_columns_for_merge(columnar_readers, required_columns)?;
|
||||
for res in columns_to_merge {
|
||||
let ((column_name, _column_type_category), grouped_columns) = res;
|
||||
let grouped_columns = grouped_columns.open(&merge_row_order)?;
|
||||
@@ -96,15 +93,18 @@ pub fn merge_columnar(
|
||||
continue;
|
||||
}
|
||||
|
||||
let column_type = grouped_columns.column_type_after_merge();
|
||||
let column_type_after_merge = grouped_columns.column_type_after_merge();
|
||||
let mut columns = grouped_columns.columns;
|
||||
coerce_columns(column_type, &mut columns)?;
|
||||
// Make sure the number of columns is the same as the number of columnar readers.
|
||||
// Or num_docs_per_columnar would be incorrect.
|
||||
assert_eq!(columns.len(), columnar_readers.len());
|
||||
coerce_columns(column_type_after_merge, &mut columns)?;
|
||||
|
||||
let mut column_serializer =
|
||||
serializer.start_serialize_column(column_name.as_bytes(), column_type);
|
||||
serializer.start_serialize_column(column_name.as_bytes(), column_type_after_merge);
|
||||
merge_column(
|
||||
column_type,
|
||||
&num_rows_per_columnar,
|
||||
column_type_after_merge,
|
||||
&num_docs_per_columnar,
|
||||
columns,
|
||||
&merge_row_order,
|
||||
&mut column_serializer,
|
||||
@@ -130,7 +130,7 @@ fn dynamic_column_to_u64_monotonic(dynamic_column: DynamicColumn) -> Option<Colu
|
||||
fn merge_column(
|
||||
column_type: ColumnType,
|
||||
num_docs_per_column: &[u32],
|
||||
columns: Vec<Option<DynamicColumn>>,
|
||||
columns_to_merge: Vec<Option<DynamicColumn>>,
|
||||
merge_row_order: &MergeRowOrder,
|
||||
wrt: &mut impl io::Write,
|
||||
) -> io::Result<()> {
|
||||
@@ -140,10 +140,10 @@ fn merge_column(
|
||||
| ColumnType::F64
|
||||
| ColumnType::DateTime
|
||||
| ColumnType::Bool => {
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
|
||||
let mut column_values: Vec<Option<Arc<dyn ColumnValues>>> =
|
||||
Vec::with_capacity(columns.len());
|
||||
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
|
||||
Vec::with_capacity(columns_to_merge.len());
|
||||
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
|
||||
if let Some(Column { index: idx, values }) =
|
||||
dynamic_column_opt.and_then(dynamic_column_to_u64_monotonic)
|
||||
{
|
||||
@@ -166,10 +166,10 @@ fn merge_column(
|
||||
serialize_column_mappable_to_u64(merged_column_index, &merge_column_values, wrt)?;
|
||||
}
|
||||
ColumnType::IpAddr => {
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
|
||||
let mut column_values: Vec<Option<Arc<dyn ColumnValues<Ipv6Addr>>>> =
|
||||
Vec::with_capacity(columns.len());
|
||||
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
|
||||
Vec::with_capacity(columns_to_merge.len());
|
||||
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
|
||||
if let Some(DynamicColumn::IpAddr(Column { index: idx, values })) =
|
||||
dynamic_column_opt
|
||||
{
|
||||
@@ -194,9 +194,10 @@ fn merge_column(
|
||||
serialize_column_mappable_to_u128(merged_column_index, &merge_column_values, wrt)?;
|
||||
}
|
||||
ColumnType::Bytes | ColumnType::Str => {
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns.len());
|
||||
let mut bytes_columns: Vec<Option<BytesColumn>> = Vec::with_capacity(columns.len());
|
||||
for (i, dynamic_column_opt) in columns.into_iter().enumerate() {
|
||||
let mut column_indexes: Vec<ColumnIndex> = Vec::with_capacity(columns_to_merge.len());
|
||||
let mut bytes_columns: Vec<Option<BytesColumn>> =
|
||||
Vec::with_capacity(columns_to_merge.len());
|
||||
for (i, dynamic_column_opt) in columns_to_merge.into_iter().enumerate() {
|
||||
match dynamic_column_opt {
|
||||
Some(DynamicColumn::Str(str_column)) => {
|
||||
column_indexes.push(str_column.term_ord_column.index.clone());
|
||||
@@ -250,7 +251,7 @@ impl GroupedColumns {
|
||||
if column_type.len() == 1 {
|
||||
return column_type.into_iter().next().unwrap();
|
||||
}
|
||||
// At the moment, only the numerical categorical column type has more than one possible
|
||||
// At the moment, only the numerical column type category has more than one possible
|
||||
// column type.
|
||||
assert!(self
|
||||
.columns
|
||||
@@ -363,7 +364,7 @@ fn is_empty_after_merge(
|
||||
ColumnIndex::Empty { .. } => true,
|
||||
ColumnIndex::Full => alive_bitset.len() == 0,
|
||||
ColumnIndex::Optional(optional_index) => {
|
||||
for doc in optional_index.iter_rows() {
|
||||
for doc in optional_index.iter_docs() {
|
||||
if alive_bitset.contains(doc) {
|
||||
return false;
|
||||
}
|
||||
@@ -371,20 +372,8 @@ fn is_empty_after_merge(
|
||||
true
|
||||
}
|
||||
ColumnIndex::Multivalued(multivalued_index) => {
|
||||
for (doc_id, (start_index, end_index)) in multivalued_index
|
||||
.start_index_column
|
||||
.iter()
|
||||
.tuple_windows()
|
||||
.enumerate()
|
||||
{
|
||||
let doc_id = doc_id as u32;
|
||||
if start_index == end_index {
|
||||
// There are no values in this document
|
||||
continue;
|
||||
}
|
||||
// The document contains values and is present in the alive bitset.
|
||||
// The column is therefore not empty.
|
||||
if alive_bitset.contains(doc_id) {
|
||||
for alive_docid in alive_bitset.iter() {
|
||||
if !multivalued_index.range(alive_docid).is_empty() {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
@@ -405,7 +394,6 @@ fn is_empty_after_merge(
|
||||
fn group_columns_for_merge<'a>(
|
||||
columnar_readers: &'a [&'a ColumnarReader],
|
||||
required_columns: &'a [(String, ColumnType)],
|
||||
_merge_row_order: &'a MergeRowOrder,
|
||||
) -> io::Result<BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle>> {
|
||||
let mut columns: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> = BTreeMap::new();
|
||||
|
||||
|
||||
@@ -5,28 +5,29 @@ use sstable::TermOrdinal;
|
||||
|
||||
use crate::Streamer;
|
||||
|
||||
pub struct HeapItem<'a> {
|
||||
pub streamer: Streamer<'a>,
|
||||
/// The terms of a column with the ordinal of the segment.
|
||||
pub struct TermsWithSegmentOrd<'a> {
|
||||
pub terms: Streamer<'a>,
|
||||
pub segment_ord: usize,
|
||||
}
|
||||
|
||||
impl<'a> PartialEq for HeapItem<'a> {
|
||||
impl PartialEq for TermsWithSegmentOrd<'_> {
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
self.segment_ord == other.segment_ord
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Eq for HeapItem<'a> {}
|
||||
impl Eq for TermsWithSegmentOrd<'_> {}
|
||||
|
||||
impl<'a> PartialOrd for HeapItem<'a> {
|
||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||
impl<'a> PartialOrd for TermsWithSegmentOrd<'a> {
|
||||
fn partial_cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Option<Ordering> {
|
||||
Some(self.cmp(other))
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for HeapItem<'a> {
|
||||
fn cmp(&self, other: &HeapItem<'a>) -> Ordering {
|
||||
(&other.streamer.key(), &other.segment_ord).cmp(&(&self.streamer.key(), &self.segment_ord))
|
||||
impl<'a> Ord for TermsWithSegmentOrd<'a> {
|
||||
fn cmp(&self, other: &TermsWithSegmentOrd<'a>) -> Ordering {
|
||||
(&other.terms.key(), &other.segment_ord).cmp(&(&self.terms.key(), &self.segment_ord))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,42 +36,34 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// - a slice with the ordinal of the segments containing the terms.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
heap: BinaryHeap<TermsWithSegmentOrd<'a>>,
|
||||
term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>,
|
||||
}
|
||||
|
||||
impl<'a> TermMerger<'a> {
|
||||
/// Stream of merged term dictionary
|
||||
pub fn new(streams: Vec<Streamer<'a>>) -> TermMerger<'a> {
|
||||
pub fn new(term_streams_with_segment: Vec<TermsWithSegmentOrd<'a>>) -> TermMerger<'a> {
|
||||
TermMerger {
|
||||
heap: BinaryHeap::new(),
|
||||
current_streamers: streams
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.map(|(ord, streamer)| HeapItem {
|
||||
streamer,
|
||||
segment_ord: ord,
|
||||
})
|
||||
.collect(),
|
||||
term_streams_with_segment,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn matching_segments<'b: 'a>(
|
||||
&'b self,
|
||||
) -> impl 'b + Iterator<Item = (usize, TermOrdinal)> {
|
||||
self.current_streamers
|
||||
self.term_streams_with_segment
|
||||
.iter()
|
||||
.map(|heap_item| (heap_item.segment_ord, heap_item.streamer.term_ord()))
|
||||
.map(|heap_item| (heap_item.segment_ord, heap_item.terms.term_ord()))
|
||||
}
|
||||
|
||||
fn advance_segments(&mut self) {
|
||||
let streamers = &mut self.current_streamers;
|
||||
let streamers = &mut self.term_streams_with_segment;
|
||||
let heap = &mut self.heap;
|
||||
for mut heap_item in streamers.drain(..) {
|
||||
if heap_item.streamer.advance() {
|
||||
if heap_item.terms.advance() {
|
||||
heap.push(heap_item);
|
||||
}
|
||||
}
|
||||
@@ -82,13 +75,13 @@ impl<'a> TermMerger<'a> {
|
||||
pub fn advance(&mut self) -> bool {
|
||||
self.advance_segments();
|
||||
if let Some(head) = self.heap.pop() {
|
||||
self.current_streamers.push(head);
|
||||
self.term_streams_with_segment.push(head);
|
||||
while let Some(next_streamer) = self.heap.peek() {
|
||||
if self.current_streamers[0].streamer.key() != next_streamer.streamer.key() {
|
||||
if self.term_streams_with_segment[0].terms.key() != next_streamer.terms.key() {
|
||||
break;
|
||||
}
|
||||
let next_heap_it = self.heap.pop().unwrap(); // safe : we peeked beforehand
|
||||
self.current_streamers.push(next_heap_it);
|
||||
self.term_streams_with_segment.push(next_heap_it);
|
||||
}
|
||||
true
|
||||
} else {
|
||||
@@ -102,6 +95,6 @@ impl<'a> TermMerger<'a> {
|
||||
/// if and only if advance() has been called before
|
||||
/// and "true" was returned.
|
||||
pub fn key(&self) -> &[u8] {
|
||||
self.current_streamers[0].streamer.key()
|
||||
self.term_streams_with_segment[0].terms.key()
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
use itertools::Itertools;
|
||||
use proptest::collection::vec;
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::*;
|
||||
use crate::{Cardinality, ColumnarWriter, HasAssociatedColumnType, RowId};
|
||||
use crate::columnar::{merge_columnar, ColumnarReader, MergeRowOrder, StackMergeOrder};
|
||||
use crate::{Cardinality, ColumnarWriter, DynamicColumn, HasAssociatedColumnType, RowId};
|
||||
|
||||
fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
|
||||
column_name: &str,
|
||||
@@ -12,7 +17,7 @@ fn make_columnar<T: Into<NumericalValue> + HasAssociatedColumnType + Copy>(
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(vals.len() as RowId, None, &mut buffer)
|
||||
.serialize(vals.len() as RowId, &mut buffer)
|
||||
.unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
@@ -24,9 +29,8 @@ fn test_column_coercion_to_u64() {
|
||||
// u64 type
|
||||
let columnar2 = make_columnar("numbers", &[u64::MAX]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
|
||||
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
|
||||
group_columns_for_merge(columnars, &[]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
|
||||
}
|
||||
@@ -36,9 +40,8 @@ fn test_column_coercion_to_i64() {
|
||||
let columnar1 = make_columnar("numbers", &[-1i64]);
|
||||
let columnar2 = make_columnar("numbers", &[2u64]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
|
||||
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
|
||||
group_columns_for_merge(columnars, &[]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
|
||||
}
|
||||
@@ -61,14 +64,8 @@ fn test_group_columns_with_required_column() {
|
||||
let columnar1 = make_columnar("numbers", &[1i64]);
|
||||
let columnar2 = make_columnar("numbers", &[2u64]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
|
||||
group_columns_for_merge(
|
||||
&[&columnar1, &columnar2],
|
||||
&[("numbers".to_string(), ColumnType::U64)],
|
||||
&merge_order,
|
||||
)
|
||||
.unwrap();
|
||||
group_columns_for_merge(columnars, &[("numbers".to_string(), ColumnType::U64)]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
|
||||
}
|
||||
@@ -78,13 +75,9 @@ fn test_group_columns_required_column_with_no_existing_columns() {
|
||||
let columnar1 = make_columnar("numbers", &[2u64]);
|
||||
let columnar2 = make_columnar("numbers", &[2u64]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<_, _> = group_columns_for_merge(
|
||||
columnars,
|
||||
&[("required_col".to_string(), ColumnType::Str)],
|
||||
&merge_order,
|
||||
)
|
||||
.unwrap();
|
||||
let column_map: BTreeMap<_, _> =
|
||||
group_columns_for_merge(columnars, &[("required_col".to_string(), ColumnType::Str)])
|
||||
.unwrap();
|
||||
assert_eq!(column_map.len(), 2);
|
||||
let columns = &column_map
|
||||
.get(&("required_col".to_string(), ColumnTypeCategory::Str))
|
||||
@@ -100,14 +93,8 @@ fn test_group_columns_required_column_is_above_all_columns_have_the_same_type_ru
|
||||
let columnar1 = make_columnar("numbers", &[2i64]);
|
||||
let columnar2 = make_columnar("numbers", &[2i64]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
|
||||
group_columns_for_merge(
|
||||
columnars,
|
||||
&[("numbers".to_string(), ColumnType::U64)],
|
||||
&merge_order,
|
||||
)
|
||||
.unwrap();
|
||||
group_columns_for_merge(columnars, &[("numbers".to_string(), ColumnType::U64)]).unwrap();
|
||||
assert_eq!(column_map.len(), 1);
|
||||
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
|
||||
}
|
||||
@@ -117,9 +104,8 @@ fn test_missing_column() {
|
||||
let columnar1 = make_columnar("numbers", &[-1i64]);
|
||||
let columnar2 = make_columnar("numbers2", &[2u64]);
|
||||
let columnars = &[&columnar1, &columnar2];
|
||||
let merge_order = StackMergeOrder::stack(columnars).into();
|
||||
let column_map: BTreeMap<(String, ColumnTypeCategory), GroupedColumnsHandle> =
|
||||
group_columns_for_merge(columnars, &[], &merge_order).unwrap();
|
||||
group_columns_for_merge(columnars, &[]).unwrap();
|
||||
assert_eq!(column_map.len(), 2);
|
||||
assert!(column_map.contains_key(&("numbers".to_string(), ColumnTypeCategory::Numerical)));
|
||||
{
|
||||
@@ -157,9 +143,7 @@ fn make_numerical_columnar_multiple_columns(
|
||||
.max()
|
||||
.unwrap_or(0u32);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -182,9 +166,7 @@ fn make_byte_columnar_multiple_columns(
|
||||
}
|
||||
}
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -203,9 +185,7 @@ fn make_text_columnar_multiple_columns(columns: &[(&str, &[&[&str]])]) -> Column
|
||||
.max()
|
||||
.unwrap_or(0u32);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer
|
||||
.serialize(num_rows, None, &mut buffer)
|
||||
.unwrap();
|
||||
dataframe_writer.serialize(num_rows, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
@@ -228,7 +208,7 @@ fn test_merge_columnar_numbers() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 3);
|
||||
assert_eq!(columnar_reader.num_docs(), 3);
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("numbers").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
@@ -256,7 +236,7 @@ fn test_merge_columnar_texts() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 3);
|
||||
assert_eq!(columnar_reader.num_docs(), 3);
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("texts").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
@@ -305,7 +285,7 @@ fn test_merge_columnar_byte() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 4);
|
||||
assert_eq!(columnar_reader.num_docs(), 4);
|
||||
assert_eq!(columnar_reader.num_columns(), 1);
|
||||
let cols = columnar_reader.read_columns("bytes").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
@@ -361,7 +341,7 @@ fn test_merge_columnar_byte_with_missing() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 3 + 2 + 3);
|
||||
assert_eq!(columnar_reader.num_docs(), 3 + 2 + 3);
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let cols = columnar_reader.read_columns("col").unwrap();
|
||||
let dynamic_column = cols[0].open().unwrap();
|
||||
@@ -413,7 +393,7 @@ fn test_merge_columnar_different_types() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 4);
|
||||
assert_eq!(columnar_reader.num_docs(), 4);
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let cols = columnar_reader.read_columns("mixed").unwrap();
|
||||
|
||||
@@ -423,11 +403,11 @@ fn test_merge_columnar_different_types() {
|
||||
panic!()
|
||||
};
|
||||
assert_eq!(vals.get_cardinality(), Cardinality::Optional);
|
||||
assert_eq!(vals.values_for_doc(0).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(1).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(2).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(0).collect_vec(), Vec::<i64>::new());
|
||||
assert_eq!(vals.values_for_doc(1).collect_vec(), Vec::<i64>::new());
|
||||
assert_eq!(vals.values_for_doc(2).collect_vec(), Vec::<i64>::new());
|
||||
assert_eq!(vals.values_for_doc(3).collect_vec(), vec![1]);
|
||||
assert_eq!(vals.values_for_doc(4).collect_vec(), vec![]);
|
||||
assert_eq!(vals.values_for_doc(4).collect_vec(), Vec::<i64>::new());
|
||||
|
||||
// text column
|
||||
let dynamic_column = cols[1].open().unwrap();
|
||||
@@ -478,7 +458,7 @@ fn test_merge_columnar_different_empty_cardinality() {
|
||||
)
|
||||
.unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_rows(), 2);
|
||||
assert_eq!(columnar_reader.num_docs(), 2);
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let cols = columnar_reader.read_columns("mixed").unwrap();
|
||||
|
||||
@@ -490,3 +470,119 @@ fn test_merge_columnar_different_empty_cardinality() {
|
||||
let dynamic_column = cols[1].open().unwrap();
|
||||
assert_eq!(dynamic_column.get_cardinality(), Cardinality::Optional);
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone)]
|
||||
struct ColumnSpec {
|
||||
column_name: String,
|
||||
/// (row_id, term)
|
||||
terms: Vec<(RowId, Vec<u8>)>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
struct ColumnarSpec {
|
||||
columns: Vec<ColumnSpec>,
|
||||
}
|
||||
|
||||
/// Generate a random (row_id, term) pair:
|
||||
/// - row_id in [0..10]
|
||||
/// - term is either from POSSIBLE_TERMS or random bytes
|
||||
fn rowid_and_term_strategy() -> impl Strategy<Value = (RowId, Vec<u8>)> {
|
||||
const POSSIBLE_TERMS: &[&[u8]] = &[b"a", b"b", b"allo"];
|
||||
|
||||
let term_strat = prop_oneof![
|
||||
// pick from the fixed list
|
||||
(0..POSSIBLE_TERMS.len()).prop_map(|i| POSSIBLE_TERMS[i].to_vec()),
|
||||
// or random bytes (length 0..10)
|
||||
prop::collection::vec(any::<u8>(), 0..10),
|
||||
];
|
||||
|
||||
(0u32..11, term_strat)
|
||||
}
|
||||
|
||||
/// Generate one ColumnSpec, with a random name and a random list of (row_id, term).
|
||||
/// We sort it by row_id so that data is in ascending order.
|
||||
fn column_spec_strategy() -> impl Strategy<Value = ColumnSpec> {
|
||||
let column_name = prop_oneof![
|
||||
Just("col".to_string()),
|
||||
Just("col2".to_string()),
|
||||
"col.*".prop_map(|s| s),
|
||||
];
|
||||
|
||||
// We'll produce 0..8 (rowid,term) entries for this column
|
||||
let data_strat = vec(rowid_and_term_strategy(), 0..8).prop_map(|mut pairs| {
|
||||
// Sort by row_id
|
||||
pairs.sort_by_key(|(row_id, _)| *row_id);
|
||||
pairs
|
||||
});
|
||||
|
||||
(column_name, data_strat).prop_map(|(name, data)| ColumnSpec {
|
||||
column_name: name,
|
||||
terms: data,
|
||||
})
|
||||
}
|
||||
|
||||
/// Strategy to generate an ColumnarSpec
|
||||
fn columnar_strategy() -> impl Strategy<Value = ColumnarSpec> {
|
||||
vec(column_spec_strategy(), 0..3).prop_map(|columns| ColumnarSpec { columns })
|
||||
}
|
||||
|
||||
/// Strategy to generate multiple ColumnarSpecs, each of which we will treat
|
||||
/// as one "columnar" to be merged together.
|
||||
fn columnars_strategy() -> impl Strategy<Value = Vec<ColumnarSpec>> {
|
||||
vec(columnar_strategy(), 1..4)
|
||||
}
|
||||
|
||||
/// Build a `ColumnarReader` from a `ColumnarSpec`
|
||||
fn build_columnar(spec: &ColumnarSpec) -> ColumnarReader {
|
||||
let mut writer = ColumnarWriter::default();
|
||||
let mut max_row_id = 0;
|
||||
for col in &spec.columns {
|
||||
for &(row_id, ref term) in &col.terms {
|
||||
writer.record_bytes(row_id, &col.column_name, term);
|
||||
max_row_id = max_row_id.max(row_id);
|
||||
}
|
||||
}
|
||||
|
||||
let mut buffer = Vec::new();
|
||||
writer.serialize(max_row_id + 1, &mut buffer).unwrap();
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
proptest! {
|
||||
// We just test that the merge_columnar function doesn't crash.
|
||||
#![proptest_config(ProptestConfig::with_cases(256))]
|
||||
#[test]
|
||||
fn test_merge_columnar_bytes_no_crash(columnars in columnars_strategy(), second_merge_columnars in columnars_strategy()) {
|
||||
let columnars: Vec<ColumnarReader> = columnars.iter()
|
||||
.map(build_columnar)
|
||||
.collect();
|
||||
|
||||
let mut out = Vec::new();
|
||||
let columnar_refs: Vec<&ColumnarReader> = columnars.iter().collect();
|
||||
let stack_merge_order = StackMergeOrder::stack(&columnar_refs);
|
||||
merge_columnar(
|
||||
&columnar_refs,
|
||||
&[],
|
||||
MergeRowOrder::Stack(stack_merge_order),
|
||||
&mut out,
|
||||
).unwrap();
|
||||
|
||||
let merged_reader = ColumnarReader::open(out).unwrap();
|
||||
|
||||
// Merge the second set of columnars with the result of the first merge
|
||||
let mut columnars: Vec<ColumnarReader> = second_merge_columnars.iter()
|
||||
.map(build_columnar)
|
||||
.collect();
|
||||
columnars.push(merged_reader);
|
||||
let mut out = Vec::new();
|
||||
let columnar_refs: Vec<&ColumnarReader> = columnars.iter().collect();
|
||||
let stack_merge_order = StackMergeOrder::stack(&columnar_refs);
|
||||
merge_columnar(
|
||||
&columnar_refs,
|
||||
&[],
|
||||
MergeRowOrder::Stack(stack_merge_order),
|
||||
&mut out,
|
||||
).unwrap();
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -5,6 +5,7 @@ mod reader;
|
||||
mod writer;
|
||||
|
||||
pub use column_type::{ColumnType, HasAssociatedColumnType};
|
||||
pub use format_version::{Version, CURRENT_VERSION};
|
||||
#[cfg(test)]
|
||||
pub(crate) use merge::ColumnTypeCategory;
|
||||
pub use merge::{merge_columnar, MergeRowOrder, ShuffleMergeOrder, StackMergeOrder};
|
||||
|
||||
@@ -1,12 +1,13 @@
|
||||
use std::{fmt, io, mem};
|
||||
|
||||
use common::file_slice::FileSlice;
|
||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
||||
use common::BinarySerializable;
|
||||
use sstable::{Dictionary, RangeSSTable};
|
||||
|
||||
use crate::columnar::{format_version, ColumnType};
|
||||
use crate::dynamic_column::DynamicColumnHandle;
|
||||
use crate::RowId;
|
||||
use crate::{RowId, Version};
|
||||
|
||||
fn io_invalid_data(msg: String) -> io::Error {
|
||||
io::Error::new(io::ErrorKind::InvalidData, msg)
|
||||
@@ -18,12 +19,13 @@ fn io_invalid_data(msg: String) -> io::Error {
|
||||
pub struct ColumnarReader {
|
||||
column_dictionary: Dictionary<RangeSSTable>,
|
||||
column_data: FileSlice,
|
||||
num_rows: RowId,
|
||||
num_docs: RowId,
|
||||
format_version: Version,
|
||||
}
|
||||
|
||||
impl fmt::Debug for ColumnarReader {
|
||||
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
||||
let num_rows = self.num_rows();
|
||||
let num_rows = self.num_docs();
|
||||
let columns = self.list_columns().unwrap();
|
||||
let num_cols = columns.len();
|
||||
let mut debug_struct = f.debug_struct("Columnar");
|
||||
@@ -53,6 +55,7 @@ impl fmt::Debug for ColumnarReader {
|
||||
fn read_all_columns_in_stream(
|
||||
mut stream: sstable::Streamer<'_, RangeSSTable>,
|
||||
column_data: &FileSlice,
|
||||
format_version: Version,
|
||||
) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let mut results = Vec::new();
|
||||
while stream.advance() {
|
||||
@@ -67,12 +70,26 @@ fn read_all_columns_in_stream(
|
||||
let dynamic_column_handle = DynamicColumnHandle {
|
||||
file_slice,
|
||||
column_type,
|
||||
format_version,
|
||||
};
|
||||
results.push(dynamic_column_handle);
|
||||
}
|
||||
Ok(results)
|
||||
}
|
||||
|
||||
fn column_dictionary_prefix_for_column_name(column_name: &str) -> String {
|
||||
// Each column is a associated to a given `column_key`,
|
||||
// that starts by `column_name\0column_header`.
|
||||
//
|
||||
// Listing the columns associated to the given column name is therefore equivalent to
|
||||
// listing `column_key` with the prefix `column_name\0`.
|
||||
format!("{}{}", column_name, '\0')
|
||||
}
|
||||
|
||||
fn column_dictionary_prefix_for_subpath(root_path: &str) -> String {
|
||||
format!("{}{}", root_path, JSON_PATH_SEGMENT_SEP as char)
|
||||
}
|
||||
|
||||
impl ColumnarReader {
|
||||
/// Opens a new Columnar file.
|
||||
pub fn open<F>(file_slice: F) -> io::Result<ColumnarReader>
|
||||
@@ -88,19 +105,20 @@ impl ColumnarReader {
|
||||
let num_rows = u32::deserialize(&mut &footer_bytes[8..12])?;
|
||||
let version_footer_bytes: [u8; format_version::VERSION_FOOTER_NUM_BYTES] =
|
||||
footer_bytes[12..].try_into().unwrap();
|
||||
let _version = format_version::parse_footer(version_footer_bytes)?;
|
||||
let format_version = format_version::parse_footer(version_footer_bytes)?;
|
||||
let (column_data, sstable) =
|
||||
file_slice_without_sstable_len.split_from_end(sstable_len as usize);
|
||||
let column_dictionary = Dictionary::open(sstable)?;
|
||||
Ok(ColumnarReader {
|
||||
column_dictionary,
|
||||
column_data,
|
||||
num_rows,
|
||||
num_docs: num_rows,
|
||||
format_version,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn num_rows(&self) -> RowId {
|
||||
self.num_rows
|
||||
pub fn num_docs(&self) -> RowId {
|
||||
self.num_docs
|
||||
}
|
||||
// Iterate over the columns in a sorted way
|
||||
pub fn iter_columns(
|
||||
@@ -126,6 +144,7 @@ impl ColumnarReader {
|
||||
let column_handle = DynamicColumnHandle {
|
||||
file_slice,
|
||||
column_type,
|
||||
format_version: self.format_version,
|
||||
};
|
||||
Some((column_name, column_handle))
|
||||
} else {
|
||||
@@ -139,35 +158,17 @@ impl ColumnarReader {
|
||||
Ok(self.iter_columns()?.collect())
|
||||
}
|
||||
|
||||
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {
|
||||
// Each column is a associated to a given `column_key`,
|
||||
// that starts by `column_name\0column_header`.
|
||||
//
|
||||
// Listing the columns associated to the given column name is therefore equivalent to
|
||||
// listing `column_key` with the prefix `column_name\0`.
|
||||
//
|
||||
// This is in turn equivalent to searching for the range
|
||||
// `[column_name,\0`..column_name\1)`.
|
||||
// TODO can we get some more generic `prefix(..)` logic in the dictionary.
|
||||
let mut start_key = column_name.to_string();
|
||||
start_key.push('\0');
|
||||
let mut end_key = column_name.to_string();
|
||||
end_key.push(1u8 as char);
|
||||
self.column_dictionary
|
||||
.range()
|
||||
.ge(start_key.as_bytes())
|
||||
.lt(end_key.as_bytes())
|
||||
}
|
||||
|
||||
pub async fn read_columns_async(
|
||||
&self,
|
||||
column_name: &str,
|
||||
) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let prefix = column_dictionary_prefix_for_column_name(column_name);
|
||||
let stream = self
|
||||
.stream_for_column_range(column_name)
|
||||
.column_dictionary
|
||||
.prefix_range(prefix)
|
||||
.into_stream_async()
|
||||
.await?;
|
||||
read_all_columns_in_stream(stream, &self.column_data)
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
/// Get all columns for the given column name.
|
||||
@@ -175,8 +176,36 @@ impl ColumnarReader {
|
||||
/// There can be more than one column associated to a given column name, provided they have
|
||||
/// different types.
|
||||
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let stream = self.stream_for_column_range(column_name).into_stream()?;
|
||||
read_all_columns_in_stream(stream, &self.column_data)
|
||||
let prefix = column_dictionary_prefix_for_column_name(column_name);
|
||||
let stream = self.column_dictionary.prefix_range(prefix).into_stream()?;
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
pub async fn read_subpath_columns_async(
|
||||
&self,
|
||||
root_path: &str,
|
||||
) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let prefix = column_dictionary_prefix_for_subpath(root_path);
|
||||
let stream = self
|
||||
.column_dictionary
|
||||
.prefix_range(prefix)
|
||||
.into_stream_async()
|
||||
.await?;
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
/// Get all inner columns for a given JSON prefix, i.e columns for which the name starts
|
||||
/// with the prefix then contain the [`JSON_PATH_SEGMENT_SEP`].
|
||||
///
|
||||
/// There can be more than one column associated to each path within the JSON structure,
|
||||
/// provided they have different types.
|
||||
pub fn read_subpath_columns(&self, root_path: &str) -> io::Result<Vec<DynamicColumnHandle>> {
|
||||
let prefix = column_dictionary_prefix_for_subpath(root_path);
|
||||
let stream = self
|
||||
.column_dictionary
|
||||
.prefix_range(prefix.as_bytes())
|
||||
.into_stream()?;
|
||||
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
|
||||
}
|
||||
|
||||
/// Return the number of columns in the columnar.
|
||||
@@ -187,6 +216,8 @@ impl ColumnarReader {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
||||
|
||||
use crate::{ColumnType, ColumnarReader, ColumnarWriter};
|
||||
|
||||
#[test]
|
||||
@@ -195,7 +226,7 @@ mod tests {
|
||||
columnar_writer.record_column_type("col1", ColumnType::Str, false);
|
||||
columnar_writer.record_column_type("col2", ColumnType::U64, false);
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(1, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(1, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
let columns = columnar.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 2);
|
||||
@@ -211,7 +242,7 @@ mod tests {
|
||||
columnar_writer.record_column_type("count", ColumnType::U64, false);
|
||||
columnar_writer.record_numerical(1, "count", 1u64);
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(2, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(2, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
let columns = columnar.list_columns().unwrap();
|
||||
assert_eq!(columns.len(), 1);
|
||||
@@ -219,6 +250,64 @@ mod tests {
|
||||
assert_eq!(columns[0].1.column_type(), ColumnType::U64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_columns() {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
columnar_writer.record_column_type("col", ColumnType::U64, false);
|
||||
columnar_writer.record_numerical(1, "col", 1u64);
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(2, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
{
|
||||
let columns = columnar.read_columns("col").unwrap();
|
||||
assert_eq!(columns.len(), 1);
|
||||
assert_eq!(columns[0].column_type(), ColumnType::U64);
|
||||
}
|
||||
{
|
||||
let columns = columnar.read_columns("other").unwrap();
|
||||
assert_eq!(columns.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_read_subpath_columns() {
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
columnar_writer.record_str(
|
||||
0,
|
||||
&format!("col1{}subcol1", JSON_PATH_SEGMENT_SEP as char),
|
||||
"hello",
|
||||
);
|
||||
columnar_writer.record_numerical(
|
||||
0,
|
||||
&format!("col1{}subcol2", JSON_PATH_SEGMENT_SEP as char),
|
||||
1i64,
|
||||
);
|
||||
columnar_writer.record_str(1, "col1", "hello");
|
||||
columnar_writer.record_str(0, "col2", "hello");
|
||||
let mut buffer = Vec::new();
|
||||
columnar_writer.serialize(2, &mut buffer).unwrap();
|
||||
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
{
|
||||
let columns = columnar.read_subpath_columns("col1").unwrap();
|
||||
assert_eq!(columns.len(), 2);
|
||||
assert_eq!(columns[0].column_type(), ColumnType::Str);
|
||||
assert_eq!(columns[1].column_type(), ColumnType::I64);
|
||||
}
|
||||
{
|
||||
let columns = columnar.read_subpath_columns("col1.subcol1").unwrap();
|
||||
assert_eq!(columns.len(), 0);
|
||||
}
|
||||
{
|
||||
let columns = columnar.read_subpath_columns("col2").unwrap();
|
||||
assert_eq!(columns.len(), 0);
|
||||
}
|
||||
{
|
||||
let columns = columnar.read_subpath_columns("other").unwrap();
|
||||
assert_eq!(columns.len(), 0);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "Input type forbidden")]
|
||||
fn test_list_columns_strict_typing_panics_on_wrong_types() {
|
||||
|
||||
@@ -87,7 +87,7 @@ impl<V: SymbolValue> ColumnOperation<V> {
|
||||
minibuf
|
||||
}
|
||||
|
||||
/// Deserialize a colummn operation.
|
||||
/// Deserialize a column operation.
|
||||
/// Returns None if the buffer is empty.
|
||||
///
|
||||
/// Panics if the payload is invalid:
|
||||
@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
|
||||
// In order to limit memory usage, and in order
|
||||
// to benefit from the stacker, we do this by serialization our data
|
||||
// as "Symbols".
|
||||
#[allow(clippy::from_over_into)]
|
||||
pub(super) trait SymbolValue: Clone + Copy {
|
||||
// Serializes the symbol into the given buffer.
|
||||
// Returns the number of bytes written into the buffer.
|
||||
|
||||
@@ -41,31 +41,10 @@ impl ColumnWriter {
|
||||
pub(super) fn operation_iterator<'a, V: SymbolValue>(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids_opt: Option<&[RowId]>,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<V>> + 'a {
|
||||
buffer.clear();
|
||||
self.values.read_to_end(arena, buffer);
|
||||
if let Some(old_to_new_ids) = old_to_new_ids_opt {
|
||||
// TODO avoid the extra deserialization / serialization.
|
||||
let mut sorted_ops: Vec<(RowId, ColumnOperation<V>)> = Vec::new();
|
||||
let mut new_doc = 0u32;
|
||||
let mut cursor = &buffer[..];
|
||||
for op in std::iter::from_fn(|| ColumnOperation::<V>::deserialize(&mut cursor)) {
|
||||
if let ColumnOperation::NewDoc(doc) = &op {
|
||||
new_doc = old_to_new_ids[*doc as usize];
|
||||
sorted_ops.push((new_doc, ColumnOperation::NewDoc(new_doc)));
|
||||
} else {
|
||||
sorted_ops.push((new_doc, op));
|
||||
}
|
||||
}
|
||||
// stable sort is crucial here.
|
||||
sorted_ops.sort_by_key(|(new_doc_id, _)| *new_doc_id);
|
||||
buffer.clear();
|
||||
for (_, op) in sorted_ops {
|
||||
buffer.extend_from_slice(op.serialize().as_ref());
|
||||
}
|
||||
}
|
||||
let mut cursor: &[u8] = &buffer[..];
|
||||
std::iter::from_fn(move || ColumnOperation::deserialize(&mut cursor))
|
||||
}
|
||||
@@ -231,11 +210,9 @@ impl NumericalColumnWriter {
|
||||
pub(super) fn operation_iterator<'a>(
|
||||
self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids: Option<&[RowId]>,
|
||||
buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<NumericalValue>> + 'a {
|
||||
self.column_writer
|
||||
.operation_iterator(arena, old_to_new_ids, buffer)
|
||||
self.column_writer.operation_iterator(arena, buffer)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -277,11 +254,9 @@ impl StrOrBytesColumnWriter {
|
||||
pub(super) fn operation_iterator<'a>(
|
||||
&self,
|
||||
arena: &MemoryArena,
|
||||
old_to_new_ids: Option<&[RowId]>,
|
||||
byte_buffer: &'a mut Vec<u8>,
|
||||
) -> impl Iterator<Item = ColumnOperation<UnorderedId>> + 'a {
|
||||
self.column_writer
|
||||
.operation_iterator(arena, old_to_new_ids, byte_buffer)
|
||||
self.column_writer.operation_iterator(arena, byte_buffer)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -8,11 +8,12 @@ use std::net::Ipv6Addr;
|
||||
|
||||
use column_operation::ColumnOperation;
|
||||
pub(crate) use column_writers::CompatibleNumericalTypes;
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::CountingWriter;
|
||||
pub(crate) use serializer::ColumnarSerializer;
|
||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||
|
||||
use crate::column_index::SerializableColumnIndex;
|
||||
use crate::column_index::{SerializableColumnIndex, SerializableOptionalIndex};
|
||||
use crate::column_values::{MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use crate::columnar::column_type::ColumnType;
|
||||
use crate::columnar::writer::column_writers::{
|
||||
@@ -43,7 +44,7 @@ struct SpareBuffers {
|
||||
/// columnar_writer.record_str(1u32 /* doc id */, "product_name", "Apple");
|
||||
/// columnar_writer.record_numerical(0u32 /* doc id */, "price", 10.5f64); //< uh oh we ended up mixing integer and floats.
|
||||
/// let mut wrt: Vec<u8> = Vec::new();
|
||||
/// columnar_writer.serialize(2u32, None, &mut wrt).unwrap();
|
||||
/// columnar_writer.serialize(2u32, &mut wrt).unwrap();
|
||||
/// ```
|
||||
#[derive(Default)]
|
||||
pub struct ColumnarWriter {
|
||||
@@ -75,63 +76,6 @@ impl ColumnarWriter {
|
||||
.sum::<usize>()
|
||||
}
|
||||
|
||||
/// Returns the list of doc ids from 0..num_docs sorted by the `sort_field`
|
||||
/// column.
|
||||
///
|
||||
/// If the column is multivalued, use the first value for scoring.
|
||||
/// If no value is associated to a specific row, the document is assigned
|
||||
/// the lowest possible score.
|
||||
///
|
||||
/// The sort applied is stable.
|
||||
pub fn sort_order(&self, sort_field: &str, num_docs: RowId, reversed: bool) -> Vec<u32> {
|
||||
let Some(numerical_col_writer) = self
|
||||
.numerical_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
.or_else(|| {
|
||||
self.datetime_field_hash_map
|
||||
.get::<NumericalColumnWriter>(sort_field.as_bytes())
|
||||
})
|
||||
else {
|
||||
return Vec::new();
|
||||
};
|
||||
let mut symbols_buffer = Vec::new();
|
||||
let mut values = Vec::new();
|
||||
let mut start_doc_check_fill = 0;
|
||||
let mut current_doc_opt: Option<RowId> = None;
|
||||
// Assumption: NewDoc will never call the same doc twice and is strictly increasing between
|
||||
// calls
|
||||
for op in numerical_col_writer.operation_iterator(&self.arena, None, &mut symbols_buffer) {
|
||||
match op {
|
||||
ColumnOperation::NewDoc(doc) => {
|
||||
current_doc_opt = Some(doc);
|
||||
}
|
||||
ColumnOperation::Value(numerical_value) => {
|
||||
if let Some(current_doc) = current_doc_opt {
|
||||
// Fill up with 0.0 since last doc
|
||||
values.extend((start_doc_check_fill..current_doc).map(|doc| (0.0, doc)));
|
||||
start_doc_check_fill = current_doc + 1;
|
||||
// handle multi values
|
||||
current_doc_opt = None;
|
||||
|
||||
let score: f32 = f64::coerce(numerical_value) as f32;
|
||||
values.push((score, current_doc));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
for doc in values.len() as u32..num_docs {
|
||||
values.push((0.0f32, doc));
|
||||
}
|
||||
values.sort_by(|(left_score, _), (right_score, _)| {
|
||||
if reversed {
|
||||
right_score.total_cmp(left_score)
|
||||
} else {
|
||||
left_score.total_cmp(right_score)
|
||||
}
|
||||
});
|
||||
values.into_iter().map(|(_score, doc)| doc).collect()
|
||||
}
|
||||
|
||||
/// Records a column type. This is useful to bypass the coercion process,
|
||||
/// makes sure the empty is present in the resulting columnar, or set
|
||||
/// the `sort_values_within_row`.
|
||||
@@ -302,13 +246,9 @@ impl ColumnarWriter {
|
||||
},
|
||||
);
|
||||
}
|
||||
pub fn serialize(
|
||||
&mut self,
|
||||
num_docs: RowId,
|
||||
old_to_new_row_ids: Option<&[RowId]>,
|
||||
wrt: &mut dyn io::Write,
|
||||
) -> io::Result<()> {
|
||||
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||
let mut serializer = ColumnarSerializer::new(wrt);
|
||||
|
||||
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
||||
.numerical_field_hash_map
|
||||
.iter()
|
||||
@@ -322,7 +262,7 @@ impl ColumnarWriter {
|
||||
columns.extend(
|
||||
self.bytes_field_hash_map
|
||||
.iter()
|
||||
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
|
||||
.map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
|
||||
);
|
||||
columns.extend(
|
||||
self.str_field_hash_map
|
||||
@@ -345,10 +285,15 @@ impl ColumnarWriter {
|
||||
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
|
||||
);
|
||||
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
|
||||
|
||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||
for (column_name, column_type, addr) in columns {
|
||||
if column_name.contains(&JSON_END_OF_PATH) {
|
||||
// Tantivy uses b'0' as a separator for nested fields in JSON.
|
||||
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
|
||||
// index).
|
||||
continue;
|
||||
}
|
||||
match column_type {
|
||||
ColumnType::Bool => {
|
||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||
@@ -358,11 +303,7 @@ impl ColumnarWriter {
|
||||
serialize_bool_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -376,11 +317,7 @@ impl ColumnarWriter {
|
||||
serialize_ip_addr_column(
|
||||
cardinality,
|
||||
num_docs,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -405,11 +342,8 @@ impl ColumnarWriter {
|
||||
num_docs,
|
||||
str_or_bytes_column_writer.sort_values_within_row,
|
||||
dictionary_builder,
|
||||
str_or_bytes_column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
str_or_bytes_column_writer
|
||||
.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&self.arena,
|
||||
&mut column_serializer,
|
||||
@@ -427,11 +361,7 @@ impl ColumnarWriter {
|
||||
cardinality,
|
||||
num_docs,
|
||||
numerical_type,
|
||||
numerical_column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
numerical_column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -446,11 +376,7 @@ impl ColumnarWriter {
|
||||
cardinality,
|
||||
num_docs,
|
||||
NumericalType::I64,
|
||||
column_writer.operation_iterator(
|
||||
arena,
|
||||
old_to_new_row_ids,
|
||||
&mut symbol_byte_buffer,
|
||||
),
|
||||
column_writer.operation_iterator(arena, &mut symbol_byte_buffer),
|
||||
buffers,
|
||||
&mut column_serializer,
|
||||
)?;
|
||||
@@ -465,7 +391,7 @@ impl ColumnarWriter {
|
||||
|
||||
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
|
||||
// Column: [Column Index, Column Values, column index num bytes U32::LE]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
fn serialize_bytes_or_str_column(
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
@@ -635,16 +561,16 @@ fn send_to_serialize_column_mappable_to_u128<
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
num_rows,
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_mappable_to_u128(
|
||||
@@ -655,15 +581,6 @@ fn send_to_serialize_column_mappable_to_u128<
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_values_within_row_in_place(multivalued_index: &[RowId], values: &mut [u64]) {
|
||||
let mut start_index: usize = 0;
|
||||
for end_index in multivalued_index.iter().copied() {
|
||||
let end_index = end_index as usize;
|
||||
values[start_index..end_index].sort_unstable();
|
||||
start_index = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
fn send_to_serialize_column_mappable_to_u64(
|
||||
op_iterator: impl Iterator<Item = ColumnOperation<u64>>,
|
||||
cardinality: Cardinality,
|
||||
@@ -687,19 +604,22 @@ fn send_to_serialize_column_mappable_to_u64(
|
||||
let optional_index_builder = value_index_builders.borrow_optional_index_builder();
|
||||
consume_operation_iterator(op_iterator, optional_index_builder, values);
|
||||
let optional_index = optional_index_builder.finish(num_rows);
|
||||
SerializableColumnIndex::Optional {
|
||||
SerializableColumnIndex::Optional(SerializableOptionalIndex {
|
||||
non_null_row_ids: Box::new(optional_index),
|
||||
num_rows,
|
||||
}
|
||||
})
|
||||
}
|
||||
Cardinality::Multivalued => {
|
||||
let multivalued_index_builder = value_index_builders.borrow_multivalued_index_builder();
|
||||
consume_operation_iterator(op_iterator, multivalued_index_builder, values);
|
||||
let multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
let serializable_multivalued_index = multivalued_index_builder.finish(num_rows);
|
||||
if sort_values_within_row {
|
||||
sort_values_within_row_in_place(multivalued_index, values);
|
||||
sort_values_within_row_in_place(
|
||||
serializable_multivalued_index.start_offsets.boxed_iter(),
|
||||
values,
|
||||
);
|
||||
}
|
||||
SerializableColumnIndex::Multivalued(Box::new(multivalued_index))
|
||||
SerializableColumnIndex::Multivalued(serializable_multivalued_index)
|
||||
}
|
||||
};
|
||||
crate::column::serialize_column_mappable_to_u64(
|
||||
@@ -710,6 +630,18 @@ fn send_to_serialize_column_mappable_to_u64(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn sort_values_within_row_in_place(
|
||||
multivalued_index: impl Iterator<Item = RowId>,
|
||||
values: &mut [u64],
|
||||
) {
|
||||
let mut start_index: usize = 0;
|
||||
for end_index in multivalued_index {
|
||||
let end_index = end_index as usize;
|
||||
values[start_index..end_index].sort_unstable();
|
||||
start_index = end_index;
|
||||
}
|
||||
}
|
||||
|
||||
fn coerce_numerical_symbol<T>(
|
||||
operation_iterator: impl Iterator<Item = ColumnOperation<NumericalValue>>,
|
||||
) -> impl Iterator<Item = ColumnOperation<u64>>
|
||||
@@ -757,7 +689,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Full);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 6);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
@@ -786,7 +718,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(3), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 4);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(1u32)));
|
||||
@@ -809,7 +741,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(2), Cardinality::Optional);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 2);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
@@ -828,7 +760,7 @@ mod tests {
|
||||
assert_eq!(column_writer.get_cardinality(1), Cardinality::Multivalued);
|
||||
let mut buffer = Vec::new();
|
||||
let symbols: Vec<ColumnOperation<NumericalValue>> = column_writer
|
||||
.operation_iterator(&arena, None, &mut buffer)
|
||||
.operation_iterator(&arena, &mut buffer)
|
||||
.collect();
|
||||
assert_eq!(symbols.len(), 3);
|
||||
assert!(matches!(symbols[0], ColumnOperation::NewDoc(0u32)));
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::io;
|
||||
use std::io::Write;
|
||||
|
||||
use common::json_path_writer::JSON_END_OF_PATH;
|
||||
use common::{BinarySerializable, CountingWriter};
|
||||
use sstable::value::RangeValueWriter;
|
||||
use sstable::RangeSSTable;
|
||||
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
|
||||
/// code.
|
||||
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
||||
buffer.clear();
|
||||
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
|
||||
if key.contains(&0u8) {
|
||||
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
||||
} else {
|
||||
buffer.extend_from_slice(key);
|
||||
}
|
||||
buffer.push(0u8);
|
||||
buffer.extend_from_slice(key);
|
||||
buffer.push(JSON_END_OF_PATH);
|
||||
buffer.push(column_type.to_code());
|
||||
}
|
||||
|
||||
@@ -71,7 +67,7 @@ pub struct ColumnSerializer<'a, W: io::Write> {
|
||||
start_offset: u64,
|
||||
}
|
||||
|
||||
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
|
||||
impl<W: io::Write> ColumnSerializer<'_, W> {
|
||||
pub fn finalize(self) -> io::Result<()> {
|
||||
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
|
||||
let byte_range = self.start_offset..end_offset;
|
||||
@@ -84,7 +80,7 @@ impl<'a, W: io::Write> ColumnSerializer<'a, W> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
||||
impl<W: io::Write> io::Write for ColumnSerializer<'_, W> {
|
||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||
self.columnar_serializer.wrt.write(buf)
|
||||
}
|
||||
@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
||||
self.columnar_serializer.wrt.write_all(buf)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_prepare_key_bytes() {
|
||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
||||
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
|
||||
assert_eq!(buffer.len(), 12);
|
||||
assert_eq!(&buffer[..10], b"root0child");
|
||||
assert_eq!(buffer[10], 0u8);
|
||||
assert_eq!(buffer[11], ColumnType::Str.to_code());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use crate::column_index::{SerializableMultivalueIndex, SerializableOptionalIndex};
|
||||
use crate::iterable::Iterable;
|
||||
use crate::RowId;
|
||||
|
||||
@@ -59,31 +60,47 @@ impl IndexBuilder for OptionalIndexBuilder {
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct MultivaluedIndexBuilder {
|
||||
start_offsets: Vec<RowId>,
|
||||
doc_with_values: Vec<RowId>,
|
||||
start_offsets: Vec<u32>,
|
||||
total_num_vals_seen: u32,
|
||||
current_row: RowId,
|
||||
current_row_has_value: bool,
|
||||
}
|
||||
|
||||
impl MultivaluedIndexBuilder {
|
||||
pub fn finish(&mut self, num_docs: RowId) -> &[u32] {
|
||||
self.start_offsets
|
||||
.resize(num_docs as usize + 1, self.total_num_vals_seen);
|
||||
&self.start_offsets[..]
|
||||
pub fn finish(&mut self, num_docs: RowId) -> SerializableMultivalueIndex<'_> {
|
||||
self.start_offsets.push(self.total_num_vals_seen);
|
||||
let non_null_row_ids: Box<dyn Iterable<RowId>> = Box::new(&self.doc_with_values[..]);
|
||||
SerializableMultivalueIndex {
|
||||
doc_ids_with_values: SerializableOptionalIndex {
|
||||
non_null_row_ids,
|
||||
num_rows: num_docs,
|
||||
},
|
||||
start_offsets: Box::new(&self.start_offsets[..]),
|
||||
}
|
||||
}
|
||||
|
||||
fn reset(&mut self) {
|
||||
self.doc_with_values.clear();
|
||||
self.start_offsets.clear();
|
||||
self.start_offsets.push(0u32);
|
||||
self.total_num_vals_seen = 0;
|
||||
self.current_row = 0;
|
||||
self.current_row_has_value = false;
|
||||
}
|
||||
}
|
||||
|
||||
impl IndexBuilder for MultivaluedIndexBuilder {
|
||||
fn record_row(&mut self, row_id: RowId) {
|
||||
self.start_offsets
|
||||
.resize(row_id as usize + 1, self.total_num_vals_seen);
|
||||
self.current_row = row_id;
|
||||
self.current_row_has_value = false;
|
||||
}
|
||||
|
||||
fn record_value(&mut self) {
|
||||
if !self.current_row_has_value {
|
||||
self.current_row_has_value = true;
|
||||
self.doc_with_values.push(self.current_row);
|
||||
self.start_offsets.push(self.total_num_vals_seen);
|
||||
}
|
||||
self.total_num_vals_seen += 1;
|
||||
}
|
||||
}
|
||||
@@ -141,6 +158,32 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_value_index_builder_simple() {
|
||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||
{
|
||||
multivalued_value_index_builder.record_row(0u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||
.start_offsets
|
||||
.boxed_iter()
|
||||
.collect();
|
||||
assert_eq!(&start_offsets, &[0, 2]);
|
||||
}
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_row(0u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
let serialized_multivalue_index = multivalued_value_index_builder.finish(1u32);
|
||||
let start_offsets: Vec<u32> = serialized_multivalue_index
|
||||
.start_offsets
|
||||
.boxed_iter()
|
||||
.collect();
|
||||
assert_eq!(&start_offsets, &[0, 2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_multivalued_value_index_builder() {
|
||||
let mut multivalued_value_index_builder = MultivaluedIndexBuilder::default();
|
||||
@@ -149,17 +192,15 @@ mod tests {
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_row(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
||||
vec![0, 0, 2, 3, 3]
|
||||
);
|
||||
multivalued_value_index_builder.reset();
|
||||
multivalued_value_index_builder.record_row(2u32);
|
||||
multivalued_value_index_builder.record_value();
|
||||
multivalued_value_index_builder.record_value();
|
||||
assert_eq!(
|
||||
multivalued_value_index_builder.finish(4u32).to_vec(),
|
||||
vec![0, 0, 0, 2, 2]
|
||||
);
|
||||
let SerializableMultivalueIndex {
|
||||
doc_ids_with_values,
|
||||
start_offsets,
|
||||
} = multivalued_value_index_builder.finish(4u32);
|
||||
assert_eq!(doc_ids_with_values.num_rows, 4u32);
|
||||
let doc_ids_with_values: Vec<u32> =
|
||||
doc_ids_with_values.non_null_row_ids.boxed_iter().collect();
|
||||
assert_eq!(&doc_ids_with_values, &[1u32, 2u32]);
|
||||
let start_offsets: Vec<u32> = start_offsets.boxed_iter().collect();
|
||||
assert_eq!(&start_offsets[..], &[0, 2, 3]);
|
||||
}
|
||||
}
|
||||
|
||||
183
columnar/src/compat_tests.rs
Normal file
183
columnar/src/compat_tests.rs
Normal file
@@ -0,0 +1,183 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use itertools::Itertools;
|
||||
|
||||
use crate::{
|
||||
merge_columnar, Cardinality, Column, ColumnarReader, DynamicColumn, StackMergeOrder,
|
||||
CURRENT_VERSION,
|
||||
};
|
||||
|
||||
const NUM_DOCS: u32 = u16::MAX as u32;
|
||||
|
||||
fn generate_columnar(num_docs: u32, value_offset: u64) -> Vec<u8> {
|
||||
use crate::ColumnarWriter;
|
||||
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
|
||||
for i in 0..num_docs {
|
||||
if i % 100 == 0 {
|
||||
columnar_writer.record_numerical(i, "sparse", value_offset + i as u64);
|
||||
}
|
||||
if i % 5 == 0 {
|
||||
columnar_writer.record_numerical(i, "dense", value_offset + i as u64);
|
||||
}
|
||||
columnar_writer.record_numerical(i, "full", value_offset + i as u64);
|
||||
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||
columnar_writer.record_numerical(i, "multi", value_offset + i as u64);
|
||||
}
|
||||
|
||||
let mut wrt: Vec<u8> = Vec::new();
|
||||
columnar_writer.serialize(num_docs, &mut wrt).unwrap();
|
||||
|
||||
wrt
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Writes a columnar for the CURRENT_VERSION to disk.
|
||||
fn create_format() {
|
||||
let version = CURRENT_VERSION.to_string();
|
||||
let file_path = path_for_version(&version);
|
||||
if PathBuf::from(file_path.clone()).exists() {
|
||||
return;
|
||||
}
|
||||
let columnar = generate_columnar(NUM_DOCS, 0);
|
||||
std::fs::write(file_path, columnar).unwrap();
|
||||
}
|
||||
|
||||
fn path_for_version(version: &str) -> String {
|
||||
format!("./compat_tests_data/{}.columnar", version)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_v1() {
|
||||
let path = path_for_version("v1");
|
||||
test_format(&path);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_format_v2() {
|
||||
let path = path_for_version("v2");
|
||||
test_format(&path);
|
||||
}
|
||||
|
||||
fn test_format(path: &str) {
|
||||
let file_content = std::fs::read(path).unwrap();
|
||||
let reader = ColumnarReader::open(file_content).unwrap();
|
||||
|
||||
check_columns(&reader);
|
||||
|
||||
// Test merge
|
||||
let reader2 = ColumnarReader::open(generate_columnar(NUM_DOCS, NUM_DOCS as u64)).unwrap();
|
||||
let columnar_readers = vec![&reader, &reader2];
|
||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||
let mut out = Vec::new();
|
||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||
let reader = ColumnarReader::open(out).unwrap();
|
||||
check_columns(&reader);
|
||||
}
|
||||
|
||||
fn check_columns(reader: &ColumnarReader) {
|
||||
let column = open_column(reader, "full");
|
||||
check_column(&column, |doc_id| vec![(doc_id, doc_id as u64).into()]);
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Full);
|
||||
|
||||
let column = open_column(reader, "multi");
|
||||
check_column(&column, |doc_id| {
|
||||
vec![
|
||||
(doc_id * 2, doc_id as u64).into(),
|
||||
(doc_id * 2 + 1, doc_id as u64).into(),
|
||||
]
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Multivalued);
|
||||
|
||||
let column = open_column(reader, "sparse");
|
||||
check_column(&column, |doc_id| {
|
||||
if doc_id % 100 == 0 {
|
||||
vec![(doc_id / 100, doc_id as u64).into()]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||
|
||||
let column = open_column(reader, "dense");
|
||||
check_column(&column, |doc_id| {
|
||||
if doc_id % 5 == 0 {
|
||||
vec![(doc_id / 5, doc_id as u64).into()]
|
||||
} else {
|
||||
vec![]
|
||||
}
|
||||
});
|
||||
assert_eq!(column.get_cardinality(), Cardinality::Optional);
|
||||
}
|
||||
|
||||
struct RowIdAndValue {
|
||||
row_id: u32,
|
||||
value: u64,
|
||||
}
|
||||
impl From<(u32, u64)> for RowIdAndValue {
|
||||
fn from((row_id, value): (u32, u64)) -> Self {
|
||||
Self { row_id, value }
|
||||
}
|
||||
}
|
||||
|
||||
fn check_column<F: Fn(u32) -> Vec<RowIdAndValue>>(column: &Column<u64>, expected: F) {
|
||||
let num_docs = column.num_docs();
|
||||
let test_doc = |doc: u32| {
|
||||
if expected(doc).is_empty() {
|
||||
assert_eq!(column.first(doc), None);
|
||||
} else {
|
||||
assert_eq!(column.first(doc), Some(expected(doc)[0].value));
|
||||
}
|
||||
let values = column.values_for_doc(doc).collect_vec();
|
||||
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||
let mut row_ids = Vec::new();
|
||||
column.row_ids_for_docs(&[doc], &mut vec![], &mut row_ids);
|
||||
assert_eq!(
|
||||
row_ids,
|
||||
expected(doc).iter().map(|x| x.row_id).collect_vec()
|
||||
);
|
||||
let values = column.values_for_doc(doc).collect_vec();
|
||||
assert_eq!(values, expected(doc).iter().map(|x| x.value).collect_vec());
|
||||
|
||||
// Docid rowid conversion
|
||||
let mut row_ids = Vec::new();
|
||||
let safe_next_doc = |doc: u32| (doc + 1).min(num_docs - 1);
|
||||
column
|
||||
.index
|
||||
.docids_to_rowids(&[doc, safe_next_doc(doc)], &mut vec![], &mut row_ids);
|
||||
let expected_rowids = expected(doc)
|
||||
.iter()
|
||||
.map(|x| x.row_id)
|
||||
.chain(expected(safe_next_doc(doc)).iter().map(|x| x.row_id))
|
||||
.collect_vec();
|
||||
assert_eq!(row_ids, expected_rowids);
|
||||
let rowid_range = column
|
||||
.index
|
||||
.docid_range_to_rowids(doc..safe_next_doc(doc) + 1);
|
||||
if expected_rowids.is_empty() {
|
||||
assert!(rowid_range.is_empty());
|
||||
} else {
|
||||
assert_eq!(
|
||||
rowid_range,
|
||||
expected_rowids[0]..expected_rowids.last().unwrap() + 1
|
||||
);
|
||||
}
|
||||
};
|
||||
test_doc(0);
|
||||
test_doc(num_docs - 1);
|
||||
test_doc(num_docs - 2);
|
||||
test_doc(65000);
|
||||
}
|
||||
|
||||
fn open_column(reader: &ColumnarReader, name: &str) -> Column<u64> {
|
||||
let column = reader.read_columns(name).unwrap()[0]
|
||||
.open()
|
||||
.unwrap()
|
||||
.coerce_numerical(crate::NumericalType::U64)
|
||||
.unwrap();
|
||||
let DynamicColumn::U64(column) = column else {
|
||||
panic!();
|
||||
};
|
||||
column
|
||||
}
|
||||
@@ -8,7 +8,7 @@ use common::{ByteCount, DateTime, HasLen, OwnedBytes};
|
||||
use crate::column::{BytesColumn, Column, StrColumn};
|
||||
use crate::column_values::{monotonic_map_column, StrictlyMonotonicFn};
|
||||
use crate::columnar::ColumnType;
|
||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType};
|
||||
use crate::{Cardinality, ColumnIndex, ColumnValues, NumericalType, Version};
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum DynamicColumn {
|
||||
@@ -232,6 +232,7 @@ static_dynamic_conversions!(Column<Ipv6Addr>, IpAddr);
|
||||
pub struct DynamicColumnHandle {
|
||||
pub(crate) file_slice: FileSlice,
|
||||
pub(crate) column_type: ColumnType,
|
||||
pub(crate) format_version: Version,
|
||||
}
|
||||
|
||||
impl DynamicColumnHandle {
|
||||
@@ -260,11 +261,15 @@ impl DynamicColumnHandle {
|
||||
let column_bytes = self.file_slice.read_bytes()?;
|
||||
match self.column_type {
|
||||
ColumnType::Str | ColumnType::Bytes => {
|
||||
let column: BytesColumn = crate::column::open_column_bytes(column_bytes)?;
|
||||
let column: BytesColumn =
|
||||
crate::column::open_column_bytes(column_bytes, self.format_version)?;
|
||||
Ok(Some(column.term_ord_column))
|
||||
}
|
||||
ColumnType::IpAddr => {
|
||||
let column = crate::column::open_column_u128_as_compact_u64(column_bytes)?;
|
||||
let column = crate::column::open_column_u128_as_compact_u64(
|
||||
column_bytes,
|
||||
self.format_version,
|
||||
)?;
|
||||
Ok(Some(column))
|
||||
}
|
||||
ColumnType::Bool
|
||||
@@ -272,7 +277,8 @@ impl DynamicColumnHandle {
|
||||
| ColumnType::U64
|
||||
| ColumnType::F64
|
||||
| ColumnType::DateTime => {
|
||||
let column = crate::column::open_column_u64::<u64>(column_bytes)?;
|
||||
let column =
|
||||
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?;
|
||||
Ok(Some(column))
|
||||
}
|
||||
}
|
||||
@@ -280,15 +286,31 @@ impl DynamicColumnHandle {
|
||||
|
||||
fn open_internal(&self, column_bytes: OwnedBytes) -> io::Result<DynamicColumn> {
|
||||
let dynamic_column: DynamicColumn = match self.column_type {
|
||||
ColumnType::Bytes => crate::column::open_column_bytes(column_bytes)?.into(),
|
||||
ColumnType::Str => crate::column::open_column_str(column_bytes)?.into(),
|
||||
ColumnType::I64 => crate::column::open_column_u64::<i64>(column_bytes)?.into(),
|
||||
ColumnType::U64 => crate::column::open_column_u64::<u64>(column_bytes)?.into(),
|
||||
ColumnType::F64 => crate::column::open_column_u64::<f64>(column_bytes)?.into(),
|
||||
ColumnType::Bool => crate::column::open_column_u64::<bool>(column_bytes)?.into(),
|
||||
ColumnType::IpAddr => crate::column::open_column_u128::<Ipv6Addr>(column_bytes)?.into(),
|
||||
ColumnType::Bytes => {
|
||||
crate::column::open_column_bytes(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::Str => {
|
||||
crate::column::open_column_str(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::I64 => {
|
||||
crate::column::open_column_u64::<i64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::U64 => {
|
||||
crate::column::open_column_u64::<u64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::F64 => {
|
||||
crate::column::open_column_u64::<f64>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::Bool => {
|
||||
crate::column::open_column_u64::<bool>(column_bytes, self.format_version)?.into()
|
||||
}
|
||||
ColumnType::IpAddr => {
|
||||
crate::column::open_column_u128::<Ipv6Addr>(column_bytes, self.format_version)?
|
||||
.into()
|
||||
}
|
||||
ColumnType::DateTime => {
|
||||
crate::column::open_column_u64::<DateTime>(column_bytes)?.into()
|
||||
crate::column::open_column_u64::<DateTime>(column_bytes, self.format_version)?
|
||||
.into()
|
||||
}
|
||||
};
|
||||
Ok(dynamic_column)
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
use std::ops::Range;
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::{ColumnValues, RowId};
|
||||
|
||||
pub trait Iterable<T = u64> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
||||
}
|
||||
|
||||
impl<'a, T: Copy> Iterable<T> for &'a [T] {
|
||||
impl<T: Copy> Iterable<T> for &[T] {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
||||
Box::new(self.iter().copied())
|
||||
}
|
||||
@@ -17,3 +20,9 @@ where Range<T>: Iterator<Item = T>
|
||||
Box::new(self.clone())
|
||||
}
|
||||
}
|
||||
|
||||
impl Iterable for Arc<dyn crate::ColumnValues<RowId>> {
|
||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||
Box::new(self.iter().map(|row_id| row_id as u64))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -48,7 +48,7 @@ pub use column_values::{
|
||||
};
|
||||
pub use columnar::{
|
||||
merge_columnar, ColumnType, ColumnarReader, ColumnarWriter, HasAssociatedColumnType,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder,
|
||||
MergeRowOrder, ShuffleMergeOrder, StackMergeOrder, Version, CURRENT_VERSION,
|
||||
};
|
||||
use sstable::VoidSSTable;
|
||||
pub use value::{NumericalType, NumericalValue};
|
||||
@@ -131,3 +131,6 @@ impl Cardinality {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
#[cfg(test)]
|
||||
mod compat_tests;
|
||||
|
||||
@@ -21,7 +21,7 @@ fn test_dataframe_writer_str() {
|
||||
dataframe_writer.record_str(1u32, "my_string", "hello");
|
||||
dataframe_writer.record_str(3u32, "my_string", "helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
@@ -35,7 +35,7 @@ fn test_dataframe_writer_bytes() {
|
||||
dataframe_writer.record_bytes(1u32, "my_string", b"hello");
|
||||
dataframe_writer.record_bytes(3u32, "my_string", b"helloeee");
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("my_string").unwrap();
|
||||
@@ -49,7 +49,7 @@ fn test_dataframe_writer_bool() {
|
||||
dataframe_writer.record_bool(1u32, "bool.value", false);
|
||||
dataframe_writer.record_bool(3u32, "bool.value", true);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("bool.value").unwrap();
|
||||
@@ -74,12 +74,12 @@ fn test_dataframe_writer_u64_multivalued() {
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 2u64);
|
||||
dataframe_writer.record_numerical(6u32, "divisor", 3u64);
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(7, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(7, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("divisor").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
assert_eq!(cols[0].num_bytes(), 29);
|
||||
assert_eq!(cols[0].num_bytes(), 50);
|
||||
let dyn_i64_col = cols[0].open().unwrap();
|
||||
let DynamicColumn::I64(divisor_col) = dyn_i64_col else {
|
||||
panic!();
|
||||
@@ -97,7 +97,7 @@ fn test_dataframe_writer_ip_addr() {
|
||||
dataframe_writer.record_ip_addr(1, "ip_addr", Ipv6Addr::from_u128(1001));
|
||||
dataframe_writer.record_ip_addr(3, "ip_addr", Ipv6Addr::from_u128(1050));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("ip_addr").unwrap();
|
||||
@@ -128,7 +128,7 @@ fn test_dataframe_writer_numerical() {
|
||||
dataframe_writer.record_numerical(2u32, "srical.value", NumericalValue::U64(13u64));
|
||||
dataframe_writer.record_numerical(4u32, "srical.value", NumericalValue::U64(15u64));
|
||||
let mut buffer: Vec<u8> = Vec::new();
|
||||
dataframe_writer.serialize(6, None, &mut buffer).unwrap();
|
||||
dataframe_writer.serialize(6, &mut buffer).unwrap();
|
||||
let columnar = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar.num_columns(), 1);
|
||||
let cols: Vec<DynamicColumnHandle> = columnar.read_columns("srical.value").unwrap();
|
||||
@@ -153,46 +153,6 @@ fn test_dataframe_writer_numerical() {
|
||||
assert_eq!(column_i64.first(6), None); //< we can change the spec for that one.
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_full() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(0u32, "value", NumericalValue::U64(1));
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
||||
let data = dataframe_writer.sort_order("value", 2, false);
|
||||
assert_eq!(data, vec![0, 1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_opt() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(3));
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(2));
|
||||
let data = dataframe_writer.sort_order("value", 5, false);
|
||||
// 0, 2, 4 is 0.0
|
||||
assert_eq!(data, vec![0, 2, 4, 3, 1]);
|
||||
let data = dataframe_writer.sort_order("value", 5, true);
|
||||
assert_eq!(
|
||||
data,
|
||||
vec![4, 2, 0, 3, 1].into_iter().rev().collect::<Vec<_>>()
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dataframe_sort_by_multi() {
|
||||
let mut dataframe_writer = ColumnarWriter::default();
|
||||
// valid for sort
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(2));
|
||||
// those are ignored for sort
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
||||
dataframe_writer.record_numerical(1u32, "value", NumericalValue::U64(4));
|
||||
// valid for sort
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(3));
|
||||
// ignored, would change sort order
|
||||
dataframe_writer.record_numerical(3u32, "value", NumericalValue::U64(1));
|
||||
let data = dataframe_writer.sort_order("value", 4, false);
|
||||
assert_eq!(data, vec![0, 2, 1, 3]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_dictionary_encoded_str() {
|
||||
let mut buffer = Vec::new();
|
||||
@@ -201,7 +161,7 @@ fn test_dictionary_encoded_str() {
|
||||
columnar_writer.record_str(3, "my.column", "c");
|
||||
columnar_writer.record_str(3, "my.column2", "different_column!");
|
||||
columnar_writer.record_str(4, "my.column", "b");
|
||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
@@ -235,7 +195,7 @@ fn test_dictionary_encoded_bytes() {
|
||||
columnar_writer.record_bytes(3, "my.column", b"c");
|
||||
columnar_writer.record_bytes(3, "my.column2", b"different_column!");
|
||||
columnar_writer.record_bytes(4, "my.column", b"b");
|
||||
columnar_writer.serialize(5, None, &mut buffer).unwrap();
|
||||
columnar_writer.serialize(5, &mut buffer).unwrap();
|
||||
let columnar_reader = ColumnarReader::open(buffer).unwrap();
|
||||
assert_eq!(columnar_reader.num_columns(), 2);
|
||||
let col_handles = columnar_reader.read_columns("my.column").unwrap();
|
||||
@@ -344,7 +304,7 @@ fn column_value_strategy() -> impl Strategy<Value = ColumnValue> {
|
||||
ip_addr_byte
|
||||
))),
|
||||
1 => any::<bool>().prop_map(ColumnValue::Bool),
|
||||
1 => (0_679_723_993i64..1_679_723_995i64)
|
||||
1 => (679_723_993i64..1_679_723_995i64)
|
||||
.prop_map(|val| { ColumnValue::DateTime(DateTime::from_timestamp_secs(val)) })
|
||||
]
|
||||
}
|
||||
@@ -369,26 +329,12 @@ fn columnar_docs_strategy() -> impl Strategy<Value = Vec<Vec<(&'static str, Colu
|
||||
.prop_flat_map(|num_docs| proptest::collection::vec(doc_strategy(), num_docs))
|
||||
}
|
||||
|
||||
fn columnar_docs_and_mapping_strategy(
|
||||
) -> impl Strategy<Value = (Vec<Vec<(&'static str, ColumnValue)>>, Vec<RowId>)> {
|
||||
columnar_docs_strategy().prop_flat_map(|docs| {
|
||||
permutation_strategy(docs.len()).prop_map(move |permutation| (docs.clone(), permutation))
|
||||
})
|
||||
}
|
||||
|
||||
fn permutation_strategy(n: usize) -> impl Strategy<Value = Vec<RowId>> {
|
||||
Just((0u32..n as RowId).collect()).prop_shuffle()
|
||||
}
|
||||
|
||||
fn permutation_and_subset_strategy(n: usize) -> impl Strategy<Value = Vec<usize>> {
|
||||
let vals: Vec<usize> = (0..n).collect();
|
||||
subsequence(vals, 0..=n).prop_shuffle()
|
||||
}
|
||||
|
||||
fn build_columnar_with_mapping(
|
||||
docs: &[Vec<(&'static str, ColumnValue)>],
|
||||
old_to_new_row_ids_opt: Option<&[RowId]>,
|
||||
) -> ColumnarReader {
|
||||
fn build_columnar_with_mapping(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||
let num_docs = docs.len() as u32;
|
||||
let mut buffer = Vec::new();
|
||||
let mut columnar_writer = ColumnarWriter::default();
|
||||
@@ -416,15 +362,13 @@ fn build_columnar_with_mapping(
|
||||
}
|
||||
}
|
||||
}
|
||||
columnar_writer
|
||||
.serialize(num_docs, old_to_new_row_ids_opt, &mut buffer)
|
||||
.unwrap();
|
||||
columnar_writer.serialize(num_docs, &mut buffer).unwrap();
|
||||
|
||||
ColumnarReader::open(buffer).unwrap()
|
||||
}
|
||||
|
||||
fn build_columnar(docs: &[Vec<(&'static str, ColumnValue)>]) -> ColumnarReader {
|
||||
build_columnar_with_mapping(docs, None)
|
||||
build_columnar_with_mapping(docs)
|
||||
}
|
||||
|
||||
fn assert_columnar_eq_strict(left: &ColumnarReader, right: &ColumnarReader) {
|
||||
@@ -436,7 +380,7 @@ fn assert_columnar_eq(
|
||||
right: &ColumnarReader,
|
||||
lenient_on_numerical_value: bool,
|
||||
) {
|
||||
assert_eq!(left.num_rows(), right.num_rows());
|
||||
assert_eq!(left.num_docs(), right.num_docs());
|
||||
let left_columns = left.list_columns().unwrap();
|
||||
let right_columns = right.list_columns().unwrap();
|
||||
assert_eq!(left_columns.len(), right_columns.len());
|
||||
@@ -448,6 +392,7 @@ fn assert_columnar_eq(
|
||||
}
|
||||
}
|
||||
|
||||
#[track_caller]
|
||||
fn assert_column_eq<T: Copy + PartialOrd + Debug + Send + Sync + 'static>(
|
||||
left: &Column<T>,
|
||||
right: &Column<T>,
|
||||
@@ -643,7 +588,7 @@ proptest! {
|
||||
#[test]
|
||||
fn test_single_columnar_builder_proptest(docs in columnar_docs_strategy()) {
|
||||
let columnar = build_columnar(&docs[..]);
|
||||
assert_eq!(columnar.num_rows() as usize, docs.len());
|
||||
assert_eq!(columnar.num_docs() as usize, docs.len());
|
||||
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
|
||||
for (doc_id, doc_vals) in docs.iter().enumerate() {
|
||||
for (col_name, col_val) in doc_vals {
|
||||
@@ -683,54 +628,6 @@ proptest! {
|
||||
}
|
||||
}
|
||||
|
||||
// Same as `test_single_columnar_builder_proptest` but with a shuffling mapping.
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(500))]
|
||||
#[test]
|
||||
fn test_single_columnar_builder_with_shuffle_proptest((docs, mapping) in columnar_docs_and_mapping_strategy()) {
|
||||
let columnar = build_columnar_with_mapping(&docs[..], Some(&mapping));
|
||||
assert_eq!(columnar.num_rows() as usize, docs.len());
|
||||
let mut expected_columns: HashMap<(&str, ColumnTypeCategory), HashMap<u32, Vec<&ColumnValue>> > = Default::default();
|
||||
for (doc_id, doc_vals) in docs.iter().enumerate() {
|
||||
for (col_name, col_val) in doc_vals {
|
||||
expected_columns
|
||||
.entry((col_name, col_val.column_type_category()))
|
||||
.or_default()
|
||||
.entry(mapping[doc_id])
|
||||
.or_default()
|
||||
.push(col_val);
|
||||
}
|
||||
}
|
||||
let column_list = columnar.list_columns().unwrap();
|
||||
assert_eq!(expected_columns.len(), column_list.len());
|
||||
for (column_name, column) in column_list {
|
||||
let dynamic_column = column.open().unwrap();
|
||||
let col_category: ColumnTypeCategory = dynamic_column.column_type().into();
|
||||
let expected_col_values: &HashMap<u32, Vec<&ColumnValue>> = expected_columns.get(&(column_name.as_str(), col_category)).unwrap();
|
||||
for _doc_id in 0..columnar.num_rows() {
|
||||
match &dynamic_column {
|
||||
DynamicColumn::Bool(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::I64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::U64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::F64(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::IpAddr(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::DateTime(col) =>
|
||||
assert_column_values(col, expected_col_values),
|
||||
DynamicColumn::Bytes(col) =>
|
||||
assert_bytes_column_values(col, expected_col_values, false),
|
||||
DynamicColumn::Str(col) =>
|
||||
assert_bytes_column_values(col, expected_col_values, true),
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// This tests create 2 or 3 random small columnar and attempts to merge them.
|
||||
// It compares the resulting merged dataframe with what would have been obtained by building the
|
||||
// dataframe from the concatenated rows to begin with.
|
||||
@@ -818,6 +715,7 @@ fn test_columnar_merging_number_columns() {
|
||||
// TODO test required_columns
|
||||
// TODO document edge case: required_columns incompatible with values.
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
fn columnar_docs_and_remap(
|
||||
) -> impl Strategy<Value = (Vec<Vec<Vec<(&'static str, ColumnValue)>>>, Vec<RowAddr>)> {
|
||||
proptest::collection::vec(columnar_docs_strategy(), 2..=3).prop_flat_map(
|
||||
@@ -844,24 +742,68 @@ fn columnar_docs_and_remap(
|
||||
proptest! {
|
||||
#![proptest_config(ProptestConfig::with_cases(1000))]
|
||||
#[test]
|
||||
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in columnar_docs_and_remap()) {
|
||||
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order.iter()
|
||||
.map(|row_addr| columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone())
|
||||
.collect();
|
||||
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
||||
let columnar_readers: Vec<ColumnarReader> = columnar_docs.iter()
|
||||
.map(|docs| build_columnar(&docs[..]))
|
||||
.collect::<Vec<_>>();
|
||||
let columnar_readers_arr: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let segment_num_rows: Vec<RowId> = columnar_docs.iter().map(|docs| docs.len() as RowId).collect();
|
||||
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
||||
crate::merge_columnar(&columnar_readers_arr[..], &[], shuffle_merge_order.into(), &mut output).unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
||||
fn test_columnar_merge_and_remap_proptest((columnar_docs, shuffle_merge_order) in
|
||||
columnar_docs_and_remap()) {
|
||||
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||
}
|
||||
}
|
||||
|
||||
fn test_columnar_merge_and_remap(
|
||||
columnar_docs: Vec<Vec<Vec<(&'static str, ColumnValue)>>>,
|
||||
shuffle_merge_order: Vec<RowAddr>,
|
||||
) {
|
||||
let shuffled_rows: Vec<Vec<(&'static str, ColumnValue)>> = shuffle_merge_order
|
||||
.iter()
|
||||
.map(|row_addr| {
|
||||
columnar_docs[row_addr.segment_ord as usize][row_addr.row_id as usize].clone()
|
||||
})
|
||||
.collect();
|
||||
let expected_merged_columnar = build_columnar(&shuffled_rows[..]);
|
||||
let columnar_readers: Vec<ColumnarReader> = columnar_docs
|
||||
.iter()
|
||||
.map(|docs| build_columnar(&docs[..]))
|
||||
.collect::<Vec<_>>();
|
||||
let columnar_readers_ref: Vec<&ColumnarReader> = columnar_readers.iter().collect();
|
||||
let mut output: Vec<u8> = Vec::new();
|
||||
let segment_num_rows: Vec<RowId> = columnar_docs
|
||||
.iter()
|
||||
.map(|docs| docs.len() as RowId)
|
||||
.collect();
|
||||
let shuffle_merge_order = ShuffleMergeOrder::for_test(&segment_num_rows, shuffle_merge_order);
|
||||
crate::merge_columnar(
|
||||
&columnar_readers_ref[..],
|
||||
&[],
|
||||
shuffle_merge_order.into(),
|
||||
&mut output,
|
||||
)
|
||||
.unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_columnar_eq(&merged_columnar, &expected_merged_columnar, true);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columnar_merge_and_remap_bug_1() {
|
||||
let columnar_docs = vec![vec![
|
||||
vec![
|
||||
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||
("c1", ColumnValue::Numerical(NumericalValue::U64(0))),
|
||||
],
|
||||
vec![],
|
||||
]];
|
||||
let shuffle_merge_order: Vec<RowAddr> = vec![
|
||||
RowAddr {
|
||||
segment_ord: 0,
|
||||
row_id: 1,
|
||||
},
|
||||
RowAddr {
|
||||
segment_ord: 0,
|
||||
row_id: 0,
|
||||
},
|
||||
];
|
||||
|
||||
test_columnar_merge_and_remap(columnar_docs, shuffle_merge_order);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_columnar_merge_empty() {
|
||||
let columnar_reader_1 = build_columnar(&[]);
|
||||
@@ -878,7 +820,7 @@ fn test_columnar_merge_empty() {
|
||||
)
|
||||
.unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_eq!(merged_columnar.num_rows(), 0);
|
||||
assert_eq!(merged_columnar.num_docs(), 0);
|
||||
assert_eq!(merged_columnar.num_columns(), 0);
|
||||
}
|
||||
|
||||
@@ -904,7 +846,7 @@ fn test_columnar_merge_single_str_column() {
|
||||
)
|
||||
.unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_eq!(merged_columnar.num_rows(), 1);
|
||||
assert_eq!(merged_columnar.num_docs(), 1);
|
||||
assert_eq!(merged_columnar.num_columns(), 1);
|
||||
}
|
||||
|
||||
@@ -936,7 +878,7 @@ fn test_delete_decrease_cardinality() {
|
||||
)
|
||||
.unwrap();
|
||||
let merged_columnar = ColumnarReader::open(output).unwrap();
|
||||
assert_eq!(merged_columnar.num_rows(), 1);
|
||||
assert_eq!(merged_columnar.num_docs(), 1);
|
||||
assert_eq!(merged_columnar.num_columns(), 1);
|
||||
let cols = merged_columnar.read_columns("c").unwrap();
|
||||
assert_eq!(cols.len(), 1);
|
||||
|
||||
@@ -17,6 +17,31 @@ impl NumericalValue {
|
||||
NumericalValue::F64(_) => NumericalType::F64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to normalize the numerical value in the following priorities:
|
||||
/// i64, i64, f64
|
||||
pub fn normalize(self) -> Self {
|
||||
match self {
|
||||
NumericalValue::U64(val) => {
|
||||
if val <= i64::MAX as u64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else {
|
||||
NumericalValue::F64(val as f64)
|
||||
}
|
||||
}
|
||||
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||
NumericalValue::F64(val) => {
|
||||
let fract = val.fract();
|
||||
if fract == 0.0 && val >= i64::MIN as f64 && val <= i64::MAX as f64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else if fract == 0.0 && val >= u64::MIN as f64 && val <= u64::MAX as f64 {
|
||||
NumericalValue::U64(val as u64)
|
||||
} else {
|
||||
NumericalValue::F64(val)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for NumericalValue {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-common"
|
||||
version = "0.7.0"
|
||||
version = "0.9.0"
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
license = "MIT"
|
||||
edition = "2021"
|
||||
@@ -9,16 +9,17 @@ documentation = "https://docs.rs/tantivy_common/"
|
||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||
repository = "https://github.com/quickwit-oss/tantivy"
|
||||
|
||||
|
||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||
|
||||
[dependencies]
|
||||
byteorder = "1.4.3"
|
||||
ownedbytes = { version= "0.7", path="../ownedbytes" }
|
||||
ownedbytes = { version= "0.9", path="../ownedbytes" }
|
||||
async-trait = "0.1"
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.14.0"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
|
||||
@@ -1,39 +1,64 @@
|
||||
#![feature(test)]
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
|
||||
|
||||
extern crate test;
|
||||
fn bench_vint() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use rand::seq::IteratorRandom;
|
||||
use rand::thread_rng;
|
||||
use tantivy_common::serialize_vint_u32;
|
||||
use test::Bencher;
|
||||
let vals: Vec<u32> = (0..20_000).collect();
|
||||
runner.bench_function("bench_vint", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
|
||||
#[bench]
|
||||
fn bench_vint(b: &mut Bencher) {
|
||||
let vals: Vec<u32> = (0..20_000).collect();
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_vint_rand(b: &mut Bencher) {
|
||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||
b.iter(|| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
out
|
||||
});
|
||||
}
|
||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||
runner.bench_function("bench_vint_rand", move |_| {
|
||||
let mut out = 0u64;
|
||||
for val in vals.iter().cloned() {
|
||||
let mut buf = [0u8; 8];
|
||||
serialize_vint_u32(val, &mut buf);
|
||||
out += u64::from(buf[0]);
|
||||
}
|
||||
black_box(out);
|
||||
});
|
||||
}
|
||||
|
||||
fn bench_bitset() {
|
||||
let mut runner = BenchRunner::new();
|
||||
|
||||
runner.bench_function("bench_tinyset_pop", move |_| {
|
||||
let mut tinyset = TinySet::singleton(black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
black_box(tinyset);
|
||||
});
|
||||
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
runner.bench_function("bench_tinyset_sum", move |_| {
|
||||
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
runner.bench_function("bench_tinyarr_sum", move |_| {
|
||||
black_box(v.iter().cloned().sum::<u32>());
|
||||
});
|
||||
|
||||
runner.bench_function("bench_bitset_initialize", move |_| {
|
||||
black_box(BitSet::with_max_value(1_000_000));
|
||||
});
|
||||
}
|
||||
|
||||
fn main() {
|
||||
bench_vint();
|
||||
bench_bitset();
|
||||
}
|
||||
|
||||
@@ -696,43 +696,3 @@ mod tests {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(all(test, feature = "unstable"))]
|
||||
mod bench {
|
||||
|
||||
use test;
|
||||
|
||||
use super::{BitSet, TinySet};
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
||||
b.iter(|| {
|
||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
tinyset.pop_lowest();
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||
b.iter(|| {
|
||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||
});
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
||||
let v = [10u32, 14u32, 21u32];
|
||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
||||
}
|
||||
|
||||
#[bench]
|
||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
||||
}
|
||||
}
|
||||
|
||||
130
common/src/bounds.rs
Normal file
130
common/src/bounds.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use std::io;
|
||||
use std::ops::Bound;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BoundsRange<T> {
|
||||
pub lower_bound: Bound<T>,
|
||||
pub upper_bound: Bound<T>,
|
||||
}
|
||||
impl<T> BoundsRange<T> {
|
||||
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
|
||||
BoundsRange {
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
}
|
||||
}
|
||||
pub fn is_unbounded(&self) -> bool {
|
||||
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
|
||||
}
|
||||
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
|
||||
BoundsRange {
|
||||
lower_bound: map_bound(&self.lower_bound, &transform),
|
||||
upper_bound: map_bound(&self.upper_bound, &transform),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound_res<TTo, Err>(
|
||||
&self,
|
||||
transform: impl Fn(&T) -> Result<TTo, Err>,
|
||||
) -> Result<BoundsRange<TTo>, Err> {
|
||||
Ok(BoundsRange {
|
||||
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
|
||||
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn transform_inner<TTo>(
|
||||
&self,
|
||||
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
|
||||
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
|
||||
) -> BoundsRange<TTo> {
|
||||
BoundsRange {
|
||||
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
|
||||
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the first set inner value
|
||||
pub fn get_inner(&self) -> Option<&T> {
|
||||
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
|
||||
}
|
||||
}
|
||||
|
||||
pub enum TransformBound<T> {
|
||||
/// Overwrite the bounds
|
||||
NewBound(Bound<T>),
|
||||
/// Use Existing bounds with new value
|
||||
Existing(T),
|
||||
}
|
||||
|
||||
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||
/// The bound variant may change by the value returned value from the closure.
|
||||
pub fn transform_bound_inner_res<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
|
||||
) -> io::Result<Bound<TTo>> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
Unbounded => Unbounded,
|
||||
})
|
||||
}
|
||||
|
||||
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||
/// The bound variant may change by the value returned value from the closure.
|
||||
pub fn transform_bound_inner<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the inner value of a `Bound`
|
||||
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
|
||||
match val {
|
||||
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||
Bound::Unbounded => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> TTo,
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
|
||||
Included(ref from_val) => Bound::Included(transform(from_val)),
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound_res<TFrom, TTo, Err>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
|
||||
) -> Result<Bound<TTo>, Err> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => Excluded(transform(from_val)?),
|
||||
Included(ref from_val) => Included(transform(from_val)?),
|
||||
Unbounded => Unbounded,
|
||||
})
|
||||
}
|
||||
@@ -1,5 +1,6 @@
|
||||
use std::fs::File;
|
||||
use std::ops::{Deref, Range, RangeBounds};
|
||||
use std::path::Path;
|
||||
use std::sync::Arc;
|
||||
use std::{fmt, io};
|
||||
|
||||
@@ -177,6 +178,12 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
|
||||
}
|
||||
|
||||
impl FileSlice {
|
||||
/// Creates a FileSlice from a path.
|
||||
pub fn open(path: &Path) -> io::Result<FileSlice> {
|
||||
let wrap_file = WrapFile::new(File::open(path)?)?;
|
||||
Ok(FileSlice::new(Arc::new(wrap_file)))
|
||||
}
|
||||
|
||||
/// Wraps a FileHandle.
|
||||
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
|
||||
let num_bytes = file_handle.len();
|
||||
|
||||
@@ -5,6 +5,7 @@ use std::ops::Deref;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
pub mod bounds;
|
||||
mod byte_count;
|
||||
mod datetime;
|
||||
pub mod file_slice;
|
||||
@@ -129,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
pub(crate) mod test {
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -143,12 +144,6 @@ pub mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -74,14 +74,14 @@ impl FixedSize for () {
|
||||
|
||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self {
|
||||
it.serialize(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = T::deserialize(reader)?;
|
||||
@@ -236,12 +236,12 @@ impl FixedSize for bool {
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
|
||||
impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
|
||||
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self.iter() {
|
||||
it.serialize(writer)?;
|
||||
BinarySerializable::serialize(it, writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = u8::deserialize(reader)?;
|
||||
let item = <u8 as BinarySerializable>::deserialize(reader)?;
|
||||
items.push(item);
|
||||
}
|
||||
Ok(Cow::Owned(items))
|
||||
|
||||
@@ -87,7 +87,7 @@ impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
||||
impl TerminatingWrite for &mut Vec<u8> {
|
||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||
self.flush()
|
||||
}
|
||||
|
||||
BIN
doc/assets/images/paradedb.png
Normal file
BIN
doc/assets/images/paradedb.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 30 KiB |
@@ -2,7 +2,7 @@
|
||||
|
||||
> Tantivy is a **search** engine **library** for Rust.
|
||||
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
|
||||
they both have the same scope and targeted use cases.
|
||||
|
||||
If you are not familiar with Lucene, let's break down our little tagline.
|
||||
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
experience. But keep in mind this is just a toolbox.
|
||||
Which bring us to the second keyword...
|
||||
|
||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
||||
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.
|
||||
|
||||
Sometimes a functionality will not be available in tantivy because it is too
|
||||
specific to your use case. By design, tantivy should make it possible to extend
|
||||
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
index from a different format.
|
||||
|
||||
Tantivy exposes a lot of low level API to do all of these things.
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
|
||||
While this design has some downsides, this greatly simplifies the source code of
|
||||
tantivy. Caching is also entirely delegated to the OS.
|
||||
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
|
||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||
|
||||
|
||||
@@ -7,6 +7,11 @@
|
||||
- [Other](#other)
|
||||
- [Usage](#usage)
|
||||
|
||||
# Index Sorting has been removed!
|
||||
More infos here:
|
||||
|
||||
https://github.com/quickwit-oss/tantivy/issues/2352
|
||||
|
||||
# Index Sorting
|
||||
|
||||
Tantivy allows you to sort the index according to a property.
|
||||
@@ -26,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Other?
|
||||
|
||||
@@ -40,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
|
||||
|
||||
## Usage
|
||||
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```rust
|
||||
let settings = IndexSettings {
|
||||
|
||||
@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
use std::ops::Bound;
|
||||
|
||||
// # Searching a range on an indexed int field.
|
||||
//
|
||||
// Below is an example of creating an indexed integer field in your schema
|
||||
@@ -5,7 +7,7 @@
|
||||
use tantivy::collector::Count;
|
||||
use tantivy::query::RangeQuery;
|
||||
use tantivy::schema::{Schema, INDEXED};
|
||||
use tantivy::{doc, Index, IndexWriter, Result};
|
||||
use tantivy::{doc, Index, IndexWriter, Result, Term};
|
||||
|
||||
fn main() -> Result<()> {
|
||||
// For the sake of simplicity, this schema will only have 1 field
|
||||
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
|
||||
reader.reload()?;
|
||||
let searcher = reader.searcher();
|
||||
// The end is excluded i.e. here we are searching up to 1969
|
||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
||||
let docs_in_the_sixties = RangeQuery::new(
|
||||
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||
);
|
||||
// Uses a Count collector to sum the total number of docs in the range
|
||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||
assert_eq!(num_60s_books, 10);
|
||||
|
||||
@@ -28,7 +28,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
[package]
|
||||
authors = ["Paul Masurel <paul@quickwit.io>", "Pascal Seitz <pascal@quickwit.io>"]
|
||||
name = "ownedbytes"
|
||||
version = "0.7.0"
|
||||
version = "0.9.0"
|
||||
edition = "2021"
|
||||
description = "Expose data as static slice"
|
||||
license = "MIT"
|
||||
|
||||
@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
let bytes_truncated: &[u8] = if self.len() > 10 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
@@ -252,6 +252,11 @@ mod tests {
|
||||
format!("{short_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
assert_eq!(
|
||||
format!("{medium_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{long_bytes:?}"),
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
[package]
|
||||
name = "tantivy-query-grammar"
|
||||
version = "0.22.0"
|
||||
version = "0.24.0"
|
||||
authors = ["Paul Masurel <paul.masurel@gmail.com>"]
|
||||
license = "MIT"
|
||||
categories = ["database-implementations", "data-structures"]
|
||||
@@ -13,3 +13,5 @@ edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
nom = "7"
|
||||
serde = { version = "1.0.219", features = ["derive"] }
|
||||
serde_json = "1.0.140"
|
||||
|
||||
@@ -3,6 +3,7 @@
|
||||
use std::convert::Infallible;
|
||||
|
||||
use nom::{AsChar, IResult, InputLength, InputTakeAtPosition};
|
||||
use serde::Serialize;
|
||||
|
||||
pub(crate) type ErrorList = Vec<LenientErrorInternal>;
|
||||
pub(crate) type JResult<I, O> = IResult<I, (O, ErrorList), Infallible>;
|
||||
@@ -15,7 +16,8 @@ pub(crate) struct LenientErrorInternal {
|
||||
}
|
||||
|
||||
/// A recoverable error and the position it happened at
|
||||
#[derive(Debug, PartialEq)]
|
||||
#[derive(Debug, PartialEq, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct LenientError {
|
||||
pub pos: usize,
|
||||
pub message: String,
|
||||
@@ -109,6 +111,8 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
move |input: I| match f.parse(input) {
|
||||
Ok((input, (output, _err))) => Ok((input, output)),
|
||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||
}
|
||||
}
|
||||
@@ -351,3 +355,21 @@ where
|
||||
{
|
||||
move |i: I| l.choice(i.clone()).unwrap_or_else(|| default.parse(i))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_lenient_error_serialization() {
|
||||
let error = LenientError {
|
||||
pos: 42,
|
||||
message: "test error message".to_string(),
|
||||
};
|
||||
|
||||
assert_eq!(
|
||||
serde_json::to_string(&error).unwrap(),
|
||||
"{\"pos\":42,\"message\":\"test error message\"}"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
#![allow(clippy::derive_partial_eq_without_eq)]
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
mod infallible;
|
||||
mod occur;
|
||||
mod query_grammar;
|
||||
@@ -12,6 +14,8 @@ pub use crate::user_input_ast::{
|
||||
Delimiter, UserInputAst, UserInputBound, UserInputLeaf, UserInputLiteral,
|
||||
};
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct Error;
|
||||
|
||||
/// Parse a query
|
||||
@@ -24,3 +28,31 @@ pub fn parse_query(query: &str) -> Result<UserInputAst, Error> {
|
||||
pub fn parse_query_lenient(query: &str) -> (UserInputAst, Vec<LenientError>) {
|
||||
parse_to_ast_lenient(query)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::{parse_query, parse_query_lenient};
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_serialization() {
|
||||
let ast = parse_query("title:hello OR title:x").unwrap();
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"bool","clauses":[["should",{"type":"literal","field_name":"title","phrase":"hello","delimiter":"none","slop":0,"prefix":false}],["should",{"type":"literal","field_name":"title","phrase":"x","delimiter":"none","slop":0,"prefix":false}]]}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_wrong_query() {
|
||||
assert!(parse_query("title:").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_lenient_wrong_query() {
|
||||
let (_, errors) = parse_query_lenient("title:");
|
||||
assert!(errors.len() == 1);
|
||||
let json = serde_json::to_string(&errors).unwrap();
|
||||
assert_eq!(json, r#"[{"pos":6,"message":"expected word"}]"#);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,17 +1,20 @@
|
||||
use std::fmt;
|
||||
use std::fmt::Write;
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
/// Defines whether a term in a query must be present,
|
||||
/// should be present or must not be present.
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the terms with the Should or the Must
|
||||
/// at least one of the queries with the Should or the Must
|
||||
/// Occur constraint must be within the document.
|
||||
Should,
|
||||
/// Document without the term are excluded from the search.
|
||||
/// Document without the queries are excluded from the search.
|
||||
Must,
|
||||
/// Document that contain the term are excluded from the
|
||||
/// Document that contain the query are excluded from the
|
||||
/// search.
|
||||
MustNot,
|
||||
}
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use std::borrow::Cow;
|
||||
use std::iter::once;
|
||||
|
||||
use nom::branch::alt;
|
||||
@@ -19,7 +20,7 @@ use crate::Occur;
|
||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||
// special characters.
|
||||
const SPECIAL_CHARS: &[char] = &[
|
||||
'+', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
||||
];
|
||||
|
||||
/// consume a field name followed by colon. Return the field name with escape sequence
|
||||
@@ -41,36 +42,92 @@ fn field_name(inp: &str) -> IResult<&str, String> {
|
||||
)(inp)
|
||||
}
|
||||
|
||||
const ESCAPE_IN_WORD: &[char] = &['^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '\\'];
|
||||
|
||||
fn interpret_escape(source: &str) -> String {
|
||||
let mut res = String::with_capacity(source.len());
|
||||
let mut in_escape = false;
|
||||
let require_escape = |c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c) || c == '-';
|
||||
|
||||
for c in source.chars() {
|
||||
if in_escape {
|
||||
if !require_escape(c) {
|
||||
// we re-add the escape sequence
|
||||
res.push('\\');
|
||||
}
|
||||
res.push(c);
|
||||
in_escape = false;
|
||||
} else if c == '\\' {
|
||||
in_escape = true;
|
||||
} else {
|
||||
res.push(c);
|
||||
}
|
||||
}
|
||||
res
|
||||
}
|
||||
|
||||
/// Consume a word outside of any context.
|
||||
// TODO should support escape sequences
|
||||
fn word(inp: &str) -> IResult<&str, &str> {
|
||||
fn word(inp: &str) -> IResult<&str, Cow<str>> {
|
||||
map_res(
|
||||
recognize(tuple((
|
||||
satisfy(|c| {
|
||||
!c.is_whitespace()
|
||||
&& !['-', '^', '`', ':', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
}),
|
||||
many0(satisfy(|c: char| {
|
||||
!c.is_whitespace() && ![':', '^', '{', '}', '"', '[', ']', '(', ')'].contains(&c)
|
||||
})),
|
||||
alt((
|
||||
preceded(char('\\'), anychar),
|
||||
satisfy(|c| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c) && c != '-'),
|
||||
)),
|
||||
many0(alt((
|
||||
preceded(char('\\'), anychar),
|
||||
satisfy(|c: char| !c.is_whitespace() && !ESCAPE_IN_WORD.contains(&c)),
|
||||
))),
|
||||
))),
|
||||
|s| match s {
|
||||
"OR" | "AND" | "NOT" | "IN" => Err(Error::new(inp, ErrorKind::Tag)),
|
||||
_ => Ok(s),
|
||||
s if s.contains('\\') => Ok(Cow::Owned(interpret_escape(s))),
|
||||
s => Ok(Cow::Borrowed(s)),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
|
||||
fn word_infallible(delimiter: &str) -> impl Fn(&str) -> JResult<&str, Option<&str>> + '_ {
|
||||
|inp| {
|
||||
opt_i_err(
|
||||
preceded(
|
||||
multispace0,
|
||||
recognize(many1(satisfy(|c| {
|
||||
!c.is_whitespace() && !delimiter.contains(c)
|
||||
}))),
|
||||
fn word_infallible(
|
||||
delimiter: &str,
|
||||
emit_error: bool,
|
||||
) -> impl Fn(&str) -> JResult<&str, Option<Cow<str>>> + '_ {
|
||||
// emit error is set when receiving an unescaped `:` should emit an error
|
||||
|
||||
move |inp| {
|
||||
map(
|
||||
opt_i_err(
|
||||
preceded(
|
||||
multispace0,
|
||||
recognize(many1(alt((
|
||||
preceded(char::<&str, _>('\\'), anychar),
|
||||
satisfy(|c| !c.is_whitespace() && !delimiter.contains(c)),
|
||||
)))),
|
||||
),
|
||||
"expected word",
|
||||
),
|
||||
"expected word",
|
||||
|(opt_s, mut errors)| match opt_s {
|
||||
Some(s) => {
|
||||
if emit_error
|
||||
&& (s
|
||||
.as_bytes()
|
||||
.windows(2)
|
||||
.any(|window| window[0] != b'\\' && window[1] == b':')
|
||||
|| s.starts_with(':'))
|
||||
{
|
||||
errors.push(LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: "parsed possible invalid field as term".to_string(),
|
||||
});
|
||||
}
|
||||
if s.contains('\\') {
|
||||
(Some(Cow::Owned(interpret_escape(s))), errors)
|
||||
} else {
|
||||
(Some(Cow::Borrowed(s)), errors)
|
||||
}
|
||||
}
|
||||
None => (None, errors),
|
||||
},
|
||||
)(inp)
|
||||
}
|
||||
}
|
||||
@@ -159,7 +216,7 @@ fn simple_term_infallible(
|
||||
(value((), char('\'')), simple_quotes),
|
||||
),
|
||||
// numbers are parsed with words in this case, as we allow string starting with a -
|
||||
map(word_infallible(delimiter), |(text, errors)| {
|
||||
map(word_infallible(delimiter, true), |(text, errors)| {
|
||||
(text.map(|text| (Delimiter::None, text.to_string())), errors)
|
||||
}),
|
||||
)(inp)
|
||||
@@ -264,7 +321,17 @@ fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
|
||||
UserInputLeaf::Exists {
|
||||
field: String::new(),
|
||||
},
|
||||
tuple((multispace0, char('*'))),
|
||||
tuple((
|
||||
multispace0,
|
||||
char('*'),
|
||||
peek(alt((
|
||||
value(
|
||||
"",
|
||||
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
|
||||
),
|
||||
eof,
|
||||
))),
|
||||
)),
|
||||
)(inp)
|
||||
}
|
||||
|
||||
@@ -274,7 +341,14 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
|
||||
peek(tuple((
|
||||
field_name,
|
||||
multispace0,
|
||||
char('*'), // when we are here, we know it can't be anything but a exists
|
||||
char('*'),
|
||||
peek(alt((
|
||||
value(
|
||||
"",
|
||||
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
|
||||
),
|
||||
eof,
|
||||
))), // we need to check this isn't a wildcard query
|
||||
))),
|
||||
)(inp)
|
||||
.map_err(|e| e.map(|_| ()))
|
||||
@@ -322,15 +396,6 @@ fn literal_no_group_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>>
|
||||
|((field_name, _, leaf), mut errors)| {
|
||||
(
|
||||
leaf.map(|leaf| {
|
||||
if matches!(&leaf, UserInputLeaf::Literal(literal)
|
||||
if literal.phrase.contains(':') && literal.delimiter == Delimiter::None)
|
||||
&& field_name.is_none()
|
||||
{
|
||||
errors.push(LenientErrorInternal {
|
||||
pos: inp.len(),
|
||||
message: "parsed possible invalid field as term".to_string(),
|
||||
});
|
||||
}
|
||||
if matches!(&leaf, UserInputLeaf::Literal(literal)
|
||||
if literal.phrase == "NOT" && literal.delimiter == Delimiter::None)
|
||||
&& field_name.is_none()
|
||||
@@ -449,20 +514,20 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
tuple_infallible((
|
||||
opt_i(anychar),
|
||||
space0_infallible,
|
||||
word_infallible("]}"),
|
||||
word_infallible("]}", false),
|
||||
space1_infallible,
|
||||
opt_i_err(
|
||||
terminated(tag("TO"), alt((value((), multispace1), value((), eof)))),
|
||||
"missing keyword TO",
|
||||
),
|
||||
word_infallible("]}"),
|
||||
word_infallible("]}", false),
|
||||
opt_i_err(one_of("]}"), "missing range delimiter"),
|
||||
)),
|
||||
|(
|
||||
(lower_bound_kind, _multispace0, lower, _multispace1, to, upper, upper_bound_kind),
|
||||
errs,
|
||||
)| {
|
||||
let lower_bound = match (lower_bound_kind, lower) {
|
||||
let lower_bound = match (lower_bound_kind, lower.as_deref()) {
|
||||
(_, Some("*")) => UserInputBound::Unbounded,
|
||||
(_, None) => UserInputBound::Unbounded,
|
||||
// if it is some, TO was actually the bound (i.e. [TO TO something])
|
||||
@@ -471,7 +536,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
(Some('{'), Some(bound)) => UserInputBound::Exclusive(bound.to_string()),
|
||||
_ => unreachable!("precondition failed, range did not start with [ or {{"),
|
||||
};
|
||||
let upper_bound = match (upper_bound_kind, upper) {
|
||||
let upper_bound = match (upper_bound_kind, upper.as_deref()) {
|
||||
(_, Some("*")) => UserInputBound::Unbounded,
|
||||
(_, None) => UserInputBound::Unbounded,
|
||||
(Some(']'), Some(bound)) => UserInputBound::Inclusive(bound.to_string()),
|
||||
@@ -488,7 +553,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
(
|
||||
(
|
||||
value((), tag(">=")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -502,7 +567,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<=")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -516,7 +581,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag(">")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
bound
|
||||
@@ -530,7 +595,7 @@ fn range_infallible(inp: &str) -> JResult<&str, UserInputLeaf> {
|
||||
),
|
||||
(
|
||||
value((), tag("<")),
|
||||
map(word_infallible(""), |(bound, err)| {
|
||||
map(word_infallible("", false), |(bound, err)| {
|
||||
(
|
||||
(
|
||||
UserInputBound::Unbounded,
|
||||
@@ -719,7 +784,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
|
||||
tuple((fallible(occur_symbol), boosted_leaf))(inp)
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn operand_occur_leaf_infallible(
|
||||
inp: &str,
|
||||
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
|
||||
@@ -785,7 +850,7 @@ fn aggregate_infallible_expressions(
|
||||
if early_operand {
|
||||
err.push(LenientErrorInternal {
|
||||
pos: 0,
|
||||
message: "Found unexpeted boolean operator before term".to_string(),
|
||||
message: "Found unexpected boolean operator before term".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -808,7 +873,7 @@ fn aggregate_infallible_expressions(
|
||||
_ => Some(Occur::Should),
|
||||
};
|
||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(
|
||||
Some(Occur::Should),
|
||||
ast.clone().unary(Occur::MustNot),
|
||||
@@ -824,7 +889,7 @@ fn aggregate_infallible_expressions(
|
||||
None => None,
|
||||
};
|
||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(
|
||||
Some(Occur::Should),
|
||||
ast.clone().unary(Occur::MustNot),
|
||||
@@ -849,7 +914,7 @@ fn aggregate_infallible_expressions(
|
||||
}
|
||||
Some(BinaryOperand::Or) => {
|
||||
if last_occur == Some(Occur::MustNot) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
||||
} else {
|
||||
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
||||
@@ -1009,7 +1074,7 @@ mod test {
|
||||
valid_parse("1", 1.0, "");
|
||||
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
||||
error_parse(".3332");
|
||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeeed,
|
||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeed,
|
||||
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
||||
// error_parse("1.");
|
||||
error_parse("-1.");
|
||||
@@ -1157,6 +1222,12 @@ mod test {
|
||||
test_parse_query_to_ast_helper("weight: <= 70", "\"weight\":{\"*\" TO \"70\"]");
|
||||
|
||||
test_parse_query_to_ast_helper("weight: <= 70.5", "\"weight\":{\"*\" TO \"70.5\"]");
|
||||
|
||||
test_parse_query_to_ast_helper(">a", "{\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper(">=a", "[\"a\" TO \"*\"}");
|
||||
test_parse_query_to_ast_helper("<a", "{\"*\" TO \"a\"}");
|
||||
test_parse_query_to_ast_helper("<=a", "{\"*\" TO \"a\"]");
|
||||
test_parse_query_to_ast_helper("<=bsd", "{\"*\" TO \"bsd\"]");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1413,7 +1484,7 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
fn test_parse_query_to_trimming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "abc");
|
||||
test_parse_query_to_ast_helper("abc ", "abc");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
||||
@@ -1443,6 +1514,11 @@ mod test {
|
||||
test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn field_re_specification() {
|
||||
test_parse_query_to_ast_helper(r#"field:(abc AND b:cde)"#, r#"(+"field":abc +"b":cde)"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_single_term() {
|
||||
test_parse_query_to_ast_helper("abc", "abc");
|
||||
@@ -1565,13 +1641,19 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn test_exist_query() {
|
||||
test_parse_query_to_ast_helper("a:*", "\"a\":*");
|
||||
test_parse_query_to_ast_helper("a: *", "\"a\":*");
|
||||
// an exist followed by default term being b
|
||||
test_is_parse_err("a:*b", "(*\"a\":* *b)");
|
||||
test_parse_query_to_ast_helper("a:*", "$exists(\"a\")");
|
||||
test_parse_query_to_ast_helper("a: *", "$exists(\"a\")");
|
||||
|
||||
// this is a term query (not a phrase prefix)
|
||||
test_parse_query_to_ast_helper(
|
||||
"(hello AND toto:*) OR happy",
|
||||
"(?(+hello +$exists(\"toto\")) ?happy)",
|
||||
);
|
||||
test_parse_query_to_ast_helper("(a:*)", "$exists(\"a\")");
|
||||
|
||||
// these are term/wildcard query (not a phrase prefix)
|
||||
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
|
||||
test_parse_query_to_ast_helper("a:*b", "\"a\":*b");
|
||||
test_parse_query_to_ast_helper(r#"a:*def*"#, "\"a\":*def*");
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -1590,5 +1672,21 @@ mod test {
|
||||
r#"myfield:'hello\"happy\'tax'"#,
|
||||
r#""myfield":'hello"happy'tax'"#,
|
||||
);
|
||||
// we don't process escape sequence for chars which don't require it
|
||||
test_parse_query_to_ast_helper(r#"abc\*"#, r#"abc\*"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_queries_with_colons() {
|
||||
test_parse_query_to_ast_helper(r#""abc:def""#, r#""abc:def""#);
|
||||
test_parse_query_to_ast_helper(r#"'abc:def'"#, r#"'abc:def'"#);
|
||||
test_parse_query_to_ast_helper(r#"abc\:def"#, r#"abc:def"#);
|
||||
test_parse_query_to_ast_helper(r#""abc\:def""#, r#""abc:def""#);
|
||||
test_parse_query_to_ast_helper(r#"'abc\:def'"#, r#"'abc:def'"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_invalid_field() {
|
||||
test_is_parse_err(r#"!bc:def"#, "!bc:def");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
use std::fmt;
|
||||
use std::fmt::{Debug, Formatter};
|
||||
|
||||
use serde::Serialize;
|
||||
|
||||
use crate::Occur;
|
||||
|
||||
#[derive(PartialEq, Clone)]
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[serde(tag = "type")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum UserInputLeaf {
|
||||
Literal(UserInputLiteral),
|
||||
All,
|
||||
@@ -101,20 +105,22 @@ impl Debug for UserInputLeaf {
|
||||
}
|
||||
UserInputLeaf::All => write!(formatter, "*"),
|
||||
UserInputLeaf::Exists { field } => {
|
||||
write!(formatter, "\"{field}\":*")
|
||||
write!(formatter, "$exists(\"{field}\")")
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug)]
|
||||
#[derive(Copy, Clone, Eq, PartialEq, Debug, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum Delimiter {
|
||||
SingleQuotes,
|
||||
DoubleQuotes,
|
||||
None,
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone)]
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub struct UserInputLiteral {
|
||||
pub field_name: Option<String>,
|
||||
pub phrase: String,
|
||||
@@ -152,7 +158,9 @@ impl fmt::Debug for UserInputLiteral {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Debug, Clone)]
|
||||
#[derive(PartialEq, Debug, Clone, Serialize)]
|
||||
#[serde(tag = "type", content = "value")]
|
||||
#[serde(rename_all = "snake_case")]
|
||||
pub enum UserInputBound {
|
||||
Inclusive(String),
|
||||
Exclusive(String),
|
||||
@@ -187,11 +195,38 @@ impl UserInputBound {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone)]
|
||||
#[derive(PartialEq, Clone, Serialize)]
|
||||
#[serde(into = "UserInputAstSerde")]
|
||||
pub enum UserInputAst {
|
||||
Clause(Vec<(Option<Occur>, UserInputAst)>),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
Boost(Box<UserInputAst>, f64),
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
#[serde(tag = "type", rename_all = "snake_case")]
|
||||
enum UserInputAstSerde {
|
||||
Bool {
|
||||
clauses: Vec<(Option<Occur>, UserInputAst)>,
|
||||
},
|
||||
Boost {
|
||||
underlying: Box<UserInputAst>,
|
||||
boost: f64,
|
||||
},
|
||||
#[serde(untagged)]
|
||||
Leaf(Box<UserInputLeaf>),
|
||||
}
|
||||
|
||||
impl From<UserInputAst> for UserInputAstSerde {
|
||||
fn from(ast: UserInputAst) -> Self {
|
||||
match ast {
|
||||
UserInputAst::Clause(clause) => UserInputAstSerde::Bool { clauses: clause },
|
||||
UserInputAst::Boost(underlying, boost) => {
|
||||
UserInputAstSerde::Boost { underlying, boost }
|
||||
}
|
||||
UserInputAst::Leaf(leaf) => UserInputAstSerde::Leaf(leaf),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl UserInputAst {
|
||||
@@ -267,7 +302,7 @@ impl fmt::Debug for UserInputAst {
|
||||
match *self {
|
||||
UserInputAst::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
// TODO this will break ast reserialization, is writing "( )" enought?
|
||||
// TODO this will break ast reserialization, is writing "( )" enough?
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
write!(formatter, "(")?;
|
||||
@@ -285,3 +320,126 @@ impl fmt::Debug for UserInputAst {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn test_all_leaf_serialization() {
|
||||
let ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(json, r#"{"type":"all"}"#);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_literal_leaf_serialization() {
|
||||
let literal = UserInputLiteral {
|
||||
field_name: Some("title".to_string()),
|
||||
phrase: "hello".to_string(),
|
||||
delimiter: Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
};
|
||||
let ast = UserInputAst::Leaf(Box::new(UserInputLeaf::Literal(literal)));
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"literal","field_name":"title","phrase":"hello","delimiter":"none","slop":0,"prefix":false}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_leaf_serialization() {
|
||||
let range = UserInputLeaf::Range {
|
||||
field: Some("price".to_string()),
|
||||
lower: UserInputBound::Inclusive("10".to_string()),
|
||||
upper: UserInputBound::Exclusive("100".to_string()),
|
||||
};
|
||||
let ast = UserInputAst::Leaf(Box::new(range));
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"range","field":"price","lower":{"type":"inclusive","value":"10"},"upper":{"type":"exclusive","value":"100"}}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_range_leaf_unbounded_serialization() {
|
||||
let range = UserInputLeaf::Range {
|
||||
field: Some("price".to_string()),
|
||||
lower: UserInputBound::Inclusive("10".to_string()),
|
||||
upper: UserInputBound::Unbounded,
|
||||
};
|
||||
let ast = UserInputAst::Leaf(Box::new(range));
|
||||
let json = serde_json::to_string(&ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"range","field":"price","lower":{"type":"inclusive","value":"10"},"upper":{"type":"unbounded"}}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boost_serialization() {
|
||||
let inner_ast = UserInputAst::Leaf(Box::new(UserInputLeaf::All));
|
||||
let boost_ast = UserInputAst::Boost(Box::new(inner_ast), 2.5);
|
||||
let json = serde_json::to_string(&boost_ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"boost","underlying":{"type":"all"},"boost":2.5}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_boost_serialization2() {
|
||||
let boost_ast = UserInputAst::Boost(
|
||||
Box::new(UserInputAst::Clause(vec![
|
||||
(
|
||||
Some(Occur::Must),
|
||||
UserInputAst::Leaf(Box::new(UserInputLeaf::All)),
|
||||
),
|
||||
(
|
||||
Some(Occur::Should),
|
||||
UserInputAst::Leaf(Box::new(UserInputLeaf::Literal(UserInputLiteral {
|
||||
field_name: Some("title".to_string()),
|
||||
phrase: "hello".to_string(),
|
||||
delimiter: Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
}))),
|
||||
),
|
||||
])),
|
||||
2.5,
|
||||
);
|
||||
let json = serde_json::to_string(&boost_ast).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"boost","underlying":{"type":"bool","clauses":[["must",{"type":"all"}],["should",{"type":"literal","field_name":"title","phrase":"hello","delimiter":"none","slop":0,"prefix":false}]]},"boost":2.5}"#
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_clause_serialization() {
|
||||
let clause = UserInputAst::Clause(vec![
|
||||
(
|
||||
Some(Occur::Must),
|
||||
UserInputAst::Leaf(Box::new(UserInputLeaf::All)),
|
||||
),
|
||||
(
|
||||
Some(Occur::Should),
|
||||
UserInputAst::Leaf(Box::new(UserInputLeaf::Literal(UserInputLiteral {
|
||||
field_name: Some("title".to_string()),
|
||||
phrase: "hello".to_string(),
|
||||
delimiter: Delimiter::None,
|
||||
slop: 0,
|
||||
prefix: false,
|
||||
}))),
|
||||
),
|
||||
]);
|
||||
let json = serde_json::to_string(&clause).unwrap();
|
||||
assert_eq!(
|
||||
json,
|
||||
r#"{"type":"bool","clauses":[["must",{"type":"all"}],["should",{"type":"literal","field_name":"title","phrase":"hello","delimiter":"none","slop":0,"prefix":false}]]}"#
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,7 +21,10 @@ impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
|
||||
|
||||
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
||||
/// (500MB). The limit is shared by all SegmentCollectors
|
||||
pub struct AggregationLimits {
|
||||
///
|
||||
/// The memory limit is also a guard, which tracks how much it allocated and releases it's memory
|
||||
/// on the shared counter. Cloning will create a new guard.
|
||||
pub struct AggregationLimitsGuard {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc<AtomicU64>,
|
||||
/// The memory_limit in bytes
|
||||
@@ -29,28 +32,41 @@ pub struct AggregationLimits {
|
||||
/// The maximum number of buckets _returned_
|
||||
/// This is not counting intermediate buckets.
|
||||
bucket_limit: u32,
|
||||
/// Allocated memory with this guard.
|
||||
allocated_with_the_guard: u64,
|
||||
}
|
||||
impl Clone for AggregationLimits {
|
||||
impl Clone for AggregationLimitsGuard {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||
memory_limit: self.memory_limit,
|
||||
bucket_limit: self.bucket_limit,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AggregationLimits {
|
||||
impl Drop for AggregationLimitsGuard {
|
||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||
/// This is used to clear the segment specific memory consumption all at once.
|
||||
fn drop(&mut self) {
|
||||
self.memory_consumption
|
||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AggregationLimitsGuard {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
memory_consumption: Default::default(),
|
||||
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
||||
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AggregationLimits {
|
||||
impl AggregationLimitsGuard {
|
||||
/// *memory_limit*
|
||||
/// memory_limit is defined in bytes.
|
||||
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
||||
@@ -67,24 +83,15 @@ impl AggregationLimits {
|
||||
memory_consumption: Default::default(),
|
||||
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
||||
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
|
||||
pub fn new_guard(&self) -> ResourceLimitGuard {
|
||||
ResourceLimitGuard {
|
||||
// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||
// The memory_limit in bytes
|
||||
memory_limit: self.memory_limit,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
pub(crate) fn add_memory_consumed(&mut self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
let prev_value = self
|
||||
.memory_consumption
|
||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||
self.allocated_with_the_guard += add_num_bytes;
|
||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -109,34 +116,6 @@ fn validate_memory_consumption(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct ResourceLimitGuard {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc<AtomicU64>,
|
||||
/// The memory_limit in bytes
|
||||
memory_limit: ByteCount,
|
||||
/// Allocated memory with this guard.
|
||||
allocated_with_the_guard: u64,
|
||||
}
|
||||
|
||||
impl ResourceLimitGuard {
|
||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
let prev_value = self
|
||||
.memory_consumption
|
||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ResourceLimitGuard {
|
||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||
/// This is used to clear the segment specific memory consumption all at once.
|
||||
fn drop(&mut self) {
|
||||
self.memory_consumption
|
||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::aggregation::tests::exec_request_with_query;
|
||||
|
||||
@@ -34,8 +34,9 @@ use super::bucket::{
|
||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation,
|
||||
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
|
||||
TopHitsAggregationReq,
|
||||
};
|
||||
|
||||
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
||||
@@ -146,6 +147,11 @@ pub enum AggregationVariants {
|
||||
/// extracted values.
|
||||
#[serde(rename = "stats")]
|
||||
Stats(StatsAggregation),
|
||||
/// Computes a collection of estended statistics (`min`, `max`, `sum`, `count`, `avg`,
|
||||
/// `sum_of_squares`, `variance`, `variance_sampling`, `std_deviation`,
|
||||
/// `std_deviation_sampling`) over the extracted values.
|
||||
#[serde(rename = "extended_stats")]
|
||||
ExtendedStats(ExtendedStatsAggregation),
|
||||
/// Computes the sum of the extracted values.
|
||||
#[serde(rename = "sum")]
|
||||
Sum(SumAggregation),
|
||||
@@ -154,7 +160,10 @@ pub enum AggregationVariants {
|
||||
Percentiles(PercentilesAggregationReq),
|
||||
/// Finds the top k values matching some order
|
||||
#[serde(rename = "top_hits")]
|
||||
TopHits(TopHitsAggregation),
|
||||
TopHits(TopHitsAggregationReq),
|
||||
/// Computes an estimate of the number of unique values
|
||||
#[serde(rename = "cardinality")]
|
||||
Cardinality(CardinalityAggregationReq),
|
||||
}
|
||||
|
||||
impl AggregationVariants {
|
||||
@@ -170,9 +179,11 @@ impl AggregationVariants {
|
||||
AggregationVariants::Max(max) => vec![max.field_name()],
|
||||
AggregationVariants::Min(min) => vec![min.field_name()],
|
||||
AggregationVariants::Stats(stats) => vec![stats.field_name()],
|
||||
AggregationVariants::ExtendedStats(extended_stats) => vec![extended_stats.field_name()],
|
||||
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
||||
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
||||
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
||||
AggregationVariants::Cardinality(per) => vec![per.field_name()],
|
||||
}
|
||||
}
|
||||
|
||||
@@ -197,6 +208,12 @@ impl AggregationVariants {
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
|
||||
match &self {
|
||||
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn as_percentile(&self) -> Option<&PercentilesAggregationReq> {
|
||||
match &self {
|
||||
|
||||
@@ -5,16 +5,15 @@ use std::io;
|
||||
|
||||
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
||||
|
||||
use super::agg_limits::ResourceLimitGuard;
|
||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use super::bucket::{
|
||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
AverageAggregation, CountAggregation, MaxAggregation, MinAggregation, StatsAggregation,
|
||||
SumAggregation,
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::segment_agg_result::AggregationLimitsGuard;
|
||||
use super::VecWithNames;
|
||||
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
||||
use crate::index::SegmentReader;
|
||||
@@ -46,7 +45,7 @@ pub struct AggregationWithAccessor {
|
||||
pub(crate) str_dict_column: Option<StrColumn>,
|
||||
pub(crate) field_type: ColumnType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
pub(crate) limits: ResourceLimitGuard,
|
||||
pub(crate) limits: AggregationLimitsGuard,
|
||||
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
||||
/// Used for missing term aggregation, which checks all columns for existence.
|
||||
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
||||
@@ -69,7 +68,7 @@ impl AggregationWithAccessor {
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
||||
let mut agg = agg.clone();
|
||||
|
||||
@@ -91,7 +90,7 @@ impl AggregationWithAccessor {
|
||||
&limits,
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits: limits.clone(),
|
||||
missing_value_for_accessor: None,
|
||||
str_dict_column: None,
|
||||
column_block_accessor: Default::default(),
|
||||
@@ -106,6 +105,7 @@ impl AggregationWithAccessor {
|
||||
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
||||
-> crate::Result<()> {
|
||||
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
||||
let limits = limits.clone();
|
||||
let res = AggregationWithAccessor {
|
||||
segment_ordinal,
|
||||
// TODO: We should do away with the `accessor` field altogether
|
||||
@@ -120,7 +120,7 @@ impl AggregationWithAccessor {
|
||||
&limits,
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits,
|
||||
missing_value_for_accessor: None,
|
||||
str_dict_column: None,
|
||||
column_block_accessor: Default::default(),
|
||||
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
|
||||
field: ref field_name,
|
||||
ref missing,
|
||||
..
|
||||
})
|
||||
| Cardinality(CardinalityAggregationReq {
|
||||
field: ref field_name,
|
||||
ref missing,
|
||||
..
|
||||
}) => {
|
||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||
let allowed_column_types = [
|
||||
@@ -181,6 +186,8 @@ impl AggregationWithAccessor {
|
||||
.map(|missing| match missing {
|
||||
Key::Str(_) => ColumnType::Str,
|
||||
Key::F64(_) => ColumnType::F64,
|
||||
Key::I64(_) => ColumnType::I64,
|
||||
Key::U64(_) => ColumnType::U64,
|
||||
})
|
||||
.unwrap_or(ColumnType::U64);
|
||||
let column_and_types = get_all_ff_reader_or_empty(
|
||||
@@ -227,14 +234,18 @@ impl AggregationWithAccessor {
|
||||
missing.clone()
|
||||
};
|
||||
|
||||
let missing_value_for_accessor = if let Some(missing) =
|
||||
missing_value_term_agg.as_ref()
|
||||
{
|
||||
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let missing_value_for_accessor =
|
||||
if let Some(missing) = missing_value_term_agg.as_ref() {
|
||||
get_missing_val_as_u64_lenient(
|
||||
column_type,
|
||||
missing,
|
||||
agg.agg.get_fast_field_names()[0],
|
||||
)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let limits = limits.clone();
|
||||
let agg = AggregationWithAccessor {
|
||||
segment_ordinal,
|
||||
missing_value_for_accessor,
|
||||
@@ -250,7 +261,7 @@ impl AggregationWithAccessor {
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
str_dict_column: str_dict_column.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits,
|
||||
column_block_accessor: Default::default(),
|
||||
};
|
||||
res.push(agg);
|
||||
@@ -260,10 +271,6 @@ impl AggregationWithAccessor {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| Count(CountAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| Max(MaxAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
@@ -276,6 +283,10 @@ impl AggregationWithAccessor {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| ExtendedStats(ExtendedStatsAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
})
|
||||
| Sum(SumAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
@@ -284,6 +295,24 @@ impl AggregationWithAccessor {
|
||||
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
|
||||
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
|
||||
}
|
||||
Count(CountAggregation {
|
||||
field: ref field_name,
|
||||
..
|
||||
}) => {
|
||||
let allowed_column_types = [
|
||||
ColumnType::I64,
|
||||
ColumnType::U64,
|
||||
ColumnType::F64,
|
||||
ColumnType::Str,
|
||||
ColumnType::DateTime,
|
||||
ColumnType::Bool,
|
||||
ColumnType::IpAddr,
|
||||
// ColumnType::Bytes Unsupported
|
||||
];
|
||||
let (accessor, column_type) =
|
||||
get_ff_reader(reader, field_name, Some(&allowed_column_types))?;
|
||||
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
|
||||
}
|
||||
Percentiles(ref percentiles) => {
|
||||
let (accessor, column_type) = get_ff_reader(
|
||||
reader,
|
||||
@@ -321,7 +350,14 @@ impl AggregationWithAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_missing_val(
|
||||
/// Get the missing value as internal u64 representation
|
||||
///
|
||||
/// For terms we use u64::MAX as sentinel value
|
||||
/// For numerical data we convert the value into the representation
|
||||
/// we would get from the fast field, when we open it as u64_lenient_for_type.
|
||||
///
|
||||
/// That way we can use it the same way as if it would come from the fastfield.
|
||||
fn get_missing_val_as_u64_lenient(
|
||||
column_type: ColumnType,
|
||||
missing: &Key,
|
||||
field_name: &str,
|
||||
@@ -330,9 +366,18 @@ fn get_missing_val(
|
||||
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
// Allow fallback to number on text fields
|
||||
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::F64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val, &column_type)
|
||||
}
|
||||
// NOTE: We may loose precision of the passed missing value by casting i64 and u64 to f64.
|
||||
Key::I64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||
}
|
||||
Key::U64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||
}
|
||||
_ => {
|
||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Missing value {missing:?} for field {field_name} is not supported for column \
|
||||
@@ -356,7 +401,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: &AggregationLimits,
|
||||
limits: &AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut aggss = Vec::new();
|
||||
for (key, agg) in aggs.iter() {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//!
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
@@ -8,7 +9,9 @@ use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::bucket::GetDocCount;
|
||||
use super::metric::{PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult};
|
||||
use super::metric::{
|
||||
ExtendedStats, PercentilesMetricResult, SingleMetricResult, Stats, TopHitsMetricResult,
|
||||
};
|
||||
use super::{AggregationError, Key};
|
||||
use crate::TantivyError;
|
||||
|
||||
@@ -88,12 +91,16 @@ pub enum MetricResult {
|
||||
Min(SingleMetricResult),
|
||||
/// Stats metric result.
|
||||
Stats(Stats),
|
||||
/// ExtendedStats metric result.
|
||||
ExtendedStats(Box<ExtendedStats>),
|
||||
/// Sum metric result.
|
||||
Sum(SingleMetricResult),
|
||||
/// Percentiles metric result.
|
||||
Percentiles(PercentilesMetricResult),
|
||||
/// Top hits metric result
|
||||
TopHits(TopHitsMetricResult),
|
||||
/// Cardinality metric result
|
||||
Cardinality(SingleMetricResult),
|
||||
}
|
||||
|
||||
impl MetricResult {
|
||||
@@ -104,6 +111,7 @@ impl MetricResult {
|
||||
MetricResult::Max(max) => Ok(max.value),
|
||||
MetricResult::Min(min) => Ok(min.value),
|
||||
MetricResult::Stats(stats) => stats.get_value(agg_property),
|
||||
MetricResult::ExtendedStats(extended_stats) => extended_stats.get_value(agg_property),
|
||||
MetricResult::Sum(sum) => Ok(sum.value),
|
||||
MetricResult::Percentiles(_) => Err(TantivyError::AggregationError(
|
||||
AggregationError::InvalidRequest("percentiles can't be used to order".to_string()),
|
||||
@@ -111,6 +119,7 @@ impl MetricResult {
|
||||
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
||||
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
||||
)),
|
||||
MetricResult::Cardinality(card) => Ok(card.value),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -179,7 +188,7 @@ pub enum BucketEntries<T> {
|
||||
}
|
||||
|
||||
impl<T> BucketEntries<T> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
|
||||
match self {
|
||||
BucketEntries::Vec(vec) => Box::new(vec.iter()),
|
||||
BucketEntries::HashMap(map) => Box::new(map.values()),
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::collector::AggregationCollector;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::AggregationLimits;
|
||||
use crate::aggregation::segment_agg_result::AggregationLimitsGuard;
|
||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"cardinality_string_id":{
|
||||
"cardinality": {
|
||||
"field": "string_id"
|
||||
}
|
||||
},
|
||||
"cardinality_score":{
|
||||
"cardinality": {
|
||||
"field": "score"
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
@@ -120,7 +130,7 @@ fn test_aggregation_flushing(
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(
|
||||
agg_req.clone(),
|
||||
AggregationLimits::default(),
|
||||
AggregationLimitsGuard::default(),
|
||||
);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
@@ -136,7 +146,7 @@ fn test_aggregation_flushing(
|
||||
.expect("Post deserialization failed");
|
||||
|
||||
intermediate_agg_result
|
||||
.into_final_result(agg_req, &Default::default())
|
||||
.into_final_result(agg_req, Default::default())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = get_collector(agg_req);
|
||||
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
|
||||
)
|
||||
);
|
||||
|
||||
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
|
||||
assert_eq!(res["cardinality_score"]["value"], 80.0);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
@@ -447,7 +460,7 @@ fn test_aggregation_level2(
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let res = searcher.search(&term_query, &collector).unwrap();
|
||||
res.into_final_result(agg_req.clone(), &Default::default())
|
||||
res.into_final_result(agg_req.clone(), Default::default())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = get_collector(agg_req.clone());
|
||||
@@ -857,7 +870,7 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
// => Segment with all boolen
|
||||
// => Segment with all boolean
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
||||
.unwrap();
|
||||
@@ -926,11 +939,11 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
},
|
||||
"termagg": {
|
||||
"buckets": [
|
||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
||||
{ "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } },
|
||||
],
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
@@ -938,3 +951,60 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values f64 numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10.5})))
|
||||
.unwrap();
|
||||
// Gets converted to f64!
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
// => Segment with all values i64 numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
// All bucket types
|
||||
let agg_req_str = r#"
|
||||
{
|
||||
"termagg": {
|
||||
"terms": {
|
||||
"field": "json.mixed_price"
|
||||
}
|
||||
}
|
||||
} "#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let aggregation_collector = get_collector(agg);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
|
||||
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
|
||||
use pretty_assertions::assert_eq;
|
||||
assert_eq!(
|
||||
&aggregation_res_json,
|
||||
&serde_json::json!({
|
||||
"termagg": {
|
||||
"buckets": [
|
||||
{ "doc_count": 2, "key": 10},
|
||||
{ "doc_count": 1, "key": 10.5},
|
||||
],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -34,10 +34,10 @@ use crate::aggregation::*;
|
||||
pub struct DateHistogramAggregationReq {
|
||||
#[doc(hidden)]
|
||||
/// Only for validation
|
||||
interval: Option<String>,
|
||||
pub interval: Option<String>,
|
||||
#[doc(hidden)]
|
||||
/// Only for validation
|
||||
calendar_interval: Option<String>,
|
||||
pub calendar_interval: Option<String>,
|
||||
/// The field to aggregate on.
|
||||
pub field: String,
|
||||
/// The format to format dates. Unsupported currently.
|
||||
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -438,7 +438,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
// Generate the full list of buckets without gaps.
|
||||
//
|
||||
@@ -496,7 +496,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
||||
is_date_agg: bool,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
// Normalization is column type dependent.
|
||||
// The request used in the the call to final is not yet be normalized.
|
||||
@@ -750,7 +750,7 @@ mod tests {
|
||||
agg_req,
|
||||
&index,
|
||||
None,
|
||||
AggregationLimits::new(Some(5_000), None),
|
||||
AggregationLimitsGuard::new(Some(5_000), None),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(res.to_string().starts_with(
|
||||
|
||||
@@ -112,18 +112,64 @@ impl Serialize for CustomOrder {
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
let value = serde_json::Value::deserialize(deserializer)?;
|
||||
let return_err = |message, val: serde_json::Value| {
|
||||
de::Error::custom(format!(
|
||||
"{}, but got {}",
|
||||
message,
|
||||
serde_json::to_string(&val).unwrap()
|
||||
))
|
||||
};
|
||||
|
||||
match value {
|
||||
serde_json::Value::Object(map) => {
|
||||
if map.len() != 1 {
|
||||
return Err(return_err(
|
||||
"expected exactly one key-value pair in the order map",
|
||||
map.into(),
|
||||
));
|
||||
}
|
||||
|
||||
let (key, value) = map.into_iter().next().unwrap();
|
||||
let order = serde_json::from_value(value).map_err(de::Error::custom)?;
|
||||
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
order,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
serde_json::Value::Array(arr) => {
|
||||
if arr.is_empty() {
|
||||
return Err(return_err("unexpected empty array in order", arr.into()));
|
||||
}
|
||||
if arr.len() != 1 {
|
||||
return Err(return_err(
|
||||
"only one sort order supported currently",
|
||||
arr.into(),
|
||||
));
|
||||
}
|
||||
let entry = arr.into_iter().next().unwrap();
|
||||
let map = entry
|
||||
.as_object()
|
||||
.ok_or_else(|| return_err("expected object as sort order", entry.clone()))?;
|
||||
let (key, value) = map.into_iter().next().ok_or_else(|| {
|
||||
return_err(
|
||||
"expected exactly one key-value pair in the order map",
|
||||
entry.clone(),
|
||||
)
|
||||
})?;
|
||||
let order = serde_json::from_value(value.clone()).map_err(de::Error::custom)?;
|
||||
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order,
|
||||
})
|
||||
}
|
||||
_ => Err(return_err(
|
||||
"unexpected type, expected an object or array",
|
||||
value,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,11 +184,23 @@ fn custom_order_serde_test() {
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
let order_deser: CustomOrder = serde_json::from_str("[{\"_key\":\"desc\"}]").unwrap();
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
assert!(order_deser
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("unexpected empty array in order"));
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> =
|
||||
serde_json::from_str(r#"[{"_key":"desc"},{"_key":"desc"}]"#);
|
||||
assert_eq!(
|
||||
order_deser.unwrap_err().to_string(),
|
||||
r#"only one sort order supported currently, but got [{"_key":"desc"},{"_key":"desc"}]"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_limits::ResourceLimitGuard;
|
||||
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
@@ -17,6 +16,7 @@ use crate::aggregation::*;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
///
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continuous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
@@ -270,7 +270,7 @@ impl SegmentRangeCollector {
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &mut AggregationsWithAccessor,
|
||||
limits: &ResourceLimitGuard,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
field_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
) -> crate::Result<Self> {
|
||||
@@ -471,7 +471,7 @@ mod tests {
|
||||
SegmentRangeCollector::from_req_and_validate(
|
||||
&req,
|
||||
&mut Default::default(),
|
||||
&AggregationLimits::default().new_guard(),
|
||||
&mut AggregationLimitsGuard::default(),
|
||||
field_type,
|
||||
0,
|
||||
)
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
use std::fmt::Debug;
|
||||
use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{
|
||||
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
|
||||
ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalValue,
|
||||
};
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
@@ -20,11 +21,11 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
build_segment_agg_collector, SegmentAggregationCollector,
|
||||
};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
|
||||
use crate::aggregation::{format_date, Key};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Creates a bucket for every unique term and counts the number of occurences.
|
||||
/// Creates a bucket for every unique term and counts the number of occurrences.
|
||||
/// Note that doc_count in the response buckets equals term count here.
|
||||
///
|
||||
/// If the text is untokenized and single value, that means one term per document and therefore it
|
||||
@@ -157,7 +158,7 @@ pub struct TermsAggregation {
|
||||
/// when loading the text.
|
||||
/// Special Case 1:
|
||||
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
||||
/// columns, to find docids without a value. That requires a special missing aggreggation.
|
||||
/// columns, to find docids without a value. That requires a special missing aggregation.
|
||||
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
||||
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
||||
/// add text.
|
||||
@@ -363,7 +364,7 @@ impl SegmentTermCollector {
|
||||
let term_buckets = TermBuckets::default();
|
||||
|
||||
if let Some(custom_order) = req.order.as_ref() {
|
||||
// Validate sub aggregtion exists
|
||||
// Validate sub aggregation exists
|
||||
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||
|
||||
@@ -466,49 +467,72 @@ impl SegmentTermCollector {
|
||||
};
|
||||
|
||||
if self.column_type == ColumnType::Str {
|
||||
let fallback_dict = Dictionary::empty();
|
||||
let term_dict = agg_with_accessor
|
||||
.str_dict_column
|
||||
.as_ref()
|
||||
.cloned()
|
||||
.unwrap_or_else(|| {
|
||||
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
|
||||
});
|
||||
let mut buffer = String::new();
|
||||
for (term_id, doc_count) in entries {
|
||||
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
|
||||
// Special case for missing key
|
||||
if term_id == u64::MAX {
|
||||
let missing_key = self
|
||||
.req
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_id but `missing` is None");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
buffer.clear();
|
||||
buffer.push_str(missing);
|
||||
dict.insert(
|
||||
IntermediateKey::Str(buffer.to_string()),
|
||||
intermediate_entry,
|
||||
);
|
||||
}
|
||||
Key::F64(val) => {
|
||||
buffer.push_str(&val.to_string());
|
||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||
}
|
||||
.map(|el| el.dictionary())
|
||||
.unwrap_or_else(|| &fallback_dict);
|
||||
let mut buffer = Vec::new();
|
||||
|
||||
// special case for missing key
|
||||
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
|
||||
let entry = entries[index];
|
||||
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
|
||||
let missing_key = self
|
||||
.req
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_id but `missing` is None");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
buffer.clear();
|
||||
buffer.extend_from_slice(missing.as_bytes());
|
||||
dict.insert(
|
||||
IntermediateKey::Str(
|
||||
String::from_utf8(buffer.to_vec())
|
||||
.expect("could not convert to String"),
|
||||
),
|
||||
intermediate_entry,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
if !term_dict.ord_to_str(term_id, &mut buffer)? {
|
||||
return Err(TantivyError::InternalError(format!(
|
||||
"Couldn't find term_id {term_id} in dict"
|
||||
)));
|
||||
Key::F64(val) => {
|
||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||
}
|
||||
Key::U64(val) => {
|
||||
dict.insert(IntermediateKey::U64(*val), intermediate_entry);
|
||||
}
|
||||
Key::I64(val) => {
|
||||
dict.insert(IntermediateKey::I64(*val), intermediate_entry);
|
||||
}
|
||||
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
|
||||
}
|
||||
|
||||
entries.swap_remove(index);
|
||||
}
|
||||
|
||||
// Sort by term ord
|
||||
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||||
let mut idx = 0;
|
||||
term_dict.sorted_ords_to_term_cb(
|
||||
entries.iter().map(|(term_id, _)| *term_id),
|
||||
|term| {
|
||||
let entry = entries[idx];
|
||||
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
|
||||
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
|
||||
dict.insert(
|
||||
IntermediateKey::Str(
|
||||
String::from_utf8(term.to_vec()).expect("could not convert to String"),
|
||||
),
|
||||
intermediate_entry,
|
||||
);
|
||||
idx += 1;
|
||||
Ok(())
|
||||
},
|
||||
)?;
|
||||
|
||||
if self.req.min_doc_count == 0 {
|
||||
// TODO: Handle rev streaming for descending sorting by keys
|
||||
let mut stream = term_dict.dictionary().stream()?;
|
||||
let mut stream = term_dict.stream()?;
|
||||
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
||||
agg_with_accessor.agg.sub_aggregation(),
|
||||
);
|
||||
@@ -567,8 +591,26 @@ impl SegmentTermCollector {
|
||||
} else {
|
||||
for (val, doc_count) in entries {
|
||||
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
||||
let val = f64_from_fastfield_u64(val, &self.column_type);
|
||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||
if self.column_type == ColumnType::U64 {
|
||||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||
} else if self.column_type == ColumnType::I64 {
|
||||
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
|
||||
} else {
|
||||
let val = f64::from_u64(val);
|
||||
let val: NumericalValue = val.into();
|
||||
|
||||
match val.normalize() {
|
||||
NumericalValue::U64(val) => {
|
||||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||
}
|
||||
NumericalValue::I64(val) => {
|
||||
dict.insert(IntermediateKey::I64(val), intermediate_entry);
|
||||
}
|
||||
NumericalValue::F64(val) => {
|
||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -627,7 +669,7 @@ mod tests {
|
||||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::AggregationLimits;
|
||||
use crate::aggregation::AggregationLimitsGuard;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
@@ -1190,8 +1232,8 @@ mod tests {
|
||||
#[test]
|
||||
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
||||
let terms_per_segment = vec![
|
||||
vec!["terma", "terma", "termb", "termb", "termb", "termc"],
|
||||
vec!["terma", "terma", "termb", "termc", "termc"],
|
||||
vec!["terma", "terma", "termb", "termb", "termb"],
|
||||
vec!["terma", "terma", "termb"],
|
||||
];
|
||||
|
||||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||||
@@ -1213,8 +1255,6 @@ mod tests {
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
@@ -1382,7 +1422,7 @@ mod tests {
|
||||
agg_req,
|
||||
&index,
|
||||
None,
|
||||
AggregationLimits::new(Some(50_000), None),
|
||||
AggregationLimitsGuard::new(Some(50_000), None),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(res
|
||||
@@ -1643,7 +1683,7 @@ mod tests {
|
||||
res["my_texts"]["buckets"][2]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
// text field with numner as missing fallback
|
||||
// text field with number as missing fallback
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||
@@ -1703,6 +1743,54 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_u64_value() -> crate::Result<()> {
|
||||
// Make sure that large u64 are not truncated
|
||||
let mut schema_builder = Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 9_223_372_036_854_775_807u64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 1_769_070_189_829_214_202u64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 1_769_070_189_829_214_202u64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_ids": {
|
||||
"terms": {
|
||||
"field": "id"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
// id field
|
||||
assert_eq!(
|
||||
res["my_ids"]["buckets"][0]["key"],
|
||||
1_769_070_189_829_214_202u64
|
||||
);
|
||||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||||
assert_eq!(
|
||||
res["my_ids"]["buckets"][1]["key"],
|
||||
9_223_372_036_854_775_807u64
|
||||
);
|
||||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_missing1() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -1769,7 +1857,7 @@ mod tests {
|
||||
res["my_texts"]["buckets"][2]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
// text field with numner as missing fallback
|
||||
// text field with number as missing fallback
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||
|
||||
@@ -70,7 +70,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
||||
)?;
|
||||
missing_entry.sub_aggregation = res;
|
||||
}
|
||||
|
||||
entries.insert(missing.into(), missing_entry);
|
||||
|
||||
let bucket = IntermediateBucketResult::Terms {
|
||||
|
||||
@@ -4,7 +4,7 @@ use super::agg_result::AggregationResults;
|
||||
use super::buf_collector::BufAggregationCollector;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::{
|
||||
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
|
||||
build_segment_agg_collector, AggregationLimitsGuard, SegmentAggregationCollector,
|
||||
};
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
@@ -22,7 +22,7 @@ pub const DEFAULT_MEMORY_LIMIT: u64 = 500_000_000;
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
@@ -30,7 +30,7 @@ impl AggregationCollector {
|
||||
///
|
||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||
/// bucket limit)
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||
Self { agg, limits }
|
||||
}
|
||||
}
|
||||
@@ -45,7 +45,7 @@ impl AggregationCollector {
|
||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
@@ -53,7 +53,7 @@ impl DistributedAggregationCollector {
|
||||
///
|
||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||
/// bucket limit)
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||
Self { agg, limits }
|
||||
}
|
||||
}
|
||||
@@ -115,7 +115,7 @@ impl Collector for AggregationCollector {
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
res.into_final_result(self.agg.clone(), &self.limits)
|
||||
res.into_final_result(self.agg.clone(), self.limits.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ impl AggregationSegmentCollector {
|
||||
agg: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: &AggregationLimits,
|
||||
limits: &AggregationLimitsGuard,
|
||||
) -> crate::Result<Self> {
|
||||
let mut aggs_with_accessor =
|
||||
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
||||
|
||||
@@ -19,13 +19,14 @@ use super::bucket::{
|
||||
GetDocCount, Order, OrderTarget, RangeAggregation, TermsAggregation,
|
||||
};
|
||||
use super::metric::{
|
||||
IntermediateAverage, IntermediateCount, IntermediateMax, IntermediateMin, IntermediateStats,
|
||||
IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::segment_agg_result::AggregationLimitsGuard;
|
||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
use crate::aggregation::metric::CardinalityCollector;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||
@@ -50,12 +51,18 @@ pub enum IntermediateKey {
|
||||
Str(String),
|
||||
/// `f64` key
|
||||
F64(f64),
|
||||
/// `i64` key
|
||||
I64(i64),
|
||||
/// `u64` key
|
||||
U64(u64),
|
||||
}
|
||||
impl From<Key> for IntermediateKey {
|
||||
fn from(value: Key) -> Self {
|
||||
match value {
|
||||
Key::Str(s) => Self::Str(s),
|
||||
Key::F64(f) => Self::F64(f),
|
||||
Key::U64(f) => Self::U64(f),
|
||||
Key::I64(f) => Self::I64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -72,7 +79,9 @@ impl From<IntermediateKey> for Key {
|
||||
}
|
||||
}
|
||||
IntermediateKey::F64(f) => Self::F64(f),
|
||||
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
|
||||
IntermediateKey::Bool(f) => Self::U64(f as u64),
|
||||
IntermediateKey::U64(f) => Self::U64(f),
|
||||
IntermediateKey::I64(f) => Self::I64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -85,6 +94,8 @@ impl std::hash::Hash for IntermediateKey {
|
||||
match self {
|
||||
IntermediateKey::Str(text) => text.hash(state),
|
||||
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||
IntermediateKey::U64(val) => val.hash(state),
|
||||
IntermediateKey::I64(val) => val.hash(state),
|
||||
IntermediateKey::Bool(val) => val.hash(state),
|
||||
IntermediateKey::IpAddr(val) => val.hash(state),
|
||||
}
|
||||
@@ -111,9 +122,9 @@ impl IntermediateAggregationResults {
|
||||
pub fn into_final_result(
|
||||
self,
|
||||
req: Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
mut limits: AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
let res = self.into_final_result_internal(&req, limits)?;
|
||||
let res = self.into_final_result_internal(&req, &mut limits)?;
|
||||
let bucket_count = res.get_bucket_count() as u32;
|
||||
if bucket_count > limits.get_bucket_limit() {
|
||||
return Err(TantivyError::AggregationError(
|
||||
@@ -130,7 +141,7 @@ impl IntermediateAggregationResults {
|
||||
pub(crate) fn into_final_result_internal(
|
||||
self,
|
||||
req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
||||
for (key, agg_res) in self.aggs_res.into_iter() {
|
||||
@@ -215,6 +226,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
Stats(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Stats(
|
||||
IntermediateStats::default(),
|
||||
)),
|
||||
ExtendedStats(_) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::ExtendedStats(IntermediateExtendedStats::default()),
|
||||
),
|
||||
Sum(_) => IntermediateAggregationResult::Metric(IntermediateMetricResult::Sum(
|
||||
IntermediateSum::default(),
|
||||
)),
|
||||
@@ -222,7 +236,10 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
||||
IntermediateMetricResult::Percentiles(PercentilesCollector::default()),
|
||||
),
|
||||
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req.clone())),
|
||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
||||
),
|
||||
Cardinality(_) => IntermediateAggregationResult::Metric(
|
||||
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
|
||||
),
|
||||
}
|
||||
}
|
||||
@@ -240,7 +257,7 @@ impl IntermediateAggregationResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &Aggregation,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResult> {
|
||||
let res = match self {
|
||||
IntermediateAggregationResult::Bucket(bucket) => {
|
||||
@@ -282,10 +299,14 @@ pub enum IntermediateMetricResult {
|
||||
Min(IntermediateMin),
|
||||
/// Intermediate stats result.
|
||||
Stats(IntermediateStats),
|
||||
/// Intermediate stats result.
|
||||
ExtendedStats(IntermediateExtendedStats),
|
||||
/// Intermediate sum result.
|
||||
Sum(IntermediateSum),
|
||||
/// Intermediate top_hits result
|
||||
TopHits(TopHitsTopNComputer),
|
||||
/// Intermediate cardinality result
|
||||
Cardinality(CardinalityCollector),
|
||||
}
|
||||
|
||||
impl IntermediateMetricResult {
|
||||
@@ -306,6 +327,9 @@ impl IntermediateMetricResult {
|
||||
IntermediateMetricResult::Stats(intermediate_stats) => {
|
||||
MetricResult::Stats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::ExtendedStats(intermediate_stats) => {
|
||||
MetricResult::ExtendedStats(intermediate_stats.finalize())
|
||||
}
|
||||
IntermediateMetricResult::Sum(intermediate_sum) => {
|
||||
MetricResult::Sum(intermediate_sum.finalize().into())
|
||||
}
|
||||
@@ -316,6 +340,9 @@ impl IntermediateMetricResult {
|
||||
IntermediateMetricResult::TopHits(top_hits) => {
|
||||
MetricResult::TopHits(top_hits.into_final_result())
|
||||
}
|
||||
IntermediateMetricResult::Cardinality(cardinality) => {
|
||||
MetricResult::Cardinality(cardinality.finalize().into())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -346,6 +373,12 @@ impl IntermediateMetricResult {
|
||||
) => {
|
||||
stats_left.merge_fruits(stats_right);
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::ExtendedStats(extended_stats_left),
|
||||
IntermediateMetricResult::ExtendedStats(extended_stats_right),
|
||||
) => {
|
||||
extended_stats_left.merge_fruits(extended_stats_right);
|
||||
}
|
||||
(IntermediateMetricResult::Sum(sum_left), IntermediateMetricResult::Sum(sum_right)) => {
|
||||
sum_left.merge_fruits(sum_right);
|
||||
}
|
||||
@@ -358,6 +391,12 @@ impl IntermediateMetricResult {
|
||||
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
||||
left.merge_fruits(right)?;
|
||||
}
|
||||
(
|
||||
IntermediateMetricResult::Cardinality(left),
|
||||
IntermediateMetricResult::Cardinality(right),
|
||||
) => {
|
||||
left.merge_fruits(right)?;
|
||||
}
|
||||
_ => {
|
||||
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
||||
}
|
||||
@@ -393,7 +432,7 @@ impl IntermediateBucketResult {
|
||||
pub(crate) fn into_final_bucket_result(
|
||||
self,
|
||||
req: &Aggregation,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketResult> {
|
||||
match self {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
@@ -557,7 +596,7 @@ impl IntermediateTermBucketResult {
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
@@ -684,7 +723,7 @@ impl IntermediateHistogramBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketEntry> {
|
||||
Ok(BucketEntry {
|
||||
key_as_string: None,
|
||||
@@ -719,7 +758,7 @@ impl IntermediateRangeBucketEntry {
|
||||
req: &Aggregations,
|
||||
_range_req: &RangeAggregation,
|
||||
column_type: Option<ColumnType>,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
let mut range_bucket_entry = RangeBucketEntry {
|
||||
key: self.key.into(),
|
||||
@@ -821,7 +860,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_intermediat_tree_with_ranges(
|
||||
fn get_intermediate_tree_with_ranges(
|
||||
data: &[(String, u64, String, u64)],
|
||||
) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
@@ -857,18 +896,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right).unwrap();
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
@@ -878,18 +917,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right).unwrap();
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
@@ -900,7 +939,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_empty() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user