mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-03 07:42:54 +00:00
Compare commits
38 Commits
agg_format
...
test_parse
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4a9262cd2c | ||
|
|
876a579e5d | ||
|
|
4c52499622 | ||
|
|
52d4e81e70 | ||
|
|
c71ea7b2ef | ||
|
|
c35a782747 | ||
|
|
c66af2c0a9 | ||
|
|
f9ac055847 | ||
|
|
21d057059e | ||
|
|
dca508b4ca | ||
|
|
aebae9965d | ||
|
|
e7e3e3f44c | ||
|
|
2f2db16ec1 | ||
|
|
d152e29687 | ||
|
|
285bcc25c9 | ||
|
|
7b65ad922d | ||
|
|
99be20cedd | ||
|
|
5f026901b8 | ||
|
|
6dfa2df06f | ||
|
|
c17e513377 | ||
|
|
2f5a269c70 | ||
|
|
50532260e3 | ||
|
|
8bd6eb06e6 | ||
|
|
55b0b52457 | ||
|
|
56fc56c5b9 | ||
|
|
85395d942a | ||
|
|
a206c3ccd3 | ||
|
|
dc5d31c116 | ||
|
|
95a4ddea3e | ||
|
|
ab5125d3dc | ||
|
|
9f81d59ecd | ||
|
|
c71ec8086d | ||
|
|
27be6aed91 | ||
|
|
3d1c4b313a | ||
|
|
0d4e319965 | ||
|
|
75dc3eb298 | ||
|
|
3f6d225086 | ||
|
|
d8843c608c |
@@ -46,7 +46,7 @@ The file of a segment has the format
|
|||||||
|
|
||||||
```segment-id . ext```
|
```segment-id . ext```
|
||||||
|
|
||||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.
|
||||||
|
|
||||||
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
||||||
|
|
||||||
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
|
|||||||
|
|
||||||
Tantivy's document follows a very strict schema, decided before building any index.
|
Tantivy's document follows a very strict schema, decided before building any index.
|
||||||
|
|
||||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||||
|
|
||||||
Depending on the type of the field, you can decide to
|
Depending on the type of the field, you can decide to
|
||||||
|
|
||||||
|
|||||||
96
CHANGELOG.md
96
CHANGELOG.md
@@ -1,3 +1,79 @@
|
|||||||
|
Tantivy 0.23 - Unreleased
|
||||||
|
================================
|
||||||
|
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75.
|
||||||
|
|
||||||
|
#### Bugfixes
|
||||||
|
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
|
||||||
|
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
|
||||||
|
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
|
||||||
|
- fix `OwnedBytes` debug panic [#2512](https://github.com/quickwit-oss/tantivy/pull/2512)(@b41sh)
|
||||||
|
|
||||||
|
#### Breaking API Changes
|
||||||
|
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
|
||||||
|
|
||||||
|
#### Features/Improvements
|
||||||
|
- **Aggregation**
|
||||||
|
- Support for cardinality aggregation [#2337](https://github.com/quickwit-oss/tantivy/pull/2337) [#2446](https://github.com/quickwit-oss/tantivy/pull/2446) (@raphaelcoeffic @PSeitz)
|
||||||
|
- Support for extended stats aggregation [#2247](https://github.com/quickwit-oss/tantivy/pull/2247)(@giovannicuccu)
|
||||||
|
- Add Key::I64 and Key::U64 variants in aggregation to avoid f64 precision issues [#2468](https://github.com/quickwit-oss/tantivy/pull/2468)(@PSeitz)
|
||||||
|
- Faster term aggregation fetch terms [#2447](https://github.com/quickwit-oss/tantivy/pull/2447)(@PSeitz)
|
||||||
|
- Improve custom order deserialization [#2451](https://github.com/quickwit-oss/tantivy/pull/2451)(@PSeitz)
|
||||||
|
- Change AggregationLimits behavior [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||||
|
- lower contention on AggregationLimits [#2394](https://github.com/quickwit-oss/tantivy/pull/2394)(@PSeitz)
|
||||||
|
- fix postcard compatibility for top_hits, add postcard test [#2346](https://github.com/quickwit-oss/tantivy/pull/2346)(@PSeitz)
|
||||||
|
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
|
||||||
|
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
|
||||||
|
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||||
|
- **Range Queries**
|
||||||
|
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
|
||||||
|
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
|
||||||
|
- modify fastfield range query heuristic [#2375](https://github.com/quickwit-oss/tantivy/pull/2375)(@trinity-1686a)
|
||||||
|
- add FastFieldRangeQuery for explicit range queries on fast field (for `RangeQuery` it is autodetected) [#2477](https://github.com/quickwit-oss/tantivy/pull/2477)(@PSeitz)
|
||||||
|
|
||||||
|
- add format backwards-compatibility tests [#2485](https://github.com/quickwit-oss/tantivy/pull/2485)(@PSeitz)
|
||||||
|
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
|
||||||
|
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
|
||||||
|
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
|
||||||
|
- feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
|
||||||
|
|
||||||
|
- **RegexPhraseQuery**
|
||||||
|
`RegexPhraseQuery` supports phrase queries with regex. E.g. query "b.* b.* wolf" matches "big bad wolf". Slop is supported as well: "b.* wolf"~2 matches "big bad wolf" [#2516](https://github.com/quickwit-oss/tantivy/pull/2516)(@PSeitz)
|
||||||
|
|
||||||
|
- **Optional Index in Multivalue Columnar Index**
|
||||||
|
For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case).
|
||||||
|
This is alleviated by placing an optional index in the multivalued index to mark documents that have values.
|
||||||
|
This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
|
||||||
|
|
||||||
|
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
|
||||||
|
|
||||||
|
- **Performace/Memory**
|
||||||
|
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
||||||
|
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
||||||
|
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
||||||
|
- Recycling buffer in PrefixPhraseScorer [#2443](https://github.com/quickwit-oss/tantivy/pull/2443)(@fulmicoton)
|
||||||
|
|
||||||
|
- **Json Type**
|
||||||
|
- JSON supports now all values on the root level. Previously an object was required. This enables support for flat mixed types. allow more JSON values, fix i64 special case [#2383](https://github.com/quickwit-oss/tantivy/pull/2383)(@PSeitz)
|
||||||
|
- add json path constructor to term [#2367](https://github.com/quickwit-oss/tantivy/pull/2367)(@PSeitz)
|
||||||
|
|
||||||
|
- **QueryParser**
|
||||||
|
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
|
||||||
|
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
|
||||||
|
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
|
||||||
|
|
||||||
|
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
|
||||||
|
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
|
||||||
|
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
|
||||||
|
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
|
||||||
|
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
|
||||||
|
- use bingang for agg and stacker benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)[#2492](https://github.com/quickwit-oss/tantivy/pull/2492)(@PSeitz)
|
||||||
|
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
|
||||||
|
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
|
||||||
|
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
|
||||||
|
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
|
||||||
|
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||||
|
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
|
||||||
|
|
||||||
Tantivy 0.22
|
Tantivy 0.22
|
||||||
================================
|
================================
|
||||||
|
|
||||||
@@ -8,7 +84,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
|||||||
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
||||||
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
||||||
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
||||||
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
- Fix bug occurring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||||
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
||||||
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
||||||
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
||||||
@@ -26,7 +102,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
|||||||
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
||||||
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
||||||
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
||||||
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
- Support ip addresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||||
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
||||||
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
||||||
|
|
||||||
@@ -116,7 +192,7 @@ Tantivy 0.20
|
|||||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
- Move tokenizer API to separate crate. Having a separate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||||
@@ -163,13 +239,13 @@ Tantivy 0.20
|
|||||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||||
- sstable
|
- sstable
|
||||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
- Isolating sstable and stacker in independent crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
- Use DeltaReader directly to implement Dictionary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
- Use DeltaReader directly to implement Dictionary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
- Add separate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
- Added support for madvise when opening an mmapped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||||
- Query Parser
|
- Query Parser
|
||||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||||
@@ -208,7 +284,7 @@ Tantivy 0.19
|
|||||||
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
||||||
- Aggregation
|
- Aggregation
|
||||||
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
||||||
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
- Add support for keyed parameter in range and histogram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||||
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
||||||
- Faster indexing
|
- Faster indexing
|
||||||
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
||||||
@@ -651,7 +727,7 @@ Tantivy 0.4.0
|
|||||||
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
|
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
|
||||||
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
|
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
|
||||||
- Optimized skip in SegmentPostings (#130) (@lnicola)
|
- Optimized skip in SegmentPostings (#130) (@lnicola)
|
||||||
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
|
- Replacing rustc_serialize by serde. Kudos to benchmark@KodrAus and @lnicola
|
||||||
- Using error-chain (@KodrAus)
|
- Using error-chain (@KodrAus)
|
||||||
- QueryParser: (@fulmicoton)
|
- QueryParser: (@fulmicoton)
|
||||||
- Explicit error returned when searched for a term that is not indexed
|
- Explicit error returned when searched for a term that is not indexed
|
||||||
|
|||||||
10
CITATION.cff
Normal file
10
CITATION.cff
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
cff-version: 1.2.0
|
||||||
|
message: "If you use this software, please cite it as below."
|
||||||
|
authors:
|
||||||
|
- alias: Quickwit Inc.
|
||||||
|
website: "https://quickwit.io"
|
||||||
|
title: "tantivy"
|
||||||
|
version: 0.22.0
|
||||||
|
doi: 10.5281/zenodo.13942948
|
||||||
|
date-released: 2024-10-17
|
||||||
|
url: "https://github.com/quickwit-oss/tantivy"
|
||||||
12
Cargo.toml
12
Cargo.toml
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.66"
|
rust-version = "1.75"
|
||||||
exclude = ["benches/*.json", "benches/*.txt"]
|
exclude = ["benches/*.json", "benches/*.txt"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
@@ -29,7 +29,7 @@ tantivy-fst = "0.5"
|
|||||||
memmap2 = { version = "0.9.0", optional = true }
|
memmap2 = { version = "0.9.0", optional = true }
|
||||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||||
zstd = { version = "0.13", optional = true, default-features = false }
|
zstd = { version = "0.13", optional = true, default-features = false }
|
||||||
tempfile = { version = "3.3.0", optional = true }
|
tempfile = { version = "3.12.0", optional = true }
|
||||||
log = "0.4.16"
|
log = "0.4.16"
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
serde_json = "1.0.79"
|
serde_json = "1.0.79"
|
||||||
@@ -43,11 +43,11 @@ bitpacking = { version = "0.9.2", default-features = false, features = [
|
|||||||
"bitpacker4x",
|
"bitpacker4x",
|
||||||
] }
|
] }
|
||||||
census = "0.4.2"
|
census = "0.4.2"
|
||||||
rustc-hash = "1.1.0"
|
rustc-hash = "2.0.0"
|
||||||
thiserror = "1.0.30"
|
thiserror = "2.0.1"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = { version = "0.5.0", optional = true }
|
fail = { version = "0.5.0", optional = true }
|
||||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||||
smallvec = "1.8.0"
|
smallvec = "1.8.0"
|
||||||
rayon = "1.5.2"
|
rayon = "1.5.2"
|
||||||
lru = "0.12.0"
|
lru = "0.12.0"
|
||||||
@@ -72,7 +72,7 @@ fnv = "1.0.7"
|
|||||||
winapi = "0.3.9"
|
winapi = "0.3.9"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
binggan = "0.8.0"
|
binggan = "0.14.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
matches = "0.1.9"
|
matches = "0.1.9"
|
||||||
|
|||||||
2
TODO.txt
2
TODO.txt
@@ -1,7 +1,7 @@
|
|||||||
Make schema_builder API fluent.
|
Make schema_builder API fluent.
|
||||||
fix doc serialization and prevent compression problems
|
fix doc serialization and prevent compression problems
|
||||||
|
|
||||||
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
|
u64 , etc. should return Result<Option> now that we support optional missing a column is really not an error
|
||||||
remove fastfield codecs
|
remove fastfield codecs
|
||||||
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
||||||
rename FastFieldReaders::open to load
|
rename FastFieldReaders::open to load
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use binggan::plugins::PeakMemAllocPlugin;
|
||||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::SliceRandom;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
@@ -17,7 +18,9 @@ pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
|||||||
/// runner.register("average_u64", move |index| average_u64(index));
|
/// runner.register("average_u64", move |index| average_u64(index));
|
||||||
macro_rules! register {
|
macro_rules! register {
|
||||||
($runner:expr, $func:ident) => {
|
($runner:expr, $func:ident) => {
|
||||||
$runner.register(stringify!($func), move |index| $func(index))
|
$runner.register(stringify!($func), move |index| {
|
||||||
|
$func(index);
|
||||||
|
})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,7 +45,8 @@ fn main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn bench_agg(mut group: InputGroup<Index>) {
|
fn bench_agg(mut group: InputGroup<Index>) {
|
||||||
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
|
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||||
|
|
||||||
register!(group, average_u64);
|
register!(group, average_u64);
|
||||||
register!(group, average_f64);
|
register!(group, average_f64);
|
||||||
register!(group, average_f64_u64);
|
register!(group, average_f64_u64);
|
||||||
|
|||||||
@@ -368,9 +368,9 @@ mod test {
|
|||||||
for start_idx in 0u32..32u32 {
|
for start_idx in 0u32..32u32 {
|
||||||
output.resize(len, 0);
|
output.resize(len, 0);
|
||||||
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
||||||
for i in 0..len {
|
for (i, output_byte) in output.iter().enumerate() {
|
||||||
let expected = (start_idx + i as u32) & mask;
|
let expected = (start_idx + i as u32) & mask;
|
||||||
assert_eq!(output[i], expected);
|
assert_eq!(*output_byte, expected);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
|||||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||||
|
|
||||||
impl FilterImplPerInstructionSet {
|
impl FilterImplPerInstructionSet {
|
||||||
#[allow(unused_variables)]
|
|
||||||
#[inline]
|
#[inline]
|
||||||
|
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||||
#[cfg(target_arch = "x86_64")]
|
#[cfg(target_arch = "x86_64")]
|
||||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
|
|||||||
proptest = "1"
|
proptest = "1"
|
||||||
more-asserts = "0.3.1"
|
more-asserts = "0.3.1"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
binggan = "0.8.1"
|
binggan = "0.14.0"
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "bench_merge"
|
name = "bench_merge"
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot
|
|||||||
# Columnar format
|
# Columnar format
|
||||||
|
|
||||||
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
||||||
The `(column_name, columne_type)` couple however uniquely identifies a column.
|
The `(column_name, column_type)` couple however uniquely identifies a column.
|
||||||
That couple is serialized as a column `column_key`. The format of that key is:
|
That couple is serialized as a column `column_key`. The format of that key is:
|
||||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||||
|
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
pub mod common;
|
pub mod common;
|
||||||
|
|
||||||
use binggan::{black_box, BenchRunner};
|
use binggan::BenchRunner;
|
||||||
use common::{generate_columnar_with_name, Card};
|
use common::{generate_columnar_with_name, Card};
|
||||||
use tantivy_columnar::*;
|
use tantivy_columnar::*;
|
||||||
|
|
||||||
@@ -29,7 +29,7 @@ fn main() {
|
|||||||
add_combo(Card::Multi, Card::Dense);
|
add_combo(Card::Multi, Card::Dense);
|
||||||
add_combo(Card::Multi, Card::Sparse);
|
add_combo(Card::Multi, Card::Sparse);
|
||||||
|
|
||||||
let runner: BenchRunner = BenchRunner::new();
|
let mut runner: BenchRunner = BenchRunner::new();
|
||||||
let mut group = runner.new_group();
|
let mut group = runner.new_group();
|
||||||
for (input_name, columnar_readers) in inputs.iter() {
|
for (input_name, columnar_readers) in inputs.iter() {
|
||||||
group.register_with_input(
|
group.register_with_input(
|
||||||
@@ -41,7 +41,7 @@ fn main() {
|
|||||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||||
|
|
||||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||||
black_box(out);
|
Some(out.len() as u64)
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
# Perf and Size
|
# Perf and Size
|
||||||
* remove alloc in `ord_to_term`
|
* remove alloc in `ord_to_term`
|
||||||
+ multivaued range queries restrat frm the beginning all of the time.
|
+ multivaued range queries restart from the beginning all of the time.
|
||||||
* re-add ZSTD compression for dictionaries
|
* re-add ZSTD compression for dictionaries
|
||||||
no systematic monotonic mapping
|
no systematic monotonic mapping
|
||||||
consider removing multilinear
|
consider removing multilinear
|
||||||
@@ -30,7 +30,7 @@ investigate if should have better errors? io::Error is overused at the moment.
|
|||||||
rename rank/select in unit tests
|
rename rank/select in unit tests
|
||||||
Review the public API via cargo doc
|
Review the public API via cargo doc
|
||||||
go through TODOs
|
go through TODOs
|
||||||
remove all doc_id occurences -> row_id
|
remove all doc_id occurrences -> row_id
|
||||||
use the rank & select naming in unit tests branch.
|
use the rank & select naming in unit tests branch.
|
||||||
multi-linear -> blockwise
|
multi-linear -> blockwise
|
||||||
linear codec -> simply a multiplication for the index column
|
linear codec -> simply a multiplication for the index column
|
||||||
@@ -43,5 +43,5 @@ isolate u128_based and uniform naming
|
|||||||
# Other
|
# Other
|
||||||
fix enhance column-cli
|
fix enhance column-cli
|
||||||
|
|
||||||
# Santa claus
|
# Santa Claus
|
||||||
autodetect datetime ipaddr, plug customizable tokenizer.
|
autodetect datetime ipaddr, plug customizable tokenizer.
|
||||||
|
|||||||
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
|||||||
&'a self,
|
&'a self,
|
||||||
docs: &'a [u32],
|
docs: &'a [u32],
|
||||||
accessor: &Column<T>,
|
accessor: &Column<T>,
|
||||||
) -> impl Iterator<Item = (DocId, T)> + '_ {
|
) -> impl Iterator<Item = (DocId, T)> + 'a {
|
||||||
if accessor.index.get_cardinality().is_full() {
|
if accessor.index.get_cardinality().is_full() {
|
||||||
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -173,7 +173,7 @@ mod tests {
|
|||||||
.into();
|
.into();
|
||||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Expected a multivalued index")
|
||||||
};
|
};
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
@@ -211,7 +211,7 @@ mod tests {
|
|||||||
|
|
||||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Expected a multivalued index")
|
||||||
};
|
};
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ struct ShuffledIndex<'a> {
|
|||||||
merge_order: &'a ShuffleMergeOrder,
|
merge_order: &'a ShuffleMergeOrder,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable<u32> for ShuffledIndex<'a> {
|
impl Iterable<u32> for ShuffledIndex<'_> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
Box::new(
|
Box::new(
|
||||||
self.merge_order
|
self.merge_order
|
||||||
@@ -127,7 +127,7 @@ fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item
|
|||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
|
impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
|
let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
|
||||||
Box::new(integrate_num_vals(num_vals_per_row))
|
Box::new(integrate_num_vals(num_vals_per_row))
|
||||||
|
|||||||
@@ -123,7 +123,7 @@ fn get_num_values_iterator<'a>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
|
impl Iterable<u32> for StackedStartOffsets<'_> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
|
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
|
||||||
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
|
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ pub enum ColumnIndex {
|
|||||||
Full,
|
Full,
|
||||||
Optional(OptionalIndex),
|
Optional(OptionalIndex),
|
||||||
/// In addition, at index num_rows, an extra value is added
|
/// In addition, at index num_rows, an extra value is added
|
||||||
/// containing the overal number of values.
|
/// containing the overall number of values.
|
||||||
Multivalued(MultiValueIndex),
|
Multivalued(MultiValueIndex),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -86,7 +86,7 @@ pub struct OptionalIndex {
|
|||||||
block_metas: Arc<[BlockMeta]>,
|
block_metas: Arc<[BlockMeta]>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable<u32> for &'a OptionalIndex {
|
impl Iterable<u32> for &OptionalIndex {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
|
||||||
Box::new(self.iter_rows())
|
Box::new(self.iter_rows())
|
||||||
}
|
}
|
||||||
@@ -123,7 +123,7 @@ enum BlockSelectCursor<'a> {
|
|||||||
Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
|
Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> BlockSelectCursor<'a> {
|
impl BlockSelectCursor<'_> {
|
||||||
fn select(&mut self, rank: u16) -> u16 {
|
fn select(&mut self, rank: u16) -> u16 {
|
||||||
match self {
|
match self {
|
||||||
BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
|
BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
|
||||||
@@ -141,7 +141,7 @@ pub struct OptionalIndexSelectCursor<'a> {
|
|||||||
num_null_rows_before_block: RowId,
|
num_null_rows_before_block: RowId,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> OptionalIndexSelectCursor<'a> {
|
impl OptionalIndexSelectCursor<'_> {
|
||||||
fn search_and_load_block(&mut self, rank: RowId) {
|
fn search_and_load_block(&mut self, rank: RowId) {
|
||||||
if rank < self.current_block_end_rank {
|
if rank < self.current_block_end_rank {
|
||||||
// we are already in the right block
|
// we are already in the right block
|
||||||
@@ -165,7 +165,7 @@ impl<'a> OptionalIndexSelectCursor<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
impl SelectCursor<RowId> for OptionalIndexSelectCursor<'_> {
|
||||||
fn select(&mut self, rank: RowId) -> RowId {
|
fn select(&mut self, rank: RowId) -> RowId {
|
||||||
self.search_and_load_block(rank);
|
self.search_and_load_block(rank);
|
||||||
let index_in_block = (rank - self.num_null_rows_before_block) as u16;
|
let index_in_block = (rank - self.num_null_rows_before_block) as u16;
|
||||||
@@ -174,7 +174,9 @@ impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Set<RowId> for OptionalIndex {
|
impl Set<RowId> for OptionalIndex {
|
||||||
type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= OptionalIndexSelectCursor<'b>
|
||||||
|
where Self: 'b;
|
||||||
// Check if value at position is not null.
|
// Check if value at position is not null.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains(&self, row_id: RowId) -> bool {
|
fn contains(&self, row_id: RowId) -> bool {
|
||||||
@@ -503,7 +505,7 @@ fn deserialize_optional_index_block_metadatas(
|
|||||||
non_null_rows_before_block += num_non_null_rows;
|
non_null_rows_before_block += num_non_null_rows;
|
||||||
}
|
}
|
||||||
block_metas.resize(
|
block_metas.resize(
|
||||||
((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize,
|
num_rows.div_ceil(ELEMENTS_PER_BLOCK) as usize,
|
||||||
BlockMeta {
|
BlockMeta {
|
||||||
non_null_rows_before_block,
|
non_null_rows_before_block,
|
||||||
start_byte_offset,
|
start_byte_offset,
|
||||||
|
|||||||
@@ -23,7 +23,6 @@ fn set_bit_at(input: &mut u64, n: u16) {
|
|||||||
///
|
///
|
||||||
/// When translating a dense index to the original index, we can use the offset to find the correct
|
/// When translating a dense index to the original index, we can use the offset to find the correct
|
||||||
/// block. Direct computation is not possible, but we can employ a linear or binary search.
|
/// block. Direct computation is not possible, but we can employ a linear or binary search.
|
||||||
|
|
||||||
const ELEMENTS_PER_MINI_BLOCK: u16 = 64;
|
const ELEMENTS_PER_MINI_BLOCK: u16 = 64;
|
||||||
const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8;
|
const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8;
|
||||||
const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2;
|
const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2;
|
||||||
@@ -109,7 +108,7 @@ pub struct DenseBlockSelectCursor<'a> {
|
|||||||
dense_block: DenseBlock<'a>,
|
dense_block: DenseBlock<'a>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
impl SelectCursor<u16> for DenseBlockSelectCursor<'_> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn select(&mut self, rank: u16) -> u16 {
|
fn select(&mut self, rank: u16) -> u16 {
|
||||||
self.block_id = self
|
self.block_id = self
|
||||||
@@ -123,7 +122,9 @@ impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Set<u16> for DenseBlock<'a> {
|
impl<'a> Set<u16> for DenseBlock<'a> {
|
||||||
type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= DenseBlockSelectCursor<'a>
|
||||||
|
where Self: 'b;
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn contains(&self, el: u16) -> bool {
|
fn contains(&self, el: u16) -> bool {
|
||||||
@@ -173,7 +174,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DenseBlock<'a> {
|
impl DenseBlock<'_> {
|
||||||
#[inline]
|
#[inline]
|
||||||
fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
|
fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
|
||||||
let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES;
|
let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES;
|
||||||
|
|||||||
@@ -31,8 +31,10 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Set<u16> for SparseBlock<'a> {
|
impl Set<u16> for SparseBlock<'_> {
|
||||||
type SelectCursor<'b> = Self where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= Self
|
||||||
|
where Self: 'b;
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn contains(&self, el: u16) -> bool {
|
fn contains(&self, el: u16) -> bool {
|
||||||
@@ -67,7 +69,7 @@ fn get_u16(data: &[u8], byte_position: usize) -> u16 {
|
|||||||
u16::from_le_bytes(bytes)
|
u16::from_le_bytes(bytes)
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SparseBlock<'a> {
|
impl SparseBlock<'_> {
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 {
|
fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 {
|
||||||
let start_offset: usize = idx as usize * 2;
|
let start_offset: usize = idx as usize * 2;
|
||||||
@@ -80,7 +82,7 @@ impl<'a> SparseBlock<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
#[allow(clippy::comparison_chain)]
|
#[expect(clippy::comparison_chain)]
|
||||||
// Looks for the element in the block. Returns the positions if found.
|
// Looks for the element in the block. Returns the positions if found.
|
||||||
fn binary_search(&self, target: u16) -> Result<u16, u16> {
|
fn binary_search(&self, target: u16) -> Result<u16, u16> {
|
||||||
let data = &self.0;
|
let data = &self.0;
|
||||||
|
|||||||
@@ -110,8 +110,8 @@ fn test_null_index(data: &[bool]) {
|
|||||||
.map(|(pos, _val)| pos as u32)
|
.map(|(pos, _val)| pos as u32)
|
||||||
.collect();
|
.collect();
|
||||||
let mut select_iter = null_index.select_cursor();
|
let mut select_iter = null_index.select_cursor();
|
||||||
for i in 0..orig_idx_with_value.len() {
|
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
||||||
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
|
assert_eq!(select_iter.select(i as u32), *expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ pub enum SerializableColumnIndex<'a> {
|
|||||||
Multivalued(SerializableMultivalueIndex<'a>),
|
Multivalued(SerializableMultivalueIndex<'a>),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> SerializableColumnIndex<'a> {
|
impl SerializableColumnIndex<'_> {
|
||||||
pub fn get_cardinality(&self) -> Cardinality {
|
pub fn get_cardinality(&self) -> Cardinality {
|
||||||
match self {
|
match self {
|
||||||
SerializableColumnIndex::Full => Cardinality::Full,
|
SerializableColumnIndex::Full => Cardinality::Full,
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
|
|||||||
pub(crate) merge_row_order: &'a MergeRowOrder,
|
pub(crate) merge_row_order: &'a MergeRowOrder,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> {
|
impl<T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'_, T> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
||||||
match self.merge_row_order {
|
match self.merge_row_order {
|
||||||
MergeRowOrder::Stack(_) => Box::new(
|
MergeRowOrder::Stack(_) => Box::new(
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ impl CompactSpaceBuilder {
|
|||||||
|
|
||||||
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
||||||
|
|
||||||
// begining of the blanks
|
// beginning of the blanks
|
||||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
||||||
if *first_blank_start != 0 {
|
if *first_blank_start != 0 {
|
||||||
covered_space.push(0..=first_blank_start - 1);
|
covered_space.push(0..=first_blank_start - 1);
|
||||||
|
|||||||
@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub(crate) mod tests {
|
||||||
use super::*;
|
use super::*;
|
||||||
use crate::column_values::u64_based::{
|
use crate::column_values::u64_based::{
|
||||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ impl BinarySerializable for Block {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn compute_num_blocks(num_vals: u32) -> u32 {
|
fn compute_num_blocks(num_vals: u32) -> u32 {
|
||||||
(num_vals + BLOCK_SIZE - 1) / BLOCK_SIZE
|
num_vals.div_ceil(BLOCK_SIZE)
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct BlockwiseLinearEstimator {
|
pub struct BlockwiseLinearEstimator {
|
||||||
|
|||||||
@@ -122,12 +122,11 @@ impl Line {
|
|||||||
line
|
line
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a line that attemps to approximate a function
|
/// Returns a line that attempts to approximate a function
|
||||||
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
||||||
///
|
///
|
||||||
/// - The approximation is always lower than the actual value.
|
/// - The approximation is always lower than the actual value. Or more rigorously, formally
|
||||||
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
|
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
|
||||||
/// for any i in [0..ys.len()).
|
|
||||||
/// - It computes without panicking for any value of it.
|
/// - It computes without panicking for any value of it.
|
||||||
///
|
///
|
||||||
/// This function is only invariable by translation if all of the
|
/// This function is only invariable by translation if all of the
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ struct RemappedTermOrdinalsValues<'a> {
|
|||||||
merge_row_order: &'a MergeRowOrder,
|
merge_row_order: &'a MergeRowOrder,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
|
impl Iterable for RemappedTermOrdinalsValues<'_> {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||||
match self.merge_row_order {
|
match self.merge_row_order {
|
||||||
MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
|
MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
|
||||||
@@ -50,7 +50,7 @@ impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> RemappedTermOrdinalsValues<'a> {
|
impl RemappedTermOrdinalsValues<'_> {
|
||||||
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
|
||||||
let iter = self
|
let iter = self
|
||||||
.bytes_columns
|
.bytes_columns
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use crate::{
|
|||||||
/// After merge, all columns belonging to the same category are coerced to
|
/// After merge, all columns belonging to the same category are coerced to
|
||||||
/// the same column type.
|
/// the same column type.
|
||||||
///
|
///
|
||||||
/// In practise, today, only Numerical colummns are coerced into one type today.
|
/// In practise, today, only Numerical columns are coerced into one type today.
|
||||||
///
|
///
|
||||||
/// See also [README.md].
|
/// See also [README.md].
|
||||||
///
|
///
|
||||||
@@ -63,11 +63,10 @@ impl From<ColumnType> for ColumnTypeCategory {
|
|||||||
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
||||||
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
||||||
/// happen:
|
/// happen:
|
||||||
/// - If the required column type is compatible with all of the input columnar, the resulsting
|
/// - If the required column type is compatible with all of the input columnar, the resulting merged
|
||||||
/// merged
|
/// columnar will simply coerce the input column and use the required column type.
|
||||||
/// columnar will simply coerce the input column and use the required column type.
|
/// - If the required column type is incompatible with one of the input columnar, the merged will
|
||||||
/// - If the required column type is incompatible with one of the input columnar, the merged
|
/// fail with an InvalidData error.
|
||||||
/// will fail with an InvalidData error.
|
|
||||||
///
|
///
|
||||||
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
||||||
/// `Columnar` table.
|
/// `Columnar` table.
|
||||||
|
|||||||
@@ -10,13 +10,13 @@ pub struct HeapItem<'a> {
|
|||||||
pub segment_ord: usize,
|
pub segment_ord: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> PartialEq for HeapItem<'a> {
|
impl PartialEq for HeapItem<'_> {
|
||||||
fn eq(&self, other: &Self) -> bool {
|
fn eq(&self, other: &Self) -> bool {
|
||||||
self.segment_ord == other.segment_ord
|
self.segment_ord == other.segment_ord
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Eq for HeapItem<'a> {}
|
impl Eq for HeapItem<'_> {}
|
||||||
|
|
||||||
impl<'a> PartialOrd for HeapItem<'a> {
|
impl<'a> PartialOrd for HeapItem<'a> {
|
||||||
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {
|
||||||
@@ -35,8 +35,7 @@ impl<'a> Ord for HeapItem<'a> {
|
|||||||
///
|
///
|
||||||
/// The item yield is actually a pair with
|
/// The item yield is actually a pair with
|
||||||
/// - the term
|
/// - the term
|
||||||
/// - a slice with the ordinal of the segments containing
|
/// - a slice with the ordinal of the segments containing the terms.
|
||||||
/// the terms.
|
|
||||||
pub struct TermMerger<'a> {
|
pub struct TermMerger<'a> {
|
||||||
heap: BinaryHeap<HeapItem<'a>>,
|
heap: BinaryHeap<HeapItem<'a>>,
|
||||||
current_streamers: Vec<HeapItem<'a>>,
|
current_streamers: Vec<HeapItem<'a>>,
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ impl<V: SymbolValue> ColumnOperation<V> {
|
|||||||
minibuf
|
minibuf
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deserialize a colummn operation.
|
/// Deserialize a column operation.
|
||||||
/// Returns None if the buffer is empty.
|
/// Returns None if the buffer is empty.
|
||||||
///
|
///
|
||||||
/// Panics if the payload is invalid:
|
/// Panics if the payload is invalid:
|
||||||
@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
|
|||||||
// In order to limit memory usage, and in order
|
// In order to limit memory usage, and in order
|
||||||
// to benefit from the stacker, we do this by serialization our data
|
// to benefit from the stacker, we do this by serialization our data
|
||||||
// as "Symbols".
|
// as "Symbols".
|
||||||
#[allow(clippy::from_over_into)]
|
|
||||||
pub(super) trait SymbolValue: Clone + Copy {
|
pub(super) trait SymbolValue: Clone + Copy {
|
||||||
// Serializes the symbol into the given buffer.
|
// Serializes the symbol into the given buffer.
|
||||||
// Returns the number of bytes written into the buffer.
|
// Returns the number of bytes written into the buffer.
|
||||||
|
|||||||
@@ -392,7 +392,7 @@ impl ColumnarWriter {
|
|||||||
|
|
||||||
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
|
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
|
||||||
// Column: [Column Index, Column Values, column index num bytes U32::LE]
|
// Column: [Column Index, Column Values, column index num bytes U32::LE]
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[expect(clippy::too_many_arguments)]
|
||||||
fn serialize_bytes_or_str_column(
|
fn serialize_bytes_or_str_column(
|
||||||
cardinality: Cardinality,
|
cardinality: Cardinality,
|
||||||
num_docs: RowId,
|
num_docs: RowId,
|
||||||
|
|||||||
@@ -67,7 +67,7 @@ pub struct ColumnSerializer<'a, W: io::Write> {
|
|||||||
start_offset: u64,
|
start_offset: u64,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
|
impl<W: io::Write> ColumnSerializer<'_, W> {
|
||||||
pub fn finalize(self) -> io::Result<()> {
|
pub fn finalize(self) -> io::Result<()> {
|
||||||
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
|
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
|
||||||
let byte_range = self.start_offset..end_offset;
|
let byte_range = self.start_offset..end_offset;
|
||||||
@@ -80,7 +80,7 @@ impl<'a, W: io::Write> ColumnSerializer<'a, W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
impl<W: io::Write> io::Write for ColumnSerializer<'_, W> {
|
||||||
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
|
||||||
self.columnar_serializer.wrt.write(buf)
|
self.columnar_serializer.wrt.write(buf)
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ pub trait Iterable<T = u64> {
|
|||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, T: Copy> Iterable<T> for &'a [T] {
|
impl<T: Copy> Iterable<T> for &[T] {
|
||||||
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
|
||||||
Box::new(self.iter().copied())
|
Box::new(self.iter().copied())
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -17,6 +17,31 @@ impl NumericalValue {
|
|||||||
NumericalValue::F64(_) => NumericalType::F64,
|
NumericalValue::F64(_) => NumericalType::F64,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tries to normalize the numerical value in the following priorities:
|
||||||
|
/// i64, i64, f64
|
||||||
|
pub fn normalize(self) -> Self {
|
||||||
|
match self {
|
||||||
|
NumericalValue::U64(val) => {
|
||||||
|
if val <= i64::MAX as u64 {
|
||||||
|
NumericalValue::I64(val as i64)
|
||||||
|
} else {
|
||||||
|
NumericalValue::F64(val as f64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||||
|
NumericalValue::F64(val) => {
|
||||||
|
let fract = val.fract();
|
||||||
|
if fract == 0.0 && val >= i64::MIN as f64 && val <= i64::MAX as f64 {
|
||||||
|
NumericalValue::I64(val as i64)
|
||||||
|
} else if fract == 0.0 && val >= u64::MIN as f64 && val <= u64::MAX as f64 {
|
||||||
|
NumericalValue::U64(val as u64)
|
||||||
|
} else {
|
||||||
|
NumericalValue::F64(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<u64> for NumericalValue {
|
impl From<u64> for NumericalValue {
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
|||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
binggan = "0.8.1"
|
binggan = "0.14.0"
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
rand = "0.8.4"
|
rand = "0.8.4"
|
||||||
|
|
||||||
|
|||||||
130
common/src/bounds.rs
Normal file
130
common/src/bounds.rs
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
use std::io;
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct BoundsRange<T> {
|
||||||
|
pub lower_bound: Bound<T>,
|
||||||
|
pub upper_bound: Bound<T>,
|
||||||
|
}
|
||||||
|
impl<T> BoundsRange<T> {
|
||||||
|
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound,
|
||||||
|
upper_bound,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_unbounded(&self) -> bool {
|
||||||
|
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
|
||||||
|
}
|
||||||
|
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound: map_bound(&self.lower_bound, &transform),
|
||||||
|
upper_bound: map_bound(&self.upper_bound, &transform),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound_res<TTo, Err>(
|
||||||
|
&self,
|
||||||
|
transform: impl Fn(&T) -> Result<TTo, Err>,
|
||||||
|
) -> Result<BoundsRange<TTo>, Err> {
|
||||||
|
Ok(BoundsRange {
|
||||||
|
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
|
||||||
|
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn transform_inner<TTo>(
|
||||||
|
&self,
|
||||||
|
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
|
||||||
|
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
|
||||||
|
) -> BoundsRange<TTo> {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
|
||||||
|
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the first set inner value
|
||||||
|
pub fn get_inner(&self) -> Option<&T> {
|
||||||
|
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum TransformBound<T> {
|
||||||
|
/// Overwrite the bounds
|
||||||
|
NewBound(Bound<T>),
|
||||||
|
/// Use Existing bounds with new value
|
||||||
|
Existing(T),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||||
|
/// The bound variant may change by the value returned value from the closure.
|
||||||
|
pub fn transform_bound_inner_res<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
|
||||||
|
) -> io::Result<Bound<TTo>> {
|
||||||
|
use self::Bound::*;
|
||||||
|
Ok(match bound {
|
||||||
|
Excluded(ref from_val) => match transform(from_val)? {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||||
|
},
|
||||||
|
Included(ref from_val) => match transform(from_val)? {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Included(new_val),
|
||||||
|
},
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||||
|
/// The bound variant may change by the value returned value from the closure.
|
||||||
|
pub fn transform_bound_inner<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
|
||||||
|
) -> Bound<TTo> {
|
||||||
|
use self::Bound::*;
|
||||||
|
match bound {
|
||||||
|
Excluded(ref from_val) => match transform(from_val) {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||||
|
},
|
||||||
|
Included(ref from_val) => match transform(from_val) {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Included(new_val),
|
||||||
|
},
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the inner value of a `Bound`
|
||||||
|
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
|
||||||
|
match val {
|
||||||
|
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||||
|
Bound::Unbounded => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> TTo,
|
||||||
|
) -> Bound<TTo> {
|
||||||
|
use self::Bound::*;
|
||||||
|
match bound {
|
||||||
|
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
|
||||||
|
Included(ref from_val) => Bound::Included(transform(from_val)),
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound_res<TFrom, TTo, Err>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
|
||||||
|
) -> Result<Bound<TTo>, Err> {
|
||||||
|
use self::Bound::*;
|
||||||
|
Ok(match bound {
|
||||||
|
Excluded(ref from_val) => Excluded(transform(from_val)?),
|
||||||
|
Included(ref from_val) => Included(transform(from_val)?),
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ use std::ops::Deref;
|
|||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
mod bitset;
|
mod bitset;
|
||||||
|
pub mod bounds;
|
||||||
mod byte_count;
|
mod byte_count;
|
||||||
mod datetime;
|
mod datetime;
|
||||||
pub mod file_slice;
|
pub mod file_slice;
|
||||||
@@ -129,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod test {
|
pub(crate) mod test {
|
||||||
|
|
||||||
use proptest::prelude::*;
|
use proptest::prelude::*;
|
||||||
|
|
||||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||||
|
|
||||||
fn test_i64_converter_helper(val: i64) {
|
fn test_i64_converter_helper(val: i64) {
|
||||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||||
@@ -143,12 +144,6 @@ pub mod test {
|
|||||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
|
||||||
let mut buffer = Vec::new();
|
|
||||||
O::default().serialize(&mut buffer).unwrap();
|
|
||||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
|
||||||
}
|
|
||||||
|
|
||||||
proptest! {
|
proptest! {
|
||||||
#[test]
|
#[test]
|
||||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||||
|
|||||||
@@ -74,14 +74,14 @@ impl FixedSize for () {
|
|||||||
|
|
||||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
VInt(self.len() as u64).serialize(writer)?;
|
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||||
for it in self {
|
for it in self {
|
||||||
it.serialize(writer)?;
|
it.serialize(writer)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
||||||
let num_items = VInt::deserialize(reader)?.val();
|
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||||
for _ in 0..num_items {
|
for _ in 0..num_items {
|
||||||
let item = T::deserialize(reader)?;
|
let item = T::deserialize(reader)?;
|
||||||
@@ -236,12 +236,12 @@ impl FixedSize for bool {
|
|||||||
impl BinarySerializable for String {
|
impl BinarySerializable for String {
|
||||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
let data: &[u8] = self.as_bytes();
|
let data: &[u8] = self.as_bytes();
|
||||||
VInt(data.len() as u64).serialize(writer)?;
|
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||||
writer.write_all(data)
|
writer.write_all(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||||
let mut result = String::with_capacity(string_length);
|
let mut result = String::with_capacity(string_length);
|
||||||
reader
|
reader
|
||||||
.take(string_length as u64)
|
.take(string_length as u64)
|
||||||
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
|
|||||||
impl<'a> BinarySerializable for Cow<'a, str> {
|
impl<'a> BinarySerializable for Cow<'a, str> {
|
||||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
let data: &[u8] = self.as_bytes();
|
let data: &[u8] = self.as_bytes();
|
||||||
VInt(data.len() as u64).serialize(writer)?;
|
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||||
writer.write_all(data)
|
writer.write_all(data)
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
||||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||||
let mut result = String::with_capacity(string_length);
|
let mut result = String::with_capacity(string_length);
|
||||||
reader
|
reader
|
||||||
.take(string_length as u64)
|
.take(string_length as u64)
|
||||||
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
|
|||||||
|
|
||||||
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
||||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||||
VInt(self.len() as u64).serialize(writer)?;
|
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||||
for it in self.iter() {
|
for it in self.iter() {
|
||||||
it.serialize(writer)?;
|
BinarySerializable::serialize(it, writer)?;
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
||||||
let num_items = VInt::deserialize(reader)?.val();
|
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||||
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
||||||
for _ in 0..num_items {
|
for _ in 0..num_items {
|
||||||
let item = u8::deserialize(reader)?;
|
let item = <u8 as BinarySerializable>::deserialize(reader)?;
|
||||||
items.push(item);
|
items.push(item);
|
||||||
}
|
}
|
||||||
Ok(Cow::Owned(items))
|
Ok(Cow::Owned(items))
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
|
impl TerminatingWrite for &mut Vec<u8> {
|
||||||
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
|
||||||
self.flush()
|
self.flush()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
> Tantivy is a **search** engine **library** for Rust.
|
> Tantivy is a **search** engine **library** for Rust.
|
||||||
|
|
||||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
|
||||||
they both have the same scope and targeted use cases.
|
they both have the same scope and targeted use cases.
|
||||||
|
|
||||||
If you are not familiar with Lucene, let's break down our little tagline.
|
If you are not familiar with Lucene, let's break down our little tagline.
|
||||||
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
|
|||||||
experience. But keep in mind this is just a toolbox.
|
experience. But keep in mind this is just a toolbox.
|
||||||
Which bring us to the second keyword...
|
Which bring us to the second keyword...
|
||||||
|
|
||||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.
|
||||||
|
|
||||||
Sometimes a functionality will not be available in tantivy because it is too
|
Sometimes a functionality will not be available in tantivy because it is too
|
||||||
specific to your use case. By design, tantivy should make it possible to extend
|
specific to your use case. By design, tantivy should make it possible to extend
|
||||||
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
|
|||||||
index from a different format.
|
index from a different format.
|
||||||
|
|
||||||
Tantivy exposes a lot of low level API to do all of these things.
|
Tantivy exposes a lot of low level API to do all of these things.
|
||||||
|
|
||||||
|
|||||||
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
|
|||||||
While this design has some downsides, this greatly simplifies the source code of
|
While this design has some downsides, this greatly simplifies the source code of
|
||||||
tantivy. Caching is also entirely delegated to the OS.
|
tantivy. Caching is also entirely delegated to the OS.
|
||||||
|
|
||||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||||
|
|
||||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||||
|
|
||||||
|
|||||||
@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
|
|||||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||||
|
|
||||||
Note: Tantivy 0.16 does not do this optimization yet.
|
Note: tantivy 0.16 does not do this optimization yet.
|
||||||
|
|
||||||
### Pruning
|
### Pruning
|
||||||
|
|
||||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||||
|
|
||||||
Note: Tantivy 0.16 does not do this optimization yet.
|
Note: tantivy 0.16 does not do this optimization yet.
|
||||||
|
|
||||||
### Other?
|
### Other?
|
||||||
|
|
||||||
@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
|
|||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
||||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.
|
||||||
|
|
||||||
```rust
|
```rust
|
||||||
let settings = IndexSettings {
|
let settings = IndexSettings {
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
|
|||||||
- `value`: The value representation is just the regular Value representation.
|
- `value`: The value representation is just the regular Value representation.
|
||||||
|
|
||||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||||
|
|
||||||
In the example above, the terms will be sorted as
|
In the example above, the terms will be sorted as
|
||||||
|
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
|
|||||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||||
// We truncate the bytes in order to make sure the debug string
|
// We truncate the bytes in order to make sure the debug string
|
||||||
// is not too long.
|
// is not too long.
|
||||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
let bytes_truncated: &[u8] = if self.len() > 10 {
|
||||||
&self.as_slice()[..10]
|
&self.as_slice()[..10]
|
||||||
} else {
|
} else {
|
||||||
self.as_slice()
|
self.as_slice()
|
||||||
@@ -252,6 +252,11 @@ mod tests {
|
|||||||
format!("{short_bytes:?}"),
|
format!("{short_bytes:?}"),
|
||||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||||
);
|
);
|
||||||
|
let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||||
|
assert_eq!(
|
||||||
|
format!("{medium_bytes:?}"),
|
||||||
|
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
|
||||||
|
);
|
||||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{long_bytes:?}"),
|
format!("{long_bytes:?}"),
|
||||||
|
|||||||
@@ -109,6 +109,8 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
|||||||
move |input: I| match f.parse(input) {
|
move |input: I| match f.parse(input) {
|
||||||
Ok((input, (output, _err))) => Ok((input, output)),
|
Ok((input, (output, _err))) => Ok((input, output)),
|
||||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||||
|
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||||
|
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ use std::fmt::Write;
|
|||||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||||
pub enum Occur {
|
pub enum Occur {
|
||||||
/// For a given document to be considered for scoring,
|
/// For a given document to be considered for scoring,
|
||||||
/// at least one of the terms with the Should or the Must
|
/// at least one of the queries with the Should or the Must
|
||||||
/// Occur constraint must be within the document.
|
/// Occur constraint must be within the document.
|
||||||
Should,
|
Should,
|
||||||
/// Document without the term are excluded from the search.
|
/// Document without the queries are excluded from the search.
|
||||||
Must,
|
Must,
|
||||||
/// Document that contain the term are excluded from the
|
/// Document that contain the query are excluded from the
|
||||||
/// search.
|
/// search.
|
||||||
MustNot,
|
MustNot,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,7 +6,7 @@ use nom::bytes::complete::tag;
|
|||||||
use nom::character::complete::{
|
use nom::character::complete::{
|
||||||
anychar, char, digit1, multispace0, multispace1, none_of, one_of, satisfy, u32,
|
anychar, char, digit1, multispace0, multispace1, none_of, one_of, satisfy, u32,
|
||||||
};
|
};
|
||||||
use nom::combinator::{eof, map, map_res, opt, peek, recognize, value, verify};
|
use nom::combinator::{eof, map, map_res, not, opt, peek, recognize, value, verify};
|
||||||
use nom::error::{Error, ErrorKind};
|
use nom::error::{Error, ErrorKind};
|
||||||
use nom::multi::{many0, many1, separated_list0};
|
use nom::multi::{many0, many1, separated_list0};
|
||||||
use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple};
|
use nom::sequence::{delimited, preceded, separated_pair, terminated, tuple};
|
||||||
@@ -20,7 +20,7 @@ use crate::Occur;
|
|||||||
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
// Note: '-' char is only forbidden at the beginning of a field name, would be clearer to add it to
|
||||||
// special characters.
|
// special characters.
|
||||||
const SPECIAL_CHARS: &[char] = &[
|
const SPECIAL_CHARS: &[char] = &[
|
||||||
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', '*', ' ',
|
'+', '^', '`', ':', '{', '}', '"', '\'', '[', ']', '(', ')', '!', '\\', ' ',
|
||||||
];
|
];
|
||||||
|
|
||||||
/// consume a field name followed by colon. Return the field name with escape sequence
|
/// consume a field name followed by colon. Return the field name with escape sequence
|
||||||
@@ -679,7 +679,10 @@ fn negate(expr: UserInputAst) -> UserInputAst {
|
|||||||
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
fn leaf(inp: &str) -> IResult<&str, UserInputAst> {
|
||||||
alt((
|
alt((
|
||||||
delimited(char('('), ast, char(')')),
|
delimited(char('('), ast, char(')')),
|
||||||
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
|
preceded(
|
||||||
|
peek(not(tag("*:"))),
|
||||||
|
map(char('*'), |_| UserInputAst::from(UserInputLeaf::All)),
|
||||||
|
),
|
||||||
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate),
|
map(preceded(tuple((tag("NOT"), multispace1)), leaf), negate),
|
||||||
literal,
|
literal,
|
||||||
))(inp)
|
))(inp)
|
||||||
@@ -700,7 +703,13 @@ fn leaf_infallible(inp: &str) -> JResult<&str, Option<UserInputAst>> {
|
|||||||
),
|
),
|
||||||
),
|
),
|
||||||
(
|
(
|
||||||
value((), char('*')),
|
value(
|
||||||
|
(),
|
||||||
|
preceded(
|
||||||
|
peek(not(tag("*:"))), // Fail if `*:` is detected
|
||||||
|
char('*'), // Match standalone `*`
|
||||||
|
),
|
||||||
|
),
|
||||||
map(nothing, |_| {
|
map(nothing, |_| {
|
||||||
(Some(UserInputAst::from(UserInputLeaf::All)), Vec::new())
|
(Some(UserInputAst::from(UserInputLeaf::All)), Vec::new())
|
||||||
}),
|
}),
|
||||||
@@ -767,7 +776,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
|
|||||||
tuple((fallible(occur_symbol), boosted_leaf))(inp)
|
tuple((fallible(occur_symbol), boosted_leaf))(inp)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::type_complexity)]
|
#[expect(clippy::type_complexity)]
|
||||||
fn operand_occur_leaf_infallible(
|
fn operand_occur_leaf_infallible(
|
||||||
inp: &str,
|
inp: &str,
|
||||||
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
|
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
|
||||||
@@ -833,7 +842,7 @@ fn aggregate_infallible_expressions(
|
|||||||
if early_operand {
|
if early_operand {
|
||||||
err.push(LenientErrorInternal {
|
err.push(LenientErrorInternal {
|
||||||
pos: 0,
|
pos: 0,
|
||||||
message: "Found unexpeted boolean operator before term".to_string(),
|
message: "Found unexpected boolean operator before term".to_string(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -856,7 +865,7 @@ fn aggregate_infallible_expressions(
|
|||||||
_ => Some(Occur::Should),
|
_ => Some(Occur::Should),
|
||||||
};
|
};
|
||||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(
|
clauses.push(vec![(
|
||||||
Some(Occur::Should),
|
Some(Occur::Should),
|
||||||
ast.clone().unary(Occur::MustNot),
|
ast.clone().unary(Occur::MustNot),
|
||||||
@@ -872,7 +881,7 @@ fn aggregate_infallible_expressions(
|
|||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(
|
clauses.push(vec![(
|
||||||
Some(Occur::Should),
|
Some(Occur::Should),
|
||||||
ast.clone().unary(Occur::MustNot),
|
ast.clone().unary(Occur::MustNot),
|
||||||
@@ -897,7 +906,7 @@ fn aggregate_infallible_expressions(
|
|||||||
}
|
}
|
||||||
Some(BinaryOperand::Or) => {
|
Some(BinaryOperand::Or) => {
|
||||||
if last_occur == Some(Occur::MustNot) {
|
if last_occur == Some(Occur::MustNot) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
||||||
} else {
|
} else {
|
||||||
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
||||||
@@ -1057,7 +1066,7 @@ mod test {
|
|||||||
valid_parse("1", 1.0, "");
|
valid_parse("1", 1.0, "");
|
||||||
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
||||||
error_parse(".3332");
|
error_parse(".3332");
|
||||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeeed,
|
// TODO trinity-1686a: I disagree that it should fail, I think it should succeed,
|
||||||
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
||||||
// error_parse("1.");
|
// error_parse("1.");
|
||||||
error_parse("-1.");
|
error_parse("-1.");
|
||||||
@@ -1222,6 +1231,7 @@ mod test {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_field_name() {
|
fn test_field_name() {
|
||||||
|
assert_eq!(super::field_name("*:a"), Ok(("a", "*".to_string())));
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
super::field_name(".my.field.name:a"),
|
super::field_name(".my.field.name:a"),
|
||||||
Ok(("a", ".my.field.name".to_string()))
|
Ok(("a", ".my.field.name".to_string()))
|
||||||
@@ -1467,7 +1477,7 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_query_to_triming_spaces() {
|
fn test_parse_query_to_trimming_spaces() {
|
||||||
test_parse_query_to_ast_helper(" abc", "abc");
|
test_parse_query_to_ast_helper(" abc", "abc");
|
||||||
test_parse_query_to_ast_helper("abc ", "abc");
|
test_parse_query_to_ast_helper("abc ", "abc");
|
||||||
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
||||||
@@ -1497,6 +1507,11 @@ mod test {
|
|||||||
test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#);
|
test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn field_re_specification() {
|
||||||
|
test_parse_query_to_ast_helper(r#"field:(abc AND b:cde)"#, r#"(+"field":abc +"b":cde)"#);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_query_single_term() {
|
fn test_parse_query_single_term() {
|
||||||
test_parse_query_to_ast_helper("abc", "abc");
|
test_parse_query_to_ast_helper("abc", "abc");
|
||||||
@@ -1522,6 +1537,11 @@ mod test {
|
|||||||
test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
|
test_parse_query_to_ast_helper("abc:toto", "\"abc\":toto");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn all_field_star() {
|
||||||
|
test_parse_query_to_ast_helper("*:toto", "\"*\":toto");
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_phrase_with_field() {
|
fn test_phrase_with_field() {
|
||||||
test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
|
test_parse_query_to_ast_helper("abc:\"happy tax payer\"", "\"abc\":\"happy tax payer\"");
|
||||||
|
|||||||
@@ -267,7 +267,7 @@ impl fmt::Debug for UserInputAst {
|
|||||||
match *self {
|
match *self {
|
||||||
UserInputAst::Clause(ref subqueries) => {
|
UserInputAst::Clause(ref subqueries) => {
|
||||||
if subqueries.is_empty() {
|
if subqueries.is_empty() {
|
||||||
// TODO this will break ast reserialization, is writing "( )" enought?
|
// TODO this will break ast reserialization, is writing "( )" enough?
|
||||||
write!(formatter, "<emptyclause>")?;
|
write!(formatter, "<emptyclause>")?;
|
||||||
} else {
|
} else {
|
||||||
write!(formatter, "(")?;
|
write!(formatter, "(")?;
|
||||||
|
|||||||
@@ -21,7 +21,10 @@ impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
|
|||||||
|
|
||||||
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
||||||
/// (500MB). The limit is shared by all SegmentCollectors
|
/// (500MB). The limit is shared by all SegmentCollectors
|
||||||
pub struct AggregationLimits {
|
///
|
||||||
|
/// The memory limit is also a guard, which tracks how much it allocated and releases it's memory
|
||||||
|
/// on the shared counter. Cloning will create a new guard.
|
||||||
|
pub struct AggregationLimitsGuard {
|
||||||
/// The counter which is shared between the aggregations for one request.
|
/// The counter which is shared between the aggregations for one request.
|
||||||
memory_consumption: Arc<AtomicU64>,
|
memory_consumption: Arc<AtomicU64>,
|
||||||
/// The memory_limit in bytes
|
/// The memory_limit in bytes
|
||||||
@@ -29,28 +32,41 @@ pub struct AggregationLimits {
|
|||||||
/// The maximum number of buckets _returned_
|
/// The maximum number of buckets _returned_
|
||||||
/// This is not counting intermediate buckets.
|
/// This is not counting intermediate buckets.
|
||||||
bucket_limit: u32,
|
bucket_limit: u32,
|
||||||
|
/// Allocated memory with this guard.
|
||||||
|
allocated_with_the_guard: u64,
|
||||||
}
|
}
|
||||||
impl Clone for AggregationLimits {
|
impl Clone for AggregationLimitsGuard {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
Self {
|
Self {
|
||||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||||
memory_limit: self.memory_limit,
|
memory_limit: self.memory_limit,
|
||||||
bucket_limit: self.bucket_limit,
|
bucket_limit: self.bucket_limit,
|
||||||
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for AggregationLimits {
|
impl Drop for AggregationLimitsGuard {
|
||||||
|
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||||
|
/// This is used to clear the segment specific memory consumption all at once.
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.memory_consumption
|
||||||
|
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for AggregationLimitsGuard {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
memory_consumption: Default::default(),
|
memory_consumption: Default::default(),
|
||||||
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
||||||
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
||||||
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationLimits {
|
impl AggregationLimitsGuard {
|
||||||
/// *memory_limit*
|
/// *memory_limit*
|
||||||
/// memory_limit is defined in bytes.
|
/// memory_limit is defined in bytes.
|
||||||
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
||||||
@@ -67,24 +83,15 @@ impl AggregationLimits {
|
|||||||
memory_consumption: Default::default(),
|
memory_consumption: Default::default(),
|
||||||
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
||||||
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
|
|
||||||
pub fn new_guard(&self) -> ResourceLimitGuard {
|
|
||||||
ResourceLimitGuard {
|
|
||||||
// The counter which is shared between the aggregations for one request.
|
|
||||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
|
||||||
// The memory_limit in bytes
|
|
||||||
memory_limit: self.memory_limit,
|
|
||||||
allocated_with_the_guard: 0,
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
pub(crate) fn add_memory_consumed(&mut self, add_num_bytes: u64) -> crate::Result<()> {
|
||||||
let prev_value = self
|
let prev_value = self
|
||||||
.memory_consumption
|
.memory_consumption
|
||||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||||
|
self.allocated_with_the_guard += add_num_bytes;
|
||||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -109,34 +116,6 @@ fn validate_memory_consumption(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ResourceLimitGuard {
|
|
||||||
/// The counter which is shared between the aggregations for one request.
|
|
||||||
memory_consumption: Arc<AtomicU64>,
|
|
||||||
/// The memory_limit in bytes
|
|
||||||
memory_limit: ByteCount,
|
|
||||||
/// Allocated memory with this guard.
|
|
||||||
allocated_with_the_guard: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ResourceLimitGuard {
|
|
||||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
|
||||||
let prev_value = self
|
|
||||||
.memory_consumption
|
|
||||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
|
||||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for ResourceLimitGuard {
|
|
||||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
|
||||||
/// This is used to clear the segment specific memory consumption all at once.
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.memory_consumption
|
|
||||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::aggregation::tests::exec_request_with_query;
|
use crate::aggregation::tests::exec_request_with_query;
|
||||||
|
|||||||
@@ -5,7 +5,6 @@ use std::io;
|
|||||||
|
|
||||||
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
||||||
|
|
||||||
use super::agg_limits::ResourceLimitGuard;
|
|
||||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||||
use super::bucket::{
|
use super::bucket::{
|
||||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||||
@@ -14,7 +13,7 @@ use super::metric::{
|
|||||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||||
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||||
};
|
};
|
||||||
use super::segment_agg_result::AggregationLimits;
|
use super::segment_agg_result::AggregationLimitsGuard;
|
||||||
use super::VecWithNames;
|
use super::VecWithNames;
|
||||||
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
@@ -46,7 +45,7 @@ pub struct AggregationWithAccessor {
|
|||||||
pub(crate) str_dict_column: Option<StrColumn>,
|
pub(crate) str_dict_column: Option<StrColumn>,
|
||||||
pub(crate) field_type: ColumnType,
|
pub(crate) field_type: ColumnType,
|
||||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||||
pub(crate) limits: ResourceLimitGuard,
|
pub(crate) limits: AggregationLimitsGuard,
|
||||||
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// Used for missing term aggregation, which checks all columns for existence.
|
/// Used for missing term aggregation, which checks all columns for existence.
|
||||||
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
||||||
@@ -69,7 +68,7 @@ impl AggregationWithAccessor {
|
|||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
||||||
let mut agg = agg.clone();
|
let mut agg = agg.clone();
|
||||||
|
|
||||||
@@ -91,7 +90,7 @@ impl AggregationWithAccessor {
|
|||||||
&limits,
|
&limits,
|
||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
limits: limits.new_guard(),
|
limits: limits.clone(),
|
||||||
missing_value_for_accessor: None,
|
missing_value_for_accessor: None,
|
||||||
str_dict_column: None,
|
str_dict_column: None,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
@@ -106,6 +105,7 @@ impl AggregationWithAccessor {
|
|||||||
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
||||||
-> crate::Result<()> {
|
-> crate::Result<()> {
|
||||||
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
||||||
|
let limits = limits.clone();
|
||||||
let res = AggregationWithAccessor {
|
let res = AggregationWithAccessor {
|
||||||
segment_ordinal,
|
segment_ordinal,
|
||||||
// TODO: We should do away with the `accessor` field altogether
|
// TODO: We should do away with the `accessor` field altogether
|
||||||
@@ -120,7 +120,7 @@ impl AggregationWithAccessor {
|
|||||||
&limits,
|
&limits,
|
||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
limits: limits.new_guard(),
|
limits,
|
||||||
missing_value_for_accessor: None,
|
missing_value_for_accessor: None,
|
||||||
str_dict_column: None,
|
str_dict_column: None,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
@@ -186,6 +186,8 @@ impl AggregationWithAccessor {
|
|||||||
.map(|missing| match missing {
|
.map(|missing| match missing {
|
||||||
Key::Str(_) => ColumnType::Str,
|
Key::Str(_) => ColumnType::Str,
|
||||||
Key::F64(_) => ColumnType::F64,
|
Key::F64(_) => ColumnType::F64,
|
||||||
|
Key::I64(_) => ColumnType::I64,
|
||||||
|
Key::U64(_) => ColumnType::U64,
|
||||||
})
|
})
|
||||||
.unwrap_or(ColumnType::U64);
|
.unwrap_or(ColumnType::U64);
|
||||||
let column_and_types = get_all_ff_reader_or_empty(
|
let column_and_types = get_all_ff_reader_or_empty(
|
||||||
@@ -232,14 +234,18 @@ impl AggregationWithAccessor {
|
|||||||
missing.clone()
|
missing.clone()
|
||||||
};
|
};
|
||||||
|
|
||||||
let missing_value_for_accessor = if let Some(missing) =
|
let missing_value_for_accessor =
|
||||||
missing_value_term_agg.as_ref()
|
if let Some(missing) = missing_value_term_agg.as_ref() {
|
||||||
{
|
get_missing_val_as_u64_lenient(
|
||||||
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
|
column_type,
|
||||||
} else {
|
missing,
|
||||||
None
|
agg.agg.get_fast_field_names()[0],
|
||||||
};
|
)?
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let limits = limits.clone();
|
||||||
let agg = AggregationWithAccessor {
|
let agg = AggregationWithAccessor {
|
||||||
segment_ordinal,
|
segment_ordinal,
|
||||||
missing_value_for_accessor,
|
missing_value_for_accessor,
|
||||||
@@ -255,7 +261,7 @@ impl AggregationWithAccessor {
|
|||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
str_dict_column: str_dict_column.clone(),
|
str_dict_column: str_dict_column.clone(),
|
||||||
limits: limits.new_guard(),
|
limits,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
};
|
};
|
||||||
res.push(agg);
|
res.push(agg);
|
||||||
@@ -330,7 +336,14 @@ impl AggregationWithAccessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_missing_val(
|
/// Get the missing value as internal u64 representation
|
||||||
|
///
|
||||||
|
/// For terms we use u64::MAX as sentinel value
|
||||||
|
/// For numerical data we convert the value into the representation
|
||||||
|
/// we would get from the fast field, when we open it as u64_lenient_for_type.
|
||||||
|
///
|
||||||
|
/// That way we can use it the same way as if it would come from the fastfield.
|
||||||
|
fn get_missing_val_as_u64_lenient(
|
||||||
column_type: ColumnType,
|
column_type: ColumnType,
|
||||||
missing: &Key,
|
missing: &Key,
|
||||||
field_name: &str,
|
field_name: &str,
|
||||||
@@ -339,9 +352,18 @@ fn get_missing_val(
|
|||||||
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
// Allow fallback to number on text fields
|
// Allow fallback to number on text fields
|
||||||
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
|
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
|
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
Key::F64(val) if column_type.numerical_type().is_some() => {
|
Key::F64(val) if column_type.numerical_type().is_some() => {
|
||||||
f64_to_fastfield_u64(*val, &column_type)
|
f64_to_fastfield_u64(*val, &column_type)
|
||||||
}
|
}
|
||||||
|
// NOTE: We may loose precision of the passed missing value by casting i64 and u64 to f64.
|
||||||
|
Key::I64(val) if column_type.numerical_type().is_some() => {
|
||||||
|
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||||
|
}
|
||||||
|
Key::U64(val) if column_type.numerical_type().is_some() => {
|
||||||
|
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||||
"Missing value {missing:?} for field {field_name} is not supported for column \
|
"Missing value {missing:?} for field {field_name} is not supported for column \
|
||||||
@@ -365,7 +387,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
|
|||||||
aggs: &Aggregations,
|
aggs: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: &AggregationLimits,
|
limits: &AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationsWithAccessor> {
|
) -> crate::Result<AggregationsWithAccessor> {
|
||||||
let mut aggss = Vec::new();
|
let mut aggss = Vec::new();
|
||||||
for (key, agg) in aggs.iter() {
|
for (key, agg) in aggs.iter() {
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
//! Contains the final aggregation tree.
|
//! Contains the final aggregation tree.
|
||||||
|
//!
|
||||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||||
//! This conversion computes the final result. For example: The intermediate result contains
|
//! This conversion computes the final result. For example: The intermediate result contains
|
||||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||||
@@ -187,7 +188,7 @@ pub enum BucketEntries<T> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<T> BucketEntries<T> {
|
impl<T> BucketEntries<T> {
|
||||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
|
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
|
||||||
match self {
|
match self {
|
||||||
BucketEntries::Vec(vec) => Box::new(vec.iter()),
|
BucketEntries::Vec(vec) => Box::new(vec.iter()),
|
||||||
BucketEntries::HashMap(map) => Box::new(map.values()),
|
BucketEntries::HashMap(map) => Box::new(map.values()),
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use crate::aggregation::agg_result::AggregationResults;
|
|||||||
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||||
use crate::aggregation::collector::AggregationCollector;
|
use crate::aggregation::collector::AggregationCollector;
|
||||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use crate::aggregation::segment_agg_result::AggregationLimits;
|
use crate::aggregation::segment_agg_result::AggregationLimitsGuard;
|
||||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||||
use crate::aggregation::DistributedAggregationCollector;
|
use crate::aggregation::DistributedAggregationCollector;
|
||||||
use crate::query::{AllQuery, TermQuery};
|
use crate::query::{AllQuery, TermQuery};
|
||||||
@@ -130,7 +130,7 @@ fn test_aggregation_flushing(
|
|||||||
let agg_res: AggregationResults = if use_distributed_collector {
|
let agg_res: AggregationResults = if use_distributed_collector {
|
||||||
let collector = DistributedAggregationCollector::from_aggs(
|
let collector = DistributedAggregationCollector::from_aggs(
|
||||||
agg_req.clone(),
|
agg_req.clone(),
|
||||||
AggregationLimits::default(),
|
AggregationLimitsGuard::default(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -146,7 +146,7 @@ fn test_aggregation_flushing(
|
|||||||
.expect("Post deserialization failed");
|
.expect("Post deserialization failed");
|
||||||
|
|
||||||
intermediate_agg_result
|
intermediate_agg_result
|
||||||
.into_final_result(agg_req, &Default::default())
|
.into_final_result(agg_req, Default::default())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
} else {
|
} else {
|
||||||
let collector = get_collector(agg_req);
|
let collector = get_collector(agg_req);
|
||||||
@@ -460,7 +460,7 @@ fn test_aggregation_level2(
|
|||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let res = searcher.search(&term_query, &collector).unwrap();
|
let res = searcher.search(&term_query, &collector).unwrap();
|
||||||
res.into_final_result(agg_req.clone(), &Default::default())
|
res.into_final_result(agg_req.clone(), Default::default())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
} else {
|
} else {
|
||||||
let collector = get_collector(agg_req.clone());
|
let collector = get_collector(agg_req.clone());
|
||||||
@@ -870,7 +870,7 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
// => Segment with all boolen
|
// => Segment with all boolean
|
||||||
index_writer
|
index_writer
|
||||||
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -939,11 +939,11 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
},
|
},
|
||||||
"termagg": {
|
"termagg": {
|
||||||
"buckets": [
|
"buckets": [
|
||||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
{ "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } },
|
||||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
||||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
{ "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } },
|
||||||
],
|
],
|
||||||
"sum_other_doc_count": 0
|
"sum_other_doc_count": 0
|
||||||
}
|
}
|
||||||
@@ -951,3 +951,60 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let json = schema_builder.add_json_field("json", FAST);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||||
|
// => Segment with all values f64 numeric
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10.5})))
|
||||||
|
.unwrap();
|
||||||
|
// Gets converted to f64!
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
// => Segment with all values i64 numeric
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
|
// All bucket types
|
||||||
|
let agg_req_str = r#"
|
||||||
|
{
|
||||||
|
"termagg": {
|
||||||
|
"terms": {
|
||||||
|
"field": "json.mixed_price"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} "#;
|
||||||
|
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||||
|
let aggregation_collector = get_collector(agg);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
|
||||||
|
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
assert_eq!(
|
||||||
|
&aggregation_res_json,
|
||||||
|
&serde_json::json!({
|
||||||
|
"termagg": {
|
||||||
|
"buckets": [
|
||||||
|
{ "doc_count": 2, "key": 10},
|
||||||
|
{ "doc_count": 1, "key": 10.5},
|
||||||
|
],
|
||||||
|
"doc_count_error_upper_bound": 0,
|
||||||
|
"sum_other_doc_count": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub(crate) mod tests {
|
||||||
use pretty_assertions::assert_eq;
|
use pretty_assertions::assert_eq;
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|||||||
@@ -438,7 +438,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
|||||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||||
histogram_req: &HistogramAggregation,
|
histogram_req: &HistogramAggregation,
|
||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<BucketEntry>> {
|
) -> crate::Result<Vec<BucketEntry>> {
|
||||||
// Generate the full list of buckets without gaps.
|
// Generate the full list of buckets without gaps.
|
||||||
//
|
//
|
||||||
@@ -496,7 +496,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
|||||||
is_date_agg: bool,
|
is_date_agg: bool,
|
||||||
histogram_req: &HistogramAggregation,
|
histogram_req: &HistogramAggregation,
|
||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<BucketEntry>> {
|
) -> crate::Result<Vec<BucketEntry>> {
|
||||||
// Normalization is column type dependent.
|
// Normalization is column type dependent.
|
||||||
// The request used in the the call to final is not yet be normalized.
|
// The request used in the the call to final is not yet be normalized.
|
||||||
@@ -750,7 +750,7 @@ mod tests {
|
|||||||
agg_req,
|
agg_req,
|
||||||
&index,
|
&index,
|
||||||
None,
|
None,
|
||||||
AggregationLimits::new(Some(5_000), None),
|
AggregationLimitsGuard::new(Some(5_000), None),
|
||||||
)
|
)
|
||||||
.unwrap_err();
|
.unwrap_err();
|
||||||
assert!(res.to_string().starts_with(
|
assert!(res.to_string().starts_with(
|
||||||
|
|||||||
@@ -112,18 +112,64 @@ impl Serialize for CustomOrder {
|
|||||||
impl<'de> Deserialize<'de> for CustomOrder {
|
impl<'de> Deserialize<'de> for CustomOrder {
|
||||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||||
where D: Deserializer<'de> {
|
where D: Deserializer<'de> {
|
||||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
let value = serde_json::Value::deserialize(deserializer)?;
|
||||||
if let Some((key, value)) = map.into_iter().next() {
|
let return_err = |message, val: serde_json::Value| {
|
||||||
|
de::Error::custom(format!(
|
||||||
|
"{}, but got {}",
|
||||||
|
message,
|
||||||
|
serde_json::to_string(&val).unwrap()
|
||||||
|
))
|
||||||
|
};
|
||||||
|
|
||||||
|
match value {
|
||||||
|
serde_json::Value::Object(map) => {
|
||||||
|
if map.len() != 1 {
|
||||||
|
return Err(return_err(
|
||||||
|
"expected exactly one key-value pair in the order map",
|
||||||
|
map.into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (key, value) = map.into_iter().next().unwrap();
|
||||||
|
let order = serde_json::from_value(value).map_err(de::Error::custom)?;
|
||||||
|
|
||||||
Ok(CustomOrder {
|
Ok(CustomOrder {
|
||||||
target: key.as_str().into(),
|
target: key.as_str().into(),
|
||||||
order: value,
|
order,
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
Err(de::Error::custom(
|
|
||||||
"unexpected empty map in order".to_string(),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
})
|
serde_json::Value::Array(arr) => {
|
||||||
|
if arr.is_empty() {
|
||||||
|
return Err(return_err("unexpected empty array in order", arr.into()));
|
||||||
|
}
|
||||||
|
if arr.len() != 1 {
|
||||||
|
return Err(return_err(
|
||||||
|
"only one sort order supported currently",
|
||||||
|
arr.into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let entry = arr.into_iter().next().unwrap();
|
||||||
|
let map = entry
|
||||||
|
.as_object()
|
||||||
|
.ok_or_else(|| return_err("expected object as sort order", entry.clone()))?;
|
||||||
|
let (key, value) = map.into_iter().next().ok_or_else(|| {
|
||||||
|
return_err(
|
||||||
|
"expected exactly one key-value pair in the order map",
|
||||||
|
entry.clone(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let order = serde_json::from_value(value.clone()).map_err(de::Error::custom)?;
|
||||||
|
|
||||||
|
Ok(CustomOrder {
|
||||||
|
target: key.as_str().into(),
|
||||||
|
order,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => Err(return_err(
|
||||||
|
"unexpected type, expected an object or array",
|
||||||
|
value,
|
||||||
|
)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,11 +184,23 @@ fn custom_order_serde_test() {
|
|||||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(order, order_deser);
|
||||||
|
let order_deser: CustomOrder = serde_json::from_str("[{\"_key\":\"desc\"}]").unwrap();
|
||||||
assert_eq!(order, order_deser);
|
assert_eq!(order, order_deser);
|
||||||
|
|
||||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||||
assert!(order_deser.is_err());
|
assert!(order_deser.is_err());
|
||||||
|
|
||||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||||
assert!(order_deser.is_err());
|
assert!(order_deser
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("unexpected empty array in order"));
|
||||||
|
|
||||||
|
let order_deser: serde_json::Result<CustomOrder> =
|
||||||
|
serde_json::from_str(r#"[{"_key":"desc"},{"_key":"desc"}]"#);
|
||||||
|
assert_eq!(
|
||||||
|
order_deser.unwrap_err().to_string(),
|
||||||
|
r#"only one sort order supported currently, but got [{"_key":"desc"},{"_key":"desc"}]"#
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
|||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::aggregation::agg_limits::ResourceLimitGuard;
|
|
||||||
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
@@ -17,6 +16,7 @@ use crate::aggregation::*;
|
|||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
/// Provide user-defined buckets to aggregate on.
|
/// Provide user-defined buckets to aggregate on.
|
||||||
|
///
|
||||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||||
/// The provided buckets have to be continuous.
|
/// The provided buckets have to be continuous.
|
||||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||||
@@ -270,7 +270,7 @@ impl SegmentRangeCollector {
|
|||||||
pub(crate) fn from_req_and_validate(
|
pub(crate) fn from_req_and_validate(
|
||||||
req: &RangeAggregation,
|
req: &RangeAggregation,
|
||||||
sub_aggregation: &mut AggregationsWithAccessor,
|
sub_aggregation: &mut AggregationsWithAccessor,
|
||||||
limits: &ResourceLimitGuard,
|
limits: &mut AggregationLimitsGuard,
|
||||||
field_type: ColumnType,
|
field_type: ColumnType,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
@@ -471,7 +471,7 @@ mod tests {
|
|||||||
SegmentRangeCollector::from_req_and_validate(
|
SegmentRangeCollector::from_req_and_validate(
|
||||||
&req,
|
&req,
|
||||||
&mut Default::default(),
|
&mut Default::default(),
|
||||||
&AggregationLimits::default().new_guard(),
|
&mut AggregationLimitsGuard::default(),
|
||||||
field_type,
|
field_type,
|
||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -3,7 +3,9 @@ use std::io;
|
|||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
|
|
||||||
use columnar::column_values::CompactSpaceU64Accessor;
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
use columnar::{
|
||||||
|
ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalValue,
|
||||||
|
};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
@@ -19,11 +21,11 @@ use crate::aggregation::intermediate_agg_result::{
|
|||||||
use crate::aggregation::segment_agg_result::{
|
use crate::aggregation::segment_agg_result::{
|
||||||
build_segment_agg_collector, SegmentAggregationCollector,
|
build_segment_agg_collector, SegmentAggregationCollector,
|
||||||
};
|
};
|
||||||
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
|
use crate::aggregation::{format_date, Key};
|
||||||
use crate::error::DataCorruption;
|
use crate::error::DataCorruption;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
/// Creates a bucket for every unique term and counts the number of occurences.
|
/// Creates a bucket for every unique term and counts the number of occurrences.
|
||||||
/// Note that doc_count in the response buckets equals term count here.
|
/// Note that doc_count in the response buckets equals term count here.
|
||||||
///
|
///
|
||||||
/// If the text is untokenized and single value, that means one term per document and therefore it
|
/// If the text is untokenized and single value, that means one term per document and therefore it
|
||||||
@@ -156,7 +158,7 @@ pub struct TermsAggregation {
|
|||||||
/// when loading the text.
|
/// when loading the text.
|
||||||
/// Special Case 1:
|
/// Special Case 1:
|
||||||
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
||||||
/// columns, to find docids without a value. That requires a special missing aggreggation.
|
/// columns, to find docids without a value. That requires a special missing aggregation.
|
||||||
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
||||||
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
||||||
/// add text.
|
/// add text.
|
||||||
@@ -362,7 +364,7 @@ impl SegmentTermCollector {
|
|||||||
let term_buckets = TermBuckets::default();
|
let term_buckets = TermBuckets::default();
|
||||||
|
|
||||||
if let Some(custom_order) = req.order.as_ref() {
|
if let Some(custom_order) = req.order.as_ref() {
|
||||||
// Validate sub aggregtion exists
|
// Validate sub aggregation exists
|
||||||
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
||||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||||
|
|
||||||
@@ -497,6 +499,12 @@ impl SegmentTermCollector {
|
|||||||
Key::F64(val) => {
|
Key::F64(val) => {
|
||||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||||
}
|
}
|
||||||
|
Key::U64(val) => {
|
||||||
|
dict.insert(IntermediateKey::U64(*val), intermediate_entry);
|
||||||
|
}
|
||||||
|
Key::I64(val) => {
|
||||||
|
dict.insert(IntermediateKey::I64(*val), intermediate_entry);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
entries.swap_remove(index);
|
entries.swap_remove(index);
|
||||||
@@ -583,8 +591,26 @@ impl SegmentTermCollector {
|
|||||||
} else {
|
} else {
|
||||||
for (val, doc_count) in entries {
|
for (val, doc_count) in entries {
|
||||||
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
||||||
let val = f64_from_fastfield_u64(val, &self.column_type);
|
if self.column_type == ColumnType::U64 {
|
||||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||||
|
} else if self.column_type == ColumnType::I64 {
|
||||||
|
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
|
||||||
|
} else {
|
||||||
|
let val = f64::from_u64(val);
|
||||||
|
let val: NumericalValue = val.into();
|
||||||
|
|
||||||
|
match val.normalize() {
|
||||||
|
NumericalValue::U64(val) => {
|
||||||
|
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
NumericalValue::I64(val) => {
|
||||||
|
dict.insert(IntermediateKey::I64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
NumericalValue::F64(val) => {
|
||||||
|
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -643,7 +669,7 @@ mod tests {
|
|||||||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||||
};
|
};
|
||||||
use crate::aggregation::AggregationLimits;
|
use crate::aggregation::AggregationLimitsGuard;
|
||||||
use crate::indexer::NoMergePolicy;
|
use crate::indexer::NoMergePolicy;
|
||||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||||
use crate::{Index, IndexWriter};
|
use crate::{Index, IndexWriter};
|
||||||
@@ -1206,8 +1232,8 @@ mod tests {
|
|||||||
#[test]
|
#[test]
|
||||||
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
||||||
let terms_per_segment = vec![
|
let terms_per_segment = vec![
|
||||||
vec!["terma", "terma", "termb", "termb", "termb", "termc"],
|
vec!["terma", "terma", "termb", "termb", "termb"],
|
||||||
vec!["terma", "terma", "termb", "termc", "termc"],
|
vec!["terma", "terma", "termb"],
|
||||||
];
|
];
|
||||||
|
|
||||||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||||||
@@ -1229,8 +1255,6 @@ mod tests {
|
|||||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
||||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
|
||||||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
|
|
||||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||||
|
|
||||||
@@ -1398,7 +1422,7 @@ mod tests {
|
|||||||
agg_req,
|
agg_req,
|
||||||
&index,
|
&index,
|
||||||
None,
|
None,
|
||||||
AggregationLimits::new(Some(50_000), None),
|
AggregationLimitsGuard::new(Some(50_000), None),
|
||||||
)
|
)
|
||||||
.unwrap_err();
|
.unwrap_err();
|
||||||
assert!(res
|
assert!(res
|
||||||
@@ -1659,7 +1683,7 @@ mod tests {
|
|||||||
res["my_texts"]["buckets"][2]["key"],
|
res["my_texts"]["buckets"][2]["key"],
|
||||||
serde_json::Value::Null
|
serde_json::Value::Null
|
||||||
);
|
);
|
||||||
// text field with numner as missing fallback
|
// text field with number as missing fallback
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
||||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||||
@@ -1719,6 +1743,54 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn terms_aggregation_u64_value() -> crate::Result<()> {
|
||||||
|
// Make sure that large u64 are not truncated
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 9_223_372_036_854_775_807u64,
|
||||||
|
))?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 1_769_070_189_829_214_202u64,
|
||||||
|
))?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 1_769_070_189_829_214_202u64,
|
||||||
|
))?;
|
||||||
|
index_writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"my_ids": {
|
||||||
|
"terms": {
|
||||||
|
"field": "id"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||||
|
|
||||||
|
// id field
|
||||||
|
assert_eq!(
|
||||||
|
res["my_ids"]["buckets"][0]["key"],
|
||||||
|
1_769_070_189_829_214_202u64
|
||||||
|
);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||||||
|
assert_eq!(
|
||||||
|
res["my_ids"]["buckets"][1]["key"],
|
||||||
|
9_223_372_036_854_775_807u64
|
||||||
|
);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn terms_aggregation_missing1() -> crate::Result<()> {
|
fn terms_aggregation_missing1() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -1785,7 +1857,7 @@ mod tests {
|
|||||||
res["my_texts"]["buckets"][2]["key"],
|
res["my_texts"]["buckets"][2]["key"],
|
||||||
serde_json::Value::Null
|
serde_json::Value::Null
|
||||||
);
|
);
|
||||||
// text field with numner as missing fallback
|
// text field with number as missing fallback
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
||||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||||
|
|||||||
@@ -70,7 +70,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
|||||||
)?;
|
)?;
|
||||||
missing_entry.sub_aggregation = res;
|
missing_entry.sub_aggregation = res;
|
||||||
}
|
}
|
||||||
|
|
||||||
entries.insert(missing.into(), missing_entry);
|
entries.insert(missing.into(), missing_entry);
|
||||||
|
|
||||||
let bucket = IntermediateBucketResult::Terms {
|
let bucket = IntermediateBucketResult::Terms {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use super::agg_result::AggregationResults;
|
|||||||
use super::buf_collector::BufAggregationCollector;
|
use super::buf_collector::BufAggregationCollector;
|
||||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use super::segment_agg_result::{
|
use super::segment_agg_result::{
|
||||||
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
|
build_segment_agg_collector, AggregationLimitsGuard, SegmentAggregationCollector,
|
||||||
};
|
};
|
||||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
||||||
use crate::collector::{Collector, SegmentCollector};
|
use crate::collector::{Collector, SegmentCollector};
|
||||||
@@ -22,7 +22,7 @@ pub const DEFAULT_MEMORY_LIMIT: u64 = 500_000_000;
|
|||||||
/// The collector collects all aggregations by the underlying aggregation request.
|
/// The collector collects all aggregations by the underlying aggregation request.
|
||||||
pub struct AggregationCollector {
|
pub struct AggregationCollector {
|
||||||
agg: Aggregations,
|
agg: Aggregations,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationCollector {
|
impl AggregationCollector {
|
||||||
@@ -30,7 +30,7 @@ impl AggregationCollector {
|
|||||||
///
|
///
|
||||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||||
/// bucket limit)
|
/// bucket limit)
|
||||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||||
Self { agg, limits }
|
Self { agg, limits }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,7 +45,7 @@ impl AggregationCollector {
|
|||||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||||
pub struct DistributedAggregationCollector {
|
pub struct DistributedAggregationCollector {
|
||||||
agg: Aggregations,
|
agg: Aggregations,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DistributedAggregationCollector {
|
impl DistributedAggregationCollector {
|
||||||
@@ -53,7 +53,7 @@ impl DistributedAggregationCollector {
|
|||||||
///
|
///
|
||||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||||
/// bucket limit)
|
/// bucket limit)
|
||||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||||
Self { agg, limits }
|
Self { agg, limits }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -115,7 +115,7 @@ impl Collector for AggregationCollector {
|
|||||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||||
) -> crate::Result<Self::Fruit> {
|
) -> crate::Result<Self::Fruit> {
|
||||||
let res = merge_fruits(segment_fruits)?;
|
let res = merge_fruits(segment_fruits)?;
|
||||||
res.into_final_result(self.agg.clone(), &self.limits)
|
res.into_final_result(self.agg.clone(), self.limits.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -147,7 +147,7 @@ impl AggregationSegmentCollector {
|
|||||||
agg: &Aggregations,
|
agg: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: &AggregationLimits,
|
limits: &AggregationLimitsGuard,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
let mut aggs_with_accessor =
|
let mut aggs_with_accessor =
|
||||||
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ use super::metric::{
|
|||||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||||
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||||
};
|
};
|
||||||
use super::segment_agg_result::AggregationLimits;
|
use super::segment_agg_result::AggregationLimitsGuard;
|
||||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||||
@@ -51,12 +51,18 @@ pub enum IntermediateKey {
|
|||||||
Str(String),
|
Str(String),
|
||||||
/// `f64` key
|
/// `f64` key
|
||||||
F64(f64),
|
F64(f64),
|
||||||
|
/// `i64` key
|
||||||
|
I64(i64),
|
||||||
|
/// `u64` key
|
||||||
|
U64(u64),
|
||||||
}
|
}
|
||||||
impl From<Key> for IntermediateKey {
|
impl From<Key> for IntermediateKey {
|
||||||
fn from(value: Key) -> Self {
|
fn from(value: Key) -> Self {
|
||||||
match value {
|
match value {
|
||||||
Key::Str(s) => Self::Str(s),
|
Key::Str(s) => Self::Str(s),
|
||||||
Key::F64(f) => Self::F64(f),
|
Key::F64(f) => Self::F64(f),
|
||||||
|
Key::U64(f) => Self::U64(f),
|
||||||
|
Key::I64(f) => Self::I64(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -73,7 +79,9 @@ impl From<IntermediateKey> for Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
IntermediateKey::F64(f) => Self::F64(f),
|
IntermediateKey::F64(f) => Self::F64(f),
|
||||||
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
|
IntermediateKey::Bool(f) => Self::U64(f as u64),
|
||||||
|
IntermediateKey::U64(f) => Self::U64(f),
|
||||||
|
IntermediateKey::I64(f) => Self::I64(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -86,6 +94,8 @@ impl std::hash::Hash for IntermediateKey {
|
|||||||
match self {
|
match self {
|
||||||
IntermediateKey::Str(text) => text.hash(state),
|
IntermediateKey::Str(text) => text.hash(state),
|
||||||
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||||
|
IntermediateKey::U64(val) => val.hash(state),
|
||||||
|
IntermediateKey::I64(val) => val.hash(state),
|
||||||
IntermediateKey::Bool(val) => val.hash(state),
|
IntermediateKey::Bool(val) => val.hash(state),
|
||||||
IntermediateKey::IpAddr(val) => val.hash(state),
|
IntermediateKey::IpAddr(val) => val.hash(state),
|
||||||
}
|
}
|
||||||
@@ -112,9 +122,9 @@ impl IntermediateAggregationResults {
|
|||||||
pub fn into_final_result(
|
pub fn into_final_result(
|
||||||
self,
|
self,
|
||||||
req: Aggregations,
|
req: Aggregations,
|
||||||
limits: &AggregationLimits,
|
mut limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResults> {
|
) -> crate::Result<AggregationResults> {
|
||||||
let res = self.into_final_result_internal(&req, limits)?;
|
let res = self.into_final_result_internal(&req, &mut limits)?;
|
||||||
let bucket_count = res.get_bucket_count() as u32;
|
let bucket_count = res.get_bucket_count() as u32;
|
||||||
if bucket_count > limits.get_bucket_limit() {
|
if bucket_count > limits.get_bucket_limit() {
|
||||||
return Err(TantivyError::AggregationError(
|
return Err(TantivyError::AggregationError(
|
||||||
@@ -131,7 +141,7 @@ impl IntermediateAggregationResults {
|
|||||||
pub(crate) fn into_final_result_internal(
|
pub(crate) fn into_final_result_internal(
|
||||||
self,
|
self,
|
||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResults> {
|
) -> crate::Result<AggregationResults> {
|
||||||
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
||||||
for (key, agg_res) in self.aggs_res.into_iter() {
|
for (key, agg_res) in self.aggs_res.into_iter() {
|
||||||
@@ -247,7 +257,7 @@ impl IntermediateAggregationResult {
|
|||||||
pub(crate) fn into_final_result(
|
pub(crate) fn into_final_result(
|
||||||
self,
|
self,
|
||||||
req: &Aggregation,
|
req: &Aggregation,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResult> {
|
) -> crate::Result<AggregationResult> {
|
||||||
let res = match self {
|
let res = match self {
|
||||||
IntermediateAggregationResult::Bucket(bucket) => {
|
IntermediateAggregationResult::Bucket(bucket) => {
|
||||||
@@ -422,7 +432,7 @@ impl IntermediateBucketResult {
|
|||||||
pub(crate) fn into_final_bucket_result(
|
pub(crate) fn into_final_bucket_result(
|
||||||
self,
|
self,
|
||||||
req: &Aggregation,
|
req: &Aggregation,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketResult> {
|
) -> crate::Result<BucketResult> {
|
||||||
match self {
|
match self {
|
||||||
IntermediateBucketResult::Range(range_res) => {
|
IntermediateBucketResult::Range(range_res) => {
|
||||||
@@ -586,7 +596,7 @@ impl IntermediateTermBucketResult {
|
|||||||
self,
|
self,
|
||||||
req: &TermsAggregation,
|
req: &TermsAggregation,
|
||||||
sub_aggregation_req: &Aggregations,
|
sub_aggregation_req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketResult> {
|
) -> crate::Result<BucketResult> {
|
||||||
let req = TermsAggregationInternal::from_req(req);
|
let req = TermsAggregationInternal::from_req(req);
|
||||||
let mut buckets: Vec<BucketEntry> = self
|
let mut buckets: Vec<BucketEntry> = self
|
||||||
@@ -713,7 +723,7 @@ impl IntermediateHistogramBucketEntry {
|
|||||||
pub(crate) fn into_final_bucket_entry(
|
pub(crate) fn into_final_bucket_entry(
|
||||||
self,
|
self,
|
||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketEntry> {
|
) -> crate::Result<BucketEntry> {
|
||||||
Ok(BucketEntry {
|
Ok(BucketEntry {
|
||||||
key_as_string: None,
|
key_as_string: None,
|
||||||
@@ -748,7 +758,7 @@ impl IntermediateRangeBucketEntry {
|
|||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
_range_req: &RangeAggregation,
|
_range_req: &RangeAggregation,
|
||||||
column_type: Option<ColumnType>,
|
column_type: Option<ColumnType>,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<RangeBucketEntry> {
|
) -> crate::Result<RangeBucketEntry> {
|
||||||
let mut range_bucket_entry = RangeBucketEntry {
|
let mut range_bucket_entry = RangeBucketEntry {
|
||||||
key: self.key.into(),
|
key: self.key.into(),
|
||||||
@@ -850,7 +860,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_intermediat_tree_with_ranges(
|
fn get_intermediate_tree_with_ranges(
|
||||||
data: &[(String, u64, String, u64)],
|
data: &[(String, u64, String, u64)],
|
||||||
) -> IntermediateAggregationResults {
|
) -> IntermediateAggregationResults {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::new();
|
||||||
@@ -886,18 +896,18 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_1() {
|
fn test_merge_fruits_tree_1() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 60, "1900".to_string(), 30),
|
("red".to_string(), 60, "1900".to_string(), 30),
|
||||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
tree_left.merge_fruits(tree_right).unwrap();
|
tree_left.merge_fruits(tree_right).unwrap();
|
||||||
|
|
||||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 110, "1900".to_string(), 55),
|
("red".to_string(), 110, "1900".to_string(), 55),
|
||||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||||
]);
|
]);
|
||||||
@@ -907,18 +917,18 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_2() {
|
fn test_merge_fruits_tree_2() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 60, "1900".to_string(), 30),
|
("red".to_string(), 60, "1900".to_string(), 30),
|
||||||
("green".to_string(), 25, "1900".to_string(), 50),
|
("green".to_string(), 25, "1900".to_string(), 50),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
tree_left.merge_fruits(tree_right).unwrap();
|
tree_left.merge_fruits(tree_right).unwrap();
|
||||||
|
|
||||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 110, "1900".to_string(), 55),
|
("red".to_string(), 110, "1900".to_string(), 55),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
("green".to_string(), 25, "1900".to_string(), 50),
|
("green".to_string(), 25, "1900".to_string(), 50),
|
||||||
@@ -929,7 +939,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_empty() {
|
fn test_merge_fruits_tree_empty() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
|
|||||||
@@ -179,10 +179,11 @@ impl SegmentCardinalityCollector {
|
|||||||
Ok(())
|
Ok(())
|
||||||
})?;
|
})?;
|
||||||
if has_missing {
|
if has_missing {
|
||||||
|
// Replace missing with the actual value provided
|
||||||
let missing_key = self
|
let missing_key = self
|
||||||
.missing
|
.missing
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.expect("Found placeholder term_ord but `missing` is None");
|
.expect("Found sentinel value u64::MAX for term_ord but `missing` is not set");
|
||||||
match missing_key {
|
match missing_key {
|
||||||
Key::Str(missing) => {
|
Key::Str(missing) => {
|
||||||
self.cardinality.sketch.insert_any(&missing);
|
self.cardinality.sketch.insert_any(&missing);
|
||||||
@@ -191,6 +192,12 @@ impl SegmentCardinalityCollector {
|
|||||||
let val = f64_to_u64(*val);
|
let val = f64_to_u64(*val);
|
||||||
self.cardinality.sketch.insert_any(&val);
|
self.cardinality.sketch.insert_any(&val);
|
||||||
}
|
}
|
||||||
|
Key::U64(val) => {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
Key::I64(val) => {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -163,8 +163,8 @@ impl PartialEq for PercentilesCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_percentil(percentil: f64) -> String {
|
fn format_percentile(percentile: f64) -> String {
|
||||||
let mut out = percentil.to_string();
|
let mut out = percentile.to_string();
|
||||||
// Slightly silly way to format trailing decimals
|
// Slightly silly way to format trailing decimals
|
||||||
if !out.contains('.') {
|
if !out.contains('.') {
|
||||||
out.push_str(".0");
|
out.push_str(".0");
|
||||||
@@ -197,7 +197,7 @@ impl PercentilesCollector {
|
|||||||
let values = if req.keyed {
|
let values = if req.keyed {
|
||||||
PercentileValues::HashMap(
|
PercentileValues::HashMap(
|
||||||
iter_quantile_and_values
|
iter_quantile_and_values
|
||||||
.map(|(val, quantil)| (format_percentil(val), quantil))
|
.map(|(val, quantil)| (format_percentile(val), quantil))
|
||||||
.collect(),
|
.collect(),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -139,7 +139,7 @@ impl<'de> Deserialize<'de> for KeyOrder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
// Transform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
||||||
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
|
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
|
||||||
// Replace `*` glob with `.*` regex
|
// Replace `*` glob with `.*` regex
|
||||||
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
|
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
|
||||||
|
|||||||
@@ -148,7 +148,7 @@ mod agg_tests;
|
|||||||
|
|
||||||
use core::fmt;
|
use core::fmt;
|
||||||
|
|
||||||
pub use agg_limits::AggregationLimits;
|
pub use agg_limits::AggregationLimitsGuard;
|
||||||
pub use collector::{
|
pub use collector::{
|
||||||
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
||||||
DEFAULT_BUCKET_LIMIT,
|
DEFAULT_BUCKET_LIMIT,
|
||||||
@@ -180,7 +180,7 @@ pub(crate) fn deserialize_option_f64<'de, D>(deserializer: D) -> Result<Option<f
|
|||||||
where D: Deserializer<'de> {
|
where D: Deserializer<'de> {
|
||||||
struct StringOrFloatVisitor;
|
struct StringOrFloatVisitor;
|
||||||
|
|
||||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
impl Visitor<'_> for StringOrFloatVisitor {
|
||||||
type Value = Option<f64>;
|
type Value = Option<f64>;
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||||
@@ -226,7 +226,7 @@ pub(crate) fn deserialize_f64<'de, D>(deserializer: D) -> Result<f64, D::Error>
|
|||||||
where D: Deserializer<'de> {
|
where D: Deserializer<'de> {
|
||||||
struct StringOrFloatVisitor;
|
struct StringOrFloatVisitor;
|
||||||
|
|
||||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
impl Visitor<'_> for StringOrFloatVisitor {
|
||||||
type Value = f64;
|
type Value = f64;
|
||||||
|
|
||||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||||
@@ -336,10 +336,16 @@ pub type SerializedKey = String;
|
|||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
|
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
|
||||||
/// The key to identify a bucket.
|
/// The key to identify a bucket.
|
||||||
|
///
|
||||||
|
/// The order is important, with serde untagged, that we try to deserialize into i64 first.
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum Key {
|
pub enum Key {
|
||||||
/// String key
|
/// String key
|
||||||
Str(String),
|
Str(String),
|
||||||
|
/// `i64` key
|
||||||
|
I64(i64),
|
||||||
|
/// `u64` key
|
||||||
|
U64(u64),
|
||||||
/// `f64` key
|
/// `f64` key
|
||||||
F64(f64),
|
F64(f64),
|
||||||
}
|
}
|
||||||
@@ -350,6 +356,8 @@ impl std::hash::Hash for Key {
|
|||||||
match self {
|
match self {
|
||||||
Key::Str(text) => text.hash(state),
|
Key::Str(text) => text.hash(state),
|
||||||
Key::F64(val) => val.to_bits().hash(state),
|
Key::F64(val) => val.to_bits().hash(state),
|
||||||
|
Key::U64(val) => val.hash(state),
|
||||||
|
Key::I64(val) => val.hash(state),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -369,6 +377,8 @@ impl Display for Key {
|
|||||||
match self {
|
match self {
|
||||||
Key::Str(val) => f.write_str(val),
|
Key::Str(val) => f.write_str(val),
|
||||||
Key::F64(val) => f.write_str(&val.to_string()),
|
Key::F64(val) => f.write_str(&val.to_string()),
|
||||||
|
Key::U64(val) => f.write_str(&val.to_string()),
|
||||||
|
Key::I64(val) => f.write_str(&val.to_string()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -448,7 +458,7 @@ mod tests {
|
|||||||
agg_req: Aggregations,
|
agg_req: Aggregations,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
query: Option<(&str, &str)>,
|
query: Option<(&str, &str)>,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<Value> {
|
) -> crate::Result<Value> {
|
||||||
let collector = AggregationCollector::from_aggs(agg_req, limits);
|
let collector = AggregationCollector::from_aggs(agg_req, limits);
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
|
||||||
pub(crate) use super::agg_limits::AggregationLimits;
|
pub(crate) use super::agg_limits::AggregationLimitsGuard;
|
||||||
use super::agg_req::AggregationVariants;
|
use super::agg_req::AggregationVariants;
|
||||||
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
|
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
|
||||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||||
@@ -103,7 +103,7 @@ pub(crate) fn build_single_agg_segment_collector(
|
|||||||
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
||||||
range_req,
|
range_req,
|
||||||
&mut req.sub_aggregation,
|
&mut req.sub_aggregation,
|
||||||
&req.limits,
|
&mut req.limits,
|
||||||
req.field_type,
|
req.field_type,
|
||||||
accessor_idx,
|
accessor_idx,
|
||||||
)?)),
|
)?)),
|
||||||
|
|||||||
@@ -13,7 +13,7 @@ struct Hit<'a> {
|
|||||||
facet: &'a Facet,
|
facet: &'a Facet,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Eq for Hit<'a> {}
|
impl Eq for Hit<'_> {}
|
||||||
|
|
||||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||||
fn eq(&self, other: &Hit<'_>) -> bool {
|
fn eq(&self, other: &Hit<'_>) -> bool {
|
||||||
@@ -27,7 +27,7 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Ord for Hit<'a> {
|
impl Ord for Hit<'_> {
|
||||||
fn cmp(&self, other: &Self) -> Ordering {
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
other
|
other
|
||||||
.count
|
.count
|
||||||
|
|||||||
@@ -182,6 +182,7 @@ where
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
||||||
|
///
|
||||||
/// it transparently wraps an inner [`Collector`] but filters documents
|
/// it transparently wraps an inner [`Collector`] but filters documents
|
||||||
/// based on the result of applying the predicate to the bytes fast field.
|
/// based on the result of applying the predicate to the bytes fast field.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -495,4 +495,4 @@ where
|
|||||||
impl_downcast!(Fruit);
|
impl_downcast!(Fruit);
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests;
|
pub(crate) mod tests;
|
||||||
|
|||||||
@@ -161,7 +161,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
|||||||
/// # Ok(())
|
/// # Ok(())
|
||||||
/// # }
|
/// # }
|
||||||
/// ```
|
/// ```
|
||||||
#[allow(clippy::type_complexity)]
|
#[expect(clippy::type_complexity)]
|
||||||
#[derive(Default)]
|
#[derive(Default)]
|
||||||
pub struct MultiCollector<'a> {
|
pub struct MultiCollector<'a> {
|
||||||
collector_wrappers: Vec<
|
collector_wrappers: Vec<
|
||||||
@@ -190,7 +190,7 @@ impl<'a> MultiCollector<'a> {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Collector for MultiCollector<'a> {
|
impl Collector for MultiCollector<'_> {
|
||||||
type Fruit = MultiFruit;
|
type Fruit = MultiFruit;
|
||||||
type Child = MultiCollectorChild;
|
type Child = MultiCollectorChild;
|
||||||
|
|
||||||
|
|||||||
@@ -15,11 +15,6 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
|||||||
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
||||||
/// should be reversed, which is useful for achieving for example largest-first
|
/// should be reversed, which is useful for achieving for example largest-first
|
||||||
/// semantics without having to wrap the feature in a `Reverse`.
|
/// semantics without having to wrap the feature in a `Reverse`.
|
||||||
///
|
|
||||||
/// WARNING: equality is not what you would expect here.
|
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
|
||||||
/// struct is never public.
|
|
||||||
#[derive(Clone, Default, Serialize, Deserialize)]
|
#[derive(Clone, Default, Serialize, Deserialize)]
|
||||||
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
||||||
/// The feature of the document. In practice, this is
|
/// The feature of the document. In practice, this is
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::marker::PhantomData;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use columnar::ColumnValues;
|
use columnar::ColumnValues;
|
||||||
use serde::de::DeserializeOwned;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::Collector;
|
use super::Collector;
|
||||||
@@ -790,7 +789,7 @@ impl<Score, D, const R: bool> From<TopNComputerDeser<Score, D, R>> for TopNCompu
|
|||||||
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
|
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
|
||||||
where
|
where
|
||||||
Score: PartialOrd + Clone,
|
Score: PartialOrd + Clone,
|
||||||
D: Serialize + DeserializeOwned + Ord + Clone,
|
D: Ord,
|
||||||
{
|
{
|
||||||
/// Create a new `TopNComputer`.
|
/// Create a new `TopNComputer`.
|
||||||
/// Internally it will allocate a buffer of size `2 * top_n`.
|
/// Internally it will allocate a buffer of size `2 * top_n`.
|
||||||
|
|||||||
91
src/compat_tests.rs
Normal file
91
src/compat_tests.rs
Normal file
@@ -0,0 +1,91 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use schema::*;
|
||||||
|
|
||||||
|
use crate::*;
|
||||||
|
|
||||||
|
fn create_index(path: &str) {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let label = schema_builder.add_text_field("label", TEXT | STORED);
|
||||||
|
let date = schema_builder.add_date_field("date", INDEXED | STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
std::fs::create_dir_all(path).unwrap();
|
||||||
|
let index = Index::create_in_dir(path, schema).unwrap();
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(label => "dateformat", date => DateTime::from_timestamp_nanos(123456)))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
/// Writes an Index for the current INDEX_FORMAT_VERSION to disk.
|
||||||
|
fn create_format() {
|
||||||
|
let version = INDEX_FORMAT_VERSION.to_string();
|
||||||
|
let file_path = path_for_version(&version);
|
||||||
|
if PathBuf::from(file_path.clone()).exists() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
create_index(&file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path_for_version(version: &str) -> String {
|
||||||
|
format!("./tests/compat_tests_data/index_v{}/", version)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// feature flag quickwit uses a different dictionary type
|
||||||
|
#[test]
|
||||||
|
#[cfg(not(feature = "quickwit"))]
|
||||||
|
fn test_format_6() {
|
||||||
|
let path = path_for_version("6");
|
||||||
|
|
||||||
|
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||||
|
// dates are truncated to Microseconds in v6
|
||||||
|
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
/// feature flag quickwit uses a different dictionary type
|
||||||
|
#[test]
|
||||||
|
#[cfg(not(feature = "quickwit"))]
|
||||||
|
fn test_format_7() {
|
||||||
|
let path = path_for_version("7");
|
||||||
|
|
||||||
|
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||||
|
// dates are not truncated in v7 in the docstore
|
||||||
|
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "quickwit"))]
|
||||||
|
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
|
||||||
|
use collector::TopDocs;
|
||||||
|
let reader = index.reader().expect("Failed to create reader");
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let schema = index.schema();
|
||||||
|
let label_field = schema.get_field("label").expect("Field 'label' not found");
|
||||||
|
let query_parser = query::QueryParser::for_index(index, vec![label_field]);
|
||||||
|
|
||||||
|
let query = query_parser
|
||||||
|
.parse_query("dateformat")
|
||||||
|
.expect("Failed to parse query");
|
||||||
|
let top_docs = searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(1))
|
||||||
|
.expect("Search failed");
|
||||||
|
|
||||||
|
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
|
||||||
|
|
||||||
|
let doc_address = top_docs[0].1;
|
||||||
|
let retrieved_doc: TantivyDocument = searcher
|
||||||
|
.doc(doc_address)
|
||||||
|
.expect("Failed to retrieve document");
|
||||||
|
|
||||||
|
let date_field = schema.get_field("date").expect("Field 'date' not found");
|
||||||
|
let date_value = retrieved_doc
|
||||||
|
.get_first(date_field)
|
||||||
|
.expect("Date field not found in document")
|
||||||
|
.as_datetime()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
|
||||||
|
assert_eq!(date_value, expected,);
|
||||||
|
}
|
||||||
@@ -100,7 +100,7 @@ impl Executor {
|
|||||||
|
|
||||||
/// Spawn a task on the pool, returning a future completing on task success.
|
/// Spawn a task on the pool, returning a future completing on task success.
|
||||||
///
|
///
|
||||||
/// If the task panic, returns `Err(())`.
|
/// If the task panics, returns `Err(())`.
|
||||||
#[cfg(feature = "quickwit")]
|
#[cfg(feature = "quickwit")]
|
||||||
pub fn spawn_blocking<T: Send + 'static>(
|
pub fn spawn_blocking<T: Send + 'static>(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;
|
|||||||
|
|
||||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||||
use crate::schema::Type;
|
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
||||||
use crate::time::format_description::well_known::Rfc3339;
|
use crate::time::format_description::well_known::Rfc3339;
|
||||||
use crate::time::{OffsetDateTime, UtcOffset};
|
use crate::time::{OffsetDateTime, UtcOffset};
|
||||||
use crate::tokenizer::TextAnalyzer;
|
use crate::tokenizer::TextAnalyzer;
|
||||||
@@ -71,7 +71,7 @@ pub fn json_path_sep_to_dot(path: &mut str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[expect(clippy::too_many_arguments)]
|
||||||
fn index_json_object<'a, V: Value<'a>>(
|
fn index_json_object<'a, V: Value<'a>>(
|
||||||
doc: DocId,
|
doc: DocId,
|
||||||
json_visitor: V::ObjectIter,
|
json_visitor: V::ObjectIter,
|
||||||
@@ -101,7 +101,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::too_many_arguments)]
|
#[expect(clippy::too_many_arguments)]
|
||||||
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||||
doc: DocId,
|
doc: DocId,
|
||||||
json_value: V,
|
json_value: V,
|
||||||
@@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
ctx.path_to_unordered_id
|
ctx.path_to_unordered_id
|
||||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||||
);
|
);
|
||||||
|
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||||
term_buffer.append_type_and_fast_value(val);
|
term_buffer.append_type_and_fast_value(val);
|
||||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||||
}
|
}
|
||||||
@@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
/// Tries to infer a JSON type from a string and append it to the term.
|
/// Tries to infer a JSON type from a string and append it to the term.
|
||||||
///
|
///
|
||||||
/// The term must be json + JSON path.
|
/// The term must be json + JSON path.
|
||||||
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
|
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||||
|
mut term: Term,
|
||||||
|
phrase: &str,
|
||||||
|
truncate_date_for_search: bool,
|
||||||
|
) -> Option<Term> {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.value()
|
term.value()
|
||||||
.as_json_value_bytes()
|
.as_json_value_bytes()
|
||||||
@@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
|
|||||||
"JSON value bytes should be empty"
|
"JSON value bytes should be empty"
|
||||||
);
|
);
|
||||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||||
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
|
if truncate_date_for_search {
|
||||||
|
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||||
|
}
|
||||||
|
term.append_type_and_fast_value(dt);
|
||||||
return Some(term);
|
return Some(term);
|
||||||
}
|
}
|
||||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ impl RetryPolicy {
|
|||||||
/// The `DirectoryLock` is an object that represents a file lock.
|
/// The `DirectoryLock` is an object that represents a file lock.
|
||||||
///
|
///
|
||||||
/// It is associated with a lock file, that gets deleted on `Drop.`
|
/// It is associated with a lock file, that gets deleted on `Drop.`
|
||||||
#[allow(dead_code)]
|
#[expect(dead_code)]
|
||||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||||
|
|
||||||
struct DirectoryLockGuard {
|
struct DirectoryLockGuard {
|
||||||
@@ -102,10 +102,8 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
|||||||
///
|
///
|
||||||
/// There are currently two implementations of `Directory`
|
/// There are currently two implementations of `Directory`
|
||||||
///
|
///
|
||||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
|
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this should be your default choice.
|
||||||
/// should be your default choice.
|
/// - The [`RamDirectory`][crate::directory::RamDirectory], which should be used mostly for tests.
|
||||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
|
|
||||||
/// should be used mostly for tests.
|
|
||||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||||
/// Opens a file and returns a boxed `FileHandle`.
|
/// Opens a file and returns a boxed `FileHandle`.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
|
|||||||
});
|
});
|
||||||
/// The meta lock file is here to protect the segment files being opened by
|
/// The meta lock file is here to protect the segment files being opened by
|
||||||
/// `IndexReader::reload()` from being garbage collected.
|
/// `IndexReader::reload()` from being garbage collected.
|
||||||
|
///
|
||||||
/// It makes it possible for another process to safely consume
|
/// It makes it possible for another process to safely consume
|
||||||
/// our index in-writing. Ideally, we may have preferred `RWLock` semantics
|
/// our index in-writing. Ideally, we may have preferred `RWLock` semantics
|
||||||
/// here, but it is difficult to achieve on Windows.
|
/// here, but it is difficult to achieve on Windows.
|
||||||
|
|||||||
@@ -244,7 +244,7 @@ impl MmapDirectory {
|
|||||||
directory_path,
|
directory_path,
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
#[allow(clippy::bind_instead_of_map)]
|
#[expect(clippy::bind_instead_of_map)]
|
||||||
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
|
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
|
||||||
let directory_path = directory_path.to_owned();
|
let directory_path = directory_path.to_owned();
|
||||||
|
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ pub struct WatchCallbackList {
|
|||||||
/// file change is detected.
|
/// file change is detected.
|
||||||
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
#[allow(dead_code)]
|
#[expect(dead_code)]
|
||||||
pub struct WatchHandle(Arc<WatchCallback>);
|
pub struct WatchHandle(Arc<WatchCallback>);
|
||||||
|
|
||||||
impl WatchHandle {
|
impl WatchHandle {
|
||||||
|
|||||||
@@ -117,7 +117,7 @@ pub trait DocSet: Send {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DocSet for &'a mut dyn DocSet {
|
impl DocSet for &mut dyn DocSet {
|
||||||
fn advance(&mut self) -> u32 {
|
fn advance(&mut self) -> u32 {
|
||||||
(**self).advance()
|
(**self).advance()
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -25,10 +25,9 @@ impl FacetReader {
|
|||||||
/// Creates a new `FacetReader`.
|
/// Creates a new `FacetReader`.
|
||||||
///
|
///
|
||||||
/// A facet reader just wraps :
|
/// A facet reader just wraps :
|
||||||
/// - a `MultiValuedFastFieldReader` that makes it possible to
|
/// - a `MultiValuedFastFieldReader` that makes it possible to access the list of facet ords for
|
||||||
/// access the list of facet ords for a given document.
|
/// a given document.
|
||||||
/// - a `TermDictionary` that helps associating a facet to
|
/// - a `TermDictionary` that helps associating a facet to an ordinal and vice versa.
|
||||||
/// an ordinal and vice versa.
|
|
||||||
pub fn new(facet_column: StrColumn) -> FacetReader {
|
pub fn new(facet_column: StrColumn) -> FacetReader {
|
||||||
FacetReader { facet_column }
|
FacetReader { facet_column }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -942,10 +942,10 @@ mod tests {
|
|||||||
|
|
||||||
let numbers = [100, 200, 300];
|
let numbers = [100, 200, 300];
|
||||||
let test_range = |range: RangeInclusive<u64>| {
|
let test_range = |range: RangeInclusive<u64>| {
|
||||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||||
let mut vec = vec![];
|
let mut vec = vec![];
|
||||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||||
assert_eq!(vec.len(), expexted_count);
|
assert_eq!(vec.len(), expected_count);
|
||||||
};
|
};
|
||||||
test_range(50..=50);
|
test_range(50..=50);
|
||||||
test_range(150..=150);
|
test_range(150..=150);
|
||||||
@@ -1020,10 +1020,10 @@ mod tests {
|
|||||||
|
|
||||||
let numbers = [1000, 1001, 1003];
|
let numbers = [1000, 1001, 1003];
|
||||||
let test_range = |range: RangeInclusive<u64>| {
|
let test_range = |range: RangeInclusive<u64>| {
|
||||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||||
let mut vec = vec![];
|
let mut vec = vec![];
|
||||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||||
assert_eq!(vec.len(), expexted_count);
|
assert_eq!(vec.len(), expected_count);
|
||||||
};
|
};
|
||||||
let test_range_variant = |start, stop| {
|
let test_range_variant = |start, stop| {
|
||||||
let start_range = start..=stop;
|
let start_range = start..=stop;
|
||||||
|
|||||||
@@ -70,13 +70,13 @@ impl FastFieldReaders {
|
|||||||
///
|
///
|
||||||
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
|
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
|
||||||
///
|
///
|
||||||
/// The logic works as follows, first we identify which field is targetted by calling
|
/// The logic works as follows, first we identify which field is targeted by calling
|
||||||
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
|
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
|
||||||
/// name by non-escaped dots, and find the longest matching schema field name.
|
/// name by non-escaped dots, and find the longest matching schema field name.
|
||||||
/// In our case, it would return the (attribute_field, "color").
|
/// In our case, it would return the (attribute_field, "color").
|
||||||
///
|
///
|
||||||
/// If no field is found, but a dynamic field is supplied, then we
|
/// If no field is found, but a dynamic field is supplied, then we
|
||||||
/// will simply assuem the user is targetting the dynamic field. (This feature is used in
|
/// will simply assume the user is targeting the dynamic field. (This feature is used in
|
||||||
/// Quickwit.)
|
/// Quickwit.)
|
||||||
///
|
///
|
||||||
/// We then encode the `(field, path)` into the right `columnar_key`.
|
/// We then encode the `(field, path)` into the right `columnar_key`.
|
||||||
|
|||||||
@@ -149,7 +149,7 @@ impl FieldNormReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
pub(crate) fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
||||||
let field_norms_id = field_norms
|
let field_norms_id = field_norms
|
||||||
.iter()
|
.iter()
|
||||||
.cloned()
|
.cloned()
|
||||||
|
|||||||
@@ -1,12 +1,9 @@
|
|||||||
#![allow(deprecated)] // Remove with index sorting
|
|
||||||
|
|
||||||
use std::collections::HashSet;
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use rand::{thread_rng, Rng};
|
use rand::{thread_rng, Rng};
|
||||||
|
|
||||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
#[allow(deprecated)]
|
|
||||||
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
||||||
|
|
||||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ use crate::TantivyError;
|
|||||||
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
||||||
/// either.
|
/// either.
|
||||||
///
|
///
|
||||||
/// - In a sync context, you can call `FutureResult::wait()`. The function
|
/// - In a sync context, you can call `FutureResult::wait()`. The function does not rely on
|
||||||
/// does not rely on `block_on`.
|
/// `block_on`.
|
||||||
/// - In an async context, you can call simply use `FutureResult` as a future.
|
/// - In an async context, you can call simply use `FutureResult` as a future.
|
||||||
pub struct FutureResult<T> {
|
pub struct FutureResult<T> {
|
||||||
inner: Inner<T>,
|
inner: Inner<T>,
|
||||||
|
|||||||
@@ -49,10 +49,8 @@ fn load_metas(
|
|||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
/// This operation is atomic :
|
/// This operation is atomic :
|
||||||
/// Either
|
/// Either
|
||||||
/// - it fails, in which case an error is returned,
|
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||||
/// and the `meta.json` remains untouched,
|
/// - it succeeds, and `meta.json` is written and flushed.
|
||||||
/// - it succeeds, and `meta.json` is written
|
|
||||||
/// and flushed.
|
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
fn save_new_metas(
|
fn save_new_metas(
|
||||||
@@ -529,12 +527,12 @@ impl Index {
|
|||||||
/// `IndexWriter` on the system is accessing the index directory,
|
/// `IndexWriter` on the system is accessing the index directory,
|
||||||
/// it is safe to manually delete the lockfile.
|
/// it is safe to manually delete the lockfile.
|
||||||
///
|
///
|
||||||
/// - `num_threads` defines the number of indexing workers that
|
/// - `num_threads` defines the number of indexing workers that should work at the same time.
|
||||||
/// should work at the same time.
|
|
||||||
///
|
///
|
||||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory
|
/// - `overall_memory_budget_in_bytes` sets the amount of memory allocated for all indexing
|
||||||
/// allocated for all indexing thread.
|
/// thread.
|
||||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
///
|
||||||
|
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||||
|
|||||||
@@ -176,10 +176,7 @@ impl SegmentMeta {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Updates the max_doc value from the `SegmentMeta`.
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
///
|
pub fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||||
/// This method is only used when updating `max_doc` from 0
|
|
||||||
/// as we finalize a fresh new segment.
|
|
||||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
|
||||||
assert_eq!(self.tracked.max_doc, 0);
|
assert_eq!(self.tracked.max_doc, 0);
|
||||||
assert!(self.tracked.deletes.is_none());
|
assert!(self.tracked.deletes.is_none());
|
||||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||||
|
|||||||
@@ -31,7 +31,6 @@ pub struct InvertedIndexReader {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl InvertedIndexReader {
|
impl InvertedIndexReader {
|
||||||
#[allow(clippy::needless_pass_by_value)] // for symmetry
|
|
||||||
pub(crate) fn new(
|
pub(crate) fn new(
|
||||||
termdict: TermDictionary,
|
termdict: TermDictionary,
|
||||||
postings_file_slice: FileSlice,
|
postings_file_slice: FileSlice,
|
||||||
@@ -71,7 +70,7 @@ impl InvertedIndexReader {
|
|||||||
&self.termdict
|
&self.termdict
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the fields and types encoded in the dictionary in lexicographic oder.
|
/// Return the fields and types encoded in the dictionary in lexicographic order.
|
||||||
/// Only valid on JSON fields.
|
/// Only valid on JSON fields.
|
||||||
///
|
///
|
||||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||||
@@ -205,16 +204,6 @@ impl InvertedIndexReader {
|
|||||||
.transpose()
|
.transpose()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn read_postings_no_deletes(
|
|
||||||
&self,
|
|
||||||
term: &Term,
|
|
||||||
option: IndexRecordOption,
|
|
||||||
) -> io::Result<Option<SegmentPostings>> {
|
|
||||||
self.get_term_info(term)?
|
|
||||||
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
|
|
||||||
.transpose()
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Returns the number of documents containing the term.
|
/// Returns the number of documents containing the term.
|
||||||
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
||||||
Ok(self
|
Ok(self
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::slice;
|
use std::slice;
|
||||||
|
|
||||||
/// Enum describing each component of a tantivy segment.
|
/// Enum describing each component of a tantivy segment.
|
||||||
|
///
|
||||||
/// Each component is stored in its own file,
|
/// Each component is stored in its own file,
|
||||||
/// using the pattern `segment_uuid`.`component_extension`,
|
/// using the pattern `segment_uuid`.`component_extension`,
|
||||||
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
||||||
|
|||||||
@@ -358,7 +358,7 @@ impl SegmentReader {
|
|||||||
.map(|(mut field_name, handle)| {
|
.map(|(mut field_name, handle)| {
|
||||||
json_path_sep_to_dot(&mut field_name);
|
json_path_sep_to_dot(&mut field_name);
|
||||||
// map to canonical path, to avoid similar but different entries.
|
// map to canonical path, to avoid similar but different entries.
|
||||||
// Eventually we should just accept '.' seperated for all cases.
|
// Eventually we should just accept '.' separated for all cases.
|
||||||
let field_name = map_to_canonical
|
let field_name = map_to_canonical
|
||||||
.get(&field_name)
|
.get(&field_name)
|
||||||
.unwrap_or(&field_name)
|
.unwrap_or(&field_name)
|
||||||
@@ -478,7 +478,7 @@ pub fn merge_field_meta_data(
|
|||||||
.into_iter()
|
.into_iter()
|
||||||
.kmerge_by(|left, right| left < right)
|
.kmerge_by(|left, right| left < right)
|
||||||
// TODO: Remove allocation
|
// TODO: Remove allocation
|
||||||
.group_by(|el| (el.field_name.to_string(), el.typ))
|
.chunk_by(|el| (el.field_name.to_string(), el.typ))
|
||||||
{
|
{
|
||||||
let mut merged: FieldMetadata = group.next().unwrap();
|
let mut merged: FieldMetadata = group.next().unwrap();
|
||||||
for el in group {
|
for el in group {
|
||||||
|
|||||||
@@ -179,8 +179,7 @@ impl DeleteCursor {
|
|||||||
/// Skips operations and position it so that
|
/// Skips operations and position it so that
|
||||||
/// - either all of the delete operation currently in the queue are consume and the next get
|
/// - either all of the delete operation currently in the queue are consume and the next get
|
||||||
/// will return `None`.
|
/// will return `None`.
|
||||||
/// - the next get will return the first operation with an
|
/// - the next get will return the first operation with an `opstamp >= target_opstamp`.
|
||||||
/// `opstamp >= target_opstamp`.
|
|
||||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||||
// TODO Can be optimize as we work with block.
|
// TODO Can be optimize as we work with block.
|
||||||
while self.is_behind_opstamp(target_opstamp) {
|
while self.is_behind_opstamp(target_opstamp) {
|
||||||
@@ -188,7 +187,6 @@ impl DeleteCursor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[allow(clippy::wrong_self_convention)]
|
|
||||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||||
self.get()
|
self.get()
|
||||||
.map(|operation| operation.opstamp < target_opstamp)
|
.map(|operation| operation.opstamp < target_opstamp)
|
||||||
|
|||||||
@@ -21,7 +21,7 @@ pub enum DocToOpstampMapping<'a> {
|
|||||||
None,
|
None,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> DocToOpstampMapping<'a> {
|
impl DocToOpstampMapping<'_> {
|
||||||
/// Assess whether a document should be considered deleted given that it contains
|
/// Assess whether a document should be considered deleted given that it contains
|
||||||
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
|
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -482,7 +482,7 @@ impl<D: Document> IndexWriter<D> {
|
|||||||
/// let index = Index::create_in_ram(schema.clone());
|
/// let index = Index::create_in_ram(schema.clone());
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
/// index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit()?;
|
||||||
///
|
///
|
||||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||||
@@ -491,7 +491,7 @@ impl<D: Document> IndexWriter<D> {
|
|||||||
///
|
///
|
||||||
/// let searcher = index.reader()?.searcher();
|
/// let searcher = index.reader()?.searcher();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
/// let query_promo = query_parser.parse_query("Prometheus")?;
|
||||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||||
///
|
///
|
||||||
/// assert!(top_docs_promo.is_empty());
|
/// assert!(top_docs_promo.is_empty());
|
||||||
@@ -2093,7 +2093,7 @@ mod tests {
|
|||||||
//
|
//
|
||||||
// Take half as sample
|
// Take half as sample
|
||||||
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
|
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
|
||||||
sample.sort_by_key(|(k, _num_occurences)| *k);
|
sample.sort_by_key(|(k, _num_occurrences)| *k);
|
||||||
// sample.truncate(sample.len() / 2);
|
// sample.truncate(sample.len() / 2);
|
||||||
if !sample.is_empty() {
|
if !sample.is_empty() {
|
||||||
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
|
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
|
||||||
@@ -2102,7 +2102,7 @@ mod tests {
|
|||||||
sample
|
sample
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(id, _)| id_is_full_doc(**id))
|
.filter(|(id, _)| id_is_full_doc(**id))
|
||||||
.map(|(_id, num_occurences)| **num_occurences)
|
.map(|(_id, num_occurrences)| **num_occurrences)
|
||||||
.sum::<u64>()
|
.sum::<u64>()
|
||||||
};
|
};
|
||||||
fn gen_query_inclusive<T1: ToString, T2: ToString>(
|
fn gen_query_inclusive<T1: ToString, T2: ToString>(
|
||||||
|
|||||||
@@ -104,7 +104,7 @@ impl MergePolicy for LogMergePolicy {
|
|||||||
|
|
||||||
let mut current_max_log_size = f64::MAX;
|
let mut current_max_log_size = f64::MAX;
|
||||||
let mut levels = vec![];
|
let mut levels = vec![];
|
||||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
for (_, merge_group) in &size_sorted_segments.into_iter().chunk_by(|segment| {
|
||||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||||
// update current_max_log_size to create a new group
|
// update current_max_log_size to create a new group
|
||||||
|
|||||||
@@ -29,8 +29,8 @@ impl MergeOperationInventory {
|
|||||||
|
|
||||||
/// A `MergeOperation` has two roles.
|
/// A `MergeOperation` has two roles.
|
||||||
/// It carries all of the information required to describe a merge:
|
/// It carries all of the information required to describe a merge:
|
||||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
/// - `target_opstamp` is the opstamp up to which we want to consume the delete queue and reflect
|
||||||
/// delete queue and reflect their deletes.
|
/// their deletes.
|
||||||
/// - `segment_ids` is the list of segment to be merged.
|
/// - `segment_ids` is the list of segment to be merged.
|
||||||
///
|
///
|
||||||
/// The second role is to ensure keep track of the fact that these
|
/// The second role is to ensure keep track of the fact that these
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ impl MergePolicy for NoMergePolicy {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
pub mod tests {
|
pub(crate) mod tests {
|
||||||
|
|
||||||
use super::*;
|
use super::*;
|
||||||
|
|
||||||
|
|||||||
@@ -673,7 +673,7 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_date(
|
get_doc_ids(vec![Term::from_field_date_for_search(
|
||||||
date_field,
|
date_field,
|
||||||
DateTime::from_utc(curr_time)
|
DateTime::from_utc(curr_time)
|
||||||
)])?,
|
)])?,
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ impl PathToUnorderedId {
|
|||||||
next_id
|
next_id
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retuns ids which reflect the lexical order of the paths.
|
/// Returns ids which reflect the lexical order of the paths.
|
||||||
///
|
///
|
||||||
/// The returned vec can be indexed with the unordered id to get the ordered id.
|
/// The returned vec can be indexed with the unordered id to get the ordered id.
|
||||||
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
|
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
|
||||||
@@ -57,7 +57,7 @@ impl PathToUnorderedId {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retuns the paths so they can be queried by the ordered id (which is the index).
|
/// Returns the paths so they can be queried by the ordered id (which is the index).
|
||||||
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
|
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
|
||||||
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
|
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
|
||||||
paths.sort_unstable();
|
paths.sort_unstable();
|
||||||
|
|||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user