mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-06 09:12:55 +00:00
Compare commits
31 Commits
test_order
...
binggan-0.
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
80538175e8 | ||
|
|
8dc942e8e7 | ||
|
|
c17e513377 | ||
|
|
2f5a269c70 | ||
|
|
50532260e3 | ||
|
|
8bd6eb06e6 | ||
|
|
55b0b52457 | ||
|
|
56fc56c5b9 | ||
|
|
85395d942a | ||
|
|
a206c3ccd3 | ||
|
|
dc5d31c116 | ||
|
|
95a4ddea3e | ||
|
|
ab5125d3dc | ||
|
|
9f81d59ecd | ||
|
|
c71ec8086d | ||
|
|
27be6aed91 | ||
|
|
3d1c4b313a | ||
|
|
0d4e319965 | ||
|
|
75dc3eb298 | ||
|
|
3f6d225086 | ||
|
|
d8843c608c | ||
|
|
7ebcc15b17 | ||
|
|
1b4076691f | ||
|
|
eab660873a | ||
|
|
232f37126e | ||
|
|
13e9885dfd | ||
|
|
56d79cb203 | ||
|
|
0f4c2e27cf | ||
|
|
f9ae295507 | ||
|
|
d9db5302d9 | ||
|
|
e453848134 |
4
.github/workflows/coverage.yml
vendored
4
.github/workflows/coverage.yml
vendored
@@ -15,11 +15,11 @@ jobs:
|
|||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v4
|
- uses: actions/checkout@v4
|
||||||
- name: Install Rust
|
- name: Install Rust
|
||||||
run: rustup toolchain install nightly-2024-04-10 --profile minimal --component llvm-tools-preview
|
run: rustup toolchain install nightly-2024-07-01 --profile minimal --component llvm-tools-preview
|
||||||
- uses: Swatinem/rust-cache@v2
|
- uses: Swatinem/rust-cache@v2
|
||||||
- uses: taiki-e/install-action@cargo-llvm-cov
|
- uses: taiki-e/install-action@cargo-llvm-cov
|
||||||
- name: Generate code coverage
|
- name: Generate code coverage
|
||||||
run: cargo +nightly-2024-04-10 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||||
- name: Upload coverage to Codecov
|
- name: Upload coverage to Codecov
|
||||||
uses: codecov/codecov-action@v3
|
uses: codecov/codecov-action@v3
|
||||||
continue-on-error: true
|
continue-on-error: true
|
||||||
|
|||||||
84
CHANGELOG.md
84
CHANGELOG.md
@@ -1,3 +1,69 @@
|
|||||||
|
Tantivy 0.23 - Unreleased
|
||||||
|
================================
|
||||||
|
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21.
|
||||||
|
|
||||||
|
#### Bugfixes
|
||||||
|
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
|
||||||
|
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
|
||||||
|
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
|
||||||
|
|
||||||
|
#### Breaking API Changes
|
||||||
|
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
|
||||||
|
|
||||||
|
#### Features/Improvements
|
||||||
|
- **Aggregation**
|
||||||
|
- Support for cardinality aggregation [#2337](https://github.com/quickwit-oss/tantivy/pull/2337) [#2446](https://github.com/quickwit-oss/tantivy/pull/2446) (@raphaelcoeffic @PSeitz)
|
||||||
|
- Support for extended stats aggregation [#2247](https://github.com/quickwit-oss/tantivy/pull/2247)(@giovannicuccu)
|
||||||
|
- Add Key::I64 and Key::U64 variants in aggregation to avoid f64 precision issues [#2468](https://github.com/quickwit-oss/tantivy/pull/2468)(@PSeitz)
|
||||||
|
- Faster term aggregation fetch terms [#2447](https://github.com/quickwit-oss/tantivy/pull/2447)(@PSeitz)
|
||||||
|
- Improve custom order deserialization [#2451](https://github.com/quickwit-oss/tantivy/pull/2451)(@PSeitz)
|
||||||
|
- Change AggregationLimits behavior [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||||
|
- lower contention on AggregationLimits [#2394](https://github.com/quickwit-oss/tantivy/pull/2394)(@PSeitz)
|
||||||
|
- fix postcard compatibility for top_hits, add postcard test [#2346](https://github.com/quickwit-oss/tantivy/pull/2346)(@PSeitz)
|
||||||
|
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
|
||||||
|
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
|
||||||
|
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||||
|
- **Range Queries**
|
||||||
|
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
|
||||||
|
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
|
||||||
|
- modify fastfield range query heuristic [#2375](https://github.com/quickwit-oss/tantivy/pull/2375)(@trinity-1686a)
|
||||||
|
- add FastFieldRangeQuery for explicit range queries on fast field (for `RangeQuery` it is autodetected) [#2477](https://github.com/quickwit-oss/tantivy/pull/2477)(@PSeitz)
|
||||||
|
|
||||||
|
- add format backwards-compatibility tests [#2485](https://github.com/quickwit-oss/tantivy/pull/2485)(@PSeitz)
|
||||||
|
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
|
||||||
|
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
|
||||||
|
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
|
||||||
|
- feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
|
||||||
|
|
||||||
|
- **Optional Index in Multivalue Columnar Index** For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case). This is alleviated by placing an optional index in the multivalued index to mark documents that have values. This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
|
||||||
|
|
||||||
|
- **Performace/Memory**
|
||||||
|
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
||||||
|
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
||||||
|
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
||||||
|
- Recycling buffer in PrefixPhraseScorer [#2443](https://github.com/quickwit-oss/tantivy/pull/2443)(@fulmicoton)
|
||||||
|
|
||||||
|
- **Json Type**
|
||||||
|
- JSON supports now all values on the root level. Previously an object was required. This enables support for flat mixed types. allow more JSON values, fix i64 special case [#2383](https://github.com/quickwit-oss/tantivy/pull/2383)(@PSeitz)
|
||||||
|
- add json path constructor to term [#2367](https://github.com/quickwit-oss/tantivy/pull/2367)(@PSeitz)
|
||||||
|
|
||||||
|
- **QueryParser**
|
||||||
|
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
|
||||||
|
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
|
||||||
|
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
|
||||||
|
|
||||||
|
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
|
||||||
|
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
|
||||||
|
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
|
||||||
|
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
|
||||||
|
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
|
||||||
|
- use bingang for agg benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)(@PSeitz)
|
||||||
|
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
|
||||||
|
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
|
||||||
|
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
|
||||||
|
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
|
||||||
|
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||||
|
|
||||||
Tantivy 0.22
|
Tantivy 0.22
|
||||||
================================
|
================================
|
||||||
|
|
||||||
@@ -8,7 +74,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
|||||||
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
||||||
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
||||||
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
||||||
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
- Fix bug occurring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||||
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
||||||
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
||||||
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
||||||
@@ -26,7 +92,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
|||||||
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
||||||
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
||||||
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
||||||
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
- Support ip addresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||||
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
||||||
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
||||||
|
|
||||||
@@ -116,7 +182,7 @@ Tantivy 0.20
|
|||||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
- Move tokenizer API to separate crate. Having a separate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||||
@@ -163,13 +229,13 @@ Tantivy 0.20
|
|||||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||||
- sstable
|
- sstable
|
||||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
- Isolating sstable and stacker in independent crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
- Use DeltaReader directly to implement Dictionary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
- Use DeltaReader directly to implement Dictionary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
- Add separate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
- Added support for madvise when opening an mmapped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||||
- Query Parser
|
- Query Parser
|
||||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||||
@@ -208,7 +274,7 @@ Tantivy 0.19
|
|||||||
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
||||||
- Aggregation
|
- Aggregation
|
||||||
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
||||||
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
- Add support for keyed parameter in range and histogram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||||
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
||||||
- Faster indexing
|
- Faster indexing
|
||||||
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
||||||
|
|||||||
11
Cargo.toml
11
Cargo.toml
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
|||||||
readme = "README.md"
|
readme = "README.md"
|
||||||
keywords = ["search", "information", "retrieval"]
|
keywords = ["search", "information", "retrieval"]
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
rust-version = "1.63"
|
rust-version = "1.66"
|
||||||
exclude = ["benches/*.json", "benches/*.txt"]
|
exclude = ["benches/*.json", "benches/*.txt"]
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
@@ -29,7 +29,7 @@ tantivy-fst = "0.5"
|
|||||||
memmap2 = { version = "0.9.0", optional = true }
|
memmap2 = { version = "0.9.0", optional = true }
|
||||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||||
zstd = { version = "0.13", optional = true, default-features = false }
|
zstd = { version = "0.13", optional = true, default-features = false }
|
||||||
tempfile = { version = "3.3.0", optional = true }
|
tempfile = { version = "3.12.0", optional = true }
|
||||||
log = "0.4.16"
|
log = "0.4.16"
|
||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
serde_json = "1.0.79"
|
serde_json = "1.0.79"
|
||||||
@@ -38,7 +38,7 @@ levenshtein_automata = "0.2.1"
|
|||||||
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
uuid = { version = "1.0.0", features = ["v4", "serde"] }
|
||||||
crossbeam-channel = "0.5.4"
|
crossbeam-channel = "0.5.4"
|
||||||
rust-stemmers = "1.2.0"
|
rust-stemmers = "1.2.0"
|
||||||
downcast-rs = "1.2.0"
|
downcast-rs = "1.2.1"
|
||||||
bitpacking = { version = "0.9.2", default-features = false, features = [
|
bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||||
"bitpacker4x",
|
"bitpacker4x",
|
||||||
] }
|
] }
|
||||||
@@ -47,7 +47,7 @@ rustc-hash = "1.1.0"
|
|||||||
thiserror = "1.0.30"
|
thiserror = "1.0.30"
|
||||||
htmlescape = "0.3.1"
|
htmlescape = "0.3.1"
|
||||||
fail = { version = "0.5.0", optional = true }
|
fail = { version = "0.5.0", optional = true }
|
||||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||||
smallvec = "1.8.0"
|
smallvec = "1.8.0"
|
||||||
rayon = "1.5.2"
|
rayon = "1.5.2"
|
||||||
lru = "0.12.0"
|
lru = "0.12.0"
|
||||||
@@ -64,6 +64,7 @@ tantivy-bitpacker = { version = "0.6", path = "./bitpacker" }
|
|||||||
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
common = { version = "0.7", path = "./common/", package = "tantivy-common" }
|
||||||
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-tokenizer-api" }
|
||||||
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
|
||||||
|
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
|
||||||
futures-util = { version = "0.3.28", optional = true }
|
futures-util = { version = "0.3.28", optional = true }
|
||||||
fnv = "1.0.7"
|
fnv = "1.0.7"
|
||||||
|
|
||||||
@@ -71,7 +72,7 @@ fnv = "1.0.7"
|
|||||||
winapi = "0.3.9"
|
winapi = "0.3.9"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
binggan = "0.8.0"
|
binggan = "0.12.0"
|
||||||
rand = "0.8.5"
|
rand = "0.8.5"
|
||||||
maplit = "1.0.2"
|
maplit = "1.0.2"
|
||||||
matches = "0.1.9"
|
matches = "0.1.9"
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ Tantivy is, in fact, strongly inspired by Lucene's design.
|
|||||||
|
|
||||||
## Benchmark
|
## Benchmark
|
||||||
|
|
||||||
The following [benchmark](https://tantivy-search.github.io/bench/) breakdowns
|
The following [benchmark](https://tantivy-search.github.io/bench/) breaks down the
|
||||||
performance for different types of queries/collections.
|
performance for different types of queries/collections.
|
||||||
|
|
||||||
Your mileage WILL vary depending on the nature of queries and their load.
|
Your mileage WILL vary depending on the nature of queries and their load.
|
||||||
|
|||||||
2
TODO.txt
2
TODO.txt
@@ -1,7 +1,7 @@
|
|||||||
Make schema_builder API fluent.
|
Make schema_builder API fluent.
|
||||||
fix doc serialization and prevent compression problems
|
fix doc serialization and prevent compression problems
|
||||||
|
|
||||||
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
|
u64 , etc. should return Result<Option> now that we support optional missing a column is really not an error
|
||||||
remove fastfield codecs
|
remove fastfield codecs
|
||||||
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
||||||
rename FastFieldReaders::open to load
|
rename FastFieldReaders::open to load
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
use binggan::plugins::PeakMemAllocPlugin;
|
||||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||||
use rand::prelude::SliceRandom;
|
use rand::prelude::SliceRandom;
|
||||||
use rand::rngs::StdRng;
|
use rand::rngs::StdRng;
|
||||||
@@ -17,7 +18,10 @@ pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
|||||||
/// runner.register("average_u64", move |index| average_u64(index));
|
/// runner.register("average_u64", move |index| average_u64(index));
|
||||||
macro_rules! register {
|
macro_rules! register {
|
||||||
($runner:expr, $func:ident) => {
|
($runner:expr, $func:ident) => {
|
||||||
$runner.register(stringify!($func), move |index| $func(index))
|
$runner.register(stringify!($func), move |index| {
|
||||||
|
$func(index);
|
||||||
|
None
|
||||||
|
})
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -42,7 +46,8 @@ fn main() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn bench_agg(mut group: InputGroup<Index>) {
|
fn bench_agg(mut group: InputGroup<Index>) {
|
||||||
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
|
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||||
|
|
||||||
register!(group, average_u64);
|
register!(group, average_u64);
|
||||||
register!(group, average_f64);
|
register!(group, average_f64);
|
||||||
register!(group, average_f64_u64);
|
register!(group, average_f64_u64);
|
||||||
@@ -51,10 +56,15 @@ fn bench_agg(mut group: InputGroup<Index>) {
|
|||||||
register!(group, percentiles_f64);
|
register!(group, percentiles_f64);
|
||||||
register!(group, terms_few);
|
register!(group, terms_few);
|
||||||
register!(group, terms_many);
|
register!(group, terms_many);
|
||||||
|
register!(group, terms_many_top_1000);
|
||||||
register!(group, terms_many_order_by_term);
|
register!(group, terms_many_order_by_term);
|
||||||
register!(group, terms_many_with_top_hits);
|
register!(group, terms_many_with_top_hits);
|
||||||
register!(group, terms_many_with_avg_sub_agg);
|
register!(group, terms_many_with_avg_sub_agg);
|
||||||
register!(group, terms_many_json_mixed_type_with_sub_agg_card);
|
register!(group, terms_many_json_mixed_type_with_avg_sub_agg);
|
||||||
|
|
||||||
|
register!(group, cardinality_agg);
|
||||||
|
register!(group, terms_few_with_cardinality_agg);
|
||||||
|
|
||||||
register!(group, range_agg);
|
register!(group, range_agg);
|
||||||
register!(group, range_agg_with_avg_sub_agg);
|
register!(group, range_agg_with_avg_sub_agg);
|
||||||
register!(group, range_agg_with_term_agg_few);
|
register!(group, range_agg_with_term_agg_few);
|
||||||
@@ -123,6 +133,33 @@ fn percentiles_f64(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn cardinality_agg(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "text_many_terms"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
|
fn terms_few_with_cardinality_agg(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"my_texts": {
|
||||||
|
"terms": { "field": "text_few_terms" },
|
||||||
|
"aggs": {
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "text_many_terms"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
|
|
||||||
fn terms_few(index: &Index) {
|
fn terms_few(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_few_terms" } },
|
"my_texts": { "terms": { "field": "text_few_terms" } },
|
||||||
@@ -135,6 +172,12 @@ fn terms_many(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
fn terms_many_top_1000(index: &Index) {
|
||||||
|
let agg_req = json!({
|
||||||
|
"my_texts": { "terms": { "field": "text_many_terms", "size": 1000 } },
|
||||||
|
});
|
||||||
|
execute_agg(index, agg_req);
|
||||||
|
}
|
||||||
fn terms_many_order_by_term(index: &Index) {
|
fn terms_many_order_by_term(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
"my_texts": { "terms": { "field": "text_many_terms", "order": { "_key": "desc" } } },
|
||||||
@@ -171,7 +214,7 @@ fn terms_many_with_avg_sub_agg(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
fn terms_many_json_mixed_type_with_sub_agg_card(index: &Index) {
|
fn terms_many_json_mixed_type_with_avg_sub_agg(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"my_texts": {
|
"my_texts": {
|
||||||
"terms": { "field": "json.mixed_type" },
|
"terms": { "field": "json.mixed_type" },
|
||||||
@@ -268,6 +311,7 @@ fn range_agg_with_term_agg_many(index: &Index) {
|
|||||||
});
|
});
|
||||||
execute_agg(index, agg_req);
|
execute_agg(index, agg_req);
|
||||||
}
|
}
|
||||||
|
|
||||||
fn histogram(index: &Index) {
|
fn histogram(index: &Index) {
|
||||||
let agg_req = json!({
|
let agg_req = json!({
|
||||||
"rangef64": {
|
"rangef64": {
|
||||||
|
|||||||
@@ -368,9 +368,9 @@ mod test {
|
|||||||
for start_idx in 0u32..32u32 {
|
for start_idx in 0u32..32u32 {
|
||||||
output.resize(len, 0);
|
output.resize(len, 0);
|
||||||
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
||||||
for i in 0..len {
|
for (i, output_byte) in output.iter().enumerate() {
|
||||||
let expected = (start_idx + i as u32) & mask;
|
let expected = (start_idx + i as u32) & mask;
|
||||||
assert_eq!(output[i], expected);
|
assert_eq!(*output_byte, expected);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
|
|||||||
proptest = "1"
|
proptest = "1"
|
||||||
more-asserts = "0.3.1"
|
more-asserts = "0.3.1"
|
||||||
rand = "0.8"
|
rand = "0.8"
|
||||||
binggan = "0.8.1"
|
binggan = "0.12.0"
|
||||||
|
|
||||||
[[bench]]
|
[[bench]]
|
||||||
name = "bench_merge"
|
name = "bench_merge"
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot
|
|||||||
# Columnar format
|
# Columnar format
|
||||||
|
|
||||||
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
||||||
The `(column_name, columne_type)` couple however uniquely identifies a column.
|
The `(column_name, column_type)` couple however uniquely identifies a column.
|
||||||
That couple is serialized as a column `column_key`. The format of that key is:
|
That couple is serialized as a column `column_key`. The format of that key is:
|
||||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||||
|
|
||||||
|
|||||||
@@ -42,6 +42,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
black_box(sum);
|
black_box(sum);
|
||||||
|
None
|
||||||
});
|
});
|
||||||
runner.register("access_first_vals", |column| {
|
runner.register("access_first_vals", |column| {
|
||||||
let mut sum = 0;
|
let mut sum = 0;
|
||||||
@@ -62,6 +63,7 @@ fn bench_group(mut runner: InputGroup<Column>) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
black_box(sum);
|
black_box(sum);
|
||||||
|
None
|
||||||
});
|
});
|
||||||
runner.run();
|
runner.run();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -41,7 +41,7 @@ fn main() {
|
|||||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||||
|
|
||||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||||
black_box(out);
|
Some(out.len() as u64)
|
||||||
},
|
},
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -10,7 +10,7 @@
|
|||||||
|
|
||||||
# Perf and Size
|
# Perf and Size
|
||||||
* remove alloc in `ord_to_term`
|
* remove alloc in `ord_to_term`
|
||||||
+ multivaued range queries restrat frm the beginning all of the time.
|
+ multivaued range queries restart from the beginning all of the time.
|
||||||
* re-add ZSTD compression for dictionaries
|
* re-add ZSTD compression for dictionaries
|
||||||
no systematic monotonic mapping
|
no systematic monotonic mapping
|
||||||
consider removing multilinear
|
consider removing multilinear
|
||||||
@@ -30,7 +30,7 @@ investigate if should have better errors? io::Error is overused at the moment.
|
|||||||
rename rank/select in unit tests
|
rename rank/select in unit tests
|
||||||
Review the public API via cargo doc
|
Review the public API via cargo doc
|
||||||
go through TODOs
|
go through TODOs
|
||||||
remove all doc_id occurences -> row_id
|
remove all doc_id occurrences -> row_id
|
||||||
use the rank & select naming in unit tests branch.
|
use the rank & select naming in unit tests branch.
|
||||||
multi-linear -> blockwise
|
multi-linear -> blockwise
|
||||||
linear codec -> simply a multiplication for the index column
|
linear codec -> simply a multiplication for the index column
|
||||||
@@ -43,5 +43,5 @@ isolate u128_based and uniform naming
|
|||||||
# Other
|
# Other
|
||||||
fix enhance column-cli
|
fix enhance column-cli
|
||||||
|
|
||||||
# Santa claus
|
# Santa Claus
|
||||||
autodetect datetime ipaddr, plug customizable tokenizer.
|
autodetect datetime ipaddr, plug customizable tokenizer.
|
||||||
|
|||||||
@@ -173,7 +173,7 @@ mod tests {
|
|||||||
.into();
|
.into();
|
||||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Expected a multivalued index")
|
||||||
};
|
};
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
@@ -211,7 +211,7 @@ mod tests {
|
|||||||
|
|
||||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||||
panic!("Excpected a multivalued index")
|
panic!("Expected a multivalued index")
|
||||||
};
|
};
|
||||||
let mut output = Vec::new();
|
let mut output = Vec::new();
|
||||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ pub enum ColumnIndex {
|
|||||||
Full,
|
Full,
|
||||||
Optional(OptionalIndex),
|
Optional(OptionalIndex),
|
||||||
/// In addition, at index num_rows, an extra value is added
|
/// In addition, at index num_rows, an extra value is added
|
||||||
/// containing the overal number of values.
|
/// containing the overall number of values.
|
||||||
Multivalued(MultiValueIndex),
|
Multivalued(MultiValueIndex),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -174,7 +174,9 @@ impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl Set<RowId> for OptionalIndex {
|
impl Set<RowId> for OptionalIndex {
|
||||||
type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= OptionalIndexSelectCursor<'b>
|
||||||
|
where Self: 'b;
|
||||||
// Check if value at position is not null.
|
// Check if value at position is not null.
|
||||||
#[inline]
|
#[inline]
|
||||||
fn contains(&self, row_id: RowId) -> bool {
|
fn contains(&self, row_id: RowId) -> bool {
|
||||||
|
|||||||
@@ -123,7 +123,9 @@ impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Set<u16> for DenseBlock<'a> {
|
impl<'a> Set<u16> for DenseBlock<'a> {
|
||||||
type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= DenseBlockSelectCursor<'a>
|
||||||
|
where Self: 'b;
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn contains(&self, el: u16) -> bool {
|
fn contains(&self, el: u16) -> bool {
|
||||||
|
|||||||
@@ -32,7 +32,9 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl<'a> Set<u16> for SparseBlock<'a> {
|
impl<'a> Set<u16> for SparseBlock<'a> {
|
||||||
type SelectCursor<'b> = Self where Self: 'b;
|
type SelectCursor<'b>
|
||||||
|
= Self
|
||||||
|
where Self: 'b;
|
||||||
|
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn contains(&self, el: u16) -> bool {
|
fn contains(&self, el: u16) -> bool {
|
||||||
|
|||||||
@@ -110,8 +110,8 @@ fn test_null_index(data: &[bool]) {
|
|||||||
.map(|(pos, _val)| pos as u32)
|
.map(|(pos, _val)| pos as u32)
|
||||||
.collect();
|
.collect();
|
||||||
let mut select_iter = null_index.select_cursor();
|
let mut select_iter = null_index.select_cursor();
|
||||||
for i in 0..orig_idx_with_value.len() {
|
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
||||||
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
|
assert_eq!(select_iter.select(i as u32), *expected);
|
||||||
}
|
}
|
||||||
|
|
||||||
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
||||||
|
|||||||
@@ -34,6 +34,7 @@ fn compute_stats(vals: impl Iterator<Item = u64>) -> ColumnStats {
|
|||||||
fn value_iter() -> impl Iterator<Item = u64> {
|
fn value_iter() -> impl Iterator<Item = u64> {
|
||||||
0..20_000
|
0..20_000
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues {
|
||||||
let mut bytes = Vec::new();
|
let mut bytes = Vec::new();
|
||||||
let stats = compute_stats(data.iter().cloned());
|
let stats = compute_stats(data.iter().cloned());
|
||||||
@@ -41,10 +42,13 @@ fn get_reader_for_bench<Codec: ColumnCodec>(data: &[u64]) -> Codec::ColumnValues
|
|||||||
for val in data {
|
for val in data {
|
||||||
codec_serializer.collect(*val);
|
codec_serializer.collect(*val);
|
||||||
}
|
}
|
||||||
codec_serializer.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes);
|
codec_serializer
|
||||||
|
.serialize(&stats, Box::new(data.iter().copied()).as_mut(), &mut bytes)
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
Codec::load(OwnedBytes::new(bytes)).unwrap()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
fn bench_get<Codec: ColumnCodec>(b: &mut Bencher, data: &[u64]) {
|
||||||
let col = get_reader_for_bench::<Codec>(data);
|
let col = get_reader_for_bench::<Codec>(data);
|
||||||
b.iter(|| {
|
b.iter(|| {
|
||||||
|
|||||||
@@ -184,7 +184,7 @@ impl CompactSpaceBuilder {
|
|||||||
|
|
||||||
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
||||||
|
|
||||||
// begining of the blanks
|
// beginning of the blanks
|
||||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
||||||
if *first_blank_start != 0 {
|
if *first_blank_start != 0 {
|
||||||
covered_space.push(0..=first_blank_start - 1);
|
covered_space.push(0..=first_blank_start - 1);
|
||||||
|
|||||||
@@ -122,12 +122,11 @@ impl Line {
|
|||||||
line
|
line
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns a line that attemps to approximate a function
|
/// Returns a line that attempts to approximate a function
|
||||||
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
||||||
///
|
///
|
||||||
/// - The approximation is always lower than the actual value.
|
/// - The approximation is always lower than the actual value. Or more rigorously, formally
|
||||||
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
|
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
|
||||||
/// for any i in [0..ys.len()).
|
|
||||||
/// - It computes without panicking for any value of it.
|
/// - It computes without panicking for any value of it.
|
||||||
///
|
///
|
||||||
/// This function is only invariable by translation if all of the
|
/// This function is only invariable by translation if all of the
|
||||||
|
|||||||
@@ -25,7 +25,7 @@ use crate::{
|
|||||||
/// After merge, all columns belonging to the same category are coerced to
|
/// After merge, all columns belonging to the same category are coerced to
|
||||||
/// the same column type.
|
/// the same column type.
|
||||||
///
|
///
|
||||||
/// In practise, today, only Numerical colummns are coerced into one type today.
|
/// In practise, today, only Numerical columns are coerced into one type today.
|
||||||
///
|
///
|
||||||
/// See also [README.md].
|
/// See also [README.md].
|
||||||
///
|
///
|
||||||
@@ -63,11 +63,10 @@ impl From<ColumnType> for ColumnTypeCategory {
|
|||||||
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
||||||
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
||||||
/// happen:
|
/// happen:
|
||||||
/// - If the required column type is compatible with all of the input columnar, the resulsting
|
/// - If the required column type is compatible with all of the input columnar, the resulting merged
|
||||||
/// merged
|
/// columnar will simply coerce the input column and use the required column type.
|
||||||
/// columnar will simply coerce the input column and use the required column type.
|
/// - If the required column type is incompatible with one of the input columnar, the merged will
|
||||||
/// - If the required column type is incompatible with one of the input columnar, the merged
|
/// fail with an InvalidData error.
|
||||||
/// will fail with an InvalidData error.
|
|
||||||
///
|
///
|
||||||
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
||||||
/// `Columnar` table.
|
/// `Columnar` table.
|
||||||
|
|||||||
@@ -35,8 +35,7 @@ impl<'a> Ord for HeapItem<'a> {
|
|||||||
///
|
///
|
||||||
/// The item yield is actually a pair with
|
/// The item yield is actually a pair with
|
||||||
/// - the term
|
/// - the term
|
||||||
/// - a slice with the ordinal of the segments containing
|
/// - a slice with the ordinal of the segments containing the terms.
|
||||||
/// the terms.
|
|
||||||
pub struct TermMerger<'a> {
|
pub struct TermMerger<'a> {
|
||||||
heap: BinaryHeap<HeapItem<'a>>,
|
heap: BinaryHeap<HeapItem<'a>>,
|
||||||
current_streamers: Vec<HeapItem<'a>>,
|
current_streamers: Vec<HeapItem<'a>>,
|
||||||
|
|||||||
@@ -87,7 +87,7 @@ impl<V: SymbolValue> ColumnOperation<V> {
|
|||||||
minibuf
|
minibuf
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Deserialize a colummn operation.
|
/// Deserialize a column operation.
|
||||||
/// Returns None if the buffer is empty.
|
/// Returns None if the buffer is empty.
|
||||||
///
|
///
|
||||||
/// Panics if the payload is invalid:
|
/// Panics if the payload is invalid:
|
||||||
|
|||||||
@@ -8,6 +8,7 @@ use std::net::Ipv6Addr;
|
|||||||
|
|
||||||
use column_operation::ColumnOperation;
|
use column_operation::ColumnOperation;
|
||||||
pub(crate) use column_writers::CompatibleNumericalTypes;
|
pub(crate) use column_writers::CompatibleNumericalTypes;
|
||||||
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use common::CountingWriter;
|
use common::CountingWriter;
|
||||||
pub(crate) use serializer::ColumnarSerializer;
|
pub(crate) use serializer::ColumnarSerializer;
|
||||||
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
use stacker::{Addr, ArenaHashMap, MemoryArena};
|
||||||
@@ -247,6 +248,7 @@ impl ColumnarWriter {
|
|||||||
}
|
}
|
||||||
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
pub fn serialize(&mut self, num_docs: RowId, wrt: &mut dyn io::Write) -> io::Result<()> {
|
||||||
let mut serializer = ColumnarSerializer::new(wrt);
|
let mut serializer = ColumnarSerializer::new(wrt);
|
||||||
|
|
||||||
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
let mut columns: Vec<(&[u8], ColumnType, Addr)> = self
|
||||||
.numerical_field_hash_map
|
.numerical_field_hash_map
|
||||||
.iter()
|
.iter()
|
||||||
@@ -260,7 +262,7 @@ impl ColumnarWriter {
|
|||||||
columns.extend(
|
columns.extend(
|
||||||
self.bytes_field_hash_map
|
self.bytes_field_hash_map
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(term, addr)| (term, ColumnType::Bytes, addr)),
|
.map(|(column_name, addr)| (column_name, ColumnType::Bytes, addr)),
|
||||||
);
|
);
|
||||||
columns.extend(
|
columns.extend(
|
||||||
self.str_field_hash_map
|
self.str_field_hash_map
|
||||||
@@ -287,6 +289,12 @@ impl ColumnarWriter {
|
|||||||
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
|
||||||
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
|
||||||
for (column_name, column_type, addr) in columns {
|
for (column_name, column_type, addr) in columns {
|
||||||
|
if column_name.contains(&JSON_END_OF_PATH) {
|
||||||
|
// Tantivy uses b'0' as a separator for nested fields in JSON.
|
||||||
|
// Column names with a b'0' are not simply ignored by the columnar (and the inverted
|
||||||
|
// index).
|
||||||
|
continue;
|
||||||
|
}
|
||||||
match column_type {
|
match column_type {
|
||||||
ColumnType::Bool => {
|
ColumnType::Bool => {
|
||||||
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
let column_writer: ColumnWriter = self.bool_field_hash_map.read(addr);
|
||||||
|
|||||||
@@ -1,6 +1,7 @@
|
|||||||
use std::io;
|
use std::io;
|
||||||
use std::io::Write;
|
use std::io::Write;
|
||||||
|
|
||||||
|
use common::json_path_writer::JSON_END_OF_PATH;
|
||||||
use common::{BinarySerializable, CountingWriter};
|
use common::{BinarySerializable, CountingWriter};
|
||||||
use sstable::value::RangeValueWriter;
|
use sstable::value::RangeValueWriter;
|
||||||
use sstable::RangeSSTable;
|
use sstable::RangeSSTable;
|
||||||
@@ -18,13 +19,8 @@ pub struct ColumnarSerializer<W: io::Write> {
|
|||||||
/// code.
|
/// code.
|
||||||
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
fn prepare_key(key: &[u8], column_type: ColumnType, buffer: &mut Vec<u8>) {
|
||||||
buffer.clear();
|
buffer.clear();
|
||||||
// Convert 0 bytes to '0' string, as 0 bytes are reserved for the end of the path.
|
buffer.extend_from_slice(key);
|
||||||
if key.contains(&0u8) {
|
buffer.push(JSON_END_OF_PATH);
|
||||||
buffer.extend(key.iter().map(|&b| if b == 0 { b'0' } else { b }));
|
|
||||||
} else {
|
|
||||||
buffer.extend_from_slice(key);
|
|
||||||
}
|
|
||||||
buffer.push(0u8);
|
|
||||||
buffer.push(column_type.to_code());
|
buffer.push(column_type.to_code());
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -97,18 +93,3 @@ impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
|
|||||||
self.columnar_serializer.wrt.write_all(buf)
|
self.columnar_serializer.wrt.write_all(buf)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
|
||||||
mod tests {
|
|
||||||
use super::*;
|
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn test_prepare_key_bytes() {
|
|
||||||
let mut buffer: Vec<u8> = b"somegarbage".to_vec();
|
|
||||||
prepare_key(b"root\0child", ColumnType::Str, &mut buffer);
|
|
||||||
assert_eq!(buffer.len(), 12);
|
|
||||||
assert_eq!(&buffer[..10], b"root0child");
|
|
||||||
assert_eq!(buffer[10], 0u8);
|
|
||||||
assert_eq!(buffer[11], ColumnType::Str.to_code());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
@@ -17,6 +17,31 @@ impl NumericalValue {
|
|||||||
NumericalValue::F64(_) => NumericalType::F64,
|
NumericalValue::F64(_) => NumericalType::F64,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tries to normalize the numerical value in the following priorities:
|
||||||
|
/// i64, i64, f64
|
||||||
|
pub fn normalize(self) -> Self {
|
||||||
|
match self {
|
||||||
|
NumericalValue::U64(val) => {
|
||||||
|
if val <= i64::MAX as u64 {
|
||||||
|
NumericalValue::I64(val as i64)
|
||||||
|
} else {
|
||||||
|
NumericalValue::F64(val as f64)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||||
|
NumericalValue::F64(val) => {
|
||||||
|
let fract = val.fract();
|
||||||
|
if fract == 0.0 && val >= i64::MIN as f64 && val <= i64::MAX as f64 {
|
||||||
|
NumericalValue::I64(val as i64)
|
||||||
|
} else if fract == 0.0 && val >= u64::MIN as f64 && val <= u64::MAX as f64 {
|
||||||
|
NumericalValue::U64(val as u64)
|
||||||
|
} else {
|
||||||
|
NumericalValue::F64(val)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl From<u64> for NumericalValue {
|
impl From<u64> for NumericalValue {
|
||||||
|
|||||||
@@ -9,7 +9,6 @@ documentation = "https://docs.rs/tantivy_common/"
|
|||||||
homepage = "https://github.com/quickwit-oss/tantivy"
|
homepage = "https://github.com/quickwit-oss/tantivy"
|
||||||
repository = "https://github.com/quickwit-oss/tantivy"
|
repository = "https://github.com/quickwit-oss/tantivy"
|
||||||
|
|
||||||
|
|
||||||
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
@@ -20,8 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
|||||||
serde = { version = "1.0.136", features = ["derive"] }
|
serde = { version = "1.0.136", features = ["derive"] }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
|
binggan = "0.12.0"
|
||||||
proptest = "1.0.0"
|
proptest = "1.0.0"
|
||||||
rand = "0.8.4"
|
rand = "0.8.4"
|
||||||
|
|
||||||
[features]
|
|
||||||
unstable = [] # useful for benches.
|
|
||||||
|
|||||||
@@ -1,39 +1,70 @@
|
|||||||
#![feature(test)]
|
use binggan::{black_box, BenchRunner};
|
||||||
|
use rand::seq::IteratorRandom;
|
||||||
|
use rand::thread_rng;
|
||||||
|
use tantivy_common::{serialize_vint_u32, BitSet, TinySet};
|
||||||
|
|
||||||
extern crate test;
|
fn bench_vint() {
|
||||||
|
let mut runner = BenchRunner::new();
|
||||||
|
|
||||||
#[cfg(test)]
|
let vals: Vec<u32> = (0..20_000).collect();
|
||||||
mod tests {
|
runner.bench_function("bench_vint", move |_| {
|
||||||
use rand::seq::IteratorRandom;
|
let mut out = 0u64;
|
||||||
use rand::thread_rng;
|
for val in vals.iter().cloned() {
|
||||||
use tantivy_common::serialize_vint_u32;
|
let mut buf = [0u8; 8];
|
||||||
use test::Bencher;
|
serialize_vint_u32(val, &mut buf);
|
||||||
|
out += u64::from(buf[0]);
|
||||||
|
}
|
||||||
|
black_box(out);
|
||||||
|
None
|
||||||
|
});
|
||||||
|
|
||||||
#[bench]
|
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
||||||
fn bench_vint(b: &mut Bencher) {
|
runner.bench_function("bench_vint_rand", move |_| {
|
||||||
let vals: Vec<u32> = (0..20_000).collect();
|
let mut out = 0u64;
|
||||||
b.iter(|| {
|
for val in vals.iter().cloned() {
|
||||||
let mut out = 0u64;
|
let mut buf = [0u8; 8];
|
||||||
for val in vals.iter().cloned() {
|
serialize_vint_u32(val, &mut buf);
|
||||||
let mut buf = [0u8; 8];
|
out += u64::from(buf[0]);
|
||||||
serialize_vint_u32(val, &mut buf);
|
}
|
||||||
out += u64::from(buf[0]);
|
black_box(out);
|
||||||
}
|
None
|
||||||
out
|
});
|
||||||
});
|
}
|
||||||
}
|
|
||||||
|
fn bench_bitset() {
|
||||||
#[bench]
|
let mut runner = BenchRunner::new();
|
||||||
fn bench_vint_rand(b: &mut Bencher) {
|
|
||||||
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
|
runner.bench_function("bench_tinyset_pop", move |_| {
|
||||||
b.iter(|| {
|
let mut tinyset = TinySet::singleton(black_box(31u32));
|
||||||
let mut out = 0u64;
|
tinyset.pop_lowest();
|
||||||
for val in vals.iter().cloned() {
|
tinyset.pop_lowest();
|
||||||
let mut buf = [0u8; 8];
|
tinyset.pop_lowest();
|
||||||
serialize_vint_u32(val, &mut buf);
|
tinyset.pop_lowest();
|
||||||
out += u64::from(buf[0]);
|
tinyset.pop_lowest();
|
||||||
}
|
tinyset.pop_lowest();
|
||||||
out
|
black_box(tinyset);
|
||||||
});
|
None
|
||||||
}
|
});
|
||||||
|
|
||||||
|
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
||||||
|
runner.bench_function("bench_tinyset_sum", move |_| {
|
||||||
|
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
||||||
|
None
|
||||||
|
});
|
||||||
|
|
||||||
|
let v = [10u32, 14u32, 21u32];
|
||||||
|
runner.bench_function("bench_tinyarr_sum", move |_| {
|
||||||
|
black_box(v.iter().cloned().sum::<u32>());
|
||||||
|
None
|
||||||
|
});
|
||||||
|
|
||||||
|
runner.bench_function("bench_bitset_initialize", move |_| {
|
||||||
|
black_box(BitSet::with_max_value(1_000_000));
|
||||||
|
None
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
fn main() {
|
||||||
|
bench_vint();
|
||||||
|
bench_bitset();
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -696,43 +696,3 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(test, feature = "unstable"))]
|
|
||||||
mod bench {
|
|
||||||
|
|
||||||
use test;
|
|
||||||
|
|
||||||
use super::{BitSet, TinySet};
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyset_pop(b: &mut test::Bencher) {
|
|
||||||
b.iter(|| {
|
|
||||||
let mut tinyset = TinySet::singleton(test::black_box(31u32));
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
tinyset.pop_lowest();
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyset_sum(b: &mut test::Bencher) {
|
|
||||||
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
|
|
||||||
b.iter(|| {
|
|
||||||
assert_eq!(test::black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_tinyarr_sum(b: &mut test::Bencher) {
|
|
||||||
let v = [10u32, 14u32, 21u32];
|
|
||||||
b.iter(|| test::black_box(v).iter().cloned().sum::<u32>());
|
|
||||||
}
|
|
||||||
|
|
||||||
#[bench]
|
|
||||||
fn bench_bitset_initialize(b: &mut test::Bencher) {
|
|
||||||
b.iter(|| BitSet::with_max_value(1_000_000));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|||||||
130
common/src/bounds.rs
Normal file
130
common/src/bounds.rs
Normal file
@@ -0,0 +1,130 @@
|
|||||||
|
use std::io;
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug)]
|
||||||
|
pub struct BoundsRange<T> {
|
||||||
|
pub lower_bound: Bound<T>,
|
||||||
|
pub upper_bound: Bound<T>,
|
||||||
|
}
|
||||||
|
impl<T> BoundsRange<T> {
|
||||||
|
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound,
|
||||||
|
upper_bound,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
pub fn is_unbounded(&self) -> bool {
|
||||||
|
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
|
||||||
|
}
|
||||||
|
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound: map_bound(&self.lower_bound, &transform),
|
||||||
|
upper_bound: map_bound(&self.upper_bound, &transform),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound_res<TTo, Err>(
|
||||||
|
&self,
|
||||||
|
transform: impl Fn(&T) -> Result<TTo, Err>,
|
||||||
|
) -> Result<BoundsRange<TTo>, Err> {
|
||||||
|
Ok(BoundsRange {
|
||||||
|
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
|
||||||
|
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn transform_inner<TTo>(
|
||||||
|
&self,
|
||||||
|
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
|
||||||
|
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
|
||||||
|
) -> BoundsRange<TTo> {
|
||||||
|
BoundsRange {
|
||||||
|
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
|
||||||
|
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the first set inner value
|
||||||
|
pub fn get_inner(&self) -> Option<&T> {
|
||||||
|
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub enum TransformBound<T> {
|
||||||
|
/// Overwrite the bounds
|
||||||
|
NewBound(Bound<T>),
|
||||||
|
/// Use Existing bounds with new value
|
||||||
|
Existing(T),
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||||
|
/// The bound variant may change by the value returned value from the closure.
|
||||||
|
pub fn transform_bound_inner_res<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
|
||||||
|
) -> io::Result<Bound<TTo>> {
|
||||||
|
use self::Bound::*;
|
||||||
|
Ok(match bound {
|
||||||
|
Excluded(ref from_val) => match transform(from_val)? {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||||
|
},
|
||||||
|
Included(ref from_val) => match transform(from_val)? {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Included(new_val),
|
||||||
|
},
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||||
|
/// The bound variant may change by the value returned value from the closure.
|
||||||
|
pub fn transform_bound_inner<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
|
||||||
|
) -> Bound<TTo> {
|
||||||
|
use self::Bound::*;
|
||||||
|
match bound {
|
||||||
|
Excluded(ref from_val) => match transform(from_val) {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||||
|
},
|
||||||
|
Included(ref from_val) => match transform(from_val) {
|
||||||
|
TransformBound::NewBound(new_val) => new_val,
|
||||||
|
TransformBound::Existing(new_val) => Included(new_val),
|
||||||
|
},
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Returns the inner value of a `Bound`
|
||||||
|
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
|
||||||
|
match val {
|
||||||
|
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||||
|
Bound::Unbounded => None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound<TFrom, TTo>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> TTo,
|
||||||
|
) -> Bound<TTo> {
|
||||||
|
use self::Bound::*;
|
||||||
|
match bound {
|
||||||
|
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
|
||||||
|
Included(ref from_val) => Bound::Included(transform(from_val)),
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn map_bound_res<TFrom, TTo, Err>(
|
||||||
|
bound: &Bound<TFrom>,
|
||||||
|
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
|
||||||
|
) -> Result<Bound<TTo>, Err> {
|
||||||
|
use self::Bound::*;
|
||||||
|
Ok(match bound {
|
||||||
|
Excluded(ref from_val) => Excluded(transform(from_val)?),
|
||||||
|
Included(ref from_val) => Included(transform(from_val)?),
|
||||||
|
Unbounded => Unbounded,
|
||||||
|
})
|
||||||
|
}
|
||||||
@@ -5,6 +5,7 @@ use std::ops::Deref;
|
|||||||
pub use byteorder::LittleEndian as Endianness;
|
pub use byteorder::LittleEndian as Endianness;
|
||||||
|
|
||||||
mod bitset;
|
mod bitset;
|
||||||
|
pub mod bounds;
|
||||||
mod byte_count;
|
mod byte_count;
|
||||||
mod datetime;
|
mod datetime;
|
||||||
pub mod file_slice;
|
pub mod file_slice;
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
// # Searching a range on an indexed int field.
|
// # Searching a range on an indexed int field.
|
||||||
//
|
//
|
||||||
// Below is an example of creating an indexed integer field in your schema
|
// Below is an example of creating an indexed integer field in your schema
|
||||||
@@ -5,7 +7,7 @@
|
|||||||
use tantivy::collector::Count;
|
use tantivy::collector::Count;
|
||||||
use tantivy::query::RangeQuery;
|
use tantivy::query::RangeQuery;
|
||||||
use tantivy::schema::{Schema, INDEXED};
|
use tantivy::schema::{Schema, INDEXED};
|
||||||
use tantivy::{doc, Index, IndexWriter, Result};
|
use tantivy::{doc, Index, IndexWriter, Result, Term};
|
||||||
|
|
||||||
fn main() -> Result<()> {
|
fn main() -> Result<()> {
|
||||||
// For the sake of simplicity, this schema will only have 1 field
|
// For the sake of simplicity, this schema will only have 1 field
|
||||||
@@ -27,7 +29,10 @@ fn main() -> Result<()> {
|
|||||||
reader.reload()?;
|
reader.reload()?;
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
// The end is excluded i.e. here we are searching up to 1969
|
// The end is excluded i.e. here we are searching up to 1969
|
||||||
let docs_in_the_sixties = RangeQuery::new_u64("year".to_string(), 1960..1970);
|
let docs_in_the_sixties = RangeQuery::new(
|
||||||
|
Bound::Included(Term::from_field_u64(year_field, 1960)),
|
||||||
|
Bound::Excluded(Term::from_field_u64(year_field, 1970)),
|
||||||
|
);
|
||||||
// Uses a Count collector to sum the total number of docs in the range
|
// Uses a Count collector to sum the total number of docs in the range
|
||||||
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
let num_60s_books = searcher.search(&docs_in_the_sixties, &Count)?;
|
||||||
assert_eq!(num_60s_books, 10);
|
assert_eq!(num_60s_books, 10);
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ fn main() -> tantivy::Result<()> {
|
|||||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||||
index_writer.commit()?;
|
index_writer.commit()?;
|
||||||
|
|
||||||
let reader = index.reader()?;
|
let reader = index.reader()?;
|
||||||
|
|||||||
@@ -109,6 +109,9 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
|||||||
move |input: I| match f.parse(input) {
|
move |input: I| match f.parse(input) {
|
||||||
Ok((input, (output, _err))) => Ok((input, output)),
|
Ok((input, (output, _err))) => Ok((input, output)),
|
||||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||||
|
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||||
|
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||||
|
#[allow(unreachable_patterns)]
|
||||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -6,12 +6,12 @@ use std::fmt::Write;
|
|||||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||||
pub enum Occur {
|
pub enum Occur {
|
||||||
/// For a given document to be considered for scoring,
|
/// For a given document to be considered for scoring,
|
||||||
/// at least one of the terms with the Should or the Must
|
/// at least one of the queries with the Should or the Must
|
||||||
/// Occur constraint must be within the document.
|
/// Occur constraint must be within the document.
|
||||||
Should,
|
Should,
|
||||||
/// Document without the term are excluded from the search.
|
/// Document without the queries are excluded from the search.
|
||||||
Must,
|
Must,
|
||||||
/// Document that contain the term are excluded from the
|
/// Document that contain the query are excluded from the
|
||||||
/// search.
|
/// search.
|
||||||
MustNot,
|
MustNot,
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -833,7 +833,7 @@ fn aggregate_infallible_expressions(
|
|||||||
if early_operand {
|
if early_operand {
|
||||||
err.push(LenientErrorInternal {
|
err.push(LenientErrorInternal {
|
||||||
pos: 0,
|
pos: 0,
|
||||||
message: "Found unexpeted boolean operator before term".to_string(),
|
message: "Found unexpected boolean operator before term".to_string(),
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -856,7 +856,7 @@ fn aggregate_infallible_expressions(
|
|||||||
_ => Some(Occur::Should),
|
_ => Some(Occur::Should),
|
||||||
};
|
};
|
||||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(
|
clauses.push(vec![(
|
||||||
Some(Occur::Should),
|
Some(Occur::Should),
|
||||||
ast.clone().unary(Occur::MustNot),
|
ast.clone().unary(Occur::MustNot),
|
||||||
@@ -872,7 +872,7 @@ fn aggregate_infallible_expressions(
|
|||||||
None => None,
|
None => None,
|
||||||
};
|
};
|
||||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(
|
clauses.push(vec![(
|
||||||
Some(Occur::Should),
|
Some(Occur::Should),
|
||||||
ast.clone().unary(Occur::MustNot),
|
ast.clone().unary(Occur::MustNot),
|
||||||
@@ -897,7 +897,7 @@ fn aggregate_infallible_expressions(
|
|||||||
}
|
}
|
||||||
Some(BinaryOperand::Or) => {
|
Some(BinaryOperand::Or) => {
|
||||||
if last_occur == Some(Occur::MustNot) {
|
if last_occur == Some(Occur::MustNot) {
|
||||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||||
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
||||||
} else {
|
} else {
|
||||||
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
||||||
@@ -1057,7 +1057,7 @@ mod test {
|
|||||||
valid_parse("1", 1.0, "");
|
valid_parse("1", 1.0, "");
|
||||||
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
||||||
error_parse(".3332");
|
error_parse(".3332");
|
||||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeeed,
|
// TODO trinity-1686a: I disagree that it should fail, I think it should succeed,
|
||||||
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
||||||
// error_parse("1.");
|
// error_parse("1.");
|
||||||
error_parse("-1.");
|
error_parse("-1.");
|
||||||
@@ -1467,7 +1467,7 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_parse_query_to_triming_spaces() {
|
fn test_parse_query_to_trimming_spaces() {
|
||||||
test_parse_query_to_ast_helper(" abc", "abc");
|
test_parse_query_to_ast_helper(" abc", "abc");
|
||||||
test_parse_query_to_ast_helper("abc ", "abc");
|
test_parse_query_to_ast_helper("abc ", "abc");
|
||||||
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
||||||
|
|||||||
@@ -267,7 +267,7 @@ impl fmt::Debug for UserInputAst {
|
|||||||
match *self {
|
match *self {
|
||||||
UserInputAst::Clause(ref subqueries) => {
|
UserInputAst::Clause(ref subqueries) => {
|
||||||
if subqueries.is_empty() {
|
if subqueries.is_empty() {
|
||||||
// TODO this will break ast reserialization, is writing "( )" enought?
|
// TODO this will break ast reserialization, is writing "( )" enough?
|
||||||
write!(formatter, "<emptyclause>")?;
|
write!(formatter, "<emptyclause>")?;
|
||||||
} else {
|
} else {
|
||||||
write!(formatter, "(")?;
|
write!(formatter, "(")?;
|
||||||
|
|||||||
@@ -21,7 +21,10 @@ impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
|
|||||||
|
|
||||||
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
||||||
/// (500MB). The limit is shared by all SegmentCollectors
|
/// (500MB). The limit is shared by all SegmentCollectors
|
||||||
pub struct AggregationLimits {
|
///
|
||||||
|
/// The memory limit is also a guard, which tracks how much it allocated and releases it's memory
|
||||||
|
/// on the shared counter. Cloning will create a new guard.
|
||||||
|
pub struct AggregationLimitsGuard {
|
||||||
/// The counter which is shared between the aggregations for one request.
|
/// The counter which is shared between the aggregations for one request.
|
||||||
memory_consumption: Arc<AtomicU64>,
|
memory_consumption: Arc<AtomicU64>,
|
||||||
/// The memory_limit in bytes
|
/// The memory_limit in bytes
|
||||||
@@ -29,28 +32,41 @@ pub struct AggregationLimits {
|
|||||||
/// The maximum number of buckets _returned_
|
/// The maximum number of buckets _returned_
|
||||||
/// This is not counting intermediate buckets.
|
/// This is not counting intermediate buckets.
|
||||||
bucket_limit: u32,
|
bucket_limit: u32,
|
||||||
|
/// Allocated memory with this guard.
|
||||||
|
allocated_with_the_guard: u64,
|
||||||
}
|
}
|
||||||
impl Clone for AggregationLimits {
|
impl Clone for AggregationLimitsGuard {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
Self {
|
Self {
|
||||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||||
memory_limit: self.memory_limit,
|
memory_limit: self.memory_limit,
|
||||||
bucket_limit: self.bucket_limit,
|
bucket_limit: self.bucket_limit,
|
||||||
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for AggregationLimits {
|
impl Drop for AggregationLimitsGuard {
|
||||||
|
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||||
|
/// This is used to clear the segment specific memory consumption all at once.
|
||||||
|
fn drop(&mut self) {
|
||||||
|
self.memory_consumption
|
||||||
|
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Default for AggregationLimitsGuard {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
memory_consumption: Default::default(),
|
memory_consumption: Default::default(),
|
||||||
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
||||||
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
||||||
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationLimits {
|
impl AggregationLimitsGuard {
|
||||||
/// *memory_limit*
|
/// *memory_limit*
|
||||||
/// memory_limit is defined in bytes.
|
/// memory_limit is defined in bytes.
|
||||||
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
||||||
@@ -67,24 +83,15 @@ impl AggregationLimits {
|
|||||||
memory_consumption: Default::default(),
|
memory_consumption: Default::default(),
|
||||||
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
||||||
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
|
|
||||||
pub fn new_guard(&self) -> ResourceLimitGuard {
|
|
||||||
ResourceLimitGuard {
|
|
||||||
// The counter which is shared between the aggregations for one request.
|
|
||||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
|
||||||
// The memory_limit in bytes
|
|
||||||
memory_limit: self.memory_limit,
|
|
||||||
allocated_with_the_guard: 0,
|
allocated_with_the_guard: 0,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
pub(crate) fn add_memory_consumed(&mut self, add_num_bytes: u64) -> crate::Result<()> {
|
||||||
let prev_value = self
|
let prev_value = self
|
||||||
.memory_consumption
|
.memory_consumption
|
||||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||||
|
self.allocated_with_the_guard += add_num_bytes;
|
||||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
@@ -109,34 +116,6 @@ fn validate_memory_consumption(
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct ResourceLimitGuard {
|
|
||||||
/// The counter which is shared between the aggregations for one request.
|
|
||||||
memory_consumption: Arc<AtomicU64>,
|
|
||||||
/// The memory_limit in bytes
|
|
||||||
memory_limit: ByteCount,
|
|
||||||
/// Allocated memory with this guard.
|
|
||||||
allocated_with_the_guard: u64,
|
|
||||||
}
|
|
||||||
|
|
||||||
impl ResourceLimitGuard {
|
|
||||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
|
||||||
let prev_value = self
|
|
||||||
.memory_consumption
|
|
||||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
|
||||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
|
||||||
Ok(())
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
impl Drop for ResourceLimitGuard {
|
|
||||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
|
||||||
/// This is used to clear the segment specific memory consumption all at once.
|
|
||||||
fn drop(&mut self) {
|
|
||||||
self.memory_consumption
|
|
||||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use crate::aggregation::tests::exec_request_with_query;
|
use crate::aggregation::tests::exec_request_with_query;
|
||||||
|
|||||||
@@ -34,8 +34,9 @@ use super::bucket::{
|
|||||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||||
};
|
};
|
||||||
use super::metric::{
|
use super::metric::{
|
||||||
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
|
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||||
PercentilesAggregationReq, StatsAggregation, SumAggregation, TopHitsAggregation,
|
MaxAggregation, MinAggregation, PercentilesAggregationReq, StatsAggregation, SumAggregation,
|
||||||
|
TopHitsAggregationReq,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
/// The top-level aggregation request structure, which contains [`Aggregation`] and their user
|
||||||
@@ -159,7 +160,10 @@ pub enum AggregationVariants {
|
|||||||
Percentiles(PercentilesAggregationReq),
|
Percentiles(PercentilesAggregationReq),
|
||||||
/// Finds the top k values matching some order
|
/// Finds the top k values matching some order
|
||||||
#[serde(rename = "top_hits")]
|
#[serde(rename = "top_hits")]
|
||||||
TopHits(TopHitsAggregation),
|
TopHits(TopHitsAggregationReq),
|
||||||
|
/// Computes an estimate of the number of unique values
|
||||||
|
#[serde(rename = "cardinality")]
|
||||||
|
Cardinality(CardinalityAggregationReq),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationVariants {
|
impl AggregationVariants {
|
||||||
@@ -179,6 +183,7 @@ impl AggregationVariants {
|
|||||||
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
AggregationVariants::Sum(sum) => vec![sum.field_name()],
|
||||||
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
AggregationVariants::Percentiles(per) => vec![per.field_name()],
|
||||||
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
AggregationVariants::TopHits(top_hits) => top_hits.field_names(),
|
||||||
|
AggregationVariants::Cardinality(per) => vec![per.field_name()],
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -203,7 +208,7 @@ impl AggregationVariants {
|
|||||||
_ => None,
|
_ => None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregation> {
|
pub(crate) fn as_top_hits(&self) -> Option<&TopHitsAggregationReq> {
|
||||||
match &self {
|
match &self {
|
||||||
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
AggregationVariants::TopHits(top_hits) => Some(top_hits),
|
||||||
_ => None,
|
_ => None,
|
||||||
|
|||||||
@@ -5,16 +5,15 @@ use std::io;
|
|||||||
|
|
||||||
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
||||||
|
|
||||||
use super::agg_limits::ResourceLimitGuard;
|
|
||||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||||
use super::bucket::{
|
use super::bucket::{
|
||||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||||
};
|
};
|
||||||
use super::metric::{
|
use super::metric::{
|
||||||
AverageAggregation, CountAggregation, ExtendedStatsAggregation, MaxAggregation, MinAggregation,
|
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||||
StatsAggregation, SumAggregation,
|
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||||
};
|
};
|
||||||
use super::segment_agg_result::AggregationLimits;
|
use super::segment_agg_result::AggregationLimitsGuard;
|
||||||
use super::VecWithNames;
|
use super::VecWithNames;
|
||||||
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
@@ -46,7 +45,7 @@ pub struct AggregationWithAccessor {
|
|||||||
pub(crate) str_dict_column: Option<StrColumn>,
|
pub(crate) str_dict_column: Option<StrColumn>,
|
||||||
pub(crate) field_type: ColumnType,
|
pub(crate) field_type: ColumnType,
|
||||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||||
pub(crate) limits: ResourceLimitGuard,
|
pub(crate) limits: AggregationLimitsGuard,
|
||||||
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
||||||
/// Used for missing term aggregation, which checks all columns for existence.
|
/// Used for missing term aggregation, which checks all columns for existence.
|
||||||
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
||||||
@@ -69,7 +68,7 @@ impl AggregationWithAccessor {
|
|||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
||||||
let mut agg = agg.clone();
|
let mut agg = agg.clone();
|
||||||
|
|
||||||
@@ -91,7 +90,7 @@ impl AggregationWithAccessor {
|
|||||||
&limits,
|
&limits,
|
||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
limits: limits.new_guard(),
|
limits: limits.clone(),
|
||||||
missing_value_for_accessor: None,
|
missing_value_for_accessor: None,
|
||||||
str_dict_column: None,
|
str_dict_column: None,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
@@ -106,6 +105,7 @@ impl AggregationWithAccessor {
|
|||||||
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
||||||
-> crate::Result<()> {
|
-> crate::Result<()> {
|
||||||
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
||||||
|
let limits = limits.clone();
|
||||||
let res = AggregationWithAccessor {
|
let res = AggregationWithAccessor {
|
||||||
segment_ordinal,
|
segment_ordinal,
|
||||||
// TODO: We should do away with the `accessor` field altogether
|
// TODO: We should do away with the `accessor` field altogether
|
||||||
@@ -120,7 +120,7 @@ impl AggregationWithAccessor {
|
|||||||
&limits,
|
&limits,
|
||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
limits: limits.new_guard(),
|
limits,
|
||||||
missing_value_for_accessor: None,
|
missing_value_for_accessor: None,
|
||||||
str_dict_column: None,
|
str_dict_column: None,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
@@ -162,6 +162,11 @@ impl AggregationWithAccessor {
|
|||||||
field: ref field_name,
|
field: ref field_name,
|
||||||
ref missing,
|
ref missing,
|
||||||
..
|
..
|
||||||
|
})
|
||||||
|
| Cardinality(CardinalityAggregationReq {
|
||||||
|
field: ref field_name,
|
||||||
|
ref missing,
|
||||||
|
..
|
||||||
}) => {
|
}) => {
|
||||||
let str_dict_column = reader.fast_fields().str(field_name)?;
|
let str_dict_column = reader.fast_fields().str(field_name)?;
|
||||||
let allowed_column_types = [
|
let allowed_column_types = [
|
||||||
@@ -181,6 +186,8 @@ impl AggregationWithAccessor {
|
|||||||
.map(|missing| match missing {
|
.map(|missing| match missing {
|
||||||
Key::Str(_) => ColumnType::Str,
|
Key::Str(_) => ColumnType::Str,
|
||||||
Key::F64(_) => ColumnType::F64,
|
Key::F64(_) => ColumnType::F64,
|
||||||
|
Key::I64(_) => ColumnType::I64,
|
||||||
|
Key::U64(_) => ColumnType::U64,
|
||||||
})
|
})
|
||||||
.unwrap_or(ColumnType::U64);
|
.unwrap_or(ColumnType::U64);
|
||||||
let column_and_types = get_all_ff_reader_or_empty(
|
let column_and_types = get_all_ff_reader_or_empty(
|
||||||
@@ -227,14 +234,18 @@ impl AggregationWithAccessor {
|
|||||||
missing.clone()
|
missing.clone()
|
||||||
};
|
};
|
||||||
|
|
||||||
let missing_value_for_accessor = if let Some(missing) =
|
let missing_value_for_accessor =
|
||||||
missing_value_term_agg.as_ref()
|
if let Some(missing) = missing_value_term_agg.as_ref() {
|
||||||
{
|
get_missing_val_as_u64_lenient(
|
||||||
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
|
column_type,
|
||||||
} else {
|
missing,
|
||||||
None
|
agg.agg.get_fast_field_names()[0],
|
||||||
};
|
)?
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
};
|
||||||
|
|
||||||
|
let limits = limits.clone();
|
||||||
let agg = AggregationWithAccessor {
|
let agg = AggregationWithAccessor {
|
||||||
segment_ordinal,
|
segment_ordinal,
|
||||||
missing_value_for_accessor,
|
missing_value_for_accessor,
|
||||||
@@ -250,7 +261,7 @@ impl AggregationWithAccessor {
|
|||||||
)?,
|
)?,
|
||||||
agg: agg.clone(),
|
agg: agg.clone(),
|
||||||
str_dict_column: str_dict_column.clone(),
|
str_dict_column: str_dict_column.clone(),
|
||||||
limits: limits.new_guard(),
|
limits,
|
||||||
column_block_accessor: Default::default(),
|
column_block_accessor: Default::default(),
|
||||||
};
|
};
|
||||||
res.push(agg);
|
res.push(agg);
|
||||||
@@ -325,7 +336,14 @@ impl AggregationWithAccessor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_missing_val(
|
/// Get the missing value as internal u64 representation
|
||||||
|
///
|
||||||
|
/// For terms we use u64::MAX as sentinel value
|
||||||
|
/// For numerical data we convert the value into the representation
|
||||||
|
/// we would get from the fast field, when we open it as u64_lenient_for_type.
|
||||||
|
///
|
||||||
|
/// That way we can use it the same way as if it would come from the fastfield.
|
||||||
|
fn get_missing_val_as_u64_lenient(
|
||||||
column_type: ColumnType,
|
column_type: ColumnType,
|
||||||
missing: &Key,
|
missing: &Key,
|
||||||
field_name: &str,
|
field_name: &str,
|
||||||
@@ -334,9 +352,18 @@ fn get_missing_val(
|
|||||||
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
// Allow fallback to number on text fields
|
// Allow fallback to number on text fields
|
||||||
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
|
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
|
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||||
Key::F64(val) if column_type.numerical_type().is_some() => {
|
Key::F64(val) if column_type.numerical_type().is_some() => {
|
||||||
f64_to_fastfield_u64(*val, &column_type)
|
f64_to_fastfield_u64(*val, &column_type)
|
||||||
}
|
}
|
||||||
|
// NOTE: We may loose precision of the passed missing value by casting i64 and u64 to f64.
|
||||||
|
Key::I64(val) if column_type.numerical_type().is_some() => {
|
||||||
|
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||||
|
}
|
||||||
|
Key::U64(val) if column_type.numerical_type().is_some() => {
|
||||||
|
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||||
"Missing value {missing:?} for field {field_name} is not supported for column \
|
"Missing value {missing:?} for field {field_name} is not supported for column \
|
||||||
@@ -360,7 +387,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
|
|||||||
aggs: &Aggregations,
|
aggs: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: &AggregationLimits,
|
limits: &AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationsWithAccessor> {
|
) -> crate::Result<AggregationsWithAccessor> {
|
||||||
let mut aggss = Vec::new();
|
let mut aggss = Vec::new();
|
||||||
for (key, agg) in aggs.iter() {
|
for (key, agg) in aggs.iter() {
|
||||||
|
|||||||
@@ -98,6 +98,8 @@ pub enum MetricResult {
|
|||||||
Percentiles(PercentilesMetricResult),
|
Percentiles(PercentilesMetricResult),
|
||||||
/// Top hits metric result
|
/// Top hits metric result
|
||||||
TopHits(TopHitsMetricResult),
|
TopHits(TopHitsMetricResult),
|
||||||
|
/// Cardinality metric result
|
||||||
|
Cardinality(SingleMetricResult),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl MetricResult {
|
impl MetricResult {
|
||||||
@@ -116,6 +118,7 @@ impl MetricResult {
|
|||||||
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
MetricResult::TopHits(_) => Err(TantivyError::AggregationError(
|
||||||
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
AggregationError::InvalidRequest("top_hits can't be used to order".to_string()),
|
||||||
)),
|
)),
|
||||||
|
MetricResult::Cardinality(card) => Ok(card.value),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ use crate::aggregation::agg_result::AggregationResults;
|
|||||||
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||||
use crate::aggregation::collector::AggregationCollector;
|
use crate::aggregation::collector::AggregationCollector;
|
||||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use crate::aggregation::segment_agg_result::AggregationLimits;
|
use crate::aggregation::segment_agg_result::AggregationLimitsGuard;
|
||||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||||
use crate::aggregation::DistributedAggregationCollector;
|
use crate::aggregation::DistributedAggregationCollector;
|
||||||
use crate::query::{AllQuery, TermQuery};
|
use crate::query::{AllQuery, TermQuery};
|
||||||
@@ -110,6 +110,16 @@ fn test_aggregation_flushing(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
},
|
||||||
|
"cardinality_string_id":{
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"cardinality_score":{
|
||||||
|
"cardinality": {
|
||||||
|
"field": "score"
|
||||||
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
@@ -120,7 +130,7 @@ fn test_aggregation_flushing(
|
|||||||
let agg_res: AggregationResults = if use_distributed_collector {
|
let agg_res: AggregationResults = if use_distributed_collector {
|
||||||
let collector = DistributedAggregationCollector::from_aggs(
|
let collector = DistributedAggregationCollector::from_aggs(
|
||||||
agg_req.clone(),
|
agg_req.clone(),
|
||||||
AggregationLimits::default(),
|
AggregationLimitsGuard::default(),
|
||||||
);
|
);
|
||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
@@ -136,7 +146,7 @@ fn test_aggregation_flushing(
|
|||||||
.expect("Post deserialization failed");
|
.expect("Post deserialization failed");
|
||||||
|
|
||||||
intermediate_agg_result
|
intermediate_agg_result
|
||||||
.into_final_result(agg_req, &Default::default())
|
.into_final_result(agg_req, Default::default())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
} else {
|
} else {
|
||||||
let collector = get_collector(agg_req);
|
let collector = get_collector(agg_req);
|
||||||
@@ -212,6 +222,9 @@ fn test_aggregation_flushing(
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
|
|
||||||
|
assert_eq!(res["cardinality_string_id"]["value"], 2.0);
|
||||||
|
assert_eq!(res["cardinality_score"]["value"], 80.0);
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -447,7 +460,7 @@ fn test_aggregation_level2(
|
|||||||
|
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
let res = searcher.search(&term_query, &collector).unwrap();
|
let res = searcher.search(&term_query, &collector).unwrap();
|
||||||
res.into_final_result(agg_req.clone(), &Default::default())
|
res.into_final_result(agg_req.clone(), Default::default())
|
||||||
.unwrap()
|
.unwrap()
|
||||||
} else {
|
} else {
|
||||||
let collector = get_collector(agg_req.clone());
|
let collector = get_collector(agg_req.clone());
|
||||||
@@ -857,7 +870,7 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
index_writer.commit().unwrap();
|
index_writer.commit().unwrap();
|
||||||
// => Segment with all boolen
|
// => Segment with all boolean
|
||||||
index_writer
|
index_writer
|
||||||
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
||||||
.unwrap();
|
.unwrap();
|
||||||
@@ -926,11 +939,11 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
},
|
},
|
||||||
"termagg": {
|
"termagg": {
|
||||||
"buckets": [
|
"buckets": [
|
||||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
{ "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } },
|
||||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
||||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
{ "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } },
|
||||||
],
|
],
|
||||||
"sum_other_doc_count": 0
|
"sum_other_doc_count": 0
|
||||||
}
|
}
|
||||||
@@ -938,3 +951,60 @@ fn test_aggregation_on_json_object_mixed_types() {
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let json = schema_builder.add_json_field("json", FAST);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||||
|
// => Segment with all values f64 numeric
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10.5})))
|
||||||
|
.unwrap();
|
||||||
|
// Gets converted to f64!
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
// => Segment with all values i64 numeric
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
|
||||||
|
// All bucket types
|
||||||
|
let agg_req_str = r#"
|
||||||
|
{
|
||||||
|
"termagg": {
|
||||||
|
"terms": {
|
||||||
|
"field": "json.mixed_price"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} "#;
|
||||||
|
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||||
|
let aggregation_collector = get_collector(agg);
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
|
||||||
|
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
|
||||||
|
use pretty_assertions::assert_eq;
|
||||||
|
assert_eq!(
|
||||||
|
&aggregation_res_json,
|
||||||
|
&serde_json::json!({
|
||||||
|
"termagg": {
|
||||||
|
"buckets": [
|
||||||
|
{ "doc_count": 2, "key": 10},
|
||||||
|
{ "doc_count": 1, "key": 10.5},
|
||||||
|
],
|
||||||
|
"doc_count_error_upper_bound": 0,
|
||||||
|
"sum_other_doc_count": 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|||||||
@@ -438,7 +438,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
|||||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||||
histogram_req: &HistogramAggregation,
|
histogram_req: &HistogramAggregation,
|
||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<BucketEntry>> {
|
) -> crate::Result<Vec<BucketEntry>> {
|
||||||
// Generate the full list of buckets without gaps.
|
// Generate the full list of buckets without gaps.
|
||||||
//
|
//
|
||||||
@@ -496,7 +496,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
|||||||
is_date_agg: bool,
|
is_date_agg: bool,
|
||||||
histogram_req: &HistogramAggregation,
|
histogram_req: &HistogramAggregation,
|
||||||
sub_aggregation: &Aggregations,
|
sub_aggregation: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<Vec<BucketEntry>> {
|
) -> crate::Result<Vec<BucketEntry>> {
|
||||||
// Normalization is column type dependent.
|
// Normalization is column type dependent.
|
||||||
// The request used in the the call to final is not yet be normalized.
|
// The request used in the the call to final is not yet be normalized.
|
||||||
@@ -750,7 +750,7 @@ mod tests {
|
|||||||
agg_req,
|
agg_req,
|
||||||
&index,
|
&index,
|
||||||
None,
|
None,
|
||||||
AggregationLimits::new(Some(5_000), None),
|
AggregationLimitsGuard::new(Some(5_000), None),
|
||||||
)
|
)
|
||||||
.unwrap_err();
|
.unwrap_err();
|
||||||
assert!(res.to_string().starts_with(
|
assert!(res.to_string().starts_with(
|
||||||
|
|||||||
@@ -112,18 +112,64 @@ impl Serialize for CustomOrder {
|
|||||||
impl<'de> Deserialize<'de> for CustomOrder {
|
impl<'de> Deserialize<'de> for CustomOrder {
|
||||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||||
where D: Deserializer<'de> {
|
where D: Deserializer<'de> {
|
||||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
let value = serde_json::Value::deserialize(deserializer)?;
|
||||||
if let Some((key, value)) = map.into_iter().next() {
|
let return_err = |message, val: serde_json::Value| {
|
||||||
|
de::Error::custom(format!(
|
||||||
|
"{}, but got {}",
|
||||||
|
message,
|
||||||
|
serde_json::to_string(&val).unwrap()
|
||||||
|
))
|
||||||
|
};
|
||||||
|
|
||||||
|
match value {
|
||||||
|
serde_json::Value::Object(map) => {
|
||||||
|
if map.len() != 1 {
|
||||||
|
return Err(return_err(
|
||||||
|
"expected exactly one key-value pair in the order map",
|
||||||
|
map.into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
|
||||||
|
let (key, value) = map.into_iter().next().unwrap();
|
||||||
|
let order = serde_json::from_value(value).map_err(de::Error::custom)?;
|
||||||
|
|
||||||
Ok(CustomOrder {
|
Ok(CustomOrder {
|
||||||
target: key.as_str().into(),
|
target: key.as_str().into(),
|
||||||
order: value,
|
order,
|
||||||
})
|
})
|
||||||
} else {
|
|
||||||
Err(de::Error::custom(
|
|
||||||
"unexpected empty map in order".to_string(),
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
})
|
serde_json::Value::Array(arr) => {
|
||||||
|
if arr.is_empty() {
|
||||||
|
return Err(return_err("unexpected empty array in order", arr.into()));
|
||||||
|
}
|
||||||
|
if arr.len() != 1 {
|
||||||
|
return Err(return_err(
|
||||||
|
"only one sort order supported currently",
|
||||||
|
arr.into(),
|
||||||
|
));
|
||||||
|
}
|
||||||
|
let entry = arr.into_iter().next().unwrap();
|
||||||
|
let map = entry
|
||||||
|
.as_object()
|
||||||
|
.ok_or_else(|| return_err("expected object as sort order", entry.clone()))?;
|
||||||
|
let (key, value) = map.into_iter().next().ok_or_else(|| {
|
||||||
|
return_err(
|
||||||
|
"expected exactly one key-value pair in the order map",
|
||||||
|
entry.clone(),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let order = serde_json::from_value(value.clone()).map_err(de::Error::custom)?;
|
||||||
|
|
||||||
|
Ok(CustomOrder {
|
||||||
|
target: key.as_str().into(),
|
||||||
|
order,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
_ => Err(return_err(
|
||||||
|
"unexpected type, expected an object or array",
|
||||||
|
value,
|
||||||
|
)),
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -138,11 +184,23 @@ fn custom_order_serde_test() {
|
|||||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||||
|
|
||||||
|
assert_eq!(order, order_deser);
|
||||||
|
let order_deser: CustomOrder = serde_json::from_str("[{\"_key\":\"desc\"}]").unwrap();
|
||||||
assert_eq!(order, order_deser);
|
assert_eq!(order, order_deser);
|
||||||
|
|
||||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||||
assert!(order_deser.is_err());
|
assert!(order_deser.is_err());
|
||||||
|
|
||||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||||
assert!(order_deser.is_err());
|
assert!(order_deser
|
||||||
|
.unwrap_err()
|
||||||
|
.to_string()
|
||||||
|
.contains("unexpected empty array in order"));
|
||||||
|
|
||||||
|
let order_deser: serde_json::Result<CustomOrder> =
|
||||||
|
serde_json::from_str(r#"[{"_key":"desc"},{"_key":"desc"}]"#);
|
||||||
|
assert_eq!(
|
||||||
|
order_deser.unwrap_err().to_string(),
|
||||||
|
r#"only one sort order supported currently, but got [{"_key":"desc"},{"_key":"desc"}]"#
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
|||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use crate::aggregation::agg_limits::ResourceLimitGuard;
|
|
||||||
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
||||||
use crate::aggregation::intermediate_agg_result::{
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||||
@@ -270,7 +269,7 @@ impl SegmentRangeCollector {
|
|||||||
pub(crate) fn from_req_and_validate(
|
pub(crate) fn from_req_and_validate(
|
||||||
req: &RangeAggregation,
|
req: &RangeAggregation,
|
||||||
sub_aggregation: &mut AggregationsWithAccessor,
|
sub_aggregation: &mut AggregationsWithAccessor,
|
||||||
limits: &ResourceLimitGuard,
|
limits: &mut AggregationLimitsGuard,
|
||||||
field_type: ColumnType,
|
field_type: ColumnType,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
@@ -471,7 +470,7 @@ mod tests {
|
|||||||
SegmentRangeCollector::from_req_and_validate(
|
SegmentRangeCollector::from_req_and_validate(
|
||||||
&req,
|
&req,
|
||||||
&mut Default::default(),
|
&mut Default::default(),
|
||||||
&AggregationLimits::default().new_guard(),
|
&mut AggregationLimitsGuard::default(),
|
||||||
field_type,
|
field_type,
|
||||||
0,
|
0,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -1,9 +1,10 @@
|
|||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
use std::io;
|
||||||
use std::net::Ipv6Addr;
|
use std::net::Ipv6Addr;
|
||||||
|
|
||||||
use columnar::column_values::CompactSpaceU64Accessor;
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
use columnar::{
|
use columnar::{
|
||||||
BytesColumn, ColumnType, MonotonicallyMappableToU128, MonotonicallyMappableToU64, StrColumn,
|
ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalValue,
|
||||||
};
|
};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
@@ -20,11 +21,11 @@ use crate::aggregation::intermediate_agg_result::{
|
|||||||
use crate::aggregation::segment_agg_result::{
|
use crate::aggregation::segment_agg_result::{
|
||||||
build_segment_agg_collector, SegmentAggregationCollector,
|
build_segment_agg_collector, SegmentAggregationCollector,
|
||||||
};
|
};
|
||||||
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
|
use crate::aggregation::{format_date, Key};
|
||||||
use crate::error::DataCorruption;
|
use crate::error::DataCorruption;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
/// Creates a bucket for every unique term and counts the number of occurences.
|
/// Creates a bucket for every unique term and counts the number of occurrences.
|
||||||
/// Note that doc_count in the response buckets equals term count here.
|
/// Note that doc_count in the response buckets equals term count here.
|
||||||
///
|
///
|
||||||
/// If the text is untokenized and single value, that means one term per document and therefore it
|
/// If the text is untokenized and single value, that means one term per document and therefore it
|
||||||
@@ -157,7 +158,7 @@ pub struct TermsAggregation {
|
|||||||
/// when loading the text.
|
/// when loading the text.
|
||||||
/// Special Case 1:
|
/// Special Case 1:
|
||||||
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
||||||
/// columns, to find docids without a value. That requires a special missing aggreggation.
|
/// columns, to find docids without a value. That requires a special missing aggregation.
|
||||||
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
||||||
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
||||||
/// add text.
|
/// add text.
|
||||||
@@ -363,7 +364,7 @@ impl SegmentTermCollector {
|
|||||||
let term_buckets = TermBuckets::default();
|
let term_buckets = TermBuckets::default();
|
||||||
|
|
||||||
if let Some(custom_order) = req.order.as_ref() {
|
if let Some(custom_order) = req.order.as_ref() {
|
||||||
// Validate sub aggregtion exists
|
// Validate sub aggregation exists
|
||||||
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
||||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||||
|
|
||||||
@@ -466,49 +467,72 @@ impl SegmentTermCollector {
|
|||||||
};
|
};
|
||||||
|
|
||||||
if self.column_type == ColumnType::Str {
|
if self.column_type == ColumnType::Str {
|
||||||
|
let fallback_dict = Dictionary::empty();
|
||||||
let term_dict = agg_with_accessor
|
let term_dict = agg_with_accessor
|
||||||
.str_dict_column
|
.str_dict_column
|
||||||
.as_ref()
|
.as_ref()
|
||||||
.cloned()
|
.map(|el| el.dictionary())
|
||||||
.unwrap_or_else(|| {
|
.unwrap_or_else(|| &fallback_dict);
|
||||||
StrColumn::wrap(BytesColumn::empty(agg_with_accessor.accessor.num_docs()))
|
let mut buffer = Vec::new();
|
||||||
});
|
|
||||||
let mut buffer = String::new();
|
// special case for missing key
|
||||||
for (term_id, doc_count) in entries {
|
if let Some(index) = entries.iter().position(|value| value.0 == u64::MAX) {
|
||||||
let intermediate_entry = into_intermediate_bucket_entry(term_id, doc_count)?;
|
let entry = entries[index];
|
||||||
// Special case for missing key
|
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)?;
|
||||||
if term_id == u64::MAX {
|
let missing_key = self
|
||||||
let missing_key = self
|
.req
|
||||||
.req
|
.missing
|
||||||
.missing
|
.as_ref()
|
||||||
.as_ref()
|
.expect("Found placeholder term_id but `missing` is None");
|
||||||
.expect("Found placeholder term_id but `missing` is None");
|
match missing_key {
|
||||||
match missing_key {
|
Key::Str(missing) => {
|
||||||
Key::Str(missing) => {
|
buffer.clear();
|
||||||
buffer.clear();
|
buffer.extend_from_slice(missing.as_bytes());
|
||||||
buffer.push_str(missing);
|
dict.insert(
|
||||||
dict.insert(
|
IntermediateKey::Str(
|
||||||
IntermediateKey::Str(buffer.to_string()),
|
String::from_utf8(buffer.to_vec())
|
||||||
intermediate_entry,
|
.expect("could not convert to String"),
|
||||||
);
|
),
|
||||||
}
|
intermediate_entry,
|
||||||
Key::F64(val) => {
|
);
|
||||||
buffer.push_str(&val.to_string());
|
|
||||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
} else {
|
Key::F64(val) => {
|
||||||
if !term_dict.ord_to_str(term_id, &mut buffer)? {
|
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||||
return Err(TantivyError::InternalError(format!(
|
}
|
||||||
"Couldn't find term_id {term_id} in dict"
|
Key::U64(val) => {
|
||||||
)));
|
dict.insert(IntermediateKey::U64(*val), intermediate_entry);
|
||||||
|
}
|
||||||
|
Key::I64(val) => {
|
||||||
|
dict.insert(IntermediateKey::I64(*val), intermediate_entry);
|
||||||
}
|
}
|
||||||
dict.insert(IntermediateKey::Str(buffer.to_string()), intermediate_entry);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
entries.swap_remove(index);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Sort by term ord
|
||||||
|
entries.sort_unstable_by_key(|bucket| bucket.0);
|
||||||
|
let mut idx = 0;
|
||||||
|
term_dict.sorted_ords_to_term_cb(
|
||||||
|
entries.iter().map(|(term_id, _)| *term_id),
|
||||||
|
|term| {
|
||||||
|
let entry = entries[idx];
|
||||||
|
let intermediate_entry = into_intermediate_bucket_entry(entry.0, entry.1)
|
||||||
|
.map_err(|err| io::Error::new(io::ErrorKind::Other, err))?;
|
||||||
|
dict.insert(
|
||||||
|
IntermediateKey::Str(
|
||||||
|
String::from_utf8(term.to_vec()).expect("could not convert to String"),
|
||||||
|
),
|
||||||
|
intermediate_entry,
|
||||||
|
);
|
||||||
|
idx += 1;
|
||||||
|
Ok(())
|
||||||
|
},
|
||||||
|
)?;
|
||||||
|
|
||||||
if self.req.min_doc_count == 0 {
|
if self.req.min_doc_count == 0 {
|
||||||
// TODO: Handle rev streaming for descending sorting by keys
|
// TODO: Handle rev streaming for descending sorting by keys
|
||||||
let mut stream = term_dict.dictionary().stream()?;
|
let mut stream = term_dict.stream()?;
|
||||||
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
let empty_sub_aggregation = IntermediateAggregationResults::empty_from_req(
|
||||||
agg_with_accessor.agg.sub_aggregation(),
|
agg_with_accessor.agg.sub_aggregation(),
|
||||||
);
|
);
|
||||||
@@ -567,8 +591,26 @@ impl SegmentTermCollector {
|
|||||||
} else {
|
} else {
|
||||||
for (val, doc_count) in entries {
|
for (val, doc_count) in entries {
|
||||||
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
||||||
let val = f64_from_fastfield_u64(val, &self.column_type);
|
if self.column_type == ColumnType::U64 {
|
||||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||||
|
} else if self.column_type == ColumnType::I64 {
|
||||||
|
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
|
||||||
|
} else {
|
||||||
|
let val = f64::from_u64(val);
|
||||||
|
let val: NumericalValue = val.into();
|
||||||
|
|
||||||
|
match val.normalize() {
|
||||||
|
NumericalValue::U64(val) => {
|
||||||
|
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
NumericalValue::I64(val) => {
|
||||||
|
dict.insert(IntermediateKey::I64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
NumericalValue::F64(val) => {
|
||||||
|
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@@ -627,7 +669,7 @@ mod tests {
|
|||||||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||||
};
|
};
|
||||||
use crate::aggregation::AggregationLimits;
|
use crate::aggregation::AggregationLimitsGuard;
|
||||||
use crate::indexer::NoMergePolicy;
|
use crate::indexer::NoMergePolicy;
|
||||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||||
use crate::{Index, IndexWriter};
|
use crate::{Index, IndexWriter};
|
||||||
@@ -1382,7 +1424,7 @@ mod tests {
|
|||||||
agg_req,
|
agg_req,
|
||||||
&index,
|
&index,
|
||||||
None,
|
None,
|
||||||
AggregationLimits::new(Some(50_000), None),
|
AggregationLimitsGuard::new(Some(50_000), None),
|
||||||
)
|
)
|
||||||
.unwrap_err();
|
.unwrap_err();
|
||||||
assert!(res
|
assert!(res
|
||||||
@@ -1643,7 +1685,7 @@ mod tests {
|
|||||||
res["my_texts"]["buckets"][2]["key"],
|
res["my_texts"]["buckets"][2]["key"],
|
||||||
serde_json::Value::Null
|
serde_json::Value::Null
|
||||||
);
|
);
|
||||||
// text field with numner as missing fallback
|
// text field with number as missing fallback
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
||||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||||
@@ -1703,6 +1745,54 @@ mod tests {
|
|||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn terms_aggregation_u64_value() -> crate::Result<()> {
|
||||||
|
// Make sure that large u64 are not truncated
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 9_223_372_036_854_775_807u64,
|
||||||
|
))?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 1_769_070_189_829_214_202u64,
|
||||||
|
))?;
|
||||||
|
index_writer.add_document(doc!(
|
||||||
|
id_field => 1_769_070_189_829_214_202u64,
|
||||||
|
))?;
|
||||||
|
index_writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"my_ids": {
|
||||||
|
"terms": {
|
||||||
|
"field": "id"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||||
|
|
||||||
|
// id field
|
||||||
|
assert_eq!(
|
||||||
|
res["my_ids"]["buckets"][0]["key"],
|
||||||
|
1_769_070_189_829_214_202u64
|
||||||
|
);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||||||
|
assert_eq!(
|
||||||
|
res["my_ids"]["buckets"][1]["key"],
|
||||||
|
9_223_372_036_854_775_807u64
|
||||||
|
);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||||||
|
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn terms_aggregation_missing1() -> crate::Result<()> {
|
fn terms_aggregation_missing1() -> crate::Result<()> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -1769,7 +1859,7 @@ mod tests {
|
|||||||
res["my_texts"]["buckets"][2]["key"],
|
res["my_texts"]["buckets"][2]["key"],
|
||||||
serde_json::Value::Null
|
serde_json::Value::Null
|
||||||
);
|
);
|
||||||
// text field with numner as missing fallback
|
// text field with number as missing fallback
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
||||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||||
|
|||||||
@@ -70,7 +70,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
|||||||
)?;
|
)?;
|
||||||
missing_entry.sub_aggregation = res;
|
missing_entry.sub_aggregation = res;
|
||||||
}
|
}
|
||||||
|
|
||||||
entries.insert(missing.into(), missing_entry);
|
entries.insert(missing.into(), missing_entry);
|
||||||
|
|
||||||
let bucket = IntermediateBucketResult::Terms {
|
let bucket = IntermediateBucketResult::Terms {
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ use super::agg_result::AggregationResults;
|
|||||||
use super::buf_collector::BufAggregationCollector;
|
use super::buf_collector::BufAggregationCollector;
|
||||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||||
use super::segment_agg_result::{
|
use super::segment_agg_result::{
|
||||||
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
|
build_segment_agg_collector, AggregationLimitsGuard, SegmentAggregationCollector,
|
||||||
};
|
};
|
||||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
||||||
use crate::collector::{Collector, SegmentCollector};
|
use crate::collector::{Collector, SegmentCollector};
|
||||||
@@ -22,7 +22,7 @@ pub const DEFAULT_MEMORY_LIMIT: u64 = 500_000_000;
|
|||||||
/// The collector collects all aggregations by the underlying aggregation request.
|
/// The collector collects all aggregations by the underlying aggregation request.
|
||||||
pub struct AggregationCollector {
|
pub struct AggregationCollector {
|
||||||
agg: Aggregations,
|
agg: Aggregations,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl AggregationCollector {
|
impl AggregationCollector {
|
||||||
@@ -30,7 +30,7 @@ impl AggregationCollector {
|
|||||||
///
|
///
|
||||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||||
/// bucket limit)
|
/// bucket limit)
|
||||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||||
Self { agg, limits }
|
Self { agg, limits }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -45,7 +45,7 @@ impl AggregationCollector {
|
|||||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||||
pub struct DistributedAggregationCollector {
|
pub struct DistributedAggregationCollector {
|
||||||
agg: Aggregations,
|
agg: Aggregations,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl DistributedAggregationCollector {
|
impl DistributedAggregationCollector {
|
||||||
@@ -53,7 +53,7 @@ impl DistributedAggregationCollector {
|
|||||||
///
|
///
|
||||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||||
/// bucket limit)
|
/// bucket limit)
|
||||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||||
Self { agg, limits }
|
Self { agg, limits }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -115,7 +115,7 @@ impl Collector for AggregationCollector {
|
|||||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||||
) -> crate::Result<Self::Fruit> {
|
) -> crate::Result<Self::Fruit> {
|
||||||
let res = merge_fruits(segment_fruits)?;
|
let res = merge_fruits(segment_fruits)?;
|
||||||
res.into_final_result(self.agg.clone(), &self.limits)
|
res.into_final_result(self.agg.clone(), self.limits.clone())
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -147,7 +147,7 @@ impl AggregationSegmentCollector {
|
|||||||
agg: &Aggregations,
|
agg: &Aggregations,
|
||||||
reader: &SegmentReader,
|
reader: &SegmentReader,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
limits: &AggregationLimits,
|
limits: &AggregationLimitsGuard,
|
||||||
) -> crate::Result<Self> {
|
) -> crate::Result<Self> {
|
||||||
let mut aggs_with_accessor =
|
let mut aggs_with_accessor =
|
||||||
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
||||||
|
|||||||
@@ -22,10 +22,11 @@ use super::metric::{
|
|||||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||||
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||||
};
|
};
|
||||||
use super::segment_agg_result::AggregationLimits;
|
use super::segment_agg_result::AggregationLimitsGuard;
|
||||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||||
|
use crate::aggregation::metric::CardinalityCollector;
|
||||||
use crate::TantivyError;
|
use crate::TantivyError;
|
||||||
|
|
||||||
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
/// Contains the intermediate aggregation result, which is optimized to be merged with other
|
||||||
@@ -50,12 +51,18 @@ pub enum IntermediateKey {
|
|||||||
Str(String),
|
Str(String),
|
||||||
/// `f64` key
|
/// `f64` key
|
||||||
F64(f64),
|
F64(f64),
|
||||||
|
/// `i64` key
|
||||||
|
I64(i64),
|
||||||
|
/// `u64` key
|
||||||
|
U64(u64),
|
||||||
}
|
}
|
||||||
impl From<Key> for IntermediateKey {
|
impl From<Key> for IntermediateKey {
|
||||||
fn from(value: Key) -> Self {
|
fn from(value: Key) -> Self {
|
||||||
match value {
|
match value {
|
||||||
Key::Str(s) => Self::Str(s),
|
Key::Str(s) => Self::Str(s),
|
||||||
Key::F64(f) => Self::F64(f),
|
Key::F64(f) => Self::F64(f),
|
||||||
|
Key::U64(f) => Self::U64(f),
|
||||||
|
Key::I64(f) => Self::I64(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -72,7 +79,9 @@ impl From<IntermediateKey> for Key {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
IntermediateKey::F64(f) => Self::F64(f),
|
IntermediateKey::F64(f) => Self::F64(f),
|
||||||
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
|
IntermediateKey::Bool(f) => Self::U64(f as u64),
|
||||||
|
IntermediateKey::U64(f) => Self::U64(f),
|
||||||
|
IntermediateKey::I64(f) => Self::I64(f),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -85,6 +94,8 @@ impl std::hash::Hash for IntermediateKey {
|
|||||||
match self {
|
match self {
|
||||||
IntermediateKey::Str(text) => text.hash(state),
|
IntermediateKey::Str(text) => text.hash(state),
|
||||||
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||||
|
IntermediateKey::U64(val) => val.hash(state),
|
||||||
|
IntermediateKey::I64(val) => val.hash(state),
|
||||||
IntermediateKey::Bool(val) => val.hash(state),
|
IntermediateKey::Bool(val) => val.hash(state),
|
||||||
IntermediateKey::IpAddr(val) => val.hash(state),
|
IntermediateKey::IpAddr(val) => val.hash(state),
|
||||||
}
|
}
|
||||||
@@ -111,9 +122,9 @@ impl IntermediateAggregationResults {
|
|||||||
pub fn into_final_result(
|
pub fn into_final_result(
|
||||||
self,
|
self,
|
||||||
req: Aggregations,
|
req: Aggregations,
|
||||||
limits: &AggregationLimits,
|
mut limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResults> {
|
) -> crate::Result<AggregationResults> {
|
||||||
let res = self.into_final_result_internal(&req, limits)?;
|
let res = self.into_final_result_internal(&req, &mut limits)?;
|
||||||
let bucket_count = res.get_bucket_count() as u32;
|
let bucket_count = res.get_bucket_count() as u32;
|
||||||
if bucket_count > limits.get_bucket_limit() {
|
if bucket_count > limits.get_bucket_limit() {
|
||||||
return Err(TantivyError::AggregationError(
|
return Err(TantivyError::AggregationError(
|
||||||
@@ -130,7 +141,7 @@ impl IntermediateAggregationResults {
|
|||||||
pub(crate) fn into_final_result_internal(
|
pub(crate) fn into_final_result_internal(
|
||||||
self,
|
self,
|
||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResults> {
|
) -> crate::Result<AggregationResults> {
|
||||||
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
||||||
for (key, agg_res) in self.aggs_res.into_iter() {
|
for (key, agg_res) in self.aggs_res.into_iter() {
|
||||||
@@ -227,6 +238,9 @@ pub(crate) fn empty_from_req(req: &Aggregation) -> IntermediateAggregationResult
|
|||||||
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
TopHits(ref req) => IntermediateAggregationResult::Metric(
|
||||||
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
IntermediateMetricResult::TopHits(TopHitsTopNComputer::new(req)),
|
||||||
),
|
),
|
||||||
|
Cardinality(_) => IntermediateAggregationResult::Metric(
|
||||||
|
IntermediateMetricResult::Cardinality(CardinalityCollector::default()),
|
||||||
|
),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -243,7 +257,7 @@ impl IntermediateAggregationResult {
|
|||||||
pub(crate) fn into_final_result(
|
pub(crate) fn into_final_result(
|
||||||
self,
|
self,
|
||||||
req: &Aggregation,
|
req: &Aggregation,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<AggregationResult> {
|
) -> crate::Result<AggregationResult> {
|
||||||
let res = match self {
|
let res = match self {
|
||||||
IntermediateAggregationResult::Bucket(bucket) => {
|
IntermediateAggregationResult::Bucket(bucket) => {
|
||||||
@@ -291,6 +305,8 @@ pub enum IntermediateMetricResult {
|
|||||||
Sum(IntermediateSum),
|
Sum(IntermediateSum),
|
||||||
/// Intermediate top_hits result
|
/// Intermediate top_hits result
|
||||||
TopHits(TopHitsTopNComputer),
|
TopHits(TopHitsTopNComputer),
|
||||||
|
/// Intermediate cardinality result
|
||||||
|
Cardinality(CardinalityCollector),
|
||||||
}
|
}
|
||||||
|
|
||||||
impl IntermediateMetricResult {
|
impl IntermediateMetricResult {
|
||||||
@@ -324,6 +340,9 @@ impl IntermediateMetricResult {
|
|||||||
IntermediateMetricResult::TopHits(top_hits) => {
|
IntermediateMetricResult::TopHits(top_hits) => {
|
||||||
MetricResult::TopHits(top_hits.into_final_result())
|
MetricResult::TopHits(top_hits.into_final_result())
|
||||||
}
|
}
|
||||||
|
IntermediateMetricResult::Cardinality(cardinality) => {
|
||||||
|
MetricResult::Cardinality(cardinality.finalize().into())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -372,6 +391,12 @@ impl IntermediateMetricResult {
|
|||||||
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
(IntermediateMetricResult::TopHits(left), IntermediateMetricResult::TopHits(right)) => {
|
||||||
left.merge_fruits(right)?;
|
left.merge_fruits(right)?;
|
||||||
}
|
}
|
||||||
|
(
|
||||||
|
IntermediateMetricResult::Cardinality(left),
|
||||||
|
IntermediateMetricResult::Cardinality(right),
|
||||||
|
) => {
|
||||||
|
left.merge_fruits(right)?;
|
||||||
|
}
|
||||||
_ => {
|
_ => {
|
||||||
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
panic!("incompatible fruit types in tree or missing merge_fruits handler");
|
||||||
}
|
}
|
||||||
@@ -407,7 +432,7 @@ impl IntermediateBucketResult {
|
|||||||
pub(crate) fn into_final_bucket_result(
|
pub(crate) fn into_final_bucket_result(
|
||||||
self,
|
self,
|
||||||
req: &Aggregation,
|
req: &Aggregation,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketResult> {
|
) -> crate::Result<BucketResult> {
|
||||||
match self {
|
match self {
|
||||||
IntermediateBucketResult::Range(range_res) => {
|
IntermediateBucketResult::Range(range_res) => {
|
||||||
@@ -571,7 +596,7 @@ impl IntermediateTermBucketResult {
|
|||||||
self,
|
self,
|
||||||
req: &TermsAggregation,
|
req: &TermsAggregation,
|
||||||
sub_aggregation_req: &Aggregations,
|
sub_aggregation_req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketResult> {
|
) -> crate::Result<BucketResult> {
|
||||||
let req = TermsAggregationInternal::from_req(req);
|
let req = TermsAggregationInternal::from_req(req);
|
||||||
let mut buckets: Vec<BucketEntry> = self
|
let mut buckets: Vec<BucketEntry> = self
|
||||||
@@ -698,7 +723,7 @@ impl IntermediateHistogramBucketEntry {
|
|||||||
pub(crate) fn into_final_bucket_entry(
|
pub(crate) fn into_final_bucket_entry(
|
||||||
self,
|
self,
|
||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<BucketEntry> {
|
) -> crate::Result<BucketEntry> {
|
||||||
Ok(BucketEntry {
|
Ok(BucketEntry {
|
||||||
key_as_string: None,
|
key_as_string: None,
|
||||||
@@ -733,7 +758,7 @@ impl IntermediateRangeBucketEntry {
|
|||||||
req: &Aggregations,
|
req: &Aggregations,
|
||||||
_range_req: &RangeAggregation,
|
_range_req: &RangeAggregation,
|
||||||
column_type: Option<ColumnType>,
|
column_type: Option<ColumnType>,
|
||||||
limits: &AggregationLimits,
|
limits: &mut AggregationLimitsGuard,
|
||||||
) -> crate::Result<RangeBucketEntry> {
|
) -> crate::Result<RangeBucketEntry> {
|
||||||
let mut range_bucket_entry = RangeBucketEntry {
|
let mut range_bucket_entry = RangeBucketEntry {
|
||||||
key: self.key.into(),
|
key: self.key.into(),
|
||||||
@@ -835,7 +860,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_intermediat_tree_with_ranges(
|
fn get_intermediate_tree_with_ranges(
|
||||||
data: &[(String, u64, String, u64)],
|
data: &[(String, u64, String, u64)],
|
||||||
) -> IntermediateAggregationResults {
|
) -> IntermediateAggregationResults {
|
||||||
let mut map = HashMap::new();
|
let mut map = HashMap::new();
|
||||||
@@ -871,18 +896,18 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_1() {
|
fn test_merge_fruits_tree_1() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 60, "1900".to_string(), 30),
|
("red".to_string(), 60, "1900".to_string(), 30),
|
||||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
tree_left.merge_fruits(tree_right).unwrap();
|
tree_left.merge_fruits(tree_right).unwrap();
|
||||||
|
|
||||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 110, "1900".to_string(), 55),
|
("red".to_string(), 110, "1900".to_string(), 55),
|
||||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||||
]);
|
]);
|
||||||
@@ -892,18 +917,18 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_2() {
|
fn test_merge_fruits_tree_2() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 60, "1900".to_string(), 30),
|
("red".to_string(), 60, "1900".to_string(), 30),
|
||||||
("green".to_string(), 25, "1900".to_string(), 50),
|
("green".to_string(), 25, "1900".to_string(), 50),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
tree_left.merge_fruits(tree_right).unwrap();
|
tree_left.merge_fruits(tree_right).unwrap();
|
||||||
|
|
||||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 110, "1900".to_string(), 55),
|
("red".to_string(), 110, "1900".to_string(), 55),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
("green".to_string(), 25, "1900".to_string(), 50),
|
("green".to_string(), 25, "1900".to_string(), 50),
|
||||||
@@ -914,7 +939,7 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_merge_fruits_tree_empty() {
|
fn test_merge_fruits_tree_empty() {
|
||||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||||
("red".to_string(), 50, "1900".to_string(), 25),
|
("red".to_string(), 50, "1900".to_string(), 25),
|
||||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||||
]);
|
]);
|
||||||
|
|||||||
473
src/aggregation/metric/cardinality.rs
Normal file
473
src/aggregation/metric/cardinality.rs
Normal file
@@ -0,0 +1,473 @@
|
|||||||
|
use std::collections::hash_map::DefaultHasher;
|
||||||
|
use std::hash::{BuildHasher, Hasher};
|
||||||
|
|
||||||
|
use columnar::column_values::CompactSpaceU64Accessor;
|
||||||
|
use columnar::Dictionary;
|
||||||
|
use common::f64_to_u64;
|
||||||
|
use hyperloglogplus::{HyperLogLog, HyperLogLogPlus};
|
||||||
|
use rustc_hash::FxHashSet;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
|
use crate::aggregation::agg_req_with_accessor::{
|
||||||
|
AggregationWithAccessor, AggregationsWithAccessor,
|
||||||
|
};
|
||||||
|
use crate::aggregation::intermediate_agg_result::{
|
||||||
|
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateMetricResult,
|
||||||
|
};
|
||||||
|
use crate::aggregation::segment_agg_result::SegmentAggregationCollector;
|
||||||
|
use crate::aggregation::*;
|
||||||
|
use crate::TantivyError;
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
struct BuildSaltedHasher {
|
||||||
|
salt: u8,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl BuildHasher for BuildSaltedHasher {
|
||||||
|
type Hasher = DefaultHasher;
|
||||||
|
|
||||||
|
fn build_hasher(&self) -> Self::Hasher {
|
||||||
|
let mut hasher = DefaultHasher::new();
|
||||||
|
hasher.write_u8(self.salt);
|
||||||
|
|
||||||
|
hasher
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// # Cardinality
|
||||||
|
///
|
||||||
|
/// The cardinality aggregation allows for computing an estimate
|
||||||
|
/// of the number of different values in a data set based on the
|
||||||
|
/// HyperLogLog++ algorithm. This is particularly useful for understanding the
|
||||||
|
/// uniqueness of values in a large dataset where counting each unique value
|
||||||
|
/// individually would be computationally expensive.
|
||||||
|
///
|
||||||
|
/// For example, you might use a cardinality aggregation to estimate the number
|
||||||
|
/// of unique visitors to a website by aggregating on a field that contains
|
||||||
|
/// user IDs or session IDs.
|
||||||
|
///
|
||||||
|
/// To use the cardinality aggregation, you'll need to provide a field to
|
||||||
|
/// aggregate on. The following example demonstrates a request for the cardinality
|
||||||
|
/// of the "user_id" field:
|
||||||
|
///
|
||||||
|
/// ```JSON
|
||||||
|
/// {
|
||||||
|
/// "cardinality": {
|
||||||
|
/// "field": "user_id"
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// This request will return an estimate of the number of unique values in the
|
||||||
|
/// "user_id" field.
|
||||||
|
///
|
||||||
|
/// ## Missing Values
|
||||||
|
///
|
||||||
|
/// The `missing` parameter defines how documents that are missing a value should be treated.
|
||||||
|
/// By default, documents without a value for the specified field are ignored. However, you can
|
||||||
|
/// specify a default value for these documents using the `missing` parameter. This can be useful
|
||||||
|
/// when you want to include documents with missing values in the aggregation.
|
||||||
|
///
|
||||||
|
/// For example, the following request treats documents with missing values in the "user_id"
|
||||||
|
/// field as if they had a value of "unknown":
|
||||||
|
///
|
||||||
|
/// ```JSON
|
||||||
|
/// {
|
||||||
|
/// "cardinality": {
|
||||||
|
/// "field": "user_id",
|
||||||
|
/// "missing": "unknown"
|
||||||
|
/// }
|
||||||
|
/// }
|
||||||
|
/// ```
|
||||||
|
///
|
||||||
|
/// # Estimation Accuracy
|
||||||
|
///
|
||||||
|
/// The cardinality aggregation provides an approximate count, which is usually
|
||||||
|
/// accurate within a small error range. This trade-off allows for efficient
|
||||||
|
/// computation even on very large datasets.
|
||||||
|
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
|
||||||
|
pub struct CardinalityAggregationReq {
|
||||||
|
/// The field name to compute the percentiles on.
|
||||||
|
pub field: String,
|
||||||
|
/// The missing parameter defines how documents that are missing a value should be treated.
|
||||||
|
/// By default they will be ignored but it is also possible to treat them as if they had a
|
||||||
|
/// value. Examples in JSON format:
|
||||||
|
/// { "field": "my_numbers", "missing": "10.0" }
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none", default)]
|
||||||
|
pub missing: Option<Key>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CardinalityAggregationReq {
|
||||||
|
/// Creates a new [`CardinalityAggregationReq`] instance from a field name.
|
||||||
|
pub fn from_field_name(field_name: String) -> Self {
|
||||||
|
Self {
|
||||||
|
field: field_name,
|
||||||
|
missing: None,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
/// Returns the field name the aggregation is computed on.
|
||||||
|
pub fn field_name(&self) -> &str {
|
||||||
|
&self.field
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, PartialEq)]
|
||||||
|
pub(crate) struct SegmentCardinalityCollector {
|
||||||
|
cardinality: CardinalityCollector,
|
||||||
|
entries: FxHashSet<u64>,
|
||||||
|
column_type: ColumnType,
|
||||||
|
accessor_idx: usize,
|
||||||
|
missing: Option<Key>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentCardinalityCollector {
|
||||||
|
pub fn from_req(column_type: ColumnType, accessor_idx: usize, missing: &Option<Key>) -> Self {
|
||||||
|
Self {
|
||||||
|
cardinality: CardinalityCollector::new(column_type as u8),
|
||||||
|
entries: Default::default(),
|
||||||
|
column_type,
|
||||||
|
accessor_idx,
|
||||||
|
missing: missing.clone(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn fetch_block_with_field(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_accessor: &mut AggregationWithAccessor,
|
||||||
|
) {
|
||||||
|
if let Some(missing) = agg_accessor.missing_value_for_accessor {
|
||||||
|
agg_accessor.column_block_accessor.fetch_block_with_missing(
|
||||||
|
docs,
|
||||||
|
&agg_accessor.accessor,
|
||||||
|
missing,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
agg_accessor
|
||||||
|
.column_block_accessor
|
||||||
|
.fetch_block(docs, &agg_accessor.accessor);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn into_intermediate_metric_result(
|
||||||
|
mut self,
|
||||||
|
agg_with_accessor: &AggregationWithAccessor,
|
||||||
|
) -> crate::Result<IntermediateMetricResult> {
|
||||||
|
if self.column_type == ColumnType::Str {
|
||||||
|
let fallback_dict = Dictionary::empty();
|
||||||
|
let dict = agg_with_accessor
|
||||||
|
.str_dict_column
|
||||||
|
.as_ref()
|
||||||
|
.map(|el| el.dictionary())
|
||||||
|
.unwrap_or_else(|| &fallback_dict);
|
||||||
|
let mut has_missing = false;
|
||||||
|
|
||||||
|
// TODO: replace FxHashSet with something that allows iterating in order
|
||||||
|
// (e.g. sparse bitvec)
|
||||||
|
let mut term_ids = Vec::new();
|
||||||
|
for term_ord in self.entries.into_iter() {
|
||||||
|
if term_ord == u64::MAX {
|
||||||
|
has_missing = true;
|
||||||
|
} else {
|
||||||
|
// we can reasonably exclude values above u32::MAX
|
||||||
|
term_ids.push(term_ord as u32);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
term_ids.sort_unstable();
|
||||||
|
dict.sorted_ords_to_term_cb(term_ids.iter().map(|term| *term as u64), |term| {
|
||||||
|
self.cardinality.sketch.insert_any(&term);
|
||||||
|
Ok(())
|
||||||
|
})?;
|
||||||
|
if has_missing {
|
||||||
|
// Replace missing with the actual value provided
|
||||||
|
let missing_key = self
|
||||||
|
.missing
|
||||||
|
.as_ref()
|
||||||
|
.expect("Found sentinel value u64::MAX for term_ord but `missing` is not set");
|
||||||
|
match missing_key {
|
||||||
|
Key::Str(missing) => {
|
||||||
|
self.cardinality.sketch.insert_any(&missing);
|
||||||
|
}
|
||||||
|
Key::F64(val) => {
|
||||||
|
let val = f64_to_u64(*val);
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
Key::U64(val) => {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
Key::I64(val) => {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(IntermediateMetricResult::Cardinality(self.cardinality))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl SegmentAggregationCollector for SegmentCardinalityCollector {
|
||||||
|
fn add_intermediate_aggregation_result(
|
||||||
|
self: Box<Self>,
|
||||||
|
agg_with_accessor: &AggregationsWithAccessor,
|
||||||
|
results: &mut IntermediateAggregationResults,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let name = agg_with_accessor.aggs.keys[self.accessor_idx].to_string();
|
||||||
|
let agg_with_accessor = &agg_with_accessor.aggs.values[self.accessor_idx];
|
||||||
|
|
||||||
|
let intermediate_result = self.into_intermediate_metric_result(agg_with_accessor)?;
|
||||||
|
results.push(
|
||||||
|
name,
|
||||||
|
IntermediateAggregationResult::Metric(intermediate_result),
|
||||||
|
)?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect(
|
||||||
|
&mut self,
|
||||||
|
doc: crate::DocId,
|
||||||
|
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
self.collect_block(&[doc], agg_with_accessor)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn collect_block(
|
||||||
|
&mut self,
|
||||||
|
docs: &[crate::DocId],
|
||||||
|
agg_with_accessor: &mut AggregationsWithAccessor,
|
||||||
|
) -> crate::Result<()> {
|
||||||
|
let bucket_agg_accessor = &mut agg_with_accessor.aggs.values[self.accessor_idx];
|
||||||
|
self.fetch_block_with_field(docs, bucket_agg_accessor);
|
||||||
|
|
||||||
|
let col_block_accessor = &bucket_agg_accessor.column_block_accessor;
|
||||||
|
if self.column_type == ColumnType::Str {
|
||||||
|
for term_ord in col_block_accessor.iter_vals() {
|
||||||
|
self.entries.insert(term_ord);
|
||||||
|
}
|
||||||
|
} else if self.column_type == ColumnType::IpAddr {
|
||||||
|
let compact_space_accessor = bucket_agg_accessor
|
||||||
|
.accessor
|
||||||
|
.values
|
||||||
|
.clone()
|
||||||
|
.downcast_arc::<CompactSpaceU64Accessor>()
|
||||||
|
.map_err(|_| {
|
||||||
|
TantivyError::AggregationError(
|
||||||
|
crate::aggregation::AggregationError::InternalError(
|
||||||
|
"Type mismatch: Could not downcast to CompactSpaceU64Accessor"
|
||||||
|
.to_string(),
|
||||||
|
),
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
for val in col_block_accessor.iter_vals() {
|
||||||
|
let val: u128 = compact_space_accessor.compact_to_u128(val as u32);
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for val in col_block_accessor.iter_vals() {
|
||||||
|
self.cardinality.sketch.insert_any(&val);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Clone, Debug, Serialize, Deserialize)]
|
||||||
|
/// The percentiles collector used during segment collection and for merging results.
|
||||||
|
pub struct CardinalityCollector {
|
||||||
|
sketch: HyperLogLogPlus<u64, BuildSaltedHasher>,
|
||||||
|
}
|
||||||
|
impl Default for CardinalityCollector {
|
||||||
|
fn default() -> Self {
|
||||||
|
Self::new(0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl PartialEq for CardinalityCollector {
|
||||||
|
fn eq(&self, _other: &Self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl CardinalityCollector {
|
||||||
|
/// Compute the final cardinality estimate.
|
||||||
|
pub fn finalize(self) -> Option<f64> {
|
||||||
|
Some(self.sketch.clone().count().trunc())
|
||||||
|
}
|
||||||
|
|
||||||
|
fn new(salt: u8) -> Self {
|
||||||
|
Self {
|
||||||
|
sketch: HyperLogLogPlus::new(16, BuildSaltedHasher { salt }).unwrap(),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub(crate) fn merge_fruits(&mut self, right: CardinalityCollector) -> crate::Result<()> {
|
||||||
|
self.sketch.merge(&right.sketch).map_err(|err| {
|
||||||
|
TantivyError::AggregationError(AggregationError::InternalError(format!(
|
||||||
|
"Error while merging cardinality {err:?}"
|
||||||
|
)))
|
||||||
|
})?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
|
||||||
|
use std::net::IpAddr;
|
||||||
|
use std::str::FromStr;
|
||||||
|
|
||||||
|
use columnar::MonotonicallyMappableToU64;
|
||||||
|
|
||||||
|
use crate::aggregation::agg_req::Aggregations;
|
||||||
|
use crate::aggregation::tests::{exec_request, get_test_index_from_terms};
|
||||||
|
use crate::schema::{IntoIpv6Addr, Schema, FAST};
|
||||||
|
use crate::Index;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test_empty_index() -> crate::Result<()> {
|
||||||
|
let values = vec![];
|
||||||
|
let index = get_test_index_from_terms(false, &values)?;
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 0.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test_single_segment() -> crate::Result<()> {
|
||||||
|
cardinality_aggregation_test_merge_segment(true)
|
||||||
|
}
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_test() -> crate::Result<()> {
|
||||||
|
cardinality_aggregation_test_merge_segment(false)
|
||||||
|
}
|
||||||
|
fn cardinality_aggregation_test_merge_segment(merge_segments: bool) -> crate::Result<()> {
|
||||||
|
let segment_and_terms = vec![
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["termb"],
|
||||||
|
vec!["termc"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["terma"],
|
||||||
|
vec!["termb"],
|
||||||
|
vec!["terma"],
|
||||||
|
];
|
||||||
|
let index = get_test_index_from_terms(merge_segments, &segment_and_terms)?;
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "string_id",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 3.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_u64() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
writer.add_document(doc!(id_field => 1u64))?;
|
||||||
|
writer.add_document(doc!(id_field => 2u64))?;
|
||||||
|
writer.add_document(doc!(id_field => 3u64))?;
|
||||||
|
writer.add_document(doc!())?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "id",
|
||||||
|
"missing": 0u64
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_ip_addr() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let field = schema_builder.add_ip_addr_field("ip_field", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
// IpV6 loopback
|
||||||
|
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||||
|
writer.add_document(doc!(field=>IpAddr::from_str("::1").unwrap().into_ipv6_addr()))?;
|
||||||
|
// IpV4
|
||||||
|
writer.add_document(
|
||||||
|
doc!(field=>IpAddr::from_str("127.0.0.1").unwrap().into_ipv6_addr()),
|
||||||
|
)?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "ip_field"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 2.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cardinality_aggregation_json() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let field = schema_builder.add_json_field("json", FAST);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
{
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": false})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": true})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": i64::from_u64(0u64)})))?;
|
||||||
|
writer.add_document(doc!(field => json!({"value": i64::from_u64(1u64)})))?;
|
||||||
|
writer.commit()?;
|
||||||
|
}
|
||||||
|
|
||||||
|
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||||
|
"cardinality": {
|
||||||
|
"cardinality": {
|
||||||
|
"field": "json.value"
|
||||||
|
},
|
||||||
|
}
|
||||||
|
}))
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let res = exec_request(agg_req, &index)?;
|
||||||
|
assert_eq!(res["cardinality"]["value"], 4.0);
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -17,6 +17,7 @@
|
|||||||
//! - [Percentiles](PercentilesAggregationReq)
|
//! - [Percentiles](PercentilesAggregationReq)
|
||||||
|
|
||||||
mod average;
|
mod average;
|
||||||
|
mod cardinality;
|
||||||
mod count;
|
mod count;
|
||||||
mod extended_stats;
|
mod extended_stats;
|
||||||
mod max;
|
mod max;
|
||||||
@@ -29,6 +30,7 @@ mod top_hits;
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
|
|
||||||
pub use average::*;
|
pub use average::*;
|
||||||
|
pub use cardinality::*;
|
||||||
pub use count::*;
|
pub use count::*;
|
||||||
pub use extended_stats::*;
|
pub use extended_stats::*;
|
||||||
pub use max::*;
|
pub use max::*;
|
||||||
|
|||||||
@@ -163,8 +163,8 @@ impl PartialEq for PercentilesCollector {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn format_percentil(percentil: f64) -> String {
|
fn format_percentile(percentile: f64) -> String {
|
||||||
let mut out = percentil.to_string();
|
let mut out = percentile.to_string();
|
||||||
// Slightly silly way to format trailing decimals
|
// Slightly silly way to format trailing decimals
|
||||||
if !out.contains('.') {
|
if !out.contains('.') {
|
||||||
out.push_str(".0");
|
out.push_str(".0");
|
||||||
@@ -197,7 +197,7 @@ impl PercentilesCollector {
|
|||||||
let values = if req.keyed {
|
let values = if req.keyed {
|
||||||
PercentileValues::HashMap(
|
PercentileValues::HashMap(
|
||||||
iter_quantile_and_values
|
iter_quantile_and_values
|
||||||
.map(|(val, quantil)| (format_percentil(val), quantil))
|
.map(|(val, quantil)| (format_percentile(val), quantil))
|
||||||
.collect(),
|
.collect(),
|
||||||
)
|
)
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -89,7 +89,7 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
|||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
|
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Default)]
|
||||||
pub struct TopHitsAggregation {
|
pub struct TopHitsAggregationReq {
|
||||||
sort: Vec<KeyOrder>,
|
sort: Vec<KeyOrder>,
|
||||||
size: usize,
|
size: usize,
|
||||||
from: Option<usize>,
|
from: Option<usize>,
|
||||||
@@ -139,7 +139,7 @@ impl<'de> Deserialize<'de> for KeyOrder {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
// Transform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
||||||
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
|
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
|
||||||
// Replace `*` glob with `.*` regex
|
// Replace `*` glob with `.*` regex
|
||||||
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
|
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
|
||||||
@@ -164,7 +164,7 @@ fn unsupported_err(parameter: &str) -> crate::Result<()> {
|
|||||||
))
|
))
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TopHitsAggregation {
|
impl TopHitsAggregationReq {
|
||||||
/// Validate and resolve field retrieval parameters
|
/// Validate and resolve field retrieval parameters
|
||||||
pub fn validate_and_resolve_field_names(
|
pub fn validate_and_resolve_field_names(
|
||||||
&mut self,
|
&mut self,
|
||||||
@@ -431,7 +431,7 @@ impl Eq for DocSortValuesAndFields {}
|
|||||||
/// The TopHitsCollector used for collecting over segments and merging results.
|
/// The TopHitsCollector used for collecting over segments and merging results.
|
||||||
#[derive(Clone, Serialize, Deserialize, Debug)]
|
#[derive(Clone, Serialize, Deserialize, Debug)]
|
||||||
pub struct TopHitsTopNComputer {
|
pub struct TopHitsTopNComputer {
|
||||||
req: TopHitsAggregation,
|
req: TopHitsAggregationReq,
|
||||||
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
top_n: TopNComputer<DocSortValuesAndFields, DocAddress, false>,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -443,7 +443,7 @@ impl std::cmp::PartialEq for TopHitsTopNComputer {
|
|||||||
|
|
||||||
impl TopHitsTopNComputer {
|
impl TopHitsTopNComputer {
|
||||||
/// Create a new TopHitsCollector
|
/// Create a new TopHitsCollector
|
||||||
pub fn new(req: &TopHitsAggregation) -> Self {
|
pub fn new(req: &TopHitsAggregationReq) -> Self {
|
||||||
Self {
|
Self {
|
||||||
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
top_n: TopNComputer::new(req.size + req.from.unwrap_or(0)),
|
||||||
req: req.clone(),
|
req: req.clone(),
|
||||||
@@ -496,7 +496,7 @@ pub(crate) struct TopHitsSegmentCollector {
|
|||||||
|
|
||||||
impl TopHitsSegmentCollector {
|
impl TopHitsSegmentCollector {
|
||||||
pub fn from_req(
|
pub fn from_req(
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
accessor_idx: usize,
|
accessor_idx: usize,
|
||||||
segment_ordinal: SegmentOrdinal,
|
segment_ordinal: SegmentOrdinal,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
@@ -509,7 +509,7 @@ impl TopHitsSegmentCollector {
|
|||||||
fn into_top_hits_collector(
|
fn into_top_hits_collector(
|
||||||
self,
|
self,
|
||||||
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
value_accessors: &HashMap<String, Vec<DynamicColumn>>,
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
) -> TopHitsTopNComputer {
|
) -> TopHitsTopNComputer {
|
||||||
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
let mut top_hits_computer = TopHitsTopNComputer::new(req);
|
||||||
let top_results = self.top_n.into_vec();
|
let top_results = self.top_n.into_vec();
|
||||||
@@ -532,7 +532,7 @@ impl TopHitsSegmentCollector {
|
|||||||
fn collect_with(
|
fn collect_with(
|
||||||
&mut self,
|
&mut self,
|
||||||
doc_id: crate::DocId,
|
doc_id: crate::DocId,
|
||||||
req: &TopHitsAggregation,
|
req: &TopHitsAggregationReq,
|
||||||
accessors: &[(Column<u64>, ColumnType)],
|
accessors: &[(Column<u64>, ColumnType)],
|
||||||
) -> crate::Result<()> {
|
) -> crate::Result<()> {
|
||||||
let sorts: Vec<DocValueAndOrder> = req
|
let sorts: Vec<DocValueAndOrder> = req
|
||||||
|
|||||||
@@ -44,11 +44,14 @@
|
|||||||
//! - [Metric](metric)
|
//! - [Metric](metric)
|
||||||
//! - [Average](metric::AverageAggregation)
|
//! - [Average](metric::AverageAggregation)
|
||||||
//! - [Stats](metric::StatsAggregation)
|
//! - [Stats](metric::StatsAggregation)
|
||||||
|
//! - [ExtendedStats](metric::ExtendedStatsAggregation)
|
||||||
//! - [Min](metric::MinAggregation)
|
//! - [Min](metric::MinAggregation)
|
||||||
//! - [Max](metric::MaxAggregation)
|
//! - [Max](metric::MaxAggregation)
|
||||||
//! - [Sum](metric::SumAggregation)
|
//! - [Sum](metric::SumAggregation)
|
||||||
//! - [Count](metric::CountAggregation)
|
//! - [Count](metric::CountAggregation)
|
||||||
//! - [Percentiles](metric::PercentilesAggregationReq)
|
//! - [Percentiles](metric::PercentilesAggregationReq)
|
||||||
|
//! - [Cardinality](metric::CardinalityAggregationReq)
|
||||||
|
//! - [TopHits](metric::TopHitsAggregationReq)
|
||||||
//!
|
//!
|
||||||
//! # Example
|
//! # Example
|
||||||
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
//! Compute the average metric, by building [`agg_req::Aggregations`], which is built from an
|
||||||
@@ -145,7 +148,7 @@ mod agg_tests;
|
|||||||
|
|
||||||
use core::fmt;
|
use core::fmt;
|
||||||
|
|
||||||
pub use agg_limits::AggregationLimits;
|
pub use agg_limits::AggregationLimitsGuard;
|
||||||
pub use collector::{
|
pub use collector::{
|
||||||
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
||||||
DEFAULT_BUCKET_LIMIT,
|
DEFAULT_BUCKET_LIMIT,
|
||||||
@@ -333,10 +336,16 @@ pub type SerializedKey = String;
|
|||||||
|
|
||||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
|
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
|
||||||
/// The key to identify a bucket.
|
/// The key to identify a bucket.
|
||||||
|
///
|
||||||
|
/// The order is important, with serde untagged, that we try to deserialize into i64 first.
|
||||||
#[serde(untagged)]
|
#[serde(untagged)]
|
||||||
pub enum Key {
|
pub enum Key {
|
||||||
/// String key
|
/// String key
|
||||||
Str(String),
|
Str(String),
|
||||||
|
/// `i64` key
|
||||||
|
I64(i64),
|
||||||
|
/// `u64` key
|
||||||
|
U64(u64),
|
||||||
/// `f64` key
|
/// `f64` key
|
||||||
F64(f64),
|
F64(f64),
|
||||||
}
|
}
|
||||||
@@ -347,6 +356,8 @@ impl std::hash::Hash for Key {
|
|||||||
match self {
|
match self {
|
||||||
Key::Str(text) => text.hash(state),
|
Key::Str(text) => text.hash(state),
|
||||||
Key::F64(val) => val.to_bits().hash(state),
|
Key::F64(val) => val.to_bits().hash(state),
|
||||||
|
Key::U64(val) => val.hash(state),
|
||||||
|
Key::I64(val) => val.hash(state),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -366,6 +377,8 @@ impl Display for Key {
|
|||||||
match self {
|
match self {
|
||||||
Key::Str(val) => f.write_str(val),
|
Key::Str(val) => f.write_str(val),
|
||||||
Key::F64(val) => f.write_str(&val.to_string()),
|
Key::F64(val) => f.write_str(&val.to_string()),
|
||||||
|
Key::U64(val) => f.write_str(&val.to_string()),
|
||||||
|
Key::I64(val) => f.write_str(&val.to_string()),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -445,7 +458,7 @@ mod tests {
|
|||||||
agg_req: Aggregations,
|
agg_req: Aggregations,
|
||||||
index: &Index,
|
index: &Index,
|
||||||
query: Option<(&str, &str)>,
|
query: Option<(&str, &str)>,
|
||||||
limits: AggregationLimits,
|
limits: AggregationLimitsGuard,
|
||||||
) -> crate::Result<Value> {
|
) -> crate::Result<Value> {
|
||||||
let collector = AggregationCollector::from_aggs(agg_req, limits);
|
let collector = AggregationCollector::from_aggs(agg_req, limits);
|
||||||
|
|
||||||
|
|||||||
@@ -5,7 +5,7 @@
|
|||||||
|
|
||||||
use std::fmt::Debug;
|
use std::fmt::Debug;
|
||||||
|
|
||||||
pub(crate) use super::agg_limits::AggregationLimits;
|
pub(crate) use super::agg_limits::AggregationLimitsGuard;
|
||||||
use super::agg_req::AggregationVariants;
|
use super::agg_req::AggregationVariants;
|
||||||
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
|
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
|
||||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||||
@@ -16,7 +16,10 @@ use super::metric::{
|
|||||||
SumAggregation,
|
SumAggregation,
|
||||||
};
|
};
|
||||||
use crate::aggregation::bucket::TermMissingAgg;
|
use crate::aggregation::bucket::TermMissingAgg;
|
||||||
use crate::aggregation::metric::{SegmentExtendedStatsCollector, TopHitsSegmentCollector};
|
use crate::aggregation::metric::{
|
||||||
|
CardinalityAggregationReq, SegmentCardinalityCollector, SegmentExtendedStatsCollector,
|
||||||
|
TopHitsSegmentCollector,
|
||||||
|
};
|
||||||
|
|
||||||
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
|
pub(crate) trait SegmentAggregationCollector: CollectorClone + Debug {
|
||||||
fn add_intermediate_aggregation_result(
|
fn add_intermediate_aggregation_result(
|
||||||
@@ -100,7 +103,7 @@ pub(crate) fn build_single_agg_segment_collector(
|
|||||||
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
||||||
range_req,
|
range_req,
|
||||||
&mut req.sub_aggregation,
|
&mut req.sub_aggregation,
|
||||||
&req.limits,
|
&mut req.limits,
|
||||||
req.field_type,
|
req.field_type,
|
||||||
accessor_idx,
|
accessor_idx,
|
||||||
)?)),
|
)?)),
|
||||||
@@ -169,6 +172,9 @@ pub(crate) fn build_single_agg_segment_collector(
|
|||||||
accessor_idx,
|
accessor_idx,
|
||||||
req.segment_ordinal,
|
req.segment_ordinal,
|
||||||
))),
|
))),
|
||||||
|
Cardinality(CardinalityAggregationReq { missing, .. }) => Ok(Box::new(
|
||||||
|
SegmentCardinalityCollector::from_req(req.field_type, accessor_idx, missing),
|
||||||
|
)),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -15,11 +15,6 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
|||||||
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
||||||
/// should be reversed, which is useful for achieving for example largest-first
|
/// should be reversed, which is useful for achieving for example largest-first
|
||||||
/// semantics without having to wrap the feature in a `Reverse`.
|
/// semantics without having to wrap the feature in a `Reverse`.
|
||||||
///
|
|
||||||
/// WARNING: equality is not what you would expect here.
|
|
||||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
|
||||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
|
||||||
/// struct is never public.
|
|
||||||
#[derive(Clone, Default, Serialize, Deserialize)]
|
#[derive(Clone, Default, Serialize, Deserialize)]
|
||||||
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
||||||
/// The feature of the document. In practice, this is
|
/// The feature of the document. In practice, this is
|
||||||
|
|||||||
@@ -3,7 +3,6 @@ use std::marker::PhantomData;
|
|||||||
use std::sync::Arc;
|
use std::sync::Arc;
|
||||||
|
|
||||||
use columnar::ColumnValues;
|
use columnar::ColumnValues;
|
||||||
use serde::de::DeserializeOwned;
|
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
|
|
||||||
use super::Collector;
|
use super::Collector;
|
||||||
@@ -790,7 +789,7 @@ impl<Score, D, const R: bool> From<TopNComputerDeser<Score, D, R>> for TopNCompu
|
|||||||
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
|
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
|
||||||
where
|
where
|
||||||
Score: PartialOrd + Clone,
|
Score: PartialOrd + Clone,
|
||||||
D: Serialize + DeserializeOwned + Ord + Clone,
|
D: Ord,
|
||||||
{
|
{
|
||||||
/// Create a new `TopNComputer`.
|
/// Create a new `TopNComputer`.
|
||||||
/// Internally it will allocate a buffer of size `2 * top_n`.
|
/// Internally it will allocate a buffer of size `2 * top_n`.
|
||||||
|
|||||||
80
src/compat_tests.rs
Normal file
80
src/compat_tests.rs
Normal file
@@ -0,0 +1,80 @@
|
|||||||
|
use std::path::PathBuf;
|
||||||
|
|
||||||
|
use schema::*;
|
||||||
|
|
||||||
|
use crate::*;
|
||||||
|
|
||||||
|
fn create_index(path: &str) {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let label = schema_builder.add_text_field("label", TEXT | STORED);
|
||||||
|
let date = schema_builder.add_date_field("date", INDEXED | STORED);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
std::fs::create_dir_all(path).unwrap();
|
||||||
|
let index = Index::create_in_dir(path, schema).unwrap();
|
||||||
|
let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(label => "dateformat", date => DateTime::from_timestamp_nanos(123456)))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
/// Writes an Index for the current INDEX_FORMAT_VERSION to disk.
|
||||||
|
fn create_format() {
|
||||||
|
let version = INDEX_FORMAT_VERSION.to_string();
|
||||||
|
let file_path = path_for_version(&version);
|
||||||
|
if PathBuf::from(file_path.clone()).exists() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
create_index(&file_path);
|
||||||
|
}
|
||||||
|
|
||||||
|
fn path_for_version(version: &str) -> String {
|
||||||
|
format!("./tests/compat_tests_data/index_v{}/", version)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// feature flag quickwit uses a different dictionary type
|
||||||
|
#[test]
|
||||||
|
#[cfg(not(feature = "quickwit"))]
|
||||||
|
fn test_format_6() {
|
||||||
|
let path = path_for_version("6");
|
||||||
|
|
||||||
|
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||||
|
// dates are truncated to Microseconds in v6
|
||||||
|
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(feature = "quickwit"))]
|
||||||
|
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
|
||||||
|
use collector::TopDocs;
|
||||||
|
let reader = index.reader().expect("Failed to create reader");
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
|
let schema = index.schema();
|
||||||
|
let label_field = schema.get_field("label").expect("Field 'label' not found");
|
||||||
|
let query_parser = query::QueryParser::for_index(index, vec![label_field]);
|
||||||
|
|
||||||
|
let query = query_parser
|
||||||
|
.parse_query("dateformat")
|
||||||
|
.expect("Failed to parse query");
|
||||||
|
let top_docs = searcher
|
||||||
|
.search(&query, &TopDocs::with_limit(1))
|
||||||
|
.expect("Search failed");
|
||||||
|
|
||||||
|
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
|
||||||
|
|
||||||
|
let doc_address = top_docs[0].1;
|
||||||
|
let retrieved_doc: TantivyDocument = searcher
|
||||||
|
.doc(doc_address)
|
||||||
|
.expect("Failed to retrieve document");
|
||||||
|
|
||||||
|
let date_field = schema.get_field("date").expect("Field 'date' not found");
|
||||||
|
let date_value = retrieved_doc
|
||||||
|
.get_first(date_field)
|
||||||
|
.expect("Date field not found in document")
|
||||||
|
.as_datetime()
|
||||||
|
.unwrap();
|
||||||
|
|
||||||
|
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
|
||||||
|
assert_eq!(date_value, expected,);
|
||||||
|
}
|
||||||
@@ -100,7 +100,7 @@ impl Executor {
|
|||||||
|
|
||||||
/// Spawn a task on the pool, returning a future completing on task success.
|
/// Spawn a task on the pool, returning a future completing on task success.
|
||||||
///
|
///
|
||||||
/// If the task panic, returns `Err(())`.
|
/// If the task panics, returns `Err(())`.
|
||||||
#[cfg(feature = "quickwit")]
|
#[cfg(feature = "quickwit")]
|
||||||
pub fn spawn_blocking<T: Send + 'static>(
|
pub fn spawn_blocking<T: Send + 'static>(
|
||||||
&self,
|
&self,
|
||||||
|
|||||||
@@ -1,10 +1,10 @@
|
|||||||
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
|
use common::json_path_writer::{JSON_END_OF_PATH, JSON_PATH_SEGMENT_SEP};
|
||||||
use common::{replace_in_place, JsonPathWriter};
|
use common::{replace_in_place, JsonPathWriter};
|
||||||
use rustc_hash::FxHashMap;
|
use rustc_hash::FxHashMap;
|
||||||
|
|
||||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||||
use crate::schema::Type;
|
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
||||||
use crate::time::format_description::well_known::Rfc3339;
|
use crate::time::format_description::well_known::Rfc3339;
|
||||||
use crate::time::{OffsetDateTime, UtcOffset};
|
use crate::time::{OffsetDateTime, UtcOffset};
|
||||||
use crate::tokenizer::TextAnalyzer;
|
use crate::tokenizer::TextAnalyzer;
|
||||||
@@ -83,6 +83,9 @@ fn index_json_object<'a, V: Value<'a>>(
|
|||||||
positions_per_path: &mut IndexingPositionsPerPath,
|
positions_per_path: &mut IndexingPositionsPerPath,
|
||||||
) {
|
) {
|
||||||
for (json_path_segment, json_value_visitor) in json_visitor {
|
for (json_path_segment, json_value_visitor) in json_visitor {
|
||||||
|
if json_path_segment.as_bytes().contains(&JSON_END_OF_PATH) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
json_path_writer.push(json_path_segment);
|
json_path_writer.push(json_path_segment);
|
||||||
index_json_value(
|
index_json_value(
|
||||||
doc,
|
doc,
|
||||||
@@ -186,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
ctx.path_to_unordered_id
|
ctx.path_to_unordered_id
|
||||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||||
);
|
);
|
||||||
|
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||||
term_buffer.append_type_and_fast_value(val);
|
term_buffer.append_type_and_fast_value(val);
|
||||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||||
}
|
}
|
||||||
@@ -236,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
|||||||
/// Tries to infer a JSON type from a string and append it to the term.
|
/// Tries to infer a JSON type from a string and append it to the term.
|
||||||
///
|
///
|
||||||
/// The term must be json + JSON path.
|
/// The term must be json + JSON path.
|
||||||
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
|
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||||
|
mut term: Term,
|
||||||
|
phrase: &str,
|
||||||
|
truncate_date_for_search: bool,
|
||||||
|
) -> Option<Term> {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
term.value()
|
term.value()
|
||||||
.as_json_value_bytes()
|
.as_json_value_bytes()
|
||||||
@@ -247,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
|
|||||||
"JSON value bytes should be empty"
|
"JSON value bytes should be empty"
|
||||||
);
|
);
|
||||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||||
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
|
if truncate_date_for_search {
|
||||||
|
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||||
|
}
|
||||||
|
term.append_type_and_fast_value(dt);
|
||||||
return Some(term);
|
return Some(term);
|
||||||
}
|
}
|
||||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||||
|
|||||||
@@ -102,10 +102,8 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
|||||||
///
|
///
|
||||||
/// There are currently two implementations of `Directory`
|
/// There are currently two implementations of `Directory`
|
||||||
///
|
///
|
||||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
|
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this should be your default choice.
|
||||||
/// should be your default choice.
|
/// - The [`RamDirectory`][crate::directory::RamDirectory], which should be used mostly for tests.
|
||||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
|
|
||||||
/// should be used mostly for tests.
|
|
||||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||||
/// Opens a file and returns a boxed `FileHandle`.
|
/// Opens a file and returns a boxed `FileHandle`.
|
||||||
///
|
///
|
||||||
|
|||||||
@@ -25,10 +25,9 @@ impl FacetReader {
|
|||||||
/// Creates a new `FacetReader`.
|
/// Creates a new `FacetReader`.
|
||||||
///
|
///
|
||||||
/// A facet reader just wraps :
|
/// A facet reader just wraps :
|
||||||
/// - a `MultiValuedFastFieldReader` that makes it possible to
|
/// - a `MultiValuedFastFieldReader` that makes it possible to access the list of facet ords for
|
||||||
/// access the list of facet ords for a given document.
|
/// a given document.
|
||||||
/// - a `TermDictionary` that helps associating a facet to
|
/// - a `TermDictionary` that helps associating a facet to an ordinal and vice versa.
|
||||||
/// an ordinal and vice versa.
|
|
||||||
pub fn new(facet_column: StrColumn) -> FacetReader {
|
pub fn new(facet_column: StrColumn) -> FacetReader {
|
||||||
FacetReader { facet_column }
|
FacetReader { facet_column }
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -942,10 +942,10 @@ mod tests {
|
|||||||
|
|
||||||
let numbers = [100, 200, 300];
|
let numbers = [100, 200, 300];
|
||||||
let test_range = |range: RangeInclusive<u64>| {
|
let test_range = |range: RangeInclusive<u64>| {
|
||||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||||
let mut vec = vec![];
|
let mut vec = vec![];
|
||||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||||
assert_eq!(vec.len(), expexted_count);
|
assert_eq!(vec.len(), expected_count);
|
||||||
};
|
};
|
||||||
test_range(50..=50);
|
test_range(50..=50);
|
||||||
test_range(150..=150);
|
test_range(150..=150);
|
||||||
@@ -1020,10 +1020,10 @@ mod tests {
|
|||||||
|
|
||||||
let numbers = [1000, 1001, 1003];
|
let numbers = [1000, 1001, 1003];
|
||||||
let test_range = |range: RangeInclusive<u64>| {
|
let test_range = |range: RangeInclusive<u64>| {
|
||||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||||
let mut vec = vec![];
|
let mut vec = vec![];
|
||||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||||
assert_eq!(vec.len(), expexted_count);
|
assert_eq!(vec.len(), expected_count);
|
||||||
};
|
};
|
||||||
let test_range_variant = |start, stop| {
|
let test_range_variant = |start, stop| {
|
||||||
let start_range = start..=stop;
|
let start_range = start..=stop;
|
||||||
|
|||||||
@@ -70,13 +70,13 @@ impl FastFieldReaders {
|
|||||||
///
|
///
|
||||||
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
|
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
|
||||||
///
|
///
|
||||||
/// The logic works as follows, first we identify which field is targetted by calling
|
/// The logic works as follows, first we identify which field is targeted by calling
|
||||||
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
|
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
|
||||||
/// name by non-escaped dots, and find the longest matching schema field name.
|
/// name by non-escaped dots, and find the longest matching schema field name.
|
||||||
/// In our case, it would return the (attribute_field, "color").
|
/// In our case, it would return the (attribute_field, "color").
|
||||||
///
|
///
|
||||||
/// If no field is found, but a dynamic field is supplied, then we
|
/// If no field is found, but a dynamic field is supplied, then we
|
||||||
/// will simply assuem the user is targetting the dynamic field. (This feature is used in
|
/// will simply assume the user is targeting the dynamic field. (This feature is used in
|
||||||
/// Quickwit.)
|
/// Quickwit.)
|
||||||
///
|
///
|
||||||
/// We then encode the `(field, path)` into the right `columnar_key`.
|
/// We then encode the `(field, path)` into the right `columnar_key`.
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ use crate::TantivyError;
|
|||||||
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
||||||
/// either.
|
/// either.
|
||||||
///
|
///
|
||||||
/// - In a sync context, you can call `FutureResult::wait()`. The function
|
/// - In a sync context, you can call `FutureResult::wait()`. The function does not rely on
|
||||||
/// does not rely on `block_on`.
|
/// `block_on`.
|
||||||
/// - In an async context, you can call simply use `FutureResult` as a future.
|
/// - In an async context, you can call simply use `FutureResult` as a future.
|
||||||
pub struct FutureResult<T> {
|
pub struct FutureResult<T> {
|
||||||
inner: Inner<T>,
|
inner: Inner<T>,
|
||||||
|
|||||||
@@ -49,10 +49,8 @@ fn load_metas(
|
|||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
/// This operation is atomic :
|
/// This operation is atomic :
|
||||||
/// Either
|
/// Either
|
||||||
/// - it fails, in which case an error is returned,
|
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||||
/// and the `meta.json` remains untouched,
|
/// - it succeeds, and `meta.json` is written and flushed.
|
||||||
/// - it succeeds, and `meta.json` is written
|
|
||||||
/// and flushed.
|
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
fn save_new_metas(
|
fn save_new_metas(
|
||||||
@@ -529,12 +527,12 @@ impl Index {
|
|||||||
/// `IndexWriter` on the system is accessing the index directory,
|
/// `IndexWriter` on the system is accessing the index directory,
|
||||||
/// it is safe to manually delete the lockfile.
|
/// it is safe to manually delete the lockfile.
|
||||||
///
|
///
|
||||||
/// - `num_threads` defines the number of indexing workers that
|
/// - `num_threads` defines the number of indexing workers that should work at the same time.
|
||||||
/// should work at the same time.
|
|
||||||
///
|
///
|
||||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory
|
/// - `overall_memory_budget_in_bytes` sets the amount of memory allocated for all indexing
|
||||||
/// allocated for all indexing thread.
|
/// thread.
|
||||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
///
|
||||||
|
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||||
///
|
///
|
||||||
/// # Errors
|
/// # Errors
|
||||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||||
|
|||||||
@@ -176,10 +176,7 @@ impl SegmentMeta {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Updates the max_doc value from the `SegmentMeta`.
|
/// Updates the max_doc value from the `SegmentMeta`.
|
||||||
///
|
pub fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||||
/// This method is only used when updating `max_doc` from 0
|
|
||||||
/// as we finalize a fresh new segment.
|
|
||||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
|
||||||
assert_eq!(self.tracked.max_doc, 0);
|
assert_eq!(self.tracked.max_doc, 0);
|
||||||
assert!(self.tracked.deletes.is_none());
|
assert!(self.tracked.deletes.is_none());
|
||||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||||
|
|||||||
@@ -71,7 +71,7 @@ impl InvertedIndexReader {
|
|||||||
&self.termdict
|
&self.termdict
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return the fields and types encoded in the dictionary in lexicographic oder.
|
/// Return the fields and types encoded in the dictionary in lexicographic order.
|
||||||
/// Only valid on JSON fields.
|
/// Only valid on JSON fields.
|
||||||
///
|
///
|
||||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||||
|
|||||||
@@ -358,7 +358,7 @@ impl SegmentReader {
|
|||||||
.map(|(mut field_name, handle)| {
|
.map(|(mut field_name, handle)| {
|
||||||
json_path_sep_to_dot(&mut field_name);
|
json_path_sep_to_dot(&mut field_name);
|
||||||
// map to canonical path, to avoid similar but different entries.
|
// map to canonical path, to avoid similar but different entries.
|
||||||
// Eventually we should just accept '.' seperated for all cases.
|
// Eventually we should just accept '.' separated for all cases.
|
||||||
let field_name = map_to_canonical
|
let field_name = map_to_canonical
|
||||||
.get(&field_name)
|
.get(&field_name)
|
||||||
.unwrap_or(&field_name)
|
.unwrap_or(&field_name)
|
||||||
|
|||||||
@@ -179,8 +179,7 @@ impl DeleteCursor {
|
|||||||
/// Skips operations and position it so that
|
/// Skips operations and position it so that
|
||||||
/// - either all of the delete operation currently in the queue are consume and the next get
|
/// - either all of the delete operation currently in the queue are consume and the next get
|
||||||
/// will return `None`.
|
/// will return `None`.
|
||||||
/// - the next get will return the first operation with an
|
/// - the next get will return the first operation with an `opstamp >= target_opstamp`.
|
||||||
/// `opstamp >= target_opstamp`.
|
|
||||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||||
// TODO Can be optimize as we work with block.
|
// TODO Can be optimize as we work with block.
|
||||||
while self.is_behind_opstamp(target_opstamp) {
|
while self.is_behind_opstamp(target_opstamp) {
|
||||||
|
|||||||
@@ -482,7 +482,7 @@ impl<D: Document> IndexWriter<D> {
|
|||||||
/// let index = Index::create_in_ram(schema.clone());
|
/// let index = Index::create_in_ram(schema.clone());
|
||||||
///
|
///
|
||||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
/// index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||||
/// index_writer.commit()?;
|
/// index_writer.commit()?;
|
||||||
///
|
///
|
||||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||||
@@ -491,7 +491,7 @@ impl<D: Document> IndexWriter<D> {
|
|||||||
///
|
///
|
||||||
/// let searcher = index.reader()?.searcher();
|
/// let searcher = index.reader()?.searcher();
|
||||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
/// let query_promo = query_parser.parse_query("Prometheus")?;
|
||||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||||
///
|
///
|
||||||
/// assert!(top_docs_promo.is_empty());
|
/// assert!(top_docs_promo.is_empty());
|
||||||
@@ -815,8 +815,9 @@ mod tests {
|
|||||||
use crate::indexer::NoMergePolicy;
|
use crate::indexer::NoMergePolicy;
|
||||||
use crate::query::{QueryParser, TermQuery};
|
use crate::query::{QueryParser, TermQuery};
|
||||||
use crate::schema::{
|
use crate::schema::{
|
||||||
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, NumericOptions,
|
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, JsonObjectOptions,
|
||||||
TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED, STRING, TEXT,
|
NumericOptions, Schema, TextFieldIndexing, TextOptions, Value, FAST, INDEXED, STORED,
|
||||||
|
STRING, TEXT,
|
||||||
};
|
};
|
||||||
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
use crate::store::DOCSTORE_CACHE_CAPACITY;
|
||||||
use crate::{
|
use crate::{
|
||||||
@@ -1573,11 +1574,11 @@ mod tests {
|
|||||||
deleted_ids.remove(id);
|
deleted_ids.remove(id);
|
||||||
}
|
}
|
||||||
IndexingOp::DeleteDoc { id } => {
|
IndexingOp::DeleteDoc { id } => {
|
||||||
existing_ids.remove(&id);
|
existing_ids.remove(id);
|
||||||
deleted_ids.insert(*id);
|
deleted_ids.insert(*id);
|
||||||
}
|
}
|
||||||
IndexingOp::DeleteDocQuery { id } => {
|
IndexingOp::DeleteDocQuery { id } => {
|
||||||
existing_ids.remove(&id);
|
existing_ids.remove(id);
|
||||||
deleted_ids.insert(*id);
|
deleted_ids.insert(*id);
|
||||||
}
|
}
|
||||||
_ => {}
|
_ => {}
|
||||||
@@ -2092,7 +2093,7 @@ mod tests {
|
|||||||
//
|
//
|
||||||
// Take half as sample
|
// Take half as sample
|
||||||
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
|
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
|
||||||
sample.sort_by_key(|(k, _num_occurences)| *k);
|
sample.sort_by_key(|(k, _num_occurrences)| *k);
|
||||||
// sample.truncate(sample.len() / 2);
|
// sample.truncate(sample.len() / 2);
|
||||||
if !sample.is_empty() {
|
if !sample.is_empty() {
|
||||||
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
|
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
|
||||||
@@ -2101,7 +2102,7 @@ mod tests {
|
|||||||
sample
|
sample
|
||||||
.iter()
|
.iter()
|
||||||
.filter(|(id, _)| id_is_full_doc(**id))
|
.filter(|(id, _)| id_is_full_doc(**id))
|
||||||
.map(|(_id, num_occurences)| **num_occurences)
|
.map(|(_id, num_occurrences)| **num_occurrences)
|
||||||
.sum::<u64>()
|
.sum::<u64>()
|
||||||
};
|
};
|
||||||
fn gen_query_inclusive<T1: ToString, T2: ToString>(
|
fn gen_query_inclusive<T1: ToString, T2: ToString>(
|
||||||
@@ -2378,11 +2379,11 @@ mod tests {
|
|||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_bug_1617_2() {
|
fn test_bug_1617_2() {
|
||||||
assert!(test_operation_strategy(
|
test_operation_strategy(
|
||||||
&[
|
&[
|
||||||
IndexingOp::AddDoc {
|
IndexingOp::AddDoc {
|
||||||
id: 13,
|
id: 13,
|
||||||
value: Default::default()
|
value: Default::default(),
|
||||||
},
|
},
|
||||||
IndexingOp::DeleteDoc { id: 13 },
|
IndexingOp::DeleteDoc { id: 13 },
|
||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
@@ -2390,9 +2391,9 @@ mod tests {
|
|||||||
IndexingOp::Commit,
|
IndexingOp::Commit,
|
||||||
IndexingOp::Merge,
|
IndexingOp::Merge,
|
||||||
],
|
],
|
||||||
true
|
true,
|
||||||
)
|
)
|
||||||
.is_ok());
|
.unwrap();
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
@@ -2490,4 +2491,46 @@ mod tests {
|
|||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_2442_reserved_character_fast_field() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = schema::Schema::builder();
|
||||||
|
let json_field = schema_builder.add_json_field("json", FAST | TEXT);
|
||||||
|
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::builder().schema(schema).create_in_ram()?;
|
||||||
|
let mut index_writer = index.writer_for_tests()?;
|
||||||
|
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||||
|
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(
|
||||||
|
json_field=>json!({"\u{0000}B":"1"})
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(
|
||||||
|
json_field=>json!({" A":"1"})
|
||||||
|
))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit()?;
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_bug_2442_reserved_character_columnar() -> crate::Result<()> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let options = JsonObjectOptions::from(FAST).set_expand_dots_enabled();
|
||||||
|
let field = schema_builder.add_json_field("json", options);
|
||||||
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(field=>json!({"\u{0000}": "A"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer
|
||||||
|
.add_document(doc!(field=>json!({format!("\u{0000}\u{0000}"): "A"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -29,8 +29,8 @@ impl MergeOperationInventory {
|
|||||||
|
|
||||||
/// A `MergeOperation` has two roles.
|
/// A `MergeOperation` has two roles.
|
||||||
/// It carries all of the information required to describe a merge:
|
/// It carries all of the information required to describe a merge:
|
||||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
/// - `target_opstamp` is the opstamp up to which we want to consume the delete queue and reflect
|
||||||
/// delete queue and reflect their deletes.
|
/// their deletes.
|
||||||
/// - `segment_ids` is the list of segment to be merged.
|
/// - `segment_ids` is the list of segment to be merged.
|
||||||
///
|
///
|
||||||
/// The second role is to ensure keep track of the fact that these
|
/// The second role is to ensure keep track of the fact that these
|
||||||
|
|||||||
@@ -673,7 +673,7 @@ mod tests {
|
|||||||
]
|
]
|
||||||
);
|
);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
get_doc_ids(vec![Term::from_field_date(
|
get_doc_ids(vec![Term::from_field_date_for_search(
|
||||||
date_field,
|
date_field,
|
||||||
DateTime::from_utc(curr_time)
|
DateTime::from_utc(curr_time)
|
||||||
)])?,
|
)])?,
|
||||||
|
|||||||
@@ -145,15 +145,27 @@ mod tests_mmap {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_field_null_byte() {
|
fn test_json_field_null_byte_is_ignored() {
|
||||||
// Test when field name contains a zero byte, which has special meaning in tantivy.
|
let mut schema_builder = Schema::builder();
|
||||||
// As a workaround, we convert the zero byte to the ASCII character '0'.
|
let options = JsonObjectOptions::from(TEXT | FAST).set_expand_dots_enabled();
|
||||||
// https://github.com/quickwit-oss/tantivy/issues/2340
|
let field = schema_builder.add_json_field("json", options);
|
||||||
// https://github.com/quickwit-oss/tantivy/issues/2193
|
let index = Index::create_in_ram(schema_builder.build());
|
||||||
let field_name_in = "\u{0000}";
|
let mut index_writer = index.writer_for_tests().unwrap();
|
||||||
let field_name_out = "0";
|
index_writer
|
||||||
test_json_field_name(field_name_in, field_name_out);
|
.add_document(doc!(field=>json!({"key": "test1", "invalidkey\u{0000}": "test2"})))
|
||||||
|
.unwrap();
|
||||||
|
index_writer.commit().unwrap();
|
||||||
|
let reader = index.reader().unwrap();
|
||||||
|
let searcher = reader.searcher();
|
||||||
|
let segment_reader = searcher.segment_reader(0);
|
||||||
|
let inv_indexer = segment_reader.inverted_index(field).unwrap();
|
||||||
|
let term_dict = inv_indexer.terms();
|
||||||
|
assert_eq!(term_dict.num_terms(), 1);
|
||||||
|
let mut term_bytes = Vec::new();
|
||||||
|
term_dict.ord_to_term(0, &mut term_bytes).unwrap();
|
||||||
|
assert_eq!(term_bytes, b"key\0stest1");
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_field_1byte() {
|
fn test_json_field_1byte() {
|
||||||
// Test when field name contains a '1' byte, which has special meaning in tantivy.
|
// Test when field name contains a '1' byte, which has special meaning in tantivy.
|
||||||
@@ -291,7 +303,7 @@ mod tests_mmap {
|
|||||||
Type::Str,
|
Type::Str,
|
||||||
),
|
),
|
||||||
(format!("{field_name_out_internal}a"), Type::Str),
|
(format!("{field_name_out_internal}a"), Type::Str),
|
||||||
(format!("{field_name_out_internal}"), Type::Str),
|
(field_name_out_internal.to_string(), Type::Str),
|
||||||
(format!("num{field_name_out_internal}"), Type::I64),
|
(format!("num{field_name_out_internal}"), Type::I64),
|
||||||
];
|
];
|
||||||
expected_fields.sort();
|
expected_fields.sort();
|
||||||
|
|||||||
@@ -38,11 +38,12 @@ impl PathToUnorderedId {
|
|||||||
#[cold]
|
#[cold]
|
||||||
fn insert_new_path(&mut self, path: &str) -> u32 {
|
fn insert_new_path(&mut self, path: &str) -> u32 {
|
||||||
let next_id = self.map.len() as u32;
|
let next_id = self.map.len() as u32;
|
||||||
self.map.insert(path.to_string(), next_id);
|
let new_path = path.to_string();
|
||||||
|
self.map.insert(new_path, next_id);
|
||||||
next_id
|
next_id
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retuns ids which reflect the lexical order of the paths.
|
/// Returns ids which reflect the lexical order of the paths.
|
||||||
///
|
///
|
||||||
/// The returned vec can be indexed with the unordered id to get the ordered id.
|
/// The returned vec can be indexed with the unordered id to get the ordered id.
|
||||||
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
|
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
|
||||||
@@ -56,7 +57,7 @@ impl PathToUnorderedId {
|
|||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Retuns the paths so they can be queried by the ordered id (which is the index).
|
/// Returns the paths so they can be queried by the ordered id (which is the index).
|
||||||
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
|
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
|
||||||
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
|
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
|
||||||
paths.sort_unstable();
|
paths.sort_unstable();
|
||||||
|
|||||||
@@ -10,12 +10,9 @@ use crate::indexer::delete_queue::DeleteCursor;
|
|||||||
///
|
///
|
||||||
/// In addition to segment `meta`,
|
/// In addition to segment `meta`,
|
||||||
/// it contains a few transient states
|
/// it contains a few transient states
|
||||||
/// - `alive_bitset` is a bitset describing
|
/// - `alive_bitset` is a bitset describing documents that were alive during the commit itself.
|
||||||
/// documents that were alive during the commit
|
/// - `delete_cursor` is the position in the delete queue. Deletes happening before the cursor are
|
||||||
/// itself.
|
/// reflected either in the .del file or in the `alive_bitset`.
|
||||||
/// - `delete_cursor` is the position in the delete queue.
|
|
||||||
/// Deletes happening before the cursor are reflected either
|
|
||||||
/// in the .del file or in the `alive_bitset`.
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
pub struct SegmentEntry {
|
pub struct SegmentEntry {
|
||||||
meta: SegmentMeta,
|
meta: SegmentMeta,
|
||||||
|
|||||||
@@ -30,10 +30,8 @@ const NUM_MERGE_THREADS: usize = 4;
|
|||||||
/// Save the index meta file.
|
/// Save the index meta file.
|
||||||
/// This operation is atomic:
|
/// This operation is atomic:
|
||||||
/// Either
|
/// Either
|
||||||
/// - it fails, in which case an error is returned,
|
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||||
/// and the `meta.json` remains untouched,
|
/// - it success, and `meta.json` is written and flushed.
|
||||||
/// - it success, and `meta.json` is written
|
|
||||||
/// and flushed.
|
|
||||||
///
|
///
|
||||||
/// This method is not part of tantivy's public API
|
/// This method is not part of tantivy's public API
|
||||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||||
@@ -379,7 +377,7 @@ impl SegmentUpdater {
|
|||||||
if self.is_alive() {
|
if self.is_alive() {
|
||||||
let index = &self.index;
|
let index = &self.index;
|
||||||
let directory = index.directory();
|
let directory = index.directory();
|
||||||
let mut commited_segment_metas = self.segment_manager.committed_segment_metas();
|
let mut committed_segment_metas = self.segment_manager.committed_segment_metas();
|
||||||
|
|
||||||
// We sort segment_readers by number of documents.
|
// We sort segment_readers by number of documents.
|
||||||
// This is an heuristic to make multithreading more efficient.
|
// This is an heuristic to make multithreading more efficient.
|
||||||
@@ -394,10 +392,10 @@ impl SegmentUpdater {
|
|||||||
// from the different drives.
|
// from the different drives.
|
||||||
//
|
//
|
||||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||||
let index_meta = IndexMeta {
|
let index_meta = IndexMeta {
|
||||||
index_settings: index.settings().clone(),
|
index_settings: index.settings().clone(),
|
||||||
segments: commited_segment_metas,
|
segments: committed_segment_metas,
|
||||||
schema: index.schema(),
|
schema: index.schema(),
|
||||||
opstamp,
|
opstamp,
|
||||||
payload: commit_message,
|
payload: commit_message,
|
||||||
@@ -542,7 +540,13 @@ impl SegmentUpdater {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn consider_merge_options(&self) {
|
fn consider_merge_options(&self) {
|
||||||
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
|
let (mut committed_segments, mut uncommitted_segments) = self.get_mergeable_segments();
|
||||||
|
if committed_segments.len() == 1 && committed_segments[0].num_deleted_docs() == 0 {
|
||||||
|
committed_segments.clear();
|
||||||
|
}
|
||||||
|
if uncommitted_segments.len() == 1 && uncommitted_segments[0].num_deleted_docs() == 0 {
|
||||||
|
uncommitted_segments.clear();
|
||||||
|
}
|
||||||
|
|
||||||
// Committed segments cannot be merged with uncommitted_segments.
|
// Committed segments cannot be merged with uncommitted_segments.
|
||||||
// We therefore consider merges using these two sets of segments independently.
|
// We therefore consider merges using these two sets of segments independently.
|
||||||
|
|||||||
@@ -64,9 +64,9 @@ impl SegmentWriter {
|
|||||||
///
|
///
|
||||||
/// The arguments are defined as follows
|
/// The arguments are defined as follows
|
||||||
///
|
///
|
||||||
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
|
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
|
||||||
/// is stored in a memory arena. This makes it possible for the user to define
|
/// stored in a memory arena. This makes it possible for the user to define the flushing
|
||||||
/// the flushing behavior as a memory limit.
|
/// behavior as a memory limit.
|
||||||
/// - segment: The segment being written
|
/// - segment: The segment being written
|
||||||
/// - schema
|
/// - schema
|
||||||
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
|
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
|
||||||
@@ -431,7 +431,7 @@ mod tests {
|
|||||||
use crate::query::{PhraseQuery, QueryParser};
|
use crate::query::{PhraseQuery, QueryParser};
|
||||||
use crate::schema::{
|
use crate::schema::{
|
||||||
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
|
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
|
||||||
STORED, STRING, TEXT,
|
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
|
||||||
};
|
};
|
||||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||||
use crate::time::format_description::well_known::Rfc3339;
|
use crate::time::format_description::well_known::Rfc3339;
|
||||||
@@ -651,7 +651,8 @@ mod tests {
|
|||||||
set_fast_val(
|
set_fast_val(
|
||||||
DateTime::from_utc(
|
DateTime::from_utc(
|
||||||
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
|
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
|
||||||
),
|
)
|
||||||
|
.truncate(DATE_TIME_PRECISION_INDEXED),
|
||||||
term
|
term
|
||||||
)
|
)
|
||||||
.serialized_value_bytes()
|
.serialized_value_bytes()
|
||||||
|
|||||||
13
src/lib.rs
13
src/lib.rs
@@ -125,8 +125,8 @@
|
|||||||
//!
|
//!
|
||||||
//! - **Searching**: [Searcher] searches the segments with anything that implements
|
//! - **Searching**: [Searcher] searches the segments with anything that implements
|
||||||
//! [Query](query::Query) and merges the results. The list of [supported
|
//! [Query](query::Query) and merges the results. The list of [supported
|
||||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||||
//! [Query](query::Query) trait.
|
//! [Query](query::Query) trait.
|
||||||
//!
|
//!
|
||||||
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
|
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
|
||||||
//!
|
//!
|
||||||
@@ -202,12 +202,15 @@ pub mod space_usage;
|
|||||||
pub mod store;
|
pub mod store;
|
||||||
pub mod termdict;
|
pub mod termdict;
|
||||||
|
|
||||||
|
mod docset;
|
||||||
mod reader;
|
mod reader;
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod compat_tests;
|
||||||
|
|
||||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
||||||
pub mod snippet;
|
pub mod snippet;
|
||||||
|
|
||||||
mod docset;
|
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
|
|
||||||
pub use census::{Inventory, TrackedObject};
|
pub use census::{Inventory, TrackedObject};
|
||||||
@@ -229,9 +232,9 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
|||||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||||
|
|
||||||
/// Index format version.
|
/// Index format version.
|
||||||
const INDEX_FORMAT_VERSION: u32 = 6;
|
pub const INDEX_FORMAT_VERSION: u32 = 6;
|
||||||
/// Oldest index format version this tantivy version can read.
|
/// Oldest index format version this tantivy version can read.
|
||||||
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
|
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
|
||||||
|
|
||||||
/// Structure version for the index.
|
/// Structure version for the index.
|
||||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||||
|
|||||||
@@ -3,7 +3,7 @@
|
|||||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
||||||
//! This information is useful to run phrase queries.
|
//! This information is useful to run phrase queries.
|
||||||
//!
|
//!
|
||||||
//! The [position](crate::SegmentComponent::Positions) file contains all of the
|
//! The [position](crate::index::SegmentComponent::Positions) file contains all of the
|
||||||
//! bitpacked positions delta, for all terms of a given field, one term after the other.
|
//! bitpacked positions delta, for all terms of a given field, one term after the other.
|
||||||
//!
|
//!
|
||||||
//! Each term is encoded independently.
|
//! Each term is encoded independently.
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
|||||||
/// # Assumption
|
/// # Assumption
|
||||||
///
|
///
|
||||||
/// - The block is sorted. Some elements may appear several times. This is the case at the
|
/// - The block is sorted. Some elements may appear several times. This is the case at the
|
||||||
/// end of the last block for instance.
|
/// end of the last block for instance.
|
||||||
/// - The target is assumed smaller or equal to the last element of the block.
|
/// - The target is assumed smaller or equal to the last element of the block.
|
||||||
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
||||||
let mut start = 0;
|
let mut start = 0;
|
||||||
|
|||||||
@@ -59,7 +59,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
/// The actual serialization format is handled by the `PostingsSerializer`.
|
/// The actual serialization format is handled by the `PostingsSerializer`.
|
||||||
fn serialize(
|
fn serialize(
|
||||||
&self,
|
&self,
|
||||||
term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
ordered_term_addrs: &[(Field, OrderedPathId, &[u8], Addr)],
|
||||||
ordered_id_to_path: &[&str],
|
ordered_id_to_path: &[&str],
|
||||||
ctx: &IndexingContext,
|
ctx: &IndexingContext,
|
||||||
serializer: &mut FieldSerializer,
|
serializer: &mut FieldSerializer,
|
||||||
@@ -69,7 +69,7 @@ impl<Rec: Recorder> PostingsWriter for JsonPostingsWriter<Rec> {
|
|||||||
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
term_buffer.clear_with_field_and_type(Type::Json, Field::from_field_id(0));
|
||||||
let mut prev_term_id = u32::MAX;
|
let mut prev_term_id = u32::MAX;
|
||||||
let mut term_path_len = 0; // this will be set in the first iteration
|
let mut term_path_len = 0; // this will be set in the first iteration
|
||||||
for (_field, path_id, term, addr) in term_addrs {
|
for (_field, path_id, term, addr) in ordered_term_addrs {
|
||||||
if prev_term_id != path_id.path_id() {
|
if prev_term_id != path_id.path_id() {
|
||||||
term_buffer.truncate_value_bytes(0);
|
term_buffer.truncate_value_bytes(0);
|
||||||
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
term_buffer.append_path(ordered_id_to_path[path_id.path_id() as usize].as_bytes());
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ pub trait Postings: DocSet + 'static {
|
|||||||
fn term_freq(&self) -> u32;
|
fn term_freq(&self) -> u32;
|
||||||
|
|
||||||
/// Returns the positions offsetted with a given value.
|
/// Returns the positions offsetted with a given value.
|
||||||
|
/// It is not necessary to clear the `output` before calling this method.
|
||||||
/// The output vector will be resized to the `term_freq`.
|
/// The output vector will be resized to the `term_freq`.
|
||||||
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
|
||||||
|
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ use crate::{DocId, Score};
|
|||||||
|
|
||||||
/// Query that matches all of the documents.
|
/// Query that matches all of the documents.
|
||||||
///
|
///
|
||||||
/// All of the document get the score 1.0.
|
/// All of the documents get the score 1.0.
|
||||||
#[derive(Clone, Debug)]
|
#[derive(Clone, Debug)]
|
||||||
pub struct AllQuery;
|
pub struct AllQuery;
|
||||||
|
|
||||||
@@ -22,10 +22,7 @@ pub struct AllWeight;
|
|||||||
|
|
||||||
impl Weight for AllWeight {
|
impl Weight for AllWeight {
|
||||||
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
|
||||||
let all_scorer = AllScorer {
|
let all_scorer = AllScorer::new(reader.max_doc());
|
||||||
doc: 0u32,
|
|
||||||
max_doc: reader.max_doc(),
|
|
||||||
};
|
|
||||||
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
Ok(Box::new(BoostScorer::new(all_scorer, boost)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -43,6 +40,13 @@ pub struct AllScorer {
|
|||||||
max_doc: DocId,
|
max_doc: DocId,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl AllScorer {
|
||||||
|
/// Creates a new AllScorer with `max_doc` docs.
|
||||||
|
pub fn new(max_doc: DocId) -> AllScorer {
|
||||||
|
AllScorer { doc: 0u32, max_doc }
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
impl DocSet for AllScorer {
|
impl DocSet for AllScorer {
|
||||||
#[inline(always)]
|
#[inline(always)]
|
||||||
fn advance(&mut self) -> DocId {
|
fn advance(&mut self) -> DocId {
|
||||||
|
|||||||
@@ -1,19 +1,14 @@
|
|||||||
use super::boolean_weight::BooleanWeight;
|
use super::boolean_weight::BooleanWeight;
|
||||||
use crate::query::{EnableScoring, Occur, Query, SumWithCoordsCombiner, TermQuery, Weight};
|
use crate::query::{EnableScoring, Occur, Query, SumCombiner, TermQuery, Weight};
|
||||||
use crate::schema::{IndexRecordOption, Term};
|
use crate::schema::{IndexRecordOption, Term};
|
||||||
|
|
||||||
/// The boolean query returns a set of documents
|
/// The boolean query returns a set of documents
|
||||||
/// that matches the Boolean combination of constituent subqueries.
|
/// that matches the Boolean combination of constituent subqueries.
|
||||||
///
|
///
|
||||||
/// The documents matched by the boolean query are
|
/// The documents matched by the boolean query are those which
|
||||||
/// those which
|
/// - match all of the sub queries associated with the `Must` occurrence
|
||||||
/// * match all of the sub queries associated with the
|
/// - match none of the sub queries associated with the `MustNot` occurrence.
|
||||||
/// `Must` occurrence
|
/// - match at least one of the sub queries associated with the `Must` or `Should` occurrence.
|
||||||
/// * match none of the sub queries associated with the
|
|
||||||
/// `MustNot` occurrence.
|
|
||||||
/// * match at least one of the sub queries associated
|
|
||||||
/// with the `Must` or `Should` occurrence.
|
|
||||||
///
|
|
||||||
///
|
///
|
||||||
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
|
/// You can combine other query types and their `Occur`ances into one `BooleanQuery`
|
||||||
///
|
///
|
||||||
@@ -66,6 +61,10 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// Term::from_field_text(title, "diary"),
|
/// Term::from_field_text(title, "diary"),
|
||||||
/// IndexRecordOption::Basic,
|
/// IndexRecordOption::Basic,
|
||||||
/// ));
|
/// ));
|
||||||
|
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
|
/// Term::from_field_text(title, "cow"),
|
||||||
|
/// IndexRecordOption::Basic
|
||||||
|
/// ));
|
||||||
/// // A TermQuery with "found" in the body
|
/// // A TermQuery with "found" in the body
|
||||||
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
/// let body_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
||||||
/// Term::from_field_text(body, "found"),
|
/// Term::from_field_text(body, "found"),
|
||||||
@@ -74,7 +73,7 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// // TermQuery "diary" must and "girl" must not be present
|
/// // TermQuery "diary" must and "girl" must not be present
|
||||||
/// let queries_with_occurs1 = vec![
|
/// let queries_with_occurs1 = vec![
|
||||||
/// (Occur::Must, diary_term_query.box_clone()),
|
/// (Occur::Must, diary_term_query.box_clone()),
|
||||||
/// (Occur::MustNot, girl_term_query),
|
/// (Occur::MustNot, girl_term_query.box_clone()),
|
||||||
/// ];
|
/// ];
|
||||||
/// // Make a BooleanQuery equivalent to
|
/// // Make a BooleanQuery equivalent to
|
||||||
/// // title:+diary title:-girl
|
/// // title:+diary title:-girl
|
||||||
@@ -82,15 +81,10 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
/// let count1 = searcher.search(&diary_must_and_girl_mustnot, &Count)?;
|
||||||
/// assert_eq!(count1, 1);
|
/// assert_eq!(count1, 1);
|
||||||
///
|
///
|
||||||
/// // TermQuery for "cow" in the title
|
|
||||||
/// let cow_term_query: Box<dyn Query> = Box::new(TermQuery::new(
|
|
||||||
/// Term::from_field_text(title, "cow"),
|
|
||||||
/// IndexRecordOption::Basic,
|
|
||||||
/// ));
|
|
||||||
/// // "title:diary OR title:cow"
|
/// // "title:diary OR title:cow"
|
||||||
/// let title_diary_or_cow = BooleanQuery::new(vec![
|
/// let title_diary_or_cow = BooleanQuery::new(vec![
|
||||||
/// (Occur::Should, diary_term_query.box_clone()),
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
/// (Occur::Should, cow_term_query),
|
/// (Occur::Should, cow_term_query.box_clone()),
|
||||||
/// ]);
|
/// ]);
|
||||||
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
/// let count2 = searcher.search(&title_diary_or_cow, &Count)?;
|
||||||
/// assert_eq!(count2, 4);
|
/// assert_eq!(count2, 4);
|
||||||
@@ -118,21 +112,38 @@ use crate::schema::{IndexRecordOption, Term};
|
|||||||
/// ]);
|
/// ]);
|
||||||
/// let count4 = searcher.search(&nested_query, &Count)?;
|
/// let count4 = searcher.search(&nested_query, &Count)?;
|
||||||
/// assert_eq!(count4, 1);
|
/// assert_eq!(count4, 1);
|
||||||
|
///
|
||||||
|
/// // You may call `with_minimum_required_clauses` to
|
||||||
|
/// // specify the number of should clauses the returned documents must match.
|
||||||
|
/// let minimum_required_query = BooleanQuery::with_minimum_required_clauses(vec![
|
||||||
|
/// (Occur::Should, cow_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, girl_term_query.box_clone()),
|
||||||
|
/// (Occur::Should, diary_term_query.box_clone()),
|
||||||
|
/// ], 2);
|
||||||
|
/// // Return documents contains "Diary Cow", "Diary Girl" or "Cow Girl"
|
||||||
|
/// // Notice: "Diary" isn't "Dairy". ;-)
|
||||||
|
/// let count5 = searcher.search(&minimum_required_query, &Count)?;
|
||||||
|
/// assert_eq!(count5, 1);
|
||||||
/// Ok(())
|
/// Ok(())
|
||||||
/// }
|
/// }
|
||||||
/// ```
|
/// ```
|
||||||
#[derive(Debug)]
|
#[derive(Debug)]
|
||||||
pub struct BooleanQuery {
|
pub struct BooleanQuery {
|
||||||
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Clone for BooleanQuery {
|
impl Clone for BooleanQuery {
|
||||||
fn clone(&self) -> Self {
|
fn clone(&self) -> Self {
|
||||||
self.subqueries
|
let subqueries = self
|
||||||
|
.subqueries
|
||||||
.iter()
|
.iter()
|
||||||
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
.map(|(occur, subquery)| (*occur, subquery.box_clone()))
|
||||||
.collect::<Vec<_>>()
|
.collect::<Vec<_>>();
|
||||||
.into()
|
Self {
|
||||||
|
subqueries,
|
||||||
|
minimum_number_should_match: self.minimum_number_should_match,
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -149,10 +160,11 @@ impl Query for BooleanQuery {
|
|||||||
.iter()
|
.iter()
|
||||||
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
|
.map(|(occur, subquery)| Ok((*occur, subquery.weight(enable_scoring)?)))
|
||||||
.collect::<crate::Result<_>>()?;
|
.collect::<crate::Result<_>>()?;
|
||||||
Ok(Box::new(BooleanWeight::new(
|
Ok(Box::new(BooleanWeight::with_minimum_number_should_match(
|
||||||
sub_weights,
|
sub_weights,
|
||||||
|
self.minimum_number_should_match,
|
||||||
enable_scoring.is_scoring_enabled(),
|
enable_scoring.is_scoring_enabled(),
|
||||||
Box::new(SumWithCoordsCombiner::default),
|
Box::new(SumCombiner::default),
|
||||||
)))
|
)))
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -166,7 +178,41 @@ impl Query for BooleanQuery {
|
|||||||
impl BooleanQuery {
|
impl BooleanQuery {
|
||||||
/// Creates a new boolean query.
|
/// Creates a new boolean query.
|
||||||
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
|
pub fn new(subqueries: Vec<(Occur, Box<dyn Query>)>) -> BooleanQuery {
|
||||||
BooleanQuery { subqueries }
|
// If the bool query includes at least one should clause
|
||||||
|
// and no Must or MustNot clauses, the default value is 1. Otherwise, the default value is
|
||||||
|
// 0. Keep pace with Elasticsearch.
|
||||||
|
let mut minimum_required = 0;
|
||||||
|
for (occur, _) in &subqueries {
|
||||||
|
match occur {
|
||||||
|
Occur::Should => minimum_required = 1,
|
||||||
|
Occur::Must | Occur::MustNot => {
|
||||||
|
minimum_required = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Self::with_minimum_required_clauses(subqueries, minimum_required)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new boolean query with minimum number of required should clauses specified.
|
||||||
|
pub fn with_minimum_required_clauses(
|
||||||
|
subqueries: Vec<(Occur, Box<dyn Query>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
BooleanQuery {
|
||||||
|
subqueries,
|
||||||
|
minimum_number_should_match,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Getter for `minimum_number_should_match`
|
||||||
|
pub fn get_minimum_number_should_match(&self) -> usize {
|
||||||
|
self.minimum_number_should_match
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Setter for `minimum_number_should_match`
|
||||||
|
pub fn set_minimum_number_should_match(&mut self, minimum_number_should_match: usize) {
|
||||||
|
self.minimum_number_should_match = minimum_number_should_match;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns the intersection of the queries.
|
/// Returns the intersection of the queries.
|
||||||
@@ -181,6 +227,18 @@ impl BooleanQuery {
|
|||||||
BooleanQuery::new(subqueries)
|
BooleanQuery::new(subqueries)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Returns the union of the queries with minimum required clause.
|
||||||
|
pub fn union_with_minimum_required_clauses(
|
||||||
|
queries: Vec<Box<dyn Query>>,
|
||||||
|
minimum_required_clauses: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
let subqueries = queries
|
||||||
|
.into_iter()
|
||||||
|
.map(|sub_query| (Occur::Should, sub_query))
|
||||||
|
.collect();
|
||||||
|
BooleanQuery::with_minimum_required_clauses(subqueries, minimum_required_clauses)
|
||||||
|
}
|
||||||
|
|
||||||
/// Helper method to create a boolean query matching a given list of terms.
|
/// Helper method to create a boolean query matching a given list of terms.
|
||||||
/// The resulting query is a disjunction of the terms.
|
/// The resulting query is a disjunction of the terms.
|
||||||
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
pub fn new_multiterms_query(terms: Vec<Term>) -> BooleanQuery {
|
||||||
@@ -203,11 +261,13 @@ impl BooleanQuery {
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::collections::HashSet;
|
||||||
|
|
||||||
use super::BooleanQuery;
|
use super::BooleanQuery;
|
||||||
use crate::collector::{Count, DocSetCollector};
|
use crate::collector::{Count, DocSetCollector};
|
||||||
use crate::query::{QueryClone, QueryParser, TermQuery};
|
use crate::query::{Query, QueryClone, QueryParser, TermQuery};
|
||||||
use crate::schema::{IndexRecordOption, Schema, TEXT};
|
use crate::schema::{Field, IndexRecordOption, Schema, TEXT};
|
||||||
use crate::{DocAddress, Index, Term};
|
use crate::{DocAddress, DocId, Index, Term};
|
||||||
|
|
||||||
fn create_test_index() -> crate::Result<Index> {
|
fn create_test_index() -> crate::Result<Index> {
|
||||||
let mut schema_builder = Schema::builder();
|
let mut schema_builder = Schema::builder();
|
||||||
@@ -223,6 +283,73 @@ mod tests {
|
|||||||
Ok(index)
|
Ok(index)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_minimum_required() -> crate::Result<()> {
|
||||||
|
fn create_test_index_with<T: IntoIterator<Item = &'static str>>(
|
||||||
|
docs: T,
|
||||||
|
) -> crate::Result<Index> {
|
||||||
|
let mut schema_builder = Schema::builder();
|
||||||
|
let text = schema_builder.add_text_field("text", TEXT);
|
||||||
|
let schema = schema_builder.build();
|
||||||
|
let index = Index::create_in_ram(schema);
|
||||||
|
let mut writer = index.writer_for_tests()?;
|
||||||
|
for doc in docs {
|
||||||
|
writer.add_document(doc!(text => doc))?;
|
||||||
|
}
|
||||||
|
writer.commit()?;
|
||||||
|
Ok(index)
|
||||||
|
}
|
||||||
|
fn create_boolean_query_with_mr<T: IntoIterator<Item = &'static str>>(
|
||||||
|
queries: T,
|
||||||
|
field: Field,
|
||||||
|
mr: usize,
|
||||||
|
) -> BooleanQuery {
|
||||||
|
let terms = queries
|
||||||
|
.into_iter()
|
||||||
|
.map(|t| Term::from_field_text(field, t))
|
||||||
|
.map(|t| TermQuery::new(t, IndexRecordOption::Basic))
|
||||||
|
.map(|q| -> Box<dyn Query> { Box::new(q) })
|
||||||
|
.collect();
|
||||||
|
BooleanQuery::union_with_minimum_required_clauses(terms, mr)
|
||||||
|
}
|
||||||
|
fn check_doc_id<T: IntoIterator<Item = DocId>>(
|
||||||
|
expected: T,
|
||||||
|
actually: HashSet<DocAddress>,
|
||||||
|
seg: u32,
|
||||||
|
) {
|
||||||
|
assert_eq!(
|
||||||
|
actually,
|
||||||
|
expected
|
||||||
|
.into_iter()
|
||||||
|
.map(|id| DocAddress::new(seg, id))
|
||||||
|
.collect()
|
||||||
|
);
|
||||||
|
}
|
||||||
|
let index = create_test_index_with(["a b c", "a c e", "d f g", "z z z", "c i b"])?;
|
||||||
|
let searcher = index.reader()?.searcher();
|
||||||
|
let text = index.schema().get_field("text").unwrap();
|
||||||
|
// Documents contains 'a c' 'a z' 'a i' 'c z' 'c i' or 'z i' shall be return.
|
||||||
|
let q1 = create_boolean_query_with_mr(["a", "c", "z", "i"], text, 2);
|
||||||
|
let docs = searcher.search(&q1, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 4], docs, 0);
|
||||||
|
// Documents contains 'a b c', 'a b e', 'a c e' or 'b c e' shall be return.
|
||||||
|
let q2 = create_boolean_query_with_mr(["a", "b", "c", "e"], text, 3);
|
||||||
|
let docs = searcher.search(&q2, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1], docs, 0);
|
||||||
|
// Nothing queried since minimum_required is too large.
|
||||||
|
let q3 = create_boolean_query_with_mr(["a", "b"], text, 3);
|
||||||
|
let docs = searcher.search(&q3, &DocSetCollector)?;
|
||||||
|
assert!(docs.is_empty());
|
||||||
|
// When mr is set to zero or one, there are no difference with `Boolean::Union`.
|
||||||
|
let q4 = create_boolean_query_with_mr(["a", "z"], text, 1);
|
||||||
|
let docs = searcher.search(&q4, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 3], docs, 0);
|
||||||
|
let q5 = create_boolean_query_with_mr(["a", "b"], text, 0);
|
||||||
|
let docs = searcher.search(&q5, &DocSetCollector)?;
|
||||||
|
check_doc_id([0, 1, 4], docs, 0);
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_union() -> crate::Result<()> {
|
fn test_union() -> crate::Result<()> {
|
||||||
let index = create_test_index()?;
|
let index = create_test_index()?;
|
||||||
|
|||||||
@@ -3,6 +3,7 @@ use std::collections::HashMap;
|
|||||||
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
use crate::docset::COLLECT_BLOCK_BUFFER_LEN;
|
||||||
use crate::index::SegmentReader;
|
use crate::index::SegmentReader;
|
||||||
use crate::postings::FreqReadingOption;
|
use crate::postings::FreqReadingOption;
|
||||||
|
use crate::query::disjunction::Disjunction;
|
||||||
use crate::query::explanation::does_not_match;
|
use crate::query::explanation::does_not_match;
|
||||||
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
|
||||||
use crate::query::term_query::TermScorer;
|
use crate::query::term_query::TermScorer;
|
||||||
@@ -18,6 +19,26 @@ enum SpecializedScorer {
|
|||||||
Other(Box<dyn Scorer>),
|
Other(Box<dyn Scorer>),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn scorer_disjunction<TScoreCombiner>(
|
||||||
|
scorers: Vec<Box<dyn Scorer>>,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
minimum_match_required: usize,
|
||||||
|
) -> Box<dyn Scorer>
|
||||||
|
where
|
||||||
|
TScoreCombiner: ScoreCombiner,
|
||||||
|
{
|
||||||
|
debug_assert!(!scorers.is_empty());
|
||||||
|
debug_assert!(minimum_match_required > 1);
|
||||||
|
if scorers.len() == 1 {
|
||||||
|
return scorers.into_iter().next().unwrap(); // Safe unwrap.
|
||||||
|
}
|
||||||
|
Box::new(Disjunction::new(
|
||||||
|
scorers,
|
||||||
|
score_combiner,
|
||||||
|
minimum_match_required,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
fn scorer_union<TScoreCombiner>(
|
fn scorer_union<TScoreCombiner>(
|
||||||
scorers: Vec<Box<dyn Scorer>>,
|
scorers: Vec<Box<dyn Scorer>>,
|
||||||
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
score_combiner_fn: impl Fn() -> TScoreCombiner,
|
||||||
@@ -70,6 +91,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
|
|||||||
/// Weight associated to the `BoolQuery`.
|
/// Weight associated to the `BoolQuery`.
|
||||||
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
|
pub struct BooleanWeight<TScoreCombiner: ScoreCombiner> {
|
||||||
weights: Vec<(Occur, Box<dyn Weight>)>,
|
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
scoring_enabled: bool,
|
scoring_enabled: bool,
|
||||||
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
|
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send>,
|
||||||
}
|
}
|
||||||
@@ -85,6 +107,22 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
|||||||
weights,
|
weights,
|
||||||
scoring_enabled,
|
scoring_enabled,
|
||||||
score_combiner_fn,
|
score_combiner_fn,
|
||||||
|
minimum_number_should_match: 1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Create a new boolean weight with minimum number of required should clauses specified.
|
||||||
|
pub fn with_minimum_number_should_match(
|
||||||
|
weights: Vec<(Occur, Box<dyn Weight>)>,
|
||||||
|
minimum_number_should_match: usize,
|
||||||
|
scoring_enabled: bool,
|
||||||
|
score_combiner_fn: Box<dyn Fn() -> TScoreCombiner + Sync + Send + 'static>,
|
||||||
|
) -> BooleanWeight<TScoreCombiner> {
|
||||||
|
BooleanWeight {
|
||||||
|
weights,
|
||||||
|
minimum_number_should_match,
|
||||||
|
scoring_enabled,
|
||||||
|
score_combiner_fn,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -111,43 +149,89 @@ impl<TScoreCombiner: ScoreCombiner> BooleanWeight<TScoreCombiner> {
|
|||||||
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
score_combiner_fn: impl Fn() -> TComplexScoreCombiner,
|
||||||
) -> crate::Result<SpecializedScorer> {
|
) -> crate::Result<SpecializedScorer> {
|
||||||
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
let mut per_occur_scorers = self.per_occur_scorers(reader, boost)?;
|
||||||
|
// Indicate how should clauses are combined with other clauses.
|
||||||
let should_scorer_opt: Option<SpecializedScorer> = per_occur_scorers
|
enum CombinationMethod {
|
||||||
.remove(&Occur::Should)
|
Ignored,
|
||||||
.map(|scorers| scorer_union(scorers, &score_combiner_fn));
|
// Only contributes to final score.
|
||||||
|
Optional(SpecializedScorer),
|
||||||
|
// Must be fitted.
|
||||||
|
Required(Box<dyn Scorer>),
|
||||||
|
}
|
||||||
|
let mut must_scorers = per_occur_scorers.remove(&Occur::Must);
|
||||||
|
let should_opt = if let Some(mut should_scorers) = per_occur_scorers.remove(&Occur::Should)
|
||||||
|
{
|
||||||
|
let num_of_should_scorers = should_scorers.len();
|
||||||
|
if self.minimum_number_should_match > num_of_should_scorers {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||||
|
}
|
||||||
|
match self.minimum_number_should_match {
|
||||||
|
0 => CombinationMethod::Optional(scorer_union(should_scorers, &score_combiner_fn)),
|
||||||
|
1 => CombinationMethod::Required(into_box_scorer(
|
||||||
|
scorer_union(should_scorers, &score_combiner_fn),
|
||||||
|
&score_combiner_fn,
|
||||||
|
)),
|
||||||
|
n if num_of_should_scorers == n => {
|
||||||
|
// When num_of_should_scorers equals the number of should clauses,
|
||||||
|
// they are no different from must clauses.
|
||||||
|
must_scorers = match must_scorers.take() {
|
||||||
|
Some(mut must_scorers) => {
|
||||||
|
must_scorers.append(&mut should_scorers);
|
||||||
|
Some(must_scorers)
|
||||||
|
}
|
||||||
|
None => Some(should_scorers),
|
||||||
|
};
|
||||||
|
CombinationMethod::Ignored
|
||||||
|
}
|
||||||
|
_ => CombinationMethod::Required(scorer_disjunction(
|
||||||
|
should_scorers,
|
||||||
|
score_combiner_fn(),
|
||||||
|
self.minimum_number_should_match,
|
||||||
|
)),
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// None of should clauses are provided.
|
||||||
|
if self.minimum_number_should_match > 0 {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
||||||
|
} else {
|
||||||
|
CombinationMethod::Ignored
|
||||||
|
}
|
||||||
|
};
|
||||||
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
let exclude_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
||||||
.remove(&Occur::MustNot)
|
.remove(&Occur::MustNot)
|
||||||
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
.map(|scorers| scorer_union(scorers, DoNothingCombiner::default))
|
||||||
.map(|specialized_scorer| {
|
.map(|specialized_scorer: SpecializedScorer| {
|
||||||
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
into_box_scorer(specialized_scorer, DoNothingCombiner::default)
|
||||||
});
|
});
|
||||||
|
let positive_scorer = match (should_opt, must_scorers) {
|
||||||
let must_scorer_opt: Option<Box<dyn Scorer>> = per_occur_scorers
|
(CombinationMethod::Ignored, Some(must_scorers)) => {
|
||||||
.remove(&Occur::Must)
|
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||||
.map(intersect_scorers);
|
}
|
||||||
|
(CombinationMethod::Optional(should_scorer), Some(must_scorers)) => {
|
||||||
let positive_scorer: SpecializedScorer = match (should_scorer_opt, must_scorer_opt) {
|
let must_scorer = intersect_scorers(must_scorers);
|
||||||
(Some(should_scorer), Some(must_scorer)) => {
|
|
||||||
if self.scoring_enabled {
|
if self.scoring_enabled {
|
||||||
SpecializedScorer::Other(Box::new(RequiredOptionalScorer::<
|
SpecializedScorer::Other(Box::new(
|
||||||
Box<dyn Scorer>,
|
RequiredOptionalScorer::<_, _, TScoreCombiner>::new(
|
||||||
Box<dyn Scorer>,
|
must_scorer,
|
||||||
TComplexScoreCombiner,
|
into_box_scorer(should_scorer, &score_combiner_fn),
|
||||||
>::new(
|
),
|
||||||
must_scorer,
|
))
|
||||||
into_box_scorer(should_scorer, &score_combiner_fn),
|
|
||||||
)))
|
|
||||||
} else {
|
} else {
|
||||||
SpecializedScorer::Other(must_scorer)
|
SpecializedScorer::Other(must_scorer)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
(None, Some(must_scorer)) => SpecializedScorer::Other(must_scorer),
|
(CombinationMethod::Required(should_scorer), Some(mut must_scorers)) => {
|
||||||
(Some(should_scorer), None) => should_scorer,
|
must_scorers.push(should_scorer);
|
||||||
(None, None) => {
|
SpecializedScorer::Other(intersect_scorers(must_scorers))
|
||||||
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)));
|
|
||||||
}
|
}
|
||||||
|
(CombinationMethod::Ignored, None) => {
|
||||||
|
return Ok(SpecializedScorer::Other(Box::new(EmptyScorer)))
|
||||||
|
}
|
||||||
|
(CombinationMethod::Required(should_scorer), None) => {
|
||||||
|
SpecializedScorer::Other(should_scorer)
|
||||||
|
}
|
||||||
|
// Optional options are promoted to required if no must scorers exists.
|
||||||
|
(CombinationMethod::Optional(should_scorer), None) => should_scorer,
|
||||||
};
|
};
|
||||||
|
|
||||||
if let Some(exclude_scorer) = exclude_scorer_opt {
|
if let Some(exclude_scorer) = exclude_scorer_opt {
|
||||||
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
let positive_scorer_boxed = into_box_scorer(positive_scorer, &score_combiner_fn);
|
||||||
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
Ok(SpecializedScorer::Other(Box::new(Exclude::new(
|
||||||
|
|||||||
@@ -12,11 +12,10 @@ mod tests {
|
|||||||
use super::*;
|
use super::*;
|
||||||
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
use crate::collector::tests::TEST_COLLECTOR_WITH_SCORE;
|
||||||
use crate::collector::TopDocs;
|
use crate::collector::TopDocs;
|
||||||
use crate::query::score_combiner::SumWithCoordsCombiner;
|
|
||||||
use crate::query::term_query::TermScorer;
|
use crate::query::term_query::TermScorer;
|
||||||
use crate::query::{
|
use crate::query::{
|
||||||
EnableScoring, Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer,
|
EnableScoring, Intersection, Occur, Query, QueryParser, RequiredOptionalScorer, Scorer,
|
||||||
TermQuery,
|
SumCombiner, TermQuery,
|
||||||
};
|
};
|
||||||
use crate::schema::*;
|
use crate::schema::*;
|
||||||
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
|
use crate::{assert_nearly_equals, DocAddress, DocId, Index, IndexWriter, Score};
|
||||||
@@ -90,11 +89,8 @@ mod tests {
|
|||||||
let query = query_parser.parse_query("+a b")?;
|
let query = query_parser.parse_query("+a b")?;
|
||||||
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
|
let weight = query.weight(EnableScoring::enabled_from_searcher(&searcher))?;
|
||||||
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
let scorer = weight.scorer(searcher.segment_reader(0u32), 1.0)?;
|
||||||
assert!(scorer.is::<RequiredOptionalScorer<
|
assert!(scorer
|
||||||
Box<dyn Scorer>,
|
.is::<RequiredOptionalScorer<Box<dyn Scorer>, Box<dyn Scorer>, SumCombiner>>());
|
||||||
Box<dyn Scorer>,
|
|
||||||
SumWithCoordsCombiner,
|
|
||||||
>>());
|
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
let query = query_parser.parse_query("+a b")?;
|
let query = query_parser.parse_query("+a b")?;
|
||||||
|
|||||||
327
src/query/disjunction.rs
Normal file
327
src/query/disjunction.rs
Normal file
@@ -0,0 +1,327 @@
|
|||||||
|
use std::cmp::Ordering;
|
||||||
|
use std::collections::BinaryHeap;
|
||||||
|
|
||||||
|
use crate::query::score_combiner::DoNothingCombiner;
|
||||||
|
use crate::query::{ScoreCombiner, Scorer};
|
||||||
|
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||||
|
|
||||||
|
/// `Disjunction` is responsible for merging `DocSet` from multiple
|
||||||
|
/// source. Specifically, It takes the union of two or more `DocSet`s
|
||||||
|
/// then filtering out elements that appear fewer times than a
|
||||||
|
/// specified threshold.
|
||||||
|
pub struct Disjunction<TScorer, TScoreCombiner = DoNothingCombiner> {
|
||||||
|
chains: BinaryHeap<ScorerWrapper<TScorer>>,
|
||||||
|
minimum_matches_required: usize,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
|
||||||
|
current_doc: DocId,
|
||||||
|
current_score: Score,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// A wrapper around a `Scorer` that caches the current `doc_id` and implements the `DocSet` trait.
|
||||||
|
/// Also, the `Ord` trait and it's family are implemented reversely. So that we can combine
|
||||||
|
/// `std::BinaryHeap<ScorerWrapper<T>>` to gain a min-heap with current doc id as key.
|
||||||
|
struct ScorerWrapper<T> {
|
||||||
|
scorer: T,
|
||||||
|
current_doc: DocId,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> ScorerWrapper<T> {
|
||||||
|
fn new(scorer: T) -> Self {
|
||||||
|
let current_doc = scorer.doc();
|
||||||
|
Self {
|
||||||
|
scorer,
|
||||||
|
current_doc,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> PartialEq for ScorerWrapper<T> {
|
||||||
|
fn eq(&self, other: &Self) -> bool {
|
||||||
|
self.doc() == other.doc()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> Eq for ScorerWrapper<T> {}
|
||||||
|
|
||||||
|
impl<T: Scorer> PartialOrd for ScorerWrapper<T> {
|
||||||
|
fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
|
||||||
|
Some(self.cmp(other))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> Ord for ScorerWrapper<T> {
|
||||||
|
fn cmp(&self, other: &Self) -> Ordering {
|
||||||
|
self.doc().cmp(&other.doc()).reverse()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<T: Scorer> DocSet for ScorerWrapper<T> {
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
let doc_id = self.scorer.advance();
|
||||||
|
self.current_doc = doc_id;
|
||||||
|
doc_id
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.scorer.size_hint()
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Disjunction<TScorer, TScoreCombiner> {
|
||||||
|
pub fn new<T: IntoIterator<Item = TScorer>>(
|
||||||
|
docsets: T,
|
||||||
|
score_combiner: TScoreCombiner,
|
||||||
|
minimum_matches_required: usize,
|
||||||
|
) -> Self {
|
||||||
|
debug_assert!(
|
||||||
|
minimum_matches_required > 1,
|
||||||
|
"union scorer works better if just one matches required"
|
||||||
|
);
|
||||||
|
let chains = docsets
|
||||||
|
.into_iter()
|
||||||
|
.map(|doc| ScorerWrapper::new(doc))
|
||||||
|
.collect();
|
||||||
|
let mut disjunction = Self {
|
||||||
|
chains,
|
||||||
|
score_combiner,
|
||||||
|
current_doc: TERMINATED,
|
||||||
|
minimum_matches_required,
|
||||||
|
current_score: 0.0,
|
||||||
|
};
|
||||||
|
if minimum_matches_required > disjunction.chains.len() {
|
||||||
|
return disjunction;
|
||||||
|
}
|
||||||
|
disjunction.advance();
|
||||||
|
disjunction
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> DocSet
|
||||||
|
for Disjunction<TScorer, TScoreCombiner>
|
||||||
|
{
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
let mut current_num_matches = 0;
|
||||||
|
while let Some(mut candidate) = self.chains.pop() {
|
||||||
|
let next = candidate.doc();
|
||||||
|
if next != TERMINATED {
|
||||||
|
// Peek next doc.
|
||||||
|
if self.current_doc != next {
|
||||||
|
if current_num_matches >= self.minimum_matches_required {
|
||||||
|
self.chains.push(candidate);
|
||||||
|
self.current_score = self.score_combiner.score();
|
||||||
|
return self.current_doc;
|
||||||
|
}
|
||||||
|
// Reset current_num_matches and scores.
|
||||||
|
current_num_matches = 0;
|
||||||
|
self.current_doc = next;
|
||||||
|
self.score_combiner.clear();
|
||||||
|
}
|
||||||
|
current_num_matches += 1;
|
||||||
|
self.score_combiner.update(&mut candidate.scorer);
|
||||||
|
candidate.advance();
|
||||||
|
self.chains.push(candidate);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if current_num_matches < self.minimum_matches_required {
|
||||||
|
self.current_doc = TERMINATED;
|
||||||
|
}
|
||||||
|
self.current_score = self.score_combiner.score();
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
#[inline]
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.current_doc
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.chains
|
||||||
|
.iter()
|
||||||
|
.map(|docset| docset.size_hint())
|
||||||
|
.max()
|
||||||
|
.unwrap_or(0u32)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<TScorer: Scorer, TScoreCombiner: ScoreCombiner> Scorer
|
||||||
|
for Disjunction<TScorer, TScoreCombiner>
|
||||||
|
{
|
||||||
|
fn score(&mut self) -> Score {
|
||||||
|
self.current_score
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use std::collections::BTreeMap;
|
||||||
|
|
||||||
|
use super::Disjunction;
|
||||||
|
use crate::query::score_combiner::DoNothingCombiner;
|
||||||
|
use crate::query::{ConstScorer, Scorer, SumCombiner, VecDocSet};
|
||||||
|
use crate::{DocId, DocSet, Score, TERMINATED};
|
||||||
|
|
||||||
|
fn conjunct<T: Ord + Copy>(arrays: &[Vec<T>], pass_line: usize) -> Vec<T> {
|
||||||
|
let mut counts = BTreeMap::new();
|
||||||
|
for array in arrays {
|
||||||
|
for &element in array {
|
||||||
|
*counts.entry(element).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
counts
|
||||||
|
.iter()
|
||||||
|
.filter_map(|(&element, &count)| {
|
||||||
|
if count >= pass_line {
|
||||||
|
Some(element)
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn aux_test_conjunction(vals: Vec<Vec<u32>>, min_match: usize) {
|
||||||
|
let mut union_expected = VecDocSet::from(conjunct(&vals, min_match));
|
||||||
|
let make_scorer = || {
|
||||||
|
Disjunction::new(
|
||||||
|
vals.iter()
|
||||||
|
.cloned()
|
||||||
|
.map(VecDocSet::from)
|
||||||
|
.map(|d| ConstScorer::new(d, 1.0)),
|
||||||
|
DoNothingCombiner,
|
||||||
|
min_match,
|
||||||
|
)
|
||||||
|
};
|
||||||
|
let mut scorer: Disjunction<_, DoNothingCombiner> = make_scorer();
|
||||||
|
let mut count = 0;
|
||||||
|
while scorer.doc() != TERMINATED {
|
||||||
|
assert_eq!(union_expected.doc(), scorer.doc());
|
||||||
|
assert_eq!(union_expected.advance(), scorer.advance());
|
||||||
|
count += 1;
|
||||||
|
}
|
||||||
|
assert_eq!(union_expected.advance(), TERMINATED);
|
||||||
|
assert_eq!(count, make_scorer().count_including_deleted());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[should_panic]
|
||||||
|
#[test]
|
||||||
|
fn test_arg_check1() {
|
||||||
|
aux_test_conjunction(vec![], 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[should_panic]
|
||||||
|
#[test]
|
||||||
|
fn test_arg_check2() {
|
||||||
|
aux_test_conjunction(vec![], 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_corner_case() {
|
||||||
|
aux_test_conjunction(vec![], 2);
|
||||||
|
aux_test_conjunction(vec![vec![]; 1000], 2);
|
||||||
|
aux_test_conjunction(vec![vec![]; 100], usize::MAX);
|
||||||
|
aux_test_conjunction(vec![vec![0xC0FFEE]; 10000], usize::MAX);
|
||||||
|
aux_test_conjunction((1..10000u32).map(|i| vec![i]).collect::<Vec<_>>(), 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_conjunction() {
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![
|
||||||
|
vec![1, 3333, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
],
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![vec![8], vec![3, 4, 0xC0FFEEu32], vec![1, 2, 100000000u32]],
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
aux_test_conjunction(
|
||||||
|
vec![
|
||||||
|
vec![1, 3333, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
vec![1, 2, 100000000u32],
|
||||||
|
],
|
||||||
|
3,
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
// This dummy scorer does nothing but yield doc id increasingly.
|
||||||
|
// with constant score 1.0
|
||||||
|
#[derive(Clone)]
|
||||||
|
struct DummyScorer {
|
||||||
|
cursor: usize,
|
||||||
|
foo: Vec<(DocId, f32)>,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DummyScorer {
|
||||||
|
fn new(doc_score: Vec<(DocId, f32)>) -> Self {
|
||||||
|
Self {
|
||||||
|
cursor: 0,
|
||||||
|
foo: doc_score,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl DocSet for DummyScorer {
|
||||||
|
fn advance(&mut self) -> DocId {
|
||||||
|
self.cursor += 1;
|
||||||
|
self.doc()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn doc(&self) -> DocId {
|
||||||
|
self.foo.get(self.cursor).map(|x| x.0).unwrap_or(TERMINATED)
|
||||||
|
}
|
||||||
|
|
||||||
|
fn size_hint(&self) -> u32 {
|
||||||
|
self.foo.len() as u32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl Scorer for DummyScorer {
|
||||||
|
fn score(&mut self) -> Score {
|
||||||
|
self.foo.get(self.cursor).map(|x| x.1).unwrap_or(0.0)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_score_calculate() {
|
||||||
|
let mut scorer = Disjunction::new(
|
||||||
|
vec![
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (4, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
],
|
||||||
|
SumCombiner::default(),
|
||||||
|
3,
|
||||||
|
);
|
||||||
|
assert_eq!(scorer.score(), 5.0);
|
||||||
|
assert_eq!(scorer.advance(), 2);
|
||||||
|
assert_eq!(scorer.score(), 3.0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_score_calculate_corner_case() {
|
||||||
|
let mut scorer = Disjunction::new(
|
||||||
|
vec![
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (2, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
DummyScorer::new(vec![(1, 1f32), (3, 1f32)]),
|
||||||
|
],
|
||||||
|
SumCombiner::default(),
|
||||||
|
2,
|
||||||
|
);
|
||||||
|
assert_eq!(scorer.doc(), 1);
|
||||||
|
assert_eq!(scorer.score(), 3.0);
|
||||||
|
assert_eq!(scorer.advance(), 3);
|
||||||
|
assert_eq!(scorer.score(), 2.0);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -149,7 +149,7 @@ mod tests {
|
|||||||
use crate::query::exist_query::ExistsQuery;
|
use crate::query::exist_query::ExistsQuery;
|
||||||
use crate::query::{BooleanQuery, RangeQuery};
|
use crate::query::{BooleanQuery, RangeQuery};
|
||||||
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
use crate::schema::{Facet, FacetOptions, Schema, FAST, INDEXED, STRING, TEXT};
|
||||||
use crate::{Index, Searcher};
|
use crate::{Index, Searcher, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_exists_query_simple() -> crate::Result<()> {
|
fn test_exists_query_simple() -> crate::Result<()> {
|
||||||
@@ -188,9 +188,8 @@ mod tests {
|
|||||||
|
|
||||||
// exercise seek
|
// exercise seek
|
||||||
let query = BooleanQuery::intersection(vec![
|
let query = BooleanQuery::intersection(vec![
|
||||||
Box::new(RangeQuery::new_u64_bounds(
|
Box::new(RangeQuery::new(
|
||||||
"all".to_string(),
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||||
Bound::Included(50),
|
|
||||||
Bound::Unbounded,
|
Bound::Unbounded,
|
||||||
)),
|
)),
|
||||||
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
Box::new(ExistsQuery::new_exists_query("even".to_string())),
|
||||||
@@ -198,10 +197,9 @@ mod tests {
|
|||||||
assert_eq!(searcher.search(&query, &Count)?, 25);
|
assert_eq!(searcher.search(&query, &Count)?, 25);
|
||||||
|
|
||||||
let query = BooleanQuery::intersection(vec![
|
let query = BooleanQuery::intersection(vec![
|
||||||
Box::new(RangeQuery::new_u64_bounds(
|
Box::new(RangeQuery::new(
|
||||||
"all".to_string(),
|
Bound::Included(Term::from_field_u64(all_field, 0)),
|
||||||
Bound::Included(0),
|
Bound::Included(Term::from_field_u64(all_field, 50)),
|
||||||
Bound::Excluded(50),
|
|
||||||
)),
|
)),
|
||||||
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
|
||||||
]);
|
]);
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ mod bm25;
|
|||||||
mod boolean_query;
|
mod boolean_query;
|
||||||
mod boost_query;
|
mod boost_query;
|
||||||
mod const_score_query;
|
mod const_score_query;
|
||||||
|
mod disjunction;
|
||||||
mod disjunction_max_query;
|
mod disjunction_max_query;
|
||||||
mod empty_query;
|
mod empty_query;
|
||||||
mod exclude;
|
mod exclude;
|
||||||
@@ -53,12 +54,10 @@ pub use self::phrase_prefix_query::PhrasePrefixQuery;
|
|||||||
pub use self::phrase_query::PhraseQuery;
|
pub use self::phrase_query::PhraseQuery;
|
||||||
pub use self::query::{EnableScoring, Query, QueryClone};
|
pub use self::query::{EnableScoring, Query, QueryClone};
|
||||||
pub use self::query_parser::{QueryParser, QueryParserError};
|
pub use self::query_parser::{QueryParser, QueryParserError};
|
||||||
pub use self::range_query::{FastFieldRangeWeight, IPFastFieldRangeWeight, RangeQuery};
|
pub use self::range_query::*;
|
||||||
pub use self::regex_query::RegexQuery;
|
pub use self::regex_query::RegexQuery;
|
||||||
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
pub use self::reqopt_scorer::RequiredOptionalScorer;
|
||||||
pub use self::score_combiner::{
|
pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombiner};
|
||||||
DisjunctionMaxCombiner, ScoreCombiner, SumCombiner, SumWithCoordsCombiner,
|
|
||||||
};
|
|
||||||
pub use self::scorer::Scorer;
|
pub use self::scorer::Scorer;
|
||||||
pub use self::set_query::TermSetQuery;
|
pub use self::set_query::TermSetQuery;
|
||||||
pub use self::term_query::TermQuery;
|
pub use self::term_query::TermQuery;
|
||||||
|
|||||||
@@ -209,7 +209,7 @@ impl MoreLikeThis {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// TOOD: Validate these changed align with the HEAD branch.
|
// TODO: Validate these changed align with the HEAD branch.
|
||||||
for value in values {
|
for value in values {
|
||||||
if let Some(text) = value.as_str() {
|
if let Some(text) = value.as_str() {
|
||||||
let tokenizer = match &mut tokenizer_opt {
|
let tokenizer = match &mut tokenizer_opt {
|
||||||
@@ -241,7 +241,7 @@ impl MoreLikeThis {
|
|||||||
let timestamp = value.as_datetime().ok_or_else(|| {
|
let timestamp = value.as_datetime().ok_or_else(|| {
|
||||||
TantivyError::InvalidArgument("invalid value".to_string())
|
TantivyError::InvalidArgument("invalid value".to_string())
|
||||||
})?;
|
})?;
|
||||||
let term = Term::from_field_date(field, timestamp);
|
let term = Term::from_field_date_for_search(field, timestamp);
|
||||||
*term_frequencies.entry(term).or_insert(0) += 1;
|
*term_frequencies.entry(term).or_insert(0) += 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -295,7 +295,7 @@ impl MoreLikeThis {
|
|||||||
self.stop_words.contains(&word)
|
self.stop_words.contains(&word)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Couputes the score for each term while ignoring not useful terms
|
/// Computes the score for each term while ignoring not useful terms
|
||||||
fn create_score_term(
|
fn create_score_term(
|
||||||
&self,
|
&self,
|
||||||
searcher: &Searcher,
|
searcher: &Searcher,
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::ops::Bound;
|
|||||||
|
|
||||||
use super::{prefix_end, PhrasePrefixWeight};
|
use super::{prefix_end, PhrasePrefixWeight};
|
||||||
use crate::query::bm25::Bm25Weight;
|
use crate::query::bm25::Bm25Weight;
|
||||||
use crate::query::{EnableScoring, Query, RangeQuery, Weight};
|
use crate::query::{EnableScoring, InvertedIndexRangeWeight, Query, Weight};
|
||||||
use crate::schema::{Field, IndexRecordOption, Term};
|
use crate::schema::{Field, IndexRecordOption, Term};
|
||||||
|
|
||||||
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
|
const DEFAULT_MAX_EXPANSIONS: u32 = 50;
|
||||||
@@ -86,7 +86,7 @@ impl PhrasePrefixQuery {
|
|||||||
///
|
///
|
||||||
/// This function is the same as [`Query::weight()`] except it returns
|
/// This function is the same as [`Query::weight()`] except it returns
|
||||||
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
/// a specialized type [`PhraseQueryWeight`] instead of a Boxed trait.
|
||||||
/// If the query was only one term long, this returns `None` wherease [`Query::weight`]
|
/// If the query was only one term long, this returns `None` whereas [`Query::weight`]
|
||||||
/// returns a boxed [`RangeWeight`]
|
/// returns a boxed [`RangeWeight`]
|
||||||
pub(crate) fn phrase_prefix_query_weight(
|
pub(crate) fn phrase_prefix_query_weight(
|
||||||
&self,
|
&self,
|
||||||
@@ -145,17 +145,15 @@ impl Query for PhrasePrefixQuery {
|
|||||||
Bound::Unbounded
|
Bound::Unbounded
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut range_query = RangeQuery::new_term_bounds(
|
let lower_bound = Bound::Included(self.prefix.1.clone());
|
||||||
enable_scoring
|
let upper_bound = end_term;
|
||||||
.schema()
|
|
||||||
.get_field_name(self.field)
|
Ok(Box::new(InvertedIndexRangeWeight::new(
|
||||||
.to_owned(),
|
self.field,
|
||||||
self.prefix.1.typ(),
|
&lower_bound,
|
||||||
&Bound::Included(self.prefix.1.clone()),
|
&upper_bound,
|
||||||
&end_term,
|
Some(self.max_expansions as u64),
|
||||||
);
|
)))
|
||||||
range_query.limit(self.max_expansions as u64);
|
|
||||||
range_query.weight(enable_scoring)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ use crate::query::phrase_query::{intersection_count, PhraseScorer};
|
|||||||
use crate::query::Scorer;
|
use crate::query::Scorer;
|
||||||
use crate::{DocId, Score};
|
use crate::{DocId, Score};
|
||||||
|
|
||||||
|
// MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
|
||||||
|
// though, it would be interesting to slim it down if possible.
|
||||||
|
#[allow(clippy::large_enum_variant)]
|
||||||
enum PhraseKind<TPostings: Postings> {
|
enum PhraseKind<TPostings: Postings> {
|
||||||
SinglePrefix {
|
SinglePrefix {
|
||||||
position_offset: u32,
|
position_offset: u32,
|
||||||
@@ -97,6 +100,7 @@ pub struct PhrasePrefixScorer<TPostings: Postings> {
|
|||||||
suffixes: Vec<TPostings>,
|
suffixes: Vec<TPostings>,
|
||||||
suffix_offset: u32,
|
suffix_offset: u32,
|
||||||
phrase_count: u32,
|
phrase_count: u32,
|
||||||
|
suffix_position_buffer: Vec<u32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
||||||
@@ -140,6 +144,7 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
suffixes,
|
suffixes,
|
||||||
suffix_offset: (max_offset - suffix_pos) as u32,
|
suffix_offset: (max_offset - suffix_pos) as u32,
|
||||||
phrase_count: 0,
|
phrase_count: 0,
|
||||||
|
suffix_position_buffer: Vec::with_capacity(100),
|
||||||
};
|
};
|
||||||
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
if phrase_prefix_scorer.doc() != TERMINATED && !phrase_prefix_scorer.matches_prefix() {
|
||||||
phrase_prefix_scorer.advance();
|
phrase_prefix_scorer.advance();
|
||||||
@@ -153,7 +158,6 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
|
|
||||||
fn matches_prefix(&mut self) -> bool {
|
fn matches_prefix(&mut self) -> bool {
|
||||||
let mut count = 0;
|
let mut count = 0;
|
||||||
let mut positions = Vec::new();
|
|
||||||
let current_doc = self.doc();
|
let current_doc = self.doc();
|
||||||
let pos_matching = self.phrase_scorer.get_intersection();
|
let pos_matching = self.phrase_scorer.get_intersection();
|
||||||
for suffix in &mut self.suffixes {
|
for suffix in &mut self.suffixes {
|
||||||
@@ -162,8 +166,8 @@ impl<TPostings: Postings> PhrasePrefixScorer<TPostings> {
|
|||||||
}
|
}
|
||||||
let doc = suffix.seek(current_doc);
|
let doc = suffix.seek(current_doc);
|
||||||
if doc == current_doc {
|
if doc == current_doc {
|
||||||
suffix.positions_with_offset(self.suffix_offset, &mut positions);
|
suffix.positions_with_offset(self.suffix_offset, &mut self.suffix_position_buffer);
|
||||||
count += intersection_count(pos_matching, &positions);
|
count += intersection_count(pos_matching, &self.suffix_position_buffer);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
self.phrase_count = count as u32;
|
self.phrase_count = count as u32;
|
||||||
|
|||||||
@@ -5,8 +5,8 @@ use crate::schema::{Field, IndexRecordOption, Term};
|
|||||||
|
|
||||||
/// `PhraseQuery` matches a specific sequence of words.
|
/// `PhraseQuery` matches a specific sequence of words.
|
||||||
///
|
///
|
||||||
/// For instance the phrase query for `"part time"` will match
|
/// For instance, the phrase query for `"part time"` will match
|
||||||
/// the sentence
|
/// the sentence:
|
||||||
///
|
///
|
||||||
/// **Alan just got a part time job.**
|
/// **Alan just got a part time job.**
|
||||||
///
|
///
|
||||||
@@ -15,7 +15,7 @@ use crate::schema::{Field, IndexRecordOption, Term};
|
|||||||
/// **This is my favorite part of the job.**
|
/// **This is my favorite part of the job.**
|
||||||
///
|
///
|
||||||
/// [Slop](PhraseQuery::set_slop) allows leniency in term proximity
|
/// [Slop](PhraseQuery::set_slop) allows leniency in term proximity
|
||||||
/// for some performance tradeof.
|
/// for some performance trade-off.
|
||||||
///
|
///
|
||||||
/// Using a `PhraseQuery` on a field requires positions
|
/// Using a `PhraseQuery` on a field requires positions
|
||||||
/// to be indexed for this field.
|
/// to be indexed for this field.
|
||||||
|
|||||||
@@ -219,8 +219,8 @@ fn intersection_exists_with_slop(
|
|||||||
/// In contrast to the regular algorithm this solves some issues:
|
/// In contrast to the regular algorithm this solves some issues:
|
||||||
/// - Keep track of the slop so far. Slop is a budget that is spent on the distance between terms.
|
/// - Keep track of the slop so far. Slop is a budget that is spent on the distance between terms.
|
||||||
/// - When encountering a match between two positions, which position is the best match is unclear
|
/// - When encountering a match between two positions, which position is the best match is unclear
|
||||||
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
|
/// and depends on intersections afterwards, therefore this algorithm keeps left and right as
|
||||||
/// matches, but only counts one.
|
/// matches, but only counts one.
|
||||||
///
|
///
|
||||||
/// This algorithm may return an incorrect count in some cases (e.g. left, right expansion and is
|
/// This algorithm may return an incorrect count in some cases (e.g. left, right expansion and is
|
||||||
/// then matches both on the following term.)
|
/// then matches both on the following term.)
|
||||||
|
|||||||
@@ -115,10 +115,10 @@ impl<'a> EnableScoring<'a> {
|
|||||||
///
|
///
|
||||||
/// So to sum it up :
|
/// So to sum it up :
|
||||||
/// - a `Query` is a recipe to define a set of documents as well the way to score them.
|
/// - a `Query` is a recipe to define a set of documents as well the way to score them.
|
||||||
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance
|
/// - a [`Weight`] is this recipe tied to a specific [`Searcher`]. It may for instance hold
|
||||||
/// hold statistics about the different term of the query. It is created by the query.
|
/// statistics about the different term of the query. It is created by the query.
|
||||||
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific
|
/// - a [`Scorer`] is a cursor over the set of matching documents, for a specific [`SegmentReader`].
|
||||||
/// [`SegmentReader`]. It is created by the [`Weight`].
|
/// It is created by the [`Weight`].
|
||||||
///
|
///
|
||||||
/// When implementing a new type of `Query`, it is normal to implement a
|
/// When implementing a new type of `Query`, it is normal to implement a
|
||||||
/// dedicated `Query`, [`Weight`] and [`Scorer`].
|
/// dedicated `Query`, [`Weight`] and [`Scorer`].
|
||||||
|
|||||||
@@ -2,7 +2,7 @@ use std::fmt;
|
|||||||
use std::ops::Bound;
|
use std::ops::Bound;
|
||||||
|
|
||||||
use crate::query::Occur;
|
use crate::query::Occur;
|
||||||
use crate::schema::{Term, Type};
|
use crate::schema::Term;
|
||||||
use crate::Score;
|
use crate::Score;
|
||||||
|
|
||||||
#[derive(Clone)]
|
#[derive(Clone)]
|
||||||
@@ -14,8 +14,6 @@ pub enum LogicalLiteral {
|
|||||||
prefix: bool,
|
prefix: bool,
|
||||||
},
|
},
|
||||||
Range {
|
Range {
|
||||||
field: String,
|
|
||||||
value_type: Type,
|
|
||||||
lower: Bound<Term>,
|
lower: Bound<Term>,
|
||||||
upper: Bound<Term>,
|
upper: Bound<Term>,
|
||||||
},
|
},
|
||||||
@@ -39,6 +37,34 @@ impl LogicalAst {
|
|||||||
LogicalAst::Boost(Box::new(self), boost)
|
LogicalAst::Boost(Box::new(self), boost)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn simplify(self) -> LogicalAst {
|
||||||
|
match self {
|
||||||
|
LogicalAst::Clause(clauses) => {
|
||||||
|
let mut new_clauses: Vec<(Occur, LogicalAst)> = Vec::new();
|
||||||
|
|
||||||
|
for (occur, sub_ast) in clauses {
|
||||||
|
let simplified_sub_ast = sub_ast.simplify();
|
||||||
|
|
||||||
|
// If clauses below have the same `Occur`, we can pull them up
|
||||||
|
match simplified_sub_ast {
|
||||||
|
LogicalAst::Clause(sub_clauses)
|
||||||
|
if (occur == Occur::Should || occur == Occur::Must)
|
||||||
|
&& sub_clauses.iter().all(|(o, _)| *o == occur) =>
|
||||||
|
{
|
||||||
|
for sub_clause in sub_clauses {
|
||||||
|
new_clauses.push(sub_clause);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
_ => new_clauses.push((occur, simplified_sub_ast)),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
LogicalAst::Clause(new_clauses)
|
||||||
|
}
|
||||||
|
LogicalAst::Leaf(_) | LogicalAst::Boost(_, _) => self,
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn occur_letter(occur: Occur) -> &'static str {
|
fn occur_letter(occur: Occur) -> &'static str {
|
||||||
|
|||||||
@@ -58,7 +58,7 @@ pub enum QueryParserError {
|
|||||||
#[error("Invalid query: Only excluding terms given")]
|
#[error("Invalid query: Only excluding terms given")]
|
||||||
AllButQueryForbidden,
|
AllButQueryForbidden,
|
||||||
/// If no default field is declared, running a query without any
|
/// If no default field is declared, running a query without any
|
||||||
/// field specified is forbbidden.
|
/// field specified is forbidden.
|
||||||
#[error("No default field declared and no field specified in query")]
|
#[error("No default field declared and no field specified in query")]
|
||||||
NoDefaultFieldDeclared,
|
NoDefaultFieldDeclared,
|
||||||
/// The field searched for is not declared
|
/// The field searched for is not declared
|
||||||
@@ -137,7 +137,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
|||||||
/// so-called default fields (as set up in the constructor).
|
/// so-called default fields (as set up in the constructor).
|
||||||
///
|
///
|
||||||
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
|
/// Assuming that the default fields are `body` and `title`, and the query parser is set with
|
||||||
/// conjunction as a default, our query will be interpreted as.
|
/// conjunction as a default, our query will be interpreted as.
|
||||||
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
|
/// `(body:Barack OR title:Barack) AND (title:Obama OR body:Obama)`.
|
||||||
/// By default, all tokenized and indexed fields are default fields.
|
/// By default, all tokenized and indexed fields are default fields.
|
||||||
///
|
///
|
||||||
@@ -148,8 +148,7 @@ fn trim_ast(logical_ast: LogicalAst) -> Option<LogicalAst> {
|
|||||||
/// `body:Barack OR (body:Barack OR text:Obama)` .
|
/// `body:Barack OR (body:Barack OR text:Obama)` .
|
||||||
///
|
///
|
||||||
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
|
/// * boolean operators `AND`, `OR`. `AND` takes precedence over `OR`, so that `a AND b OR c` is
|
||||||
/// interpreted
|
/// interpreted as `(a AND b) OR c`.
|
||||||
/// as `(a AND b) OR c`.
|
|
||||||
///
|
///
|
||||||
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
|
/// * In addition to the boolean operators, the `-`, `+` can help define. These operators are
|
||||||
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
|
/// sufficient to express all queries using boolean operators. For instance `x AND y OR z` can be
|
||||||
@@ -272,8 +271,7 @@ impl QueryParser {
|
|||||||
|
|
||||||
/// Creates a `QueryParser`, given
|
/// Creates a `QueryParser`, given
|
||||||
/// * an index
|
/// * an index
|
||||||
/// * a set of default fields used to search if no field is specifically defined
|
/// * a set of default fields used to search if no field is specifically defined in the query.
|
||||||
/// in the query.
|
|
||||||
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
|
pub fn for_index(index: &Index, default_fields: Vec<Field>) -> QueryParser {
|
||||||
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
|
QueryParser::new(index.schema(), default_fields, index.tokenizers().clone())
|
||||||
}
|
}
|
||||||
@@ -379,7 +377,7 @@ impl QueryParser {
|
|||||||
if !err.is_empty() {
|
if !err.is_empty() {
|
||||||
return Err(err.swap_remove(0));
|
return Err(err.swap_remove(0));
|
||||||
}
|
}
|
||||||
Ok(ast)
|
Ok(ast.simplify())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Parse the user query into an AST.
|
/// Parse the user query into an AST.
|
||||||
@@ -482,16 +480,33 @@ impl QueryParser {
|
|||||||
});
|
});
|
||||||
if terms.len() != 1 {
|
if terms.len() != 1 {
|
||||||
return Err(QueryParserError::UnsupportedQuery(format!(
|
return Err(QueryParserError::UnsupportedQuery(format!(
|
||||||
"Range query boundary cannot have multiple tokens: {phrase:?}."
|
"Range query boundary cannot have multiple tokens: {phrase:?} [{terms:?}]."
|
||||||
)));
|
)));
|
||||||
}
|
}
|
||||||
Ok(terms.into_iter().next().unwrap())
|
Ok(terms.into_iter().next().unwrap())
|
||||||
}
|
}
|
||||||
FieldType::JsonObject(_) => {
|
FieldType::JsonObject(ref json_options) => {
|
||||||
// Json range are not supported.
|
let get_term_with_path = || {
|
||||||
Err(QueryParserError::UnsupportedQuery(
|
Term::from_field_json_path(
|
||||||
"Range query are not supported on json field.".to_string(),
|
field,
|
||||||
))
|
json_path,
|
||||||
|
json_options.is_expand_dots_enabled(),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
if let Some(term) =
|
||||||
|
// Try to convert the phrase to a fast value
|
||||||
|
convert_to_fast_value_and_append_to_json_term(
|
||||||
|
get_term_with_path(),
|
||||||
|
phrase,
|
||||||
|
false,
|
||||||
|
)
|
||||||
|
{
|
||||||
|
Ok(term)
|
||||||
|
} else {
|
||||||
|
let mut term = get_term_with_path();
|
||||||
|
term.append_type_and_str(phrase);
|
||||||
|
Ok(term)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
FieldType::Facet(_) => match Facet::from_text(phrase) {
|
FieldType::Facet(_) => match Facet::from_text(phrase) {
|
||||||
Ok(facet) => Ok(Term::from_facet(field, &facet)),
|
Ok(facet) => Ok(Term::from_facet(field, &facet)),
|
||||||
@@ -553,7 +568,7 @@ impl QueryParser {
|
|||||||
}
|
}
|
||||||
FieldType::Date(_) => {
|
FieldType::Date(_) => {
|
||||||
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
let dt = OffsetDateTime::parse(phrase, &Rfc3339)?;
|
||||||
let dt_term = Term::from_field_date(field, DateTime::from_utc(dt));
|
let dt_term = Term::from_field_date_for_search(field, DateTime::from_utc(dt));
|
||||||
Ok(vec![LogicalLiteral::Term(dt_term)])
|
Ok(vec![LogicalLiteral::Term(dt_term)])
|
||||||
}
|
}
|
||||||
FieldType::Str(ref str_options) => {
|
FieldType::Str(ref str_options) => {
|
||||||
@@ -685,8 +700,8 @@ impl QueryParser {
|
|||||||
///
|
///
|
||||||
/// The terms are identified by a triplet:
|
/// The terms are identified by a triplet:
|
||||||
/// - tantivy field
|
/// - tantivy field
|
||||||
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON
|
/// - field_path: tantivy has JSON fields. It is possible to target a member of a JSON object by
|
||||||
/// object by naturally extending the json field name with a "." separated field_path
|
/// naturally extending the json field name with a "." separated field_path
|
||||||
/// - field_phrase: the phrase that is being searched.
|
/// - field_phrase: the phrase that is being searched.
|
||||||
///
|
///
|
||||||
/// The literal identifies the targeted field by a so-called *full field path*,
|
/// The literal identifies the targeted field by a so-called *full field path*,
|
||||||
@@ -790,8 +805,6 @@ impl QueryParser {
|
|||||||
let (field, json_path) = try_tuple!(self
|
let (field, json_path) = try_tuple!(self
|
||||||
.split_full_path(&full_path)
|
.split_full_path(&full_path)
|
||||||
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
.ok_or_else(|| QueryParserError::FieldDoesNotExist(full_path.clone())));
|
||||||
let field_entry = self.schema.get_field_entry(field);
|
|
||||||
let value_type = field_entry.field_type().value_type();
|
|
||||||
let mut errors = Vec::new();
|
let mut errors = Vec::new();
|
||||||
let lower = match self.resolve_bound(field, json_path, &lower) {
|
let lower = match self.resolve_bound(field, json_path, &lower) {
|
||||||
Ok(bound) => bound,
|
Ok(bound) => bound,
|
||||||
@@ -809,15 +822,11 @@ impl QueryParser {
|
|||||||
};
|
};
|
||||||
if lower == Bound::Unbounded && upper == Bound::Unbounded {
|
if lower == Bound::Unbounded && upper == Bound::Unbounded {
|
||||||
// this range is useless, either because a user requested [* TO *], or because
|
// this range is useless, either because a user requested [* TO *], or because
|
||||||
// we failed to parse something. Either way, there is no point emiting it
|
// we failed to parse something. Either way, there is no point emitting it
|
||||||
return (None, errors);
|
return (None, errors);
|
||||||
}
|
}
|
||||||
let logical_ast = LogicalAst::Leaf(Box::new(LogicalLiteral::Range {
|
let logical_ast =
|
||||||
field: self.schema.get_field_name(field).to_string(),
|
LogicalAst::Leaf(Box::new(LogicalLiteral::Range { lower, upper }));
|
||||||
value_type,
|
|
||||||
lower,
|
|
||||||
upper,
|
|
||||||
}));
|
|
||||||
(Some(logical_ast), errors)
|
(Some(logical_ast), errors)
|
||||||
}
|
}
|
||||||
UserInputLeaf::Set {
|
UserInputLeaf::Set {
|
||||||
@@ -884,14 +893,7 @@ fn convert_literal_to_query(
|
|||||||
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
Box::new(PhraseQuery::new_with_offset_and_slop(terms, slop))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
LogicalLiteral::Range {
|
LogicalLiteral::Range { lower, upper } => Box::new(RangeQuery::new(lower, upper)),
|
||||||
field,
|
|
||||||
value_type,
|
|
||||||
lower,
|
|
||||||
upper,
|
|
||||||
} => Box::new(RangeQuery::new_term_bounds(
|
|
||||||
field, value_type, &lower, &upper,
|
|
||||||
)),
|
|
||||||
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
LogicalLiteral::Set { elements, .. } => Box::new(TermSetQuery::new(elements)),
|
||||||
LogicalLiteral::All => Box::new(AllQuery),
|
LogicalLiteral::All => Box::new(AllQuery),
|
||||||
}
|
}
|
||||||
@@ -962,7 +964,8 @@ fn generate_literals_for_json_object(
|
|||||||
|| Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());
|
|| Term::from_field_json_path(field, json_path, json_options.is_expand_dots_enabled());
|
||||||
|
|
||||||
// Try to convert the phrase to a fast value
|
// Try to convert the phrase to a fast value
|
||||||
if let Some(term) = convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase)
|
if let Some(term) =
|
||||||
|
convert_to_fast_value_and_append_to_json_term(get_term_with_path(), phrase, true)
|
||||||
{
|
{
|
||||||
logical_literals.push(LogicalLiteral::Term(term));
|
logical_literals.push(LogicalLiteral::Term(term));
|
||||||
}
|
}
|
||||||
@@ -1136,8 +1139,8 @@ mod test {
|
|||||||
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
let query = make_query_parser().parse_query("title:[A TO B]").unwrap();
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"RangeQuery { field: \"title\", value_type: Str, lower_bound: Included([97]), \
|
"RangeQuery { bounds: BoundsRange { lower_bound: Included(Term(field=0, type=Str, \
|
||||||
upper_bound: Included([98]), limit: None }"
|
\"a\")), upper_bound: Included(Term(field=0, type=Str, \"b\")) } }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1304,7 +1307,7 @@ mod test {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_json_field_query_with_espaced_dot() {
|
fn test_json_field_query_with_escaped_dot() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
extract_query_term_json_path(r#"json.k8s.node.name:hello"#),
|
extract_query_term_json_path(r#"json.k8s.node.name:hello"#),
|
||||||
"k8s\u{1}node\u{1}name\0shello"
|
"k8s\u{1}node\u{1}name\0shello"
|
||||||
@@ -1434,7 +1437,7 @@ mod test {
|
|||||||
);
|
);
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
"(+title:a +title:b) title:c",
|
"(+title:a +title:b) title:c",
|
||||||
r#"(+(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b")) +Term(field=0, type=Str, "c"))"#,
|
r#"(+Term(field=0, type=Str, "a") +Term(field=0, type=Str, "b") +Term(field=0, type=Str, "c"))"#,
|
||||||
true,
|
true,
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
@@ -1470,7 +1473,7 @@ mod test {
|
|||||||
pub fn test_parse_query_to_ast_two_terms() {
|
pub fn test_parse_query_to_ast_two_terms() {
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
"title:a b",
|
"title:a b",
|
||||||
r#"(Term(field=0, type=Str, "a") (Term(field=0, type=Str, "b") Term(field=1, type=Str, "b")))"#,
|
r#"(Term(field=0, type=Str, "a") Term(field=0, type=Str, "b") Term(field=1, type=Str, "b"))"#,
|
||||||
false,
|
false,
|
||||||
);
|
);
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
@@ -1702,6 +1705,21 @@ mod test {
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
pub fn test_parse_query_negative() {
|
||||||
|
test_parse_query_to_logical_ast_helper(
|
||||||
|
"title:b -title:a",
|
||||||
|
r#"(+Term(field=0, type=Str, "b") -Term(field=0, type=Str, "a"))"#,
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
|
||||||
|
test_parse_query_to_logical_ast_helper(
|
||||||
|
"title:b -(-title:a -title:c)",
|
||||||
|
r#"(+Term(field=0, type=Str, "b") -(-Term(field=0, type=Str, "a") -Term(field=0, type=Str, "c")))"#,
|
||||||
|
true,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
pub fn test_query_parser_hyphen() {
|
pub fn test_query_parser_hyphen() {
|
||||||
test_parse_query_to_logical_ast_helper(
|
test_parse_query_to_logical_ast_helper(
|
||||||
@@ -1815,7 +1833,8 @@ mod test {
|
|||||||
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
\"bad\"))], prefix: (2, Term(field=0, type=Str, \"wo\")), max_expansions: 50 }), \
|
||||||
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
(Should, PhrasePrefixQuery { field: Field(1), phrase_terms: [(0, Term(field=1, \
|
||||||
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
type=Str, \"big\")), (1, Term(field=1, type=Str, \"bad\"))], prefix: (2, \
|
||||||
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })] }"
|
Term(field=1, type=Str, \"wo\")), max_expansions: 50 })], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1880,7 +1899,8 @@ mod test {
|
|||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
|
"BooleanQuery { subqueries: [(Should, FuzzyTermQuery { term: Term(field=0, \
|
||||||
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
|
type=Str, \"abc\"), distance: 1, transposition_cost_one: true, prefix: false }), \
|
||||||
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))] }"
|
(Should, TermQuery(Term(field=1, type=Str, \"abc\")))], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -1897,7 +1917,8 @@ mod test {
|
|||||||
format!("{query:?}"),
|
format!("{query:?}"),
|
||||||
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
|
"BooleanQuery { subqueries: [(Should, TermQuery(Term(field=0, type=Str, \
|
||||||
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
|
\"abc\"))), (Should, FuzzyTermQuery { term: Term(field=1, type=Str, \"abc\"), \
|
||||||
distance: 2, transposition_cost_one: false, prefix: true })] }"
|
distance: 2, transposition_cost_one: false, prefix: true })], \
|
||||||
|
minimum_number_should_match: 1 }"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -49,10 +49,10 @@ pub(crate) struct RangeDocSet<T> {
|
|||||||
///
|
///
|
||||||
/// There are two patterns.
|
/// There are two patterns.
|
||||||
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
|
/// - We do a full scan. => We can load large chunks. We don't know in advance if seek call
|
||||||
/// will come, so we start with small chunks
|
/// will come, so we start with small chunks
|
||||||
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
|
/// - We load docs, interspersed with seek calls. When there are big jumps in the seek, we
|
||||||
/// should load small chunks. When the seeks are small, we can employ the same strategy as on a
|
/// should load small chunks. When the seeks are small, we can employ the same strategy as on
|
||||||
/// full scan.
|
/// a full scan.
|
||||||
fetch_horizon: u32,
|
fetch_horizon: u32,
|
||||||
/// Current batch of loaded docs.
|
/// Current batch of loaded docs.
|
||||||
loaded_docs: VecCursor,
|
loaded_docs: VecCursor,
|
||||||
@@ -180,10 +180,12 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
|
use std::ops::Bound;
|
||||||
|
|
||||||
use crate::collector::Count;
|
use crate::collector::Count;
|
||||||
use crate::directory::RamDirectory;
|
use crate::directory::RamDirectory;
|
||||||
use crate::query::RangeQuery;
|
use crate::query::RangeQuery;
|
||||||
use crate::{schema, IndexBuilder, TantivyDocument};
|
use crate::{schema, IndexBuilder, TantivyDocument, Term};
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn range_query_fast_optional_field_minimum() {
|
fn range_query_fast_optional_field_minimum() {
|
||||||
@@ -218,10 +220,9 @@ mod tests {
|
|||||||
let reader = index.reader().unwrap();
|
let reader = index.reader().unwrap();
|
||||||
let searcher = reader.searcher();
|
let searcher = reader.searcher();
|
||||||
|
|
||||||
let query = RangeQuery::new_u64_bounds(
|
let query = RangeQuery::new(
|
||||||
"score".to_string(),
|
Bound::Included(Term::from_field_u64(score_field, 70)),
|
||||||
std::ops::Bound::Included(70),
|
Bound::Unbounded,
|
||||||
std::ops::Bound::Unbounded,
|
|
||||||
);
|
);
|
||||||
|
|
||||||
let count = searcher.search(&query, &Count).unwrap();
|
let count = searcher.search(&query, &Count).unwrap();
|
||||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user