Compare commits

...

53 Commits

Author SHA1 Message Date
trinity Pointard
c73d4a7166 reorder parsing of aggregations results 2025-02-26 18:06:09 +01:00
SteveLauC
c48c649436 refactor: use std AtomicU64 and remove wrapper (#2585) 2025-02-24 03:56:15 +01:00
Paul Masurel
58c0739953 Merge pull request #2581 from quickwit-oss/merge_dict_column_repro
use usize in bitpacker
2025-02-21 10:53:07 +09:00
Pascal Seitz
e7daf69de9 use usize in bitpacker
use usize in bitpacker to enable larger columns in the columnar store

Godbolt comparison with u32 vs u64 for get access: https://godbolt.org/z/cjf7nenYP

Add a mini-tool to inspect columnar files created by tantivy. (very basic functionality which can be extended later)
2025-02-20 15:39:10 +01:00
trinity-1686a
f060e86bc6 Merge pull request #2578 from quickwit-oss/1686a/buildable-histo-agg
make DateHistogramAggregationReq buildable
2025-02-18 15:30:54 +01:00
trinity Pointard
0368162ef0 make DateHistogramAggregationReq buildable 2025-02-18 11:45:24 +01:00
trinity-1686a
e843c71015 Merge pull request #2568 from quickwit-oss/trinity/wildcard-query-parser
allow term starting with wildcard in query parser
2025-02-12 16:47:25 +01:00
trinity Pointard
5cea16ef9f improve handling of spcial char after exist query 2025-01-22 16:04:31 +01:00
dependabot[bot]
4aa8cd2470 Update downcast-rs requirement from 1.2.1 to 2.0.1 (#2566)
Updates the requirements on [downcast-rs](https://github.com/marcianx/downcast-rs) to permit the latest version.
- [Changelog](https://github.com/marcianx/downcast-rs/blob/master/CHANGELOG.md)
- [Commits](https://github.com/marcianx/downcast-rs/compare/v1.2.1...v2.0.1)

---
updated-dependencies:
- dependency-name: downcast-rs
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-01-22 10:32:24 +01:00
trinity Pointard
4d4ee1b0ac allow term starting with wildcard in query parser 2025-01-15 10:27:48 +01:00
dependabot[bot]
43c89b4360 Update itertools requirement from 0.13.0 to 0.14.0 (#2563)
Updates the requirements on [itertools](https://github.com/rust-itertools/itertools) to permit the latest version.
- [Changelog](https://github.com/rust-itertools/itertools/blob/master/CHANGELOG.md)
- [Commits](https://github.com/rust-itertools/itertools/compare/v0.13.0...v0.14.0)

---
updated-dependencies:
- dependency-name: itertools
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2025-01-08 17:11:46 +01:00
trinity-1686a
d281ca3e65 Merge pull request #2559 from quickwit-oss/trinity/sstable-partial-automaton
allow warming partially an sstable for an automaton
2025-01-08 16:35:35 +01:00
trinity Pointard
be17daf658 split iterator 2025-01-08 16:24:34 +01:00
trinity Pointard
6ca84a61fa make termdict always clone 2025-01-08 16:19:54 +01:00
trinity Pointard
037d12c9c9 fix deadlocking on automaton warmup 2025-01-06 11:58:58 +01:00
Remi Dettai
71cf19870b Exist queries match subpath fields (#2558)
* Exist queries match subpath fields

* Make subpath check optional

* Add async subpath listing
2025-01-06 10:17:39 +01:00
trinity Pointard
175a529c41 use executor for cpu-heavy sstable decompression for automaton 2025-01-03 19:14:07 +01:00
trinity Pointard
fe0c7c5408 change rangebound style 2025-01-02 11:56:05 +01:00
Harrison Burt
148594f0f9 Improve IndexWriter customisation via builder (#2562)
* Improve `IndexWriter` customisation via builder

* Remove change noise from PR

* Correct documentation

* Resolve comments and add test
2025-01-02 09:43:22 +01:00
dependabot[bot]
8edb439440 Update rustc-hash requirement from 1.1.0 to 2.1.0 (#2551)
---
updated-dependencies:
- dependency-name: rustc-hash
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-26 10:25:05 +01:00
trinity Pointard
dfff5f3bcb rename merge_holes_under => merge_holes_under_bytes 2024-12-23 16:17:44 +01:00
trinity-1686a
ebf4d84553 add comment about cpu-intensive operation in async context 2024-12-20 12:23:49 +01:00
trinity-1686a
42efc7f7c8 clippy 2024-12-20 11:00:11 +01:00
trinity-1686a
192395c311 attempt at simplifying can_block_match_automaton 2024-12-20 10:25:38 +01:00
trinity-1686a
a1447cc9c2 remove breaking change in sstable public api 2024-12-19 17:30:05 +01:00
trinity-1686a
c39d91f827 Merge pull request #2547 from quickwit-oss/trinity/count-str
add support for counting non integer in aggregation
2024-12-17 15:27:30 +01:00
trinity Pointard
32b6e9711b add tests 2024-12-13 16:06:24 +01:00
trinity-1686a
24c5dc2398 allow warming up automaton 2024-12-10 13:32:12 +01:00
trinity-1686a
9e2ddec4b3 merge adjacent block when building delta for automaton 2024-12-10 13:32:12 +01:00
trinity-1686a
1f6a8e74bb support iterating over partially loaded sstable 2024-12-10 13:32:12 +01:00
trinity-1686a
7e901f523b get iter for blocks of sstable matching automaton 2024-12-10 13:32:12 +01:00
trinity-1686a
3c30a41c14 add helper to figure if block can match automaton 2024-12-10 13:32:12 +01:00
dependabot[bot]
0f99d4f420 Update measure_time requirement from 0.8.2 to 0.9.0 (#2557)
---
updated-dependencies:
- dependency-name: measure_time
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-12-09 21:39:01 +01:00
Pierre Barre
6e02c5cb25 Make NUM_MERGE_THREADS configurable (#2535)
* Make `NUM_MERGE_THREADS` configurable

* Remove unused import

* Reword comment src/index/index.rs

Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>

---------

Co-authored-by: PSeitz <PSeitz@users.noreply.github.com>
2024-12-09 16:53:11 +08:00
PSeitz
876a579e5d queryparser: add field respecification test (#2550) 2024-12-02 14:17:12 +01:00
PSeitz
4c52499622 clippy (#2549) 2024-11-29 16:08:21 +08:00
trinity-1686a
0bac391291 add support for counting non integer in aggregation 2024-11-28 19:52:47 +01:00
PSeitz
52d4e81e70 update CHANGELOG (#2546) 2024-11-27 20:49:35 +08:00
dependabot[bot]
c71ea7b2ef Update thiserror requirement from 1.0.30 to 2.0.1 (#2542)
Updates the requirements on [thiserror](https://github.com/dtolnay/thiserror) to permit the latest version.
- [Release notes](https://github.com/dtolnay/thiserror/releases)
- [Commits](https://github.com/dtolnay/thiserror/compare/1.0.30...2.0.1)

---
updated-dependencies:
- dependency-name: thiserror
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
2024-11-09 08:08:34 +08:00
Paul Masurel
c35a782747 Updating rustc-hash and clippy fixes (#2532)
* Updating rustc-hash and clippy fixes

* fix terms_aggregation_min_doc_count_special_case

---------

Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2024-11-01 13:46:26 +08:00
dependabot[bot]
c66af2c0a9 Update binggan requirement from 0.12.0 to 0.14.0 (#2530)
* Update binggan requirement from 0.12.0 to 0.14.0

---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* fix build

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2024-10-24 09:41:35 +08:00
Joan Antoni RE
f9ac055847 Fix some links in architecture docs (#2528) 2024-10-23 21:06:54 +09:00
PSeitz
21d057059e clippy (#2527)
* clippy

* clippy

* clippy

* clippy

* convert allow to expect and remove unused

* cargo fmt

* cleanup

* export sample

* clippy
2024-10-22 09:26:54 +08:00
PSeitz
dca508b4ca remove read_postings_no_deletes (#2526)
closes #2525
2024-10-22 09:52:43 +09:00
PSeitz
aebae9965d add RegexPhraseQuery (#2516)
* add RegexPhraseQuery

RegexPhraseQuery supports phrase queries with regex. It supports regex
and wildcards. E.g. a query with wildcards:
"b* b* wolf" matches "big bad wolf"
Slop is supported as well:
"b* wolf"~2 matches "big bad wolf"

Regex queries may match a lot of terms where we still need to
keep track which term hit to load the positions.
The phrase query algorithm groups terms by their frequency
together in the union to prefilter groups early.

This PR comes with some new datastructures:

SimpleUnion - A union docset for a list of docsets. It doesn't do any
caching and is therefore well suited for datasets with lots of skipping.
(phrase search, but intersections in general)

LoadedPostings - Like SegmentPostings, but all docs and positions are loaded in
memory. SegmentPostings uses 1840 bytes per instance with its caches,
which is equivalent to 460 docids.
LoadedPostings is used for terms which have less than 100 docs.
LoadedPostings is only used to reduce memory consumption.

BitSetPostingUnion - Creates a `Posting` that uses the bitset for docid
hits and the docsets for positions. The BitSet is the precalculated
union of the docsets
In the RegexPhraseQuery there is a size limit of 512 docsets per PreAggregatedUnion,
before creating a new one.

Renamed Union to BufferedUnionScorer
Added proptests to test different union types.

* cleanup

* use Box instead of Vec

* use RefCell instead of term_freq(&mut)

* remove wildcard mode

* move RefCell to outer

* clippy
2024-10-21 18:29:17 +08:00
Marvin
e7e3e3f44c make casing in docs more consistent (#2524)
* make casing in docs more consistent

* more

* lowercase tantivy
2024-10-21 17:59:41 +09:00
PSeitz
2f2db16ec1 store DateTime as nanoseconds in doc store (#2486)
* store DateTime as nanoseconds in doc store

The doc store DateTime was truncated to microseconds previously. This
removes this truncation, while still keeping backwards compatibility.

This is done by adding the trait `ConfigurableBinarySerializable`, which
works like `BinarySerializable`, but with a config that allows de/serialize
as different date time precision currently.

bump version format to 7.
add compat test to check the date time truncation.

* remove configurable binary serialize, add enum for doc store version

* test doc store version ord
2024-10-18 10:50:20 +08:00
Paul Masurel
d152e29687 Fixed citation (#2523) 2024-10-17 10:19:50 +09:00
Paul Masurel
285bcc25c9 Added citation.cff (#2522) 2024-10-17 09:43:35 +09:00
PSeitz
7b65ad922d use binggan for stacker bench (#2492)
* use binggan for stacker bench

```
alice (num terms: 174693)
hashmap                    Memory: 1.3 MB     Avg: 367.19 MiB/s (-1.34%)    Median: 368.10 MiB/s (-1.34%)    [378.75 MiB/s .. 352.81 MiB/s]
hasmap with postings       Memory: 2.4 MB     Avg: 237.29 MiB/s (-2.19%)    Median: 240.22 MiB/s (-1.61%)    [248.26 MiB/s .. 210.66 MiB/s]
fxhashmap ref postings     Memory: 2.9 MB     Avg: 171.94 MiB/s (-3.22%)    Median: 174.13 MiB/s (-2.69%)    [185.94 MiB/s .. 152.43 MiB/s]
fxhasmap owned postings    Memory: 3.5 MB     Avg: 96.993 MiB/s (-4.20%)    Median: 97.410 MiB/s (-4.48%)    [102.78 MiB/s .. 82.745 MiB/s]
numbers unique 100k
hashmap                 Memory: 5.2 MB     Avg: 334.17 MiB/s (-3.06%)    Median: 352.61 MiB/s (+0.77%)    [362.60 MiB/s .. 213.03 MiB/s]
hasmap with postings    Memory: 6.3 MB     Avg: 316.96 MiB/s (-0.02%)    Median: 325.16 MiB/s (-0.04%)    [338.36 MiB/s .. 218.60 MiB/s]
zipfs numbers 100k
hashmap                 Memory: 1.3 MB     Avg: 1.2342 GiB/s (+2.87%)    Median: 1.2677 GiB/s (+4.66%)    [1.3130 GiB/s .. 915.93 MiB/s]
hasmap with postings    Memory: 2.4 MB     Avg: 485.16 MiB/s (+2.68%)    Median: 494.70 MiB/s (+4.42%)    [505.31 MiB/s .. 413.14 MiB/s]
numbers unique 1mio
hashmap                 Memory: 35.7 MB     Avg: 169.68 MiB/s (-1.08%)    Median: 166.80 MiB/s (-3.87%)    [201.33 MiB/s .. 154.26 MiB/s]
hasmap with postings    Memory: 39.8 MB     Avg: 149.49 MiB/s (-3.07%)    Median: 150.85 MiB/s (-1.45%)    [160.76 MiB/s .. 130.94 MiB/s]
zipfs numbers 1mio
hashmap                 Memory: 1.3 MB     Avg: 1.2185 GiB/s (-2.33%)     Median: 1.2291 GiB/s (-2.33%)     [1.2905 GiB/s .. 1.0742 GiB/s]
hasmap with postings    Memory: 5.5 MB     Avg: 358.43 MiB/s (-11.63%)    Median: 356.95 MiB/s (-12.85%)    [444.94 MiB/s .. 302.46 MiB/s]
numbers unique 2mio
hashmap                 Memory: 70.3 MB     Avg: 163.65 MiB/s (+8.37%)    Median: 162.83 MiB/s (+8.80%)    [190.20 MiB/s .. 144.70 MiB/s]
hasmap with postings    Memory: 78.6 MB     Avg: 148.00 MiB/s (+7.75%)    Median: 151.53 MiB/s (+9.11%)    [166.92 MiB/s .. 120.09 MiB/s]
zipfs numbers 2mio
hashmap                 Memory: 1.3 MB     Avg: 1.2535 GiB/s (+2.59%)    Median: 1.2654 GiB/s (+0.36%)    [1.2938 GiB/s .. 1.0592 GiB/s]
hasmap with postings    Memory: 9.7 MB     Avg: 377.96 MiB/s (-4.94%)    Median: 381.82 MiB/s (-3.67%)    [426.14 MiB/s .. 335.66 MiB/s]
numbers unique 5mio
hashmap                 Memory: 277.9 MB     Avg: 121.30 MiB/s (+2.00%)    Median: 121.99 MiB/s (+2.99%)    [132.51 MiB/s .. 110.32 MiB/s]
hasmap with postings    Memory: 295.7 MB     Avg: 114.23 MiB/s (+2.13%)    Median: 115.26 MiB/s (+2.94%)    [124.08 MiB/s .. 103.38 MiB/s]
zipfs numbers 5mio
hashmap                 Memory: 1.3 MB      Avg: 1.2326 GiB/s (+0.63%)    Median: 1.2400 GiB/s (+0.71%)    [1.2755 GiB/s .. 1.0923 GiB/s]
hasmap with postings    Memory: 25.4 MB     Avg: 360.49 MiB/s (+1.07%)    Median: 363.44 MiB/s (+1.27%)    [404.88 MiB/s .. 300.38 MiB/s]
```

* rename bench

* update binggan

* rename to HASHMAP_CAPACITY
2024-10-16 11:41:33 +08:00
dependabot[bot]
99be20cedd Update binggan requirement from 0.10.0 to 0.12.0 (#2519)
* Update binggan requirement from 0.10.0 to 0.12.0

---
updated-dependencies:
- dependency-name: binggan
  dependency-type: direct:production
...

Signed-off-by: dependabot[bot] <support@github.com>

* fix build

---------

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
Co-authored-by: Pascal Seitz <pascal.seitz@gmail.com>
2024-10-16 11:36:04 +08:00
Bruce Mitchener
5f026901b8 Update MSRV to 1.75 (#2515)
This is required by the `fs4` dependency. There are other
things that need something later than 1.66.

Both quickwit and the Python binding already require something
newer.
2024-10-16 10:32:16 +08:00
baishen
6dfa2df06f fix OwnedBytes debug panic (#2512) 2024-10-16 10:31:40 +08:00
163 changed files with 3596 additions and 991 deletions

View File

@@ -46,7 +46,7 @@ The file of a segment has the format
```segment-id . ext```
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
Tantivy's document follows a very strict schema, decided before building any index.
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
Depending on the type of the field, you can decide to

View File

@@ -1,11 +1,12 @@
Tantivy 0.23 - Unreleased
================================
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21.
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21. The new minimum rust version will be 1.75.
#### Bugfixes
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
- fix `OwnedBytes` debug panic [#2512](https://github.com/quickwit-oss/tantivy/pull/2512)(@b41sh)
#### Breaking API Changes
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
@@ -35,7 +36,15 @@ Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
- feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
- **Optional Index in Multivalue Columnar Index** For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case). This is alleviated by placing an optional index in the multivalued index to mark documents that have values. This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
- **RegexPhraseQuery**
`RegexPhraseQuery` supports phrase queries with regex. E.g. query "b.* b.* wolf" matches "big bad wolf". Slop is supported as well: "b.* wolf"~2 matches "big bad wolf" [#2516](https://github.com/quickwit-oss/tantivy/pull/2516)(@PSeitz)
- **Optional Index in Multivalue Columnar Index**
For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case).
This is alleviated by placing an optional index in the multivalued index to mark documents that have values.
This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
- **Store DateTime as nanoseconds in doc store** DateTime in the doc store was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. [#2486](https://github.com/quickwit-oss/tantivy/pull/2486)(@PSeitz)
- **Performace/Memory**
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
@@ -57,12 +66,13 @@ Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
- use bingang for agg benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)(@PSeitz)
- use bingang for agg and stacker benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)[#2492](https://github.com/quickwit-oss/tantivy/pull/2492)(@PSeitz)
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
- remove read_postings_no_deletes [#2526](https://github.com/quickwit-oss/tantivy/pull/2526)(@PSeitz)
Tantivy 0.22
================================
@@ -717,7 +727,7 @@ Tantivy 0.4.0
- Raise the limit of number of fields (previously 256 fields) (@fulmicoton)
- Removed u32 fields. They are replaced by u64 and i64 fields (#65) (@fulmicoton)
- Optimized skip in SegmentPostings (#130) (@lnicola)
- Replacing rustc_serialize by serde. Kudos to @KodrAus and @lnicola
- Replacing rustc_serialize by serde. Kudos to benchmark@KodrAus and @lnicola
- Using error-chain (@KodrAus)
- QueryParser: (@fulmicoton)
- Explicit error returned when searched for a term that is not indexed

10
CITATION.cff Normal file
View File

@@ -0,0 +1,10 @@
cff-version: 1.2.0
message: "If you use this software, please cite it as below."
authors:
- alias: Quickwit Inc.
website: "https://quickwit.io"
title: "tantivy"
version: 0.22.0
doi: 10.5281/zenodo.13942948
date-released: 2024-10-17
url: "https://github.com/quickwit-oss/tantivy"

View File

@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
readme = "README.md"
keywords = ["search", "information", "retrieval"]
edition = "2021"
rust-version = "1.66"
rust-version = "1.75"
exclude = ["benches/*.json", "benches/*.txt"]
[dependencies]
@@ -38,13 +38,13 @@ levenshtein_automata = "0.2.1"
uuid = { version = "1.0.0", features = ["v4", "serde"] }
crossbeam-channel = "0.5.4"
rust-stemmers = "1.2.0"
downcast-rs = "1.2.1"
downcast-rs = "2.0.1"
bitpacking = { version = "0.9.2", default-features = false, features = [
"bitpacker4x",
] }
census = "0.4.2"
rustc-hash = "1.1.0"
thiserror = "1.0.30"
rustc-hash = "2.0.0"
thiserror = "2.0.1"
htmlescape = "0.3.1"
fail = { version = "0.5.0", optional = true }
time = { version = "0.3.35", features = ["serde-well-known"] }
@@ -52,9 +52,10 @@ smallvec = "1.8.0"
rayon = "1.5.2"
lru = "0.12.0"
fastdivide = "0.4.0"
itertools = "0.13.0"
measure_time = "0.8.2"
itertools = "0.14.0"
measure_time = "0.9.0"
arc-swap = "1.5.0"
bon = "3.3.1"
columnar = { version = "0.3", path = "./columnar", package = "tantivy-columnar" }
sstable = { version = "0.3", path = "./sstable", package = "tantivy-sstable", optional = true }
@@ -66,13 +67,14 @@ tokenizer-api = { version = "0.3", path = "./tokenizer-api", package = "tantivy-
sketches-ddsketch = { version = "0.3.0", features = ["use_serde"] }
hyperloglogplus = { version = "0.4.1", features = ["const-loop"] }
futures-util = { version = "0.3.28", optional = true }
futures-channel = { version = "0.3.28", optional = true }
fnv = "1.0.7"
[target.'cfg(windows)'.dependencies]
winapi = "0.3.9"
[dev-dependencies]
binggan = "0.10.0"
binggan = "0.14.0"
rand = "0.8.5"
maplit = "1.0.2"
matches = "0.1.9"
@@ -120,7 +122,7 @@ zstd-compression = ["zstd"]
failpoints = ["fail", "fail/failpoints"]
unstable = [] # useful for benches.
quickwit = ["sstable", "futures-util"]
quickwit = ["sstable", "futures-util", "futures-channel"]
# Compares only the hash of a string when indexing data.
# Increases indexing speed, but may lead to extremely rare missing terms, when there's a hash collision.

View File

@@ -1,3 +1,4 @@
use binggan::plugins::PeakMemAllocPlugin;
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
use rand::prelude::SliceRandom;
use rand::rngs::StdRng;
@@ -19,7 +20,6 @@ macro_rules! register {
($runner:expr, $func:ident) => {
$runner.register(stringify!($func), move |index| {
$func(index);
None
})
};
}
@@ -45,7 +45,8 @@ fn main() {
}
fn bench_agg(mut group: InputGroup<Index>) {
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
register!(group, average_u64);
register!(group, average_f64);
register!(group, average_f64_u64);

View File

@@ -94,14 +94,14 @@ impl BitUnpacker {
#[inline]
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
let addr_in_bits = idx * self.num_bits;
let addr = (addr_in_bits >> 3) as usize;
let addr_in_bits = idx as usize * self.num_bits as usize;
let addr = addr_in_bits >> 3;
if addr + 8 > data.len() {
if self.num_bits == 0 {
return 0;
}
let bit_shift = addr_in_bits & 7;
return self.get_slow_path(addr, bit_shift, data);
return self.get_slow_path(addr, bit_shift as u32, data);
}
let bit_shift = addr_in_bits & 7;
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();

View File

@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
impl FilterImplPerInstructionSet {
#[allow(unused_variables)]
#[inline]
#[allow(unused_variables)] // on non-x86_64, code is unused.
fn from(code: u8) -> FilterImplPerInstructionSet {
#[cfg(target_arch = "x86_64")]
if code == FilterImplPerInstructionSet::AVX2 as u8 {

View File

@@ -9,7 +9,7 @@ description = "column oriented storage for tantivy"
categories = ["database-implementations", "data-structures", "compression"]
[dependencies]
itertools = "0.13.0"
itertools = "0.14.0"
fastdivide = "0.4.0"
stacker = { version= "0.3", path = "../stacker", package="tantivy-stacker"}
@@ -17,13 +17,13 @@ sstable = { version= "0.3", path = "../sstable", package = "tantivy-sstable" }
common = { version= "0.7", path = "../common", package = "tantivy-common" }
tantivy-bitpacker = { version= "0.6", path = "../bitpacker/" }
serde = "1.0.152"
downcast-rs = "1.2.0"
downcast-rs = "2.0.1"
[dev-dependencies]
proptest = "1"
more-asserts = "0.3.1"
rand = "0.8"
binggan = "0.10.0"
binggan = "0.14.0"
[[bench]]
name = "bench_merge"

View File

@@ -42,7 +42,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
}
}
black_box(sum);
None
});
runner.register("access_first_vals", |column| {
let mut sum = 0;
@@ -63,7 +62,6 @@ fn bench_group(mut runner: InputGroup<Column>) {
}
black_box(sum);
None
});
runner.run();
}

View File

@@ -1,6 +1,6 @@
pub mod common;
use binggan::{black_box, BenchRunner};
use binggan::BenchRunner;
use common::{generate_columnar_with_name, Card};
use tantivy_columnar::*;
@@ -29,7 +29,7 @@ fn main() {
add_combo(Card::Multi, Card::Dense);
add_combo(Card::Multi, Card::Sparse);
let runner: BenchRunner = BenchRunner::new();
let mut runner: BenchRunner = BenchRunner::new();
let mut group = runner.new_group();
for (input_name, columnar_readers) in inputs.iter() {
group.register_with_input(

View File

@@ -0,0 +1,18 @@
[package]
name = "tantivy-columnar-inspect"
version = "0.1.0"
edition = "2021"
license = "MIT"
[dependencies]
tantivy = {path="../..", package="tantivy"}
columnar = {path="../", package="tantivy-columnar"}
common = {path="../../common", package="tantivy-common"}
[workspace]
members = []
[profile.release]
debug = true
#debug-assertions = true
#overflow-checks = true

View File

@@ -0,0 +1,54 @@
use columnar::ColumnarReader;
use common::file_slice::{FileSlice, WrapFile};
use std::io;
use std::path::Path;
use tantivy::directory::footer::Footer;
fn main() -> io::Result<()> {
println!("Opens a columnar file written by tantivy and validates it.");
let path = std::env::args().nth(1).unwrap();
let path = Path::new(&path);
println!("Reading {:?}", path);
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
Ok(())
}
pub fn validate_columnar_reader(reader: &ColumnarReader) {
let num_rows = reader.num_rows();
println!("num_rows: {}", num_rows);
let columns = reader.list_columns().unwrap();
println!("num columns: {:?}", columns.len());
for (col_name, dynamic_column_handle) in columns {
let col = dynamic_column_handle.open().unwrap();
match col {
columnar::DynamicColumn::Bool(_)
| columnar::DynamicColumn::I64(_)
| columnar::DynamicColumn::U64(_)
| columnar::DynamicColumn::F64(_)
| columnar::DynamicColumn::IpAddr(_)
| columnar::DynamicColumn::DateTime(_)
| columnar::DynamicColumn::Bytes(_) => {}
columnar::DynamicColumn::Str(str_column) => {
let num_vals = str_column.ords().values.num_vals();
let num_terms_dict = str_column.num_terms() as u64;
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
for ord in str_column.ords().values.iter() {
assert!(ord < num_terms_dict);
}
}
}
}
}
/// Opens a columnar file that was written by tantivy and validates it.
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
let reader = ColumnarReader::open(slice).unwrap();
validate_columnar_reader(&reader);
Ok(reader)
}

View File

@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
&'a self,
docs: &'a [u32],
accessor: &Column<T>,
) -> impl Iterator<Item = (DocId, T)> + '_ {
) -> impl Iterator<Item = (DocId, T)> + 'a {
if accessor.index.get_cardinality().is_full() {
docs.iter().cloned().zip(self.val_cache.iter().cloned())
} else {

View File

@@ -58,7 +58,7 @@ struct ShuffledIndex<'a> {
merge_order: &'a ShuffleMergeOrder,
}
impl<'a> Iterable<u32> for ShuffledIndex<'a> {
impl Iterable<u32> for ShuffledIndex<'_> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(
self.merge_order
@@ -127,7 +127,7 @@ fn integrate_num_vals(num_vals: impl Iterator<Item = u32>) -> impl Iterator<Item
)
}
impl<'a> Iterable<u32> for ShuffledMultivaluedIndex<'a> {
impl Iterable<u32> for ShuffledMultivaluedIndex<'_> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
let num_vals_per_row = iter_num_values(self.column_indexes, self.merge_order);
Box::new(integrate_num_vals(num_vals_per_row))

View File

@@ -123,7 +123,7 @@ fn get_num_values_iterator<'a>(
}
}
impl<'a> Iterable<u32> for StackedStartOffsets<'a> {
impl Iterable<u32> for StackedStartOffsets<'_> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
let num_values_it = (0..self.column_indexes.len()).flat_map(|columnar_id| {
let num_docs = self.stack_merge_order.columnar_range(columnar_id).len() as u32;

View File

@@ -86,7 +86,7 @@ pub struct OptionalIndex {
block_metas: Arc<[BlockMeta]>,
}
impl<'a> Iterable<u32> for &'a OptionalIndex {
impl Iterable<u32> for &OptionalIndex {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u32> + '_> {
Box::new(self.iter_rows())
}
@@ -123,7 +123,7 @@ enum BlockSelectCursor<'a> {
Sparse(<SparseBlock<'a> as Set<u16>>::SelectCursor<'a>),
}
impl<'a> BlockSelectCursor<'a> {
impl BlockSelectCursor<'_> {
fn select(&mut self, rank: u16) -> u16 {
match self {
BlockSelectCursor::Dense(dense_select_cursor) => dense_select_cursor.select(rank),
@@ -141,7 +141,7 @@ pub struct OptionalIndexSelectCursor<'a> {
num_null_rows_before_block: RowId,
}
impl<'a> OptionalIndexSelectCursor<'a> {
impl OptionalIndexSelectCursor<'_> {
fn search_and_load_block(&mut self, rank: RowId) {
if rank < self.current_block_end_rank {
// we are already in the right block
@@ -165,7 +165,7 @@ impl<'a> OptionalIndexSelectCursor<'a> {
}
}
impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
impl SelectCursor<RowId> for OptionalIndexSelectCursor<'_> {
fn select(&mut self, rank: RowId) -> RowId {
self.search_and_load_block(rank);
let index_in_block = (rank - self.num_null_rows_before_block) as u16;
@@ -505,7 +505,7 @@ fn deserialize_optional_index_block_metadatas(
non_null_rows_before_block += num_non_null_rows;
}
block_metas.resize(
((num_rows + ELEMENTS_PER_BLOCK - 1) / ELEMENTS_PER_BLOCK) as usize,
num_rows.div_ceil(ELEMENTS_PER_BLOCK) as usize,
BlockMeta {
non_null_rows_before_block,
start_byte_offset,

View File

@@ -23,7 +23,6 @@ fn set_bit_at(input: &mut u64, n: u16) {
///
/// When translating a dense index to the original index, we can use the offset to find the correct
/// block. Direct computation is not possible, but we can employ a linear or binary search.
const ELEMENTS_PER_MINI_BLOCK: u16 = 64;
const MINI_BLOCK_BITVEC_NUM_BYTES: usize = 8;
const MINI_BLOCK_OFFSET_NUM_BYTES: usize = 2;
@@ -109,7 +108,7 @@ pub struct DenseBlockSelectCursor<'a> {
dense_block: DenseBlock<'a>,
}
impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
impl SelectCursor<u16> for DenseBlockSelectCursor<'_> {
#[inline]
fn select(&mut self, rank: u16) -> u16 {
self.block_id = self
@@ -175,7 +174,7 @@ impl<'a> Set<u16> for DenseBlock<'a> {
}
}
impl<'a> DenseBlock<'a> {
impl DenseBlock<'_> {
#[inline]
fn mini_block(&self, mini_block_id: u16) -> DenseMiniBlock {
let data_start_pos = mini_block_id as usize * MINI_BLOCK_NUM_BYTES;

View File

@@ -31,7 +31,7 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
}
}
impl<'a> Set<u16> for SparseBlock<'a> {
impl Set<u16> for SparseBlock<'_> {
type SelectCursor<'b>
= Self
where Self: 'b;
@@ -69,7 +69,7 @@ fn get_u16(data: &[u8], byte_position: usize) -> u16 {
u16::from_le_bytes(bytes)
}
impl<'a> SparseBlock<'a> {
impl SparseBlock<'_> {
#[inline(always)]
fn value_at_idx(&self, data: &[u8], idx: u16) -> u16 {
let start_offset: usize = idx as usize * 2;
@@ -82,7 +82,7 @@ impl<'a> SparseBlock<'a> {
}
#[inline]
#[allow(clippy::comparison_chain)]
#[expect(clippy::comparison_chain)]
// Looks for the element in the block. Returns the positions if found.
fn binary_search(&self, target: u16) -> Result<u16, u16> {
let data = &self.0;

View File

@@ -31,7 +31,7 @@ pub enum SerializableColumnIndex<'a> {
Multivalued(SerializableMultivalueIndex<'a>),
}
impl<'a> SerializableColumnIndex<'a> {
impl SerializableColumnIndex<'_> {
pub fn get_cardinality(&self) -> Cardinality {
match self {
SerializableColumnIndex::Full => Cardinality::Full,

View File

@@ -10,7 +10,7 @@ pub(crate) struct MergedColumnValues<'a, T> {
pub(crate) merge_row_order: &'a MergeRowOrder,
}
impl<'a, T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'a, T> {
impl<T: Copy + PartialOrd + Debug + 'static> Iterable<T> for MergedColumnValues<'_, T> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
match self.merge_row_order {
MergeRowOrder::Stack(_) => Box::new(

View File

@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
}
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use super::*;
use crate::column_values::u64_based::{
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,

View File

@@ -39,7 +39,7 @@ impl BinarySerializable for Block {
}
fn compute_num_blocks(num_vals: u32) -> u32 {
(num_vals + BLOCK_SIZE - 1) / BLOCK_SIZE
num_vals.div_ceil(BLOCK_SIZE)
}
pub struct BlockwiseLinearEstimator {

View File

@@ -39,7 +39,7 @@ struct RemappedTermOrdinalsValues<'a> {
merge_row_order: &'a MergeRowOrder,
}
impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
impl Iterable for RemappedTermOrdinalsValues<'_> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = u64> + '_> {
match self.merge_row_order {
MergeRowOrder::Stack(_) => self.boxed_iter_stacked(),
@@ -50,7 +50,7 @@ impl<'a> Iterable for RemappedTermOrdinalsValues<'a> {
}
}
impl<'a> RemappedTermOrdinalsValues<'a> {
impl RemappedTermOrdinalsValues<'_> {
fn boxed_iter_stacked(&self) -> Box<dyn Iterator<Item = u64> + '_> {
let iter = self
.bytes_columns

View File

@@ -10,13 +10,13 @@ pub struct HeapItem<'a> {
pub segment_ord: usize,
}
impl<'a> PartialEq for HeapItem<'a> {
impl PartialEq for HeapItem<'_> {
fn eq(&self, other: &Self) -> bool {
self.segment_ord == other.segment_ord
}
}
impl<'a> Eq for HeapItem<'a> {}
impl Eq for HeapItem<'_> {}
impl<'a> PartialOrd for HeapItem<'a> {
fn partial_cmp(&self, other: &HeapItem<'a>) -> Option<Ordering> {

View File

@@ -1,6 +1,7 @@
use std::{fmt, io, mem};
use common::file_slice::FileSlice;
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
use common::BinarySerializable;
use sstable::{Dictionary, RangeSSTable};
@@ -76,6 +77,19 @@ fn read_all_columns_in_stream(
Ok(results)
}
fn column_dictionary_prefix_for_column_name(column_name: &str) -> String {
// Each column is a associated to a given `column_key`,
// that starts by `column_name\0column_header`.
//
// Listing the columns associated to the given column name is therefore equivalent to
// listing `column_key` with the prefix `column_name\0`.
format!("{}{}", column_name, '\0')
}
fn column_dictionary_prefix_for_subpath(root_path: &str) -> String {
format!("{}{}", root_path, JSON_PATH_SEGMENT_SEP as char)
}
impl ColumnarReader {
/// Opens a new Columnar file.
pub fn open<F>(file_slice: F) -> io::Result<ColumnarReader>
@@ -144,32 +158,14 @@ impl ColumnarReader {
Ok(self.iter_columns()?.collect())
}
fn stream_for_column_range(&self, column_name: &str) -> sstable::StreamerBuilder<RangeSSTable> {
// Each column is a associated to a given `column_key`,
// that starts by `column_name\0column_header`.
//
// Listing the columns associated to the given column name is therefore equivalent to
// listing `column_key` with the prefix `column_name\0`.
//
// This is in turn equivalent to searching for the range
// `[column_name,\0`..column_name\1)`.
// TODO can we get some more generic `prefix(..)` logic in the dictionary.
let mut start_key = column_name.to_string();
start_key.push('\0');
let mut end_key = column_name.to_string();
end_key.push(1u8 as char);
self.column_dictionary
.range()
.ge(start_key.as_bytes())
.lt(end_key.as_bytes())
}
pub async fn read_columns_async(
&self,
column_name: &str,
) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_column_name(column_name);
let stream = self
.stream_for_column_range(column_name)
.column_dictionary
.prefix_range(prefix)
.into_stream_async()
.await?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
@@ -180,7 +176,35 @@ impl ColumnarReader {
/// There can be more than one column associated to a given column name, provided they have
/// different types.
pub fn read_columns(&self, column_name: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let stream = self.stream_for_column_range(column_name).into_stream()?;
let prefix = column_dictionary_prefix_for_column_name(column_name);
let stream = self.column_dictionary.prefix_range(prefix).into_stream()?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
pub async fn read_subpath_columns_async(
&self,
root_path: &str,
) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_subpath(root_path);
let stream = self
.column_dictionary
.prefix_range(prefix)
.into_stream_async()
.await?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
/// Get all inner columns for a given JSON prefix, i.e columns for which the name starts
/// with the prefix then contain the [`JSON_PATH_SEGMENT_SEP`].
///
/// There can be more than one column associated to each path within the JSON structure,
/// provided they have different types.
pub fn read_subpath_columns(&self, root_path: &str) -> io::Result<Vec<DynamicColumnHandle>> {
let prefix = column_dictionary_prefix_for_subpath(root_path);
let stream = self
.column_dictionary
.prefix_range(prefix.as_bytes())
.into_stream()?;
read_all_columns_in_stream(stream, &self.column_data, self.format_version)
}
@@ -192,6 +216,8 @@ impl ColumnarReader {
#[cfg(test)]
mod tests {
use common::json_path_writer::JSON_PATH_SEGMENT_SEP;
use crate::{ColumnType, ColumnarReader, ColumnarWriter};
#[test]
@@ -224,6 +250,64 @@ mod tests {
assert_eq!(columns[0].1.column_type(), ColumnType::U64);
}
#[test]
fn test_read_columns() {
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_column_type("col", ColumnType::U64, false);
columnar_writer.record_numerical(1, "col", 1u64);
let mut buffer = Vec::new();
columnar_writer.serialize(2, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
{
let columns = columnar.read_columns("col").unwrap();
assert_eq!(columns.len(), 1);
assert_eq!(columns[0].column_type(), ColumnType::U64);
}
{
let columns = columnar.read_columns("other").unwrap();
assert_eq!(columns.len(), 0);
}
}
#[test]
fn test_read_subpath_columns() {
let mut columnar_writer = ColumnarWriter::default();
columnar_writer.record_str(
0,
&format!("col1{}subcol1", JSON_PATH_SEGMENT_SEP as char),
"hello",
);
columnar_writer.record_numerical(
0,
&format!("col1{}subcol2", JSON_PATH_SEGMENT_SEP as char),
1i64,
);
columnar_writer.record_str(1, "col1", "hello");
columnar_writer.record_str(0, "col2", "hello");
let mut buffer = Vec::new();
columnar_writer.serialize(2, &mut buffer).unwrap();
let columnar = ColumnarReader::open(buffer).unwrap();
{
let columns = columnar.read_subpath_columns("col1").unwrap();
assert_eq!(columns.len(), 2);
assert_eq!(columns[0].column_type(), ColumnType::Str);
assert_eq!(columns[1].column_type(), ColumnType::I64);
}
{
let columns = columnar.read_subpath_columns("col1.subcol1").unwrap();
assert_eq!(columns.len(), 0);
}
{
let columns = columnar.read_subpath_columns("col2").unwrap();
assert_eq!(columns.len(), 0);
}
{
let columns = columnar.read_subpath_columns("other").unwrap();
assert_eq!(columns.len(), 0);
}
}
#[test]
#[should_panic(expected = "Input type forbidden")]
fn test_list_columns_strict_typing_panics_on_wrong_types() {

View File

@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
// In order to limit memory usage, and in order
// to benefit from the stacker, we do this by serialization our data
// as "Symbols".
#[allow(clippy::from_over_into)]
pub(super) trait SymbolValue: Clone + Copy {
// Serializes the symbol into the given buffer.
// Returns the number of bytes written into the buffer.

View File

@@ -285,7 +285,6 @@ impl ColumnarWriter {
.map(|(column_name, addr)| (column_name, ColumnType::DateTime, addr)),
);
columns.sort_unstable_by_key(|(column_name, col_type, _)| (*column_name, *col_type));
let (arena, buffers, dictionaries) = (&self.arena, &mut self.buffers, &self.dictionaries);
let mut symbol_byte_buffer: Vec<u8> = Vec::new();
for (column_name, column_type, addr) in columns {
@@ -392,7 +391,7 @@ impl ColumnarWriter {
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
// Column: [Column Index, Column Values, column index num bytes U32::LE]
#[allow(clippy::too_many_arguments)]
#[expect(clippy::too_many_arguments)]
fn serialize_bytes_or_str_column(
cardinality: Cardinality,
num_docs: RowId,

View File

@@ -67,7 +67,7 @@ pub struct ColumnSerializer<'a, W: io::Write> {
start_offset: u64,
}
impl<'a, W: io::Write> ColumnSerializer<'a, W> {
impl<W: io::Write> ColumnSerializer<'_, W> {
pub fn finalize(self) -> io::Result<()> {
let end_offset: u64 = self.columnar_serializer.wrt.written_bytes();
let byte_range = self.start_offset..end_offset;
@@ -80,7 +80,7 @@ impl<'a, W: io::Write> ColumnSerializer<'a, W> {
}
}
impl<'a, W: io::Write> io::Write for ColumnSerializer<'a, W> {
impl<W: io::Write> io::Write for ColumnSerializer<'_, W> {
fn write(&mut self, buf: &[u8]) -> io::Result<usize> {
self.columnar_serializer.wrt.write(buf)
}

View File

@@ -7,7 +7,7 @@ pub trait Iterable<T = u64> {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_>;
}
impl<'a, T: Copy> Iterable<T> for &'a [T] {
impl<T: Copy> Iterable<T> for &[T] {
fn boxed_iter(&self) -> Box<dyn Iterator<Item = T> + '_> {
Box::new(self.iter().copied())
}

View File

@@ -19,7 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
serde = { version = "1.0.136", features = ["derive"] }
[dev-dependencies]
binggan = "0.10.0"
binggan = "0.14.0"
proptest = "1.0.0"
rand = "0.8.4"

View File

@@ -15,7 +15,6 @@ fn bench_vint() {
out += u64::from(buf[0]);
}
black_box(out);
None
});
let vals: Vec<u32> = (0..20_000).choose_multiple(&mut thread_rng(), 100_000);
@@ -27,7 +26,6 @@ fn bench_vint() {
out += u64::from(buf[0]);
}
black_box(out);
None
});
}
@@ -43,24 +41,20 @@ fn bench_bitset() {
tinyset.pop_lowest();
tinyset.pop_lowest();
black_box(tinyset);
None
});
let tiny_set = TinySet::empty().insert(10u32).insert(14u32).insert(21u32);
runner.bench_function("bench_tinyset_sum", move |_| {
assert_eq!(black_box(tiny_set).into_iter().sum::<u32>(), 45u32);
None
});
let v = [10u32, 14u32, 21u32];
runner.bench_function("bench_tinyarr_sum", move |_| {
black_box(v.iter().cloned().sum::<u32>());
None
});
runner.bench_function("bench_bitset_initialize", move |_| {
black_box(BitSet::with_max_value(1_000_000));
None
});
}

View File

@@ -1,5 +1,6 @@
use std::fs::File;
use std::ops::{Deref, Range, RangeBounds};
use std::path::Path;
use std::sync::Arc;
use std::{fmt, io};
@@ -177,6 +178,12 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
}
impl FileSlice {
/// Creates a FileSlice from a path.
pub fn open(path: &Path) -> io::Result<FileSlice> {
let wrap_file = WrapFile::new(File::open(path)?)?;
Ok(FileSlice::new(Arc::new(wrap_file)))
}
/// Wraps a FileHandle.
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
let num_bytes = file_handle.len();

View File

@@ -130,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
}
#[cfg(test)]
pub mod test {
pub(crate) mod test {
use proptest::prelude::*;
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
fn test_i64_converter_helper(val: i64) {
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
@@ -144,12 +144,6 @@ pub mod test {
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
}
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
}
proptest! {
#[test]
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {

View File

@@ -74,14 +74,14 @@ impl FixedSize for () {
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self {
it.serialize(writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
let num_items = VInt::deserialize(reader)?.val();
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = T::deserialize(reader)?;
@@ -236,12 +236,12 @@ impl FixedSize for bool {
impl BinarySerializable for String {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
impl<'a> BinarySerializable for Cow<'a, str> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
let data: &[u8] = self.as_bytes();
VInt(data.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
writer.write_all(data)
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
let string_length = VInt::deserialize(reader)?.val() as usize;
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
let mut result = String::with_capacity(string_length);
reader
.take(string_length as u64)
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
impl<'a> BinarySerializable for Cow<'a, [u8]> {
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
VInt(self.len() as u64).serialize(writer)?;
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
for it in self.iter() {
it.serialize(writer)?;
BinarySerializable::serialize(it, writer)?;
}
Ok(())
}
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
let num_items = VInt::deserialize(reader)?.val();
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
for _ in 0..num_items {
let item = u8::deserialize(reader)?;
let item = <u8 as BinarySerializable>::deserialize(reader)?;
items.push(item);
}
Ok(Cow::Owned(items))

View File

@@ -87,7 +87,7 @@ impl<W: TerminatingWrite> TerminatingWrite for BufWriter<W> {
}
}
impl<'a> TerminatingWrite for &'a mut Vec<u8> {
impl TerminatingWrite for &mut Vec<u8> {
fn terminate_ref(&mut self, _a: AntiCallToken) -> io::Result<()> {
self.flush()
}

View File

@@ -2,7 +2,7 @@
> Tantivy is a **search** engine **library** for Rust.
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
they both have the same scope and targeted use cases.
If you are not familiar with Lucene, let's break down our little tagline.
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
experience. But keep in mind this is just a toolbox.
Which bring us to the second keyword...
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.
Sometimes a functionality will not be available in tantivy because it is too
specific to your use case. By design, tantivy should make it possible to extend
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
index from a different format.
Tantivy exposes a lot of low level API to do all of these things.

View File

@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
While this design has some downsides, this greatly simplifies the source code of
tantivy. Caching is also entirely delegated to the OS.
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.

View File

@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
Note: Tantivy 0.16 does not do this optimization yet.
Note: tantivy 0.16 does not do this optimization yet.
### Pruning
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
Note: Tantivy 0.16 does not do this optimization yet.
Note: tantivy 0.16 does not do this optimization yet.
### Other?
@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
## Usage
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.
```rust
let settings = IndexSettings {

View File

@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
- `value`: The value representation is just the regular Value representation.
This representation is designed to align the natural sort of Terms with the lexicographical sort
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
In the example above, the terms will be sorted as

View File

@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
// We truncate the bytes in order to make sure the debug string
// is not too long.
let bytes_truncated: &[u8] = if self.len() > 8 {
let bytes_truncated: &[u8] = if self.len() > 10 {
&self.as_slice()[..10]
} else {
self.as_slice()
@@ -252,6 +252,11 @@ mod tests {
format!("{short_bytes:?}"),
"OwnedBytes([97, 98, 99, 100], len=4)"
);
let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
assert_eq!(
format!("{medium_bytes:?}"),
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
);
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
assert_eq!(
format!("{long_bytes:?}"),

View File

@@ -111,7 +111,6 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
// old versions don't understand this is uninhabited and need the empty match to help,
// newer versions warn because this arm is unreachable (which it is indeed).
#[allow(unreachable_patterns)]
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
}
}

View File

@@ -321,7 +321,17 @@ fn exists(inp: &str) -> IResult<&str, UserInputLeaf> {
UserInputLeaf::Exists {
field: String::new(),
},
tuple((multispace0, char('*'))),
tuple((
multispace0,
char('*'),
peek(alt((
value(
"",
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
),
eof,
))),
)),
)(inp)
}
@@ -331,7 +341,14 @@ fn exists_precond(inp: &str) -> IResult<&str, (), ()> {
peek(tuple((
field_name,
multispace0,
char('*'), // when we are here, we know it can't be anything but a exists
char('*'),
peek(alt((
value(
"",
satisfy(|c: char| c.is_whitespace() || ESCAPE_IN_WORD.contains(&c)),
),
eof,
))), // we need to check this isn't a wildcard query
))),
)(inp)
.map_err(|e| e.map(|_| ()))
@@ -767,7 +784,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
tuple((fallible(occur_symbol), boosted_leaf))(inp)
}
#[allow(clippy::type_complexity)]
#[expect(clippy::type_complexity)]
fn operand_occur_leaf_infallible(
inp: &str,
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
@@ -1497,6 +1514,11 @@ mod test {
test_is_parse_err(r#"field:(+a -"b c""#, r#"(+"field":a -"field":"b c")"#);
}
#[test]
fn field_re_specification() {
test_parse_query_to_ast_helper(r#"field:(abc AND b:cde)"#, r#"(+"field":abc +"b":cde)"#);
}
#[test]
fn test_parse_query_single_term() {
test_parse_query_to_ast_helper("abc", "abc");
@@ -1619,13 +1641,19 @@ mod test {
#[test]
fn test_exist_query() {
test_parse_query_to_ast_helper("a:*", "\"a\":*");
test_parse_query_to_ast_helper("a: *", "\"a\":*");
// an exist followed by default term being b
test_is_parse_err("a:*b", "(*\"a\":* *b)");
test_parse_query_to_ast_helper("a:*", "$exists(\"a\")");
test_parse_query_to_ast_helper("a: *", "$exists(\"a\")");
// this is a term query (not a phrase prefix)
test_parse_query_to_ast_helper(
"(hello AND toto:*) OR happy",
"(?(+hello +$exists(\"toto\")) ?happy)",
);
test_parse_query_to_ast_helper("(a:*)", "$exists(\"a\")");
// these are term/wildcard query (not a phrase prefix)
test_parse_query_to_ast_helper("a:b*", "\"a\":b*");
test_parse_query_to_ast_helper("a:*b", "\"a\":*b");
test_parse_query_to_ast_helper(r#"a:*def*"#, "\"a\":*def*");
}
#[test]

View File

@@ -101,7 +101,7 @@ impl Debug for UserInputLeaf {
}
UserInputLeaf::All => write!(formatter, "*"),
UserInputLeaf::Exists { field } => {
write!(formatter, "\"{field}\":*")
write!(formatter, "$exists(\"{field}\")")
}
}
}

View File

@@ -271,10 +271,6 @@ impl AggregationWithAccessor {
field: ref field_name,
..
})
| Count(CountAggregation {
field: ref field_name,
..
})
| Max(MaxAggregation {
field: ref field_name,
..
@@ -299,6 +295,24 @@ impl AggregationWithAccessor {
get_ff_reader(reader, field_name, Some(get_numeric_or_date_column_types()))?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
}
Count(CountAggregation {
field: ref field_name,
..
}) => {
let allowed_column_types = [
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::Str,
ColumnType::DateTime,
ColumnType::Bool,
ColumnType::IpAddr,
// ColumnType::Bytes Unsupported
];
let (accessor, column_type) =
get_ff_reader(reader, field_name, Some(&allowed_column_types))?;
add_agg_with_accessor(&agg, accessor, column_type, &mut res)?;
}
Percentiles(ref percentiles) => {
let (accessor, column_type) = get_ff_reader(
reader,

View File

@@ -1,4 +1,5 @@
//! Contains the final aggregation tree.
//!
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
//! This conversion computes the final result. For example: The intermediate result contains
//! intermediate average results, which is the sum and the number of values. The actual average is
@@ -124,9 +125,26 @@ impl MetricResult {
}
/// BucketEntry holds bucket aggregation result types.
// the order of fields is important to deserialize properly
// Terms must be first because all Terms are valid Range (we ignore unknown fields)
// Range and Histogram are always ambiguous, they contain the same 3 required fields, and all else
// is optional Having Range is usually more useful (contains more fields, missing field from
// Histogram can be obtained by key.to_string())
#[derive(Clone, Debug, PartialEq, Serialize, Deserialize)]
#[serde(untagged)]
pub enum BucketResult {
/// This is the term result
Terms {
/// The buckets.
///
/// See [`TermsAggregation`](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
/// This is the range entry for a bucket, which contains a key, count, from, to, and optionally
/// sub-aggregations.
Range {
@@ -143,18 +161,6 @@ pub enum BucketResult {
/// See [`HistogramAggregation`](super::bucket::HistogramAggregation)
buckets: BucketEntries<BucketEntry>,
},
/// This is the term result
Terms {
/// The buckets.
///
/// See [`TermsAggregation`](super::bucket::TermsAggregation)
buckets: Vec<BucketEntry>,
/// The number of documents that didnt make it into to TOP N due to shard_size or size
sum_other_doc_count: u64,
#[serde(skip_serializing_if = "Option::is_none")]
/// The upper bound error for the doc count of each term.
doc_count_error_upper_bound: Option<u64>,
},
}
impl BucketResult {
@@ -187,7 +193,7 @@ pub enum BucketEntries<T> {
}
impl<T> BucketEntries<T> {
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
match self {
BucketEntries::Vec(vec) => Box::new(vec.iter()),
BucketEntries::HashMap(map) => Box::new(map.values()),

View File

@@ -34,10 +34,10 @@ use crate::aggregation::*;
pub struct DateHistogramAggregationReq {
#[doc(hidden)]
/// Only for validation
interval: Option<String>,
pub interval: Option<String>,
#[doc(hidden)]
/// Only for validation
calendar_interval: Option<String>,
pub calendar_interval: Option<String>,
/// The field to aggregate on.
pub field: String,
/// The format to format dates. Unsupported currently.
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
}
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use pretty_assertions::assert_eq;
use super::*;

View File

@@ -16,6 +16,7 @@ use crate::aggregation::*;
use crate::TantivyError;
/// Provide user-defined buckets to aggregate on.
///
/// Two special buckets will automatically be created to cover the whole range of values.
/// The provided buckets have to be continuous.
/// During the aggregation, the values extracted from the fast_field `field` will be checked

View File

@@ -1232,8 +1232,8 @@ mod tests {
#[test]
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
let terms_per_segment = vec![
vec!["terma", "terma", "termb", "termb", "termb", "termc"],
vec!["terma", "terma", "termb", "termc", "termc"],
vec!["terma", "terma", "termb", "termb", "termb"],
vec!["terma", "terma", "termb"],
];
let index = get_test_index_from_terms(false, &terms_per_segment)?;
@@ -1255,8 +1255,6 @@ mod tests {
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);

View File

@@ -220,9 +220,23 @@ impl SegmentStatsCollector {
.column_block_accessor
.fetch_block(docs, &agg_accessor.accessor);
}
for val in agg_accessor.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
if [
ColumnType::I64,
ColumnType::U64,
ColumnType::F64,
ColumnType::DateTime,
]
.contains(&self.field_type)
{
for val in agg_accessor.column_block_accessor.iter_vals() {
let val1 = f64_from_fastfield_u64(val, &self.field_type);
self.stats.collect(val1);
}
} else {
for _val in agg_accessor.column_block_accessor.iter_vals() {
// we ignore the value and simply record that we got something
self.stats.collect(0.0);
}
}
}
}
@@ -435,6 +449,11 @@ mod tests {
"field": "score",
},
},
"count_str": {
"value_count": {
"field": "text",
},
},
"range": range_agg
}))
.unwrap();
@@ -500,6 +519,13 @@ mod tests {
})
);
assert_eq!(
res["count_str"],
json!({
"value": 7.0,
})
);
Ok(())
}

View File

@@ -180,7 +180,7 @@ pub(crate) fn deserialize_option_f64<'de, D>(deserializer: D) -> Result<Option<f
where D: Deserializer<'de> {
struct StringOrFloatVisitor;
impl<'de> Visitor<'de> for StringOrFloatVisitor {
impl Visitor<'_> for StringOrFloatVisitor {
type Value = Option<f64>;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
@@ -226,7 +226,7 @@ pub(crate) fn deserialize_f64<'de, D>(deserializer: D) -> Result<f64, D::Error>
where D: Deserializer<'de> {
struct StringOrFloatVisitor;
impl<'de> Visitor<'de> for StringOrFloatVisitor {
impl Visitor<'_> for StringOrFloatVisitor {
type Value = f64;
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
@@ -578,7 +578,7 @@ mod tests {
.set_indexing_options(
TextFieldIndexing::default().set_index_option(IndexRecordOption::WithFreqs),
)
.set_fast(None)
.set_fast(Some("raw"))
.set_stored();
let text_field = schema_builder.add_text_field("text", text_fieldtype);
let date_field = schema_builder.add_date_field("date", FAST);

View File

@@ -13,7 +13,7 @@ struct Hit<'a> {
facet: &'a Facet,
}
impl<'a> Eq for Hit<'a> {}
impl Eq for Hit<'_> {}
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
fn eq(&self, other: &Hit<'_>) -> bool {
@@ -27,7 +27,7 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
}
}
impl<'a> Ord for Hit<'a> {
impl Ord for Hit<'_> {
fn cmp(&self, other: &Self) -> Ordering {
other
.count

View File

@@ -182,6 +182,7 @@ where
}
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
///
/// it transparently wraps an inner [`Collector`] but filters documents
/// based on the result of applying the predicate to the bytes fast field.
///

View File

@@ -495,4 +495,4 @@ where
impl_downcast!(Fruit);
#[cfg(test)]
pub mod tests;
pub(crate) mod tests;

View File

@@ -161,7 +161,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
/// # Ok(())
/// # }
/// ```
#[allow(clippy::type_complexity)]
#[expect(clippy::type_complexity)]
#[derive(Default)]
pub struct MultiCollector<'a> {
collector_wrappers: Vec<
@@ -190,7 +190,7 @@ impl<'a> MultiCollector<'a> {
}
}
impl<'a> Collector for MultiCollector<'a> {
impl Collector for MultiCollector<'_> {
type Fruit = MultiFruit;
type Child = MultiCollectorChild;

View File

@@ -44,8 +44,19 @@ fn test_format_6() {
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
}
/// feature flag quickwit uses a different dictionary type
#[test]
#[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
fn test_format_7() {
let path = path_for_version("7");
let index = Index::open_in_dir(path).expect("Failed to open index");
// dates are not truncated in v7 in the docstore
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
}
#[cfg(not(feature = "quickwit"))]
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
use collector::TopDocs;
let reader = index.reader().expect("Failed to create reader");
let searcher = reader.searcher();
@@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
.as_datetime()
.unwrap();
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
assert_eq!(date_value, expected,);
}

View File

@@ -71,7 +71,7 @@ pub fn json_path_sep_to_dot(path: &mut str) {
}
}
#[allow(clippy::too_many_arguments)]
#[expect(clippy::too_many_arguments)]
fn index_json_object<'a, V: Value<'a>>(
doc: DocId,
json_visitor: V::ObjectIter,
@@ -101,7 +101,7 @@ fn index_json_object<'a, V: Value<'a>>(
}
}
#[allow(clippy::too_many_arguments)]
#[expect(clippy::too_many_arguments)]
pub(crate) fn index_json_value<'a, V: Value<'a>>(
doc: DocId,
json_value: V,

View File

@@ -39,7 +39,7 @@ impl RetryPolicy {
/// The `DirectoryLock` is an object that represents a file lock.
///
/// It is associated with a lock file, that gets deleted on `Drop.`
#[allow(dead_code)]
#[expect(dead_code)]
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
struct DirectoryLockGuard {

View File

@@ -48,6 +48,7 @@ pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
});
/// The meta lock file is here to protect the segment files being opened by
/// `IndexReader::reload()` from being garbage collected.
///
/// It makes it possible for another process to safely consume
/// our index in-writing. Ideally, we may have preferred `RWLock` semantics
/// here, but it is difficult to achieve on Windows.

View File

@@ -1,3 +1,9 @@
//! The footer is a small metadata structure that is appended at the end of every file.
//!
//! The footer is used to store a checksum of the file content.
//! The footer also stores the version of the index format.
//! This version is used to detect incompatibility between the index and the library version.
use std::io;
use std::io::Write;
@@ -20,20 +26,22 @@ type CrcHashU32 = u32;
/// A Footer is appended to every file
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct Footer {
/// The version of the index format
pub version: Version,
/// The crc32 hash of the body
pub crc: CrcHashU32,
}
impl Footer {
pub fn new(crc: CrcHashU32) -> Self {
pub(crate) fn new(crc: CrcHashU32) -> Self {
let version = crate::VERSION.clone();
Footer { version, crc }
}
pub fn crc(&self) -> CrcHashU32 {
pub(crate) fn crc(&self) -> CrcHashU32 {
self.crc
}
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
pub(crate) fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
let mut counting_write = CountingWriter::wrap(&mut write);
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
let footer_payload_len = counting_write.written_bytes();
@@ -42,6 +50,7 @@ impl Footer {
Ok(())
}
/// Extracts the tantivy Footer from the file and returns the footer and the rest of the file
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
if file.len() < 4 {
return Err(io::Error::new(

View File

@@ -244,7 +244,7 @@ impl MmapDirectory {
directory_path,
)));
}
#[allow(clippy::bind_instead_of_map)]
#[expect(clippy::bind_instead_of_map)]
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
let directory_path = directory_path.to_owned();

View File

@@ -6,7 +6,7 @@ mod mmap_directory;
mod directory;
mod directory_lock;
mod file_watcher;
mod footer;
pub mod footer;
mod managed_directory;
mod ram_directory;
mod watch_event_router;

View File

@@ -32,7 +32,7 @@ pub struct WatchCallbackList {
/// file change is detected.
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
#[derive(Clone)]
#[allow(dead_code)]
#[expect(dead_code)]
pub struct WatchHandle(Arc<WatchCallback>);
impl WatchHandle {

View File

@@ -117,7 +117,7 @@ pub trait DocSet: Send {
}
}
impl<'a> DocSet for &'a mut dyn DocSet {
impl DocSet for &mut dyn DocSet {
fn advance(&mut self) -> u32 {
(**self).advance()
}

View File

@@ -217,7 +217,7 @@ impl FastFieldReaders {
Ok(dynamic_column.into())
}
/// Returning a `dynamic_column_handle`.
/// Returns a `dynamic_column_handle`.
pub fn dynamic_column_handle(
&self,
field_name: &str,
@@ -234,7 +234,7 @@ impl FastFieldReaders {
Ok(dynamic_column_handle_opt)
}
/// Returning all `dynamic_column_handle`.
/// Returns all `dynamic_column_handle` that match the given field name.
pub fn dynamic_column_handles(
&self,
field_name: &str,
@@ -250,6 +250,22 @@ impl FastFieldReaders {
Ok(dynamic_column_handles)
}
/// Returns all `dynamic_column_handle` that are inner fields of the provided JSON path.
pub fn dynamic_subpath_column_handles(
&self,
root_path: &str,
) -> crate::Result<Vec<DynamicColumnHandle>> {
let Some(resolved_field_name) = self.resolve_field(root_path)? else {
return Ok(Vec::new());
};
let dynamic_column_handles = self
.columnar
.read_subpath_columns(&resolved_field_name)?
.into_iter()
.collect();
Ok(dynamic_column_handles)
}
#[doc(hidden)]
pub async fn list_dynamic_column_handles(
&self,
@@ -265,6 +281,21 @@ impl FastFieldReaders {
Ok(columns)
}
#[doc(hidden)]
pub async fn list_subpath_dynamic_column_handles(
&self,
root_path: &str,
) -> crate::Result<Vec<DynamicColumnHandle>> {
let Some(resolved_field_name) = self.resolve_field(root_path)? else {
return Ok(Vec::new());
};
let columns = self
.columnar
.read_subpath_columns_async(&resolved_field_name)
.await?;
Ok(columns)
}
/// Returns the `u64` column used to represent any `u64`-mapped typed (String/Bytes term ids,
/// i64, u64, f64, DateTime).
///
@@ -476,6 +507,15 @@ mod tests {
.iter()
.any(|column| column.column_type() == ColumnType::Str));
println!("*** {:?}", fast_fields.columnar().list_columns());
let json_columns = fast_fields.dynamic_column_handles("json").unwrap();
assert_eq!(json_columns.len(), 0);
let json_subcolumns = fast_fields.dynamic_subpath_column_handles("json").unwrap();
assert_eq!(json_subcolumns.len(), 3);
let foo_subcolumns = fast_fields
.dynamic_subpath_column_handles("json.foo")
.unwrap();
assert_eq!(foo_subcolumns.len(), 0);
}
}

View File

@@ -149,7 +149,7 @@ impl FieldNormReader {
}
#[cfg(test)]
pub fn for_test(field_norms: &[u32]) -> FieldNormReader {
pub(crate) fn for_test(field_norms: &[u32]) -> FieldNormReader {
let field_norms_id = field_norms
.iter()
.cloned()

View File

@@ -1,12 +1,9 @@
#![allow(deprecated)] // Remove with index sorting
use std::collections::HashSet;
use rand::{thread_rng, Rng};
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::schema::*;
#[allow(deprecated)]
use crate::{doc, schema, Index, IndexWriter, Searcher};
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {

View File

@@ -15,7 +15,9 @@ use crate::directory::MmapDirectory;
use crate::directory::{Directory, ManagedDirectory, RamDirectory, INDEX_WRITER_LOCK};
use crate::error::{DataCorruption, TantivyError};
use crate::index::{IndexMeta, SegmentId, SegmentMeta, SegmentMetaInventory};
use crate::indexer::index_writer::{MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN};
use crate::indexer::index_writer::{
IndexWriterOptions, MAX_NUM_THREAD, MEMORY_BUDGET_NUM_BYTES_MIN,
};
use crate::indexer::segment_updater::save_metas;
use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
use crate::reader::{IndexReader, IndexReaderBuilder};
@@ -519,6 +521,43 @@ impl Index {
load_metas(self.directory(), &self.inventory)
}
/// Open a new index writer with the given options. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
/// that due to a panic or other error, a stale lockfile will be
/// left in the index directory. If you are sure that no other
/// `IndexWriter` on the system is accessing the index directory,
/// it is safe to manually delete the lockfile.
///
/// - `options` defines the writer configuration which includes things like buffer sizes,
/// indexer threads, etc...
///
/// # Errors
/// If the lockfile already exists, returns `TantivyError::LockFailure`.
/// If the memory arena per thread is too small or too big, returns
/// `TantivyError::InvalidArgument`
pub fn writer_with_options<D: Document>(
&self,
options: IndexWriterOptions,
) -> crate::Result<IndexWriter<D>> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using a regular directory, this \
means there is already an `IndexWriter` working on this `Directory`, in \
this process or in a different process."
.to_string(),
),
)
})?;
IndexWriter::new(self, options, directory_lock)
}
/// Open a new index writer. Attempts to acquire a lockfile.
///
/// The lockfile should be deleted on drop, but it is possible
@@ -543,27 +582,12 @@ impl Index {
num_threads: usize,
overall_memory_budget_in_bytes: usize,
) -> crate::Result<IndexWriter<D>> {
let directory_lock = self
.directory
.acquire_lock(&INDEX_WRITER_LOCK)
.map_err(|err| {
TantivyError::LockFailure(
err,
Some(
"Failed to acquire index lock. If you are using a regular directory, this \
means there is already an `IndexWriter` working on this `Directory`, in \
this process or in a different process."
.to_string(),
),
)
})?;
let memory_arena_in_bytes_per_thread = overall_memory_budget_in_bytes / num_threads;
IndexWriter::new(
self,
num_threads,
memory_arena_in_bytes_per_thread,
directory_lock,
)
let options = IndexWriterOptions::builder()
.num_worker_threads(num_threads)
.memory_budget_per_thread(memory_arena_in_bytes_per_thread)
.build();
self.writer_with_options(options)
}
/// Helper to create an index writer for tests.

View File

@@ -3,6 +3,12 @@ use std::io;
use common::json_path_writer::JSON_END_OF_PATH;
use common::BinarySerializable;
use fnv::FnvHashSet;
#[cfg(feature = "quickwit")]
use futures_util::{FutureExt, StreamExt, TryStreamExt};
#[cfg(feature = "quickwit")]
use itertools::Itertools;
#[cfg(feature = "quickwit")]
use tantivy_fst::automaton::{AlwaysMatch, Automaton};
use crate::directory::FileSlice;
use crate::positions::PositionReader;
@@ -31,7 +37,6 @@ pub struct InvertedIndexReader {
}
impl InvertedIndexReader {
#[allow(clippy::needless_pass_by_value)] // for symmetry
pub(crate) fn new(
termdict: TermDictionary,
postings_file_slice: FileSlice,
@@ -205,16 +210,6 @@ impl InvertedIndexReader {
.transpose()
}
pub(crate) fn read_postings_no_deletes(
&self,
term: &Term,
option: IndexRecordOption,
) -> io::Result<Option<SegmentPostings>> {
self.get_term_info(term)?
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
.transpose()
}
/// Returns the number of documents containing the term.
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
Ok(self
@@ -230,13 +225,18 @@ impl InvertedIndexReader {
self.termdict.get_async(term.serialized_value_bytes()).await
}
async fn get_term_range_async(
&self,
async fn get_term_range_async<'a, A: Automaton + 'a>(
&'a self,
terms: impl std::ops::RangeBounds<Term>,
automaton: A,
limit: Option<u64>,
) -> io::Result<impl Iterator<Item = TermInfo> + '_> {
merge_holes_under_bytes: usize,
) -> io::Result<impl Iterator<Item = TermInfo> + 'a>
where
A::State: Clone,
{
use std::ops::Bound;
let range_builder = self.termdict.range();
let range_builder = self.termdict.search(automaton);
let range_builder = match terms.start_bound() {
Bound::Included(bound) => range_builder.ge(bound.serialized_value_bytes()),
Bound::Excluded(bound) => range_builder.gt(bound.serialized_value_bytes()),
@@ -253,7 +253,9 @@ impl InvertedIndexReader {
range_builder
};
let mut stream = range_builder.into_stream_async().await?;
let mut stream = range_builder
.into_stream_async_merging_holes(merge_holes_under_bytes)
.await?;
let iter = std::iter::from_fn(move || stream.next().map(|(_k, v)| v.clone()));
@@ -299,7 +301,9 @@ impl InvertedIndexReader {
limit: Option<u64>,
with_positions: bool,
) -> io::Result<bool> {
let mut term_info = self.get_term_range_async(terms, limit).await?;
let mut term_info = self
.get_term_range_async(terms, AlwaysMatch, limit, 0)
.await?;
let Some(first_terminfo) = term_info.next() else {
// no key matches, nothing more to load
@@ -326,6 +330,84 @@ impl InvertedIndexReader {
Ok(true)
}
/// Warmup a block postings given a range of `Term`s.
/// This method is for an advanced usage only.
///
/// returns a boolean, whether a term matching the range was found in the dictionary
pub async fn warm_postings_automaton<
A: Automaton + Clone + Send + 'static,
E: FnOnce(Box<dyn FnOnce() -> io::Result<()> + Send>) -> F,
F: std::future::Future<Output = io::Result<()>>,
>(
&self,
automaton: A,
// with_positions: bool, at the moment we have no use for it, and supporting it would add
// complexity to the coalesce
executor: E,
) -> io::Result<bool>
where
A::State: Clone,
{
// merge holes under 4MiB, that's how many bytes we can hope to receive during a TTFB from
// S3 (~80MiB/s, and 50ms latency)
const MERGE_HOLES_UNDER_BYTES: usize = (80 * 1024 * 1024 * 50) / 1000;
// we build a first iterator to download everything. Simply calling the function already
// download everything we need from the sstable, but doesn't start iterating over it.
let _term_info_iter = self
.get_term_range_async(.., automaton.clone(), None, MERGE_HOLES_UNDER_BYTES)
.await?;
let (sender, posting_ranges_to_load_stream) = futures_channel::mpsc::unbounded();
let termdict = self.termdict.clone();
let cpu_bound_task = move || {
// then we build a 2nd iterator, this one with no holes, so we don't go through blocks
// we can't match.
// This makes the assumption there is a caching layer below us, which gives sync read
// for free after the initial async access. This might not always be true, but is in
// Quickwit.
// We build things from this closure otherwise we get into lifetime issues that can only
// be solved with self referential strucs. Returning an io::Result from here is a bit
// more leaky abstraction-wise, but a lot better than the alternative
let mut stream = termdict.search(automaton).into_stream()?;
// we could do without an iterator, but this allows us access to coalesce which simplify
// things
let posting_ranges_iter =
std::iter::from_fn(move || stream.next().map(|(_k, v)| v.postings_range.clone()));
let merged_posting_ranges_iter = posting_ranges_iter.coalesce(|range1, range2| {
if range1.end + MERGE_HOLES_UNDER_BYTES >= range2.start {
Ok(range1.start..range2.end)
} else {
Err((range1, range2))
}
});
for posting_range in merged_posting_ranges_iter {
if let Err(_) = sender.unbounded_send(posting_range) {
// this should happen only when search is cancelled
return Err(io::Error::other("failed to send posting range back"));
}
}
Ok(())
};
let task_handle = executor(Box::new(cpu_bound_task));
let posting_downloader = posting_ranges_to_load_stream
.map(|posting_slice| {
self.postings_file_slice
.read_bytes_slice_async(posting_slice)
.map(|result| result.map(|_slice| ()))
})
.buffer_unordered(5)
.try_collect::<Vec<()>>();
let (_, slices_downloaded) =
futures_util::future::try_join(task_handle, posting_downloader).await?;
Ok(!slices_downloaded.is_empty())
}
/// Warmup the block postings for all terms.
/// This method is for an advanced usage only.
///

View File

@@ -1,6 +1,7 @@
use std::slice;
/// Enum describing each component of a tantivy segment.
///
/// Each component is stored in its own file,
/// using the pattern `segment_uuid`.`component_extension`,
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`

View File

@@ -478,7 +478,7 @@ pub fn merge_field_meta_data(
.into_iter()
.kmerge_by(|left, right| left < right)
// TODO: Remove allocation
.group_by(|el| (el.field_name.to_string(), el.typ))
.chunk_by(|el| (el.field_name.to_string(), el.typ))
{
let mut merged: FieldMetadata = group.next().unwrap();
for el in group {

View File

@@ -187,7 +187,6 @@ impl DeleteCursor {
}
}
#[allow(clippy::wrong_self_convention)]
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
self.get()
.map(|operation| operation.opstamp < target_opstamp)

View File

@@ -21,7 +21,7 @@ pub enum DocToOpstampMapping<'a> {
None,
}
impl<'a> DocToOpstampMapping<'a> {
impl DocToOpstampMapping<'_> {
/// Assess whether a document should be considered deleted given that it contains
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
///

View File

@@ -45,6 +45,23 @@ fn error_in_index_worker_thread(context: &str) -> TantivyError {
))
}
#[derive(Clone, bon::Builder)]
/// A builder for creating a new [IndexWriter] for an index.
pub struct IndexWriterOptions {
#[builder(default = MEMORY_BUDGET_NUM_BYTES_MIN)]
/// The memory budget per indexer thread.
///
/// When an indexer thread has buffered this much data in memory
/// it will flush the segment to disk (although this is not searchable until commit is called.)
memory_budget_per_thread: usize,
#[builder(default = 1)]
/// The number of indexer worker threads to use.
num_worker_threads: usize,
#[builder(default = 4)]
/// Defines the number of merger threads to use.
num_merge_threads: usize,
}
/// `IndexWriter` is the user entry-point to add document to an index.
///
/// It manages a small number of indexing thread, as well as a shared
@@ -58,8 +75,7 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
index: Index,
// The memory budget per thread, after which a commit is triggered.
memory_budget_in_bytes_per_thread: usize,
options: IndexWriterOptions,
workers_join_handle: Vec<JoinHandle<crate::Result<()>>>,
@@ -70,8 +86,6 @@ pub struct IndexWriter<D: Document = TantivyDocument> {
worker_id: usize,
num_threads: usize,
delete_queue: DeleteQueue,
stamper: Stamper,
@@ -265,23 +279,27 @@ impl<D: Document> IndexWriter<D> {
/// `TantivyError::InvalidArgument`
pub(crate) fn new(
index: &Index,
num_threads: usize,
memory_budget_in_bytes_per_thread: usize,
options: IndexWriterOptions,
directory_lock: DirectoryLock,
) -> crate::Result<Self> {
if memory_budget_in_bytes_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
if options.memory_budget_per_thread < MEMORY_BUDGET_NUM_BYTES_MIN {
let err_msg = format!(
"The memory arena in bytes per thread needs to be at least \
{MEMORY_BUDGET_NUM_BYTES_MIN}."
);
return Err(TantivyError::InvalidArgument(err_msg));
}
if memory_budget_in_bytes_per_thread >= MEMORY_BUDGET_NUM_BYTES_MAX {
if options.memory_budget_per_thread >= MEMORY_BUDGET_NUM_BYTES_MAX {
let err_msg = format!(
"The memory arena in bytes per thread cannot exceed {MEMORY_BUDGET_NUM_BYTES_MAX}"
);
return Err(TantivyError::InvalidArgument(err_msg));
}
if options.num_worker_threads == 0 {
let err_msg = "At least one worker thread is required, got 0".to_string();
return Err(TantivyError::InvalidArgument(err_msg));
}
let (document_sender, document_receiver) =
crossbeam_channel::bounded(PIPELINE_MAX_SIZE_IN_DOCS);
@@ -291,13 +309,17 @@ impl<D: Document> IndexWriter<D> {
let stamper = Stamper::new(current_opstamp);
let segment_updater =
SegmentUpdater::create(index.clone(), stamper.clone(), &delete_queue.cursor())?;
let segment_updater = SegmentUpdater::create(
index.clone(),
stamper.clone(),
&delete_queue.cursor(),
options.num_merge_threads,
)?;
let mut index_writer = Self {
_directory_lock: Some(directory_lock),
memory_budget_in_bytes_per_thread,
options: options.clone(),
index: index.clone(),
index_writer_status: IndexWriterStatus::from(document_receiver),
operation_sender: document_sender,
@@ -305,7 +327,6 @@ impl<D: Document> IndexWriter<D> {
segment_updater,
workers_join_handle: vec![],
num_threads,
delete_queue,
@@ -398,7 +419,7 @@ impl<D: Document> IndexWriter<D> {
let mut delete_cursor = self.delete_queue.cursor();
let mem_budget = self.memory_budget_in_bytes_per_thread;
let mem_budget = self.options.memory_budget_per_thread;
let index = self.index.clone();
let join_handle: JoinHandle<crate::Result<()>> = thread::Builder::new()
.name(format!("thrd-tantivy-index{}", self.worker_id))
@@ -451,7 +472,7 @@ impl<D: Document> IndexWriter<D> {
}
fn start_workers(&mut self) -> crate::Result<()> {
for _ in 0..self.num_threads {
for _ in 0..self.options.num_worker_threads {
self.add_indexing_worker()?;
}
Ok(())
@@ -553,12 +574,7 @@ impl<D: Document> IndexWriter<D> {
.take()
.expect("The IndexWriter does not have any lock. This is a bug, please report.");
let new_index_writer = IndexWriter::new(
&self.index,
self.num_threads,
self.memory_budget_in_bytes_per_thread,
directory_lock,
)?;
let new_index_writer = IndexWriter::new(&self.index, self.options.clone(), directory_lock)?;
// the current `self` is dropped right away because of this call.
//
@@ -812,7 +828,7 @@ mod tests {
use crate::directory::error::LockError;
use crate::error::*;
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
use crate::indexer::NoMergePolicy;
use crate::indexer::{IndexWriterOptions, NoMergePolicy};
use crate::query::{QueryParser, TermQuery};
use crate::schema::{
self, Facet, FacetOptions, IndexRecordOption, IpAddrOptions, JsonObjectOptions,
@@ -2533,4 +2549,36 @@ mod tests {
index_writer.commit().unwrap();
Ok(())
}
#[test]
fn test_writer_options_validation() {
let mut schema_builder = Schema::builder();
let field = schema_builder.add_bool_field("example", STORED);
let index = Index::create_in_ram(schema_builder.build());
let opt_wo_threads = IndexWriterOptions::builder().num_worker_threads(0).build();
let result = index.writer_with_options::<TantivyDocument>(opt_wo_threads);
assert!(result.is_err(), "Writer should reject 0 thread count");
assert!(matches!(result, Err(TantivyError::InvalidArgument(_))));
let opt_with_low_memory = IndexWriterOptions::builder()
.memory_budget_per_thread(10 << 10)
.build();
let result = index.writer_with_options::<TantivyDocument>(opt_with_low_memory);
assert!(
result.is_err(),
"Writer should reject options with too low memory size"
);
assert!(matches!(result, Err(TantivyError::InvalidArgument(_))));
let opt_with_low_memory = IndexWriterOptions::builder()
.memory_budget_per_thread(5 << 30)
.build();
let result = index.writer_with_options::<TantivyDocument>(opt_with_low_memory);
assert!(
result.is_err(),
"Writer should reject options with too high memory size"
);
assert!(matches!(result, Err(TantivyError::InvalidArgument(_))));
}
}

View File

@@ -104,7 +104,7 @@ impl MergePolicy for LogMergePolicy {
let mut current_max_log_size = f64::MAX;
let mut levels = vec![];
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
for (_, merge_group) in &size_sorted_segments.into_iter().chunk_by(|segment| {
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
if segment_log_size < (current_max_log_size - self.level_log_size) {
// update current_max_log_size to create a new group

View File

@@ -36,7 +36,7 @@ impl MergePolicy for NoMergePolicy {
}
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use super::*;

View File

@@ -31,7 +31,7 @@ mod stamper;
use crossbeam_channel as channel;
use smallvec::SmallVec;
pub use self::index_writer::IndexWriter;
pub use self::index_writer::{IndexWriter, IndexWriterOptions};
pub use self::log_merge_policy::LogMergePolicy;
pub use self::merge_operation::MergeOperation;
pub use self::merge_policy::{MergeCandidate, MergePolicy, NoMergePolicy};

View File

@@ -25,8 +25,6 @@ use crate::indexer::{
};
use crate::{FutureResult, Opstamp};
const NUM_MERGE_THREADS: usize = 4;
/// Save the index meta file.
/// This operation is atomic:
/// Either
@@ -273,6 +271,7 @@ impl SegmentUpdater {
index: Index,
stamper: Stamper,
delete_cursor: &DeleteCursor,
num_merge_threads: usize,
) -> crate::Result<SegmentUpdater> {
let segments = index.searchable_segment_metas()?;
let segment_manager = SegmentManager::from_segments(segments, delete_cursor);
@@ -287,7 +286,7 @@ impl SegmentUpdater {
})?;
let merge_thread_pool = ThreadPoolBuilder::new()
.thread_name(|i| format!("merge_thread_{i}"))
.num_threads(NUM_MERGE_THREADS)
.num_threads(num_merge_threads)
.build()
.map_err(|_| {
crate::TantivyError::SystemError(

View File

@@ -150,7 +150,7 @@ impl SegmentWriter {
let vals_grouped_by_field = doc
.iter_fields_and_values()
.sorted_by_key(|(field, _)| *field)
.group_by(|(field, _)| *field);
.chunk_by(|(field, _)| *field);
for (field, field_values) in &vals_grouped_by_field {
let values = field_values.map(|el| el.1);
@@ -422,6 +422,7 @@ mod tests {
use std::collections::BTreeMap;
use std::path::{Path, PathBuf};
use columnar::ColumnType;
use tempfile::TempDir;
use crate::collector::{Count, TopDocs};
@@ -431,15 +432,15 @@ mod tests {
use crate::query::{PhraseQuery, QueryParser};
use crate::schema::{
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
DATE_TIME_PRECISION_INDEXED, FAST, STORED, STRING, TEXT,
};
use crate::store::{Compressor, StoreReader, StoreWriter};
use crate::time::format_description::well_known::Rfc3339;
use crate::time::OffsetDateTime;
use crate::tokenizer::{PreTokenizedString, Token};
use crate::{
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, TantivyDocument, Term,
TERMINATED,
DateTime, Directory, DocAddress, DocSet, Index, IndexWriter, SegmentReader,
TantivyDocument, Term, TERMINATED,
};
#[test]
@@ -841,6 +842,75 @@ mod tests {
assert_eq!(searcher.search(&phrase_query, &Count).unwrap(), 0);
}
#[test]
fn test_json_fast() {
let mut schema_builder = Schema::builder();
let json_field = schema_builder.add_json_field("json", FAST);
let schema = schema_builder.build();
let json_val: serde_json::Value = serde_json::from_str(
r#"{
"toto": "titi",
"float": -0.2,
"bool": true,
"unsigned": 1,
"signed": -2,
"complexobject": {
"field.with.dot": 1
},
"date": "1985-04-12T23:20:50.52Z",
"my_arr": [2, 3, {"my_key": "two tokens"}, 4]
}"#,
)
.unwrap();
let doc = doc!(json_field=>json_val.clone());
let index = Index::create_in_ram(schema.clone());
let mut writer = index.writer_for_tests().unwrap();
writer.add_document(doc).unwrap();
writer.commit().unwrap();
let reader = index.reader().unwrap();
let searcher = reader.searcher();
let segment_reader = searcher.segment_reader(0u32);
fn assert_type(reader: &SegmentReader, field: &str, typ: ColumnType) {
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
assert_eq!(cols.len(), 1, "{}", field);
assert_eq!(cols[0].column_type(), typ, "{}", field);
}
assert_type(segment_reader, "json.toto", ColumnType::Str);
assert_type(segment_reader, "json.float", ColumnType::F64);
assert_type(segment_reader, "json.bool", ColumnType::Bool);
assert_type(segment_reader, "json.unsigned", ColumnType::I64);
assert_type(segment_reader, "json.signed", ColumnType::I64);
assert_type(
segment_reader,
"json.complexobject.field\\.with\\.dot",
ColumnType::I64,
);
assert_type(segment_reader, "json.date", ColumnType::DateTime);
assert_type(segment_reader, "json.my_arr", ColumnType::I64);
assert_type(segment_reader, "json.my_arr.my_key", ColumnType::Str);
fn assert_empty(reader: &SegmentReader, field: &str) {
let cols = reader.fast_fields().dynamic_column_handles(field).unwrap();
assert_eq!(cols.len(), 0);
}
assert_empty(segment_reader, "unknown");
assert_empty(segment_reader, "json");
assert_empty(segment_reader, "json.toto.titi");
let sub_columns = segment_reader
.fast_fields()
.dynamic_subpath_column_handles("json")
.unwrap();
assert_eq!(sub_columns.len(), 9);
let subsub_columns = segment_reader
.fast_fields()
.dynamic_subpath_column_handles("json.complexobject")
.unwrap();
assert_eq!(subsub_columns.len(), 1);
}
#[test]
fn test_json_term_with_numeric_merge_panic_regression_bug_2283() {
// https://github.com/quickwit-oss/tantivy/issues/2283

View File

@@ -1,79 +1,19 @@
use std::ops::Range;
use std::sync::atomic::Ordering;
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::Arc;
use crate::Opstamp;
#[cfg(not(target_arch = "arm"))]
mod atomic_impl {
use std::sync::atomic::{AtomicU64, Ordering};
use crate::Opstamp;
#[derive(Default)]
pub struct AtomicU64Wrapper(AtomicU64);
impl AtomicU64Wrapper {
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
AtomicU64Wrapper(AtomicU64::new(first_opstamp))
}
pub fn fetch_add(&self, val: u64, order: Ordering) -> u64 {
self.0.fetch_add(val, order)
}
pub fn revert(&self, val: u64, order: Ordering) -> u64 {
self.0.store(val, order);
val
}
}
}
#[cfg(target_arch = "arm")]
mod atomic_impl {
/// Under other architecture, we rely on a mutex.
use std::sync::atomic::Ordering;
use std::sync::RwLock;
use crate::Opstamp;
#[derive(Default)]
pub struct AtomicU64Wrapper(RwLock<u64>);
impl AtomicU64Wrapper {
pub fn new(first_opstamp: Opstamp) -> AtomicU64Wrapper {
AtomicU64Wrapper(RwLock::new(first_opstamp))
}
pub fn fetch_add(&self, incr: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
let previous_val = *lock;
*lock = previous_val + incr;
previous_val
}
pub fn revert(&self, val: u64, _order: Ordering) -> u64 {
let mut lock = self.0.write().unwrap();
*lock = val;
val
}
}
}
use self::atomic_impl::AtomicU64Wrapper;
/// Stamper provides Opstamps, which is just an auto-increment id to label
/// an operation.
///
/// Cloning does not "fork" the stamp generation. The stamper actually wraps an `Arc`.
#[derive(Clone, Default)]
pub struct Stamper(Arc<AtomicU64Wrapper>);
pub struct Stamper(Arc<AtomicU64>);
impl Stamper {
pub fn new(first_opstamp: Opstamp) -> Stamper {
Stamper(Arc::new(AtomicU64Wrapper::new(first_opstamp)))
Stamper(Arc::new(AtomicU64::new(first_opstamp)))
}
pub fn stamp(&self) -> Opstamp {
@@ -92,7 +32,8 @@ impl Stamper {
/// Reverts the stamper to a given `Opstamp` value and returns it
pub fn revert(&self, to_opstamp: Opstamp) -> Opstamp {
self.0.revert(to_opstamp, Ordering::SeqCst)
self.0.store(to_opstamp, Ordering::SeqCst);
to_opstamp
}
}
@@ -101,7 +42,7 @@ mod test {
use super::Stamper;
#[allow(clippy::redundant_clone)]
#[expect(clippy::redundant_clone)]
#[test]
fn test_stamper() {
let stamper = Stamper::new(7u64);
@@ -117,7 +58,7 @@ mod test {
assert_eq!(stamper.stamp(), 15u64);
}
#[allow(clippy::redundant_clone)]
#[expect(clippy::redundant_clone)]
#[test]
fn test_stamper_revert() {
let stamper = Stamper::new(7u64);

View File

@@ -178,10 +178,8 @@ pub use crate::future_result::FutureResult;
pub type Result<T> = std::result::Result<T, TantivyError>;
mod core;
#[allow(deprecated)] // Remove with index sorting
pub mod indexer;
#[allow(unused_doc_comments)]
pub mod error;
pub mod tokenizer;
@@ -190,7 +188,6 @@ pub mod collector;
pub mod directory;
pub mod fastfield;
pub mod fieldnorm;
#[allow(deprecated)] // Remove with index sorting
pub mod index;
pub mod positions;
pub mod postings;
@@ -223,7 +220,6 @@ pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
pub use crate::core::json_utils;
pub use crate::core::{Executor, Searcher, SearcherGeneration};
pub use crate::directory::Directory;
#[allow(deprecated)] // Remove with index sorting
pub use crate::index::{
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
SegmentMeta, SegmentReader,
@@ -232,7 +228,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
pub use crate::schema::{Document, TantivyDocument, Term};
/// Index format version.
pub const INDEX_FORMAT_VERSION: u32 = 6;
pub const INDEX_FORMAT_VERSION: u32 = 7;
/// Oldest index format version this tantivy version can read.
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
@@ -371,6 +367,7 @@ macro_rules! fail_point {
}};
}
/// Common test utilities.
#[cfg(test)]
pub mod tests {
use common::{BinarySerializable, FixedSize};
@@ -389,6 +386,7 @@ pub mod tests {
use crate::schema::*;
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
/// Asserts that the serialized value is the value in the trait.
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
let mut buffer = Vec::new();
O::default().serialize(&mut buffer).unwrap();
@@ -421,6 +419,7 @@ pub mod tests {
}};
}
/// Generates random numbers
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
let seed: [u8; 32] = [1; 32];
StdRng::from_seed(seed)
@@ -429,6 +428,7 @@ pub mod tests {
.collect::<Vec<u32>>()
}
/// Sample `n` elements with Bernoulli distribution.
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
StdRng::from_seed([seed_val; 32])
.sample_iter(&Bernoulli::new(ratio).unwrap())
@@ -438,6 +438,7 @@ pub mod tests {
.collect()
}
/// Sample `n` elements with Bernoulli distribution.
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
sample_with_seed(n, ratio, 4)
}

View File

@@ -41,7 +41,6 @@
/// );
/// # }
/// ```
#[macro_export]
macro_rules! doc(
() => {

View File

@@ -1,4 +1,5 @@
//! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
//!
//! This position is expressed as token ordinal. For instance,
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
//! This information is useful to run phrase queries.
@@ -38,7 +39,7 @@ pub use self::serializer::PositionSerializer;
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use std::iter;

View File

@@ -264,7 +264,7 @@ impl VIntDecoder for BlockDecoder {
}
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use super::*;
use crate::TERMINATED;

View File

@@ -0,0 +1,155 @@
use crate::docset::{DocSet, TERMINATED};
use crate::postings::{Postings, SegmentPostings};
use crate::DocId;
/// `LoadedPostings` is a `DocSet` and `Postings` implementation.
/// It is used to represent the postings of a term in memory.
/// It is suitable if there are few documents for a term.
///
/// It exists mainly to reduce memory usage.
/// `SegmentPostings` uses 1840 bytes per instance due to its caches.
/// If you need to keep many terms around with few docs, it's cheaper to load all the
/// postings in memory.
///
/// This is relevant for `RegexPhraseQuery`, which may have a lot of
/// terms.
/// E.g. 100_000 terms would need 184MB due to SegmentPostings.
pub struct LoadedPostings {
doc_ids: Box<[DocId]>,
position_offsets: Box<[u32]>,
positions: Box<[u32]>,
cursor: usize,
}
impl LoadedPostings {
/// Creates a new `LoadedPostings` from a `SegmentPostings`.
///
/// It will also preload positions, if positions are available in the SegmentPostings.
pub fn load(segment_postings: &mut SegmentPostings) -> LoadedPostings {
let num_docs = segment_postings.doc_freq() as usize;
let mut doc_ids = Vec::with_capacity(num_docs);
let mut positions = Vec::with_capacity(num_docs);
let mut position_offsets = Vec::with_capacity(num_docs);
while segment_postings.doc() != TERMINATED {
position_offsets.push(positions.len() as u32);
doc_ids.push(segment_postings.doc());
segment_postings.append_positions_with_offset(0, &mut positions);
segment_postings.advance();
}
position_offsets.push(positions.len() as u32);
LoadedPostings {
doc_ids: doc_ids.into_boxed_slice(),
positions: positions.into_boxed_slice(),
position_offsets: position_offsets.into_boxed_slice(),
cursor: 0,
}
}
}
#[cfg(test)]
impl From<(Vec<DocId>, Vec<Vec<u32>>)> for LoadedPostings {
fn from(doc_ids_and_positions: (Vec<DocId>, Vec<Vec<u32>>)) -> LoadedPostings {
let mut position_offsets = Vec::new();
let mut all_positions = Vec::new();
let (doc_ids, docid_positions) = doc_ids_and_positions;
for positions in docid_positions {
position_offsets.push(all_positions.len() as u32);
all_positions.extend_from_slice(&positions);
}
position_offsets.push(all_positions.len() as u32);
LoadedPostings {
doc_ids: doc_ids.into_boxed_slice(),
positions: all_positions.into_boxed_slice(),
position_offsets: position_offsets.into_boxed_slice(),
cursor: 0,
}
}
}
impl DocSet for LoadedPostings {
fn advance(&mut self) -> DocId {
self.cursor += 1;
if self.cursor >= self.doc_ids.len() {
self.cursor = self.doc_ids.len();
return TERMINATED;
}
self.doc()
}
fn doc(&self) -> DocId {
if self.cursor >= self.doc_ids.len() {
return TERMINATED;
}
self.doc_ids[self.cursor]
}
fn size_hint(&self) -> u32 {
self.doc_ids.len() as u32
}
}
impl Postings for LoadedPostings {
fn term_freq(&self) -> u32 {
let start = self.position_offsets[self.cursor] as usize;
let end = self.position_offsets[self.cursor + 1] as usize;
(end - start) as u32
}
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let start = self.position_offsets[self.cursor] as usize;
let end = self.position_offsets[self.cursor + 1] as usize;
for pos in &self.positions[start..end] {
output.push(*pos + offset);
}
}
}
#[cfg(test)]
pub(crate) mod tests {
use super::*;
#[test]
pub fn test_vec_postings() {
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
let mut postings = LoadedPostings::from((doc_ids, vec![]));
assert_eq!(postings.doc(), 0u32);
assert_eq!(postings.advance(), 3u32);
assert_eq!(postings.doc(), 3u32);
assert_eq!(postings.seek(14u32), 15u32);
assert_eq!(postings.doc(), 15u32);
assert_eq!(postings.seek(300u32), 300u32);
assert_eq!(postings.doc(), 300u32);
assert_eq!(postings.seek(6000u32), TERMINATED);
}
#[test]
pub fn test_vec_postings2() {
let doc_ids: Vec<DocId> = (0u32..1024u32).map(|e| e * 3).collect();
let mut positions = Vec::new();
positions.resize(1024, Vec::new());
positions[0] = vec![1u32, 2u32, 3u32];
positions[1] = vec![30u32];
positions[2] = vec![10u32];
positions[4] = vec![50u32];
let mut postings = LoadedPostings::from((doc_ids, positions));
let load = |postings: &mut LoadedPostings| {
let mut loaded_positions = Vec::new();
postings.positions(loaded_positions.as_mut());
loaded_positions
};
assert_eq!(postings.doc(), 0u32);
assert_eq!(load(&mut postings), vec![1u32, 2u32, 3u32]);
assert_eq!(postings.advance(), 3u32);
assert_eq!(postings.doc(), 3u32);
assert_eq!(load(&mut postings), vec![30u32]);
assert_eq!(postings.seek(14u32), 15u32);
assert_eq!(postings.doc(), 15u32);
assert_eq!(postings.seek(300u32), 300u32);
assert_eq!(postings.doc(), 300u32);
assert_eq!(postings.seek(6000u32), TERMINATED);
}
}

View File

@@ -8,6 +8,7 @@ mod block_segment_postings;
pub(crate) mod compression;
mod indexing_context;
mod json_postings_writer;
mod loaded_postings;
mod per_field_postings_writer;
mod postings;
mod postings_writer;
@@ -17,6 +18,7 @@ mod serializer;
mod skip;
mod term_info;
pub(crate) use loaded_postings::LoadedPostings;
pub(crate) use stacker::compute_table_memory_size;
pub use self::block_segment_postings::BlockSegmentPostings;
@@ -29,7 +31,7 @@ pub use self::serializer::{FieldSerializer, InvertedIndexSerializer};
pub(crate) use self::skip::{BlockInfo, SkipReader};
pub use self::term_info::TermInfo;
#[allow(clippy::enum_variant_names)]
#[expect(clippy::enum_variant_names)]
#[derive(Debug, PartialEq, Clone, Copy, Eq)]
pub(crate) enum FreqReadingOption {
NoFreq,
@@ -38,7 +40,7 @@ pub(crate) enum FreqReadingOption {
}
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use std::mem;
use super::{InvertedIndexSerializer, Postings};

View File

@@ -17,7 +17,14 @@ pub trait Postings: DocSet + 'static {
/// Returns the positions offsetted with a given value.
/// It is not necessary to clear the `output` before calling this method.
/// The output vector will be resized to the `term_freq`.
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
output.clear();
self.append_positions_with_offset(offset, output);
}
/// Returns the positions offsetted with a given value.
/// Data will be appended to the output.
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>);
/// Returns the positions of the term in the given document.
/// The output vector will be resized to the `term_freq`.
@@ -25,3 +32,13 @@ pub trait Postings: DocSet + 'static {
self.positions_with_offset(0u32, output);
}
}
impl Postings for Box<dyn Postings> {
fn term_freq(&self) -> u32 {
(**self).term_freq()
}
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
(**self).append_positions_with_offset(offset, output);
}
}

View File

@@ -34,7 +34,7 @@ impl<'a> VInt32Reader<'a> {
}
}
impl<'a> Iterator for VInt32Reader<'a> {
impl Iterator for VInt32Reader<'_> {
type Item = u32;
fn next(&mut self) -> Option<u32> {

View File

@@ -237,8 +237,9 @@ impl Postings for SegmentPostings {
self.block_cursor.freq(self.cur)
}
fn positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
fn append_positions_with_offset(&mut self, offset: u32, output: &mut Vec<u32>) {
let term_freq = self.term_freq();
let prev_len = output.len();
if let Some(position_reader) = self.position_reader.as_mut() {
debug_assert!(
!self.block_cursor.freqs().is_empty(),
@@ -249,15 +250,14 @@ impl Postings for SegmentPostings {
.iter()
.cloned()
.sum::<u32>() as u64);
output.resize(term_freq as usize, 0u32);
position_reader.read(read_offset, &mut output[..]);
// TODO: instead of zeroing the output, we could use MaybeUninit or similar.
output.resize(prev_len + term_freq as usize, 0u32);
position_reader.read(read_offset, &mut output[prev_len..]);
let mut cum = offset;
for output_mut in output.iter_mut() {
for output_mut in output[prev_len..].iter_mut() {
cum += *output_mut;
*output_mut = cum;
}
} else {
output.clear();
}
}
}

View File

@@ -6,6 +6,7 @@ use tantivy_fst::Automaton;
use super::phrase_prefix_query::prefix_end;
use crate::index::SegmentReader;
use crate::postings::TermInfo;
use crate::query::{BitSetDocSet, ConstScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::termdict::{TermDictionary, TermStreamer};
@@ -64,6 +65,18 @@ where
term_stream_builder.into_stream()
}
/// Returns the term infos that match the automaton
pub fn get_match_term_infos(&self, reader: &SegmentReader) -> crate::Result<Vec<TermInfo>> {
let inverted_index = reader.inverted_index(self.field)?;
let term_dict = inverted_index.terms();
let mut term_stream = self.automaton_stream(term_dict)?;
let mut term_infos = Vec::new();
while term_stream.advance() {
term_infos.push(term_stream.value().clone());
}
Ok(term_infos)
}
}
impl<A> Weight for AutomatonWeight<A>

View File

@@ -272,7 +272,7 @@ impl<'a> From<&'a mut TermScorer> for TermScorerWithMaxScore<'a> {
}
}
impl<'a> Deref for TermScorerWithMaxScore<'a> {
impl Deref for TermScorerWithMaxScore<'_> {
type Target = TermScorer;
fn deref(&self) -> &Self::Target {
@@ -280,7 +280,7 @@ impl<'a> Deref for TermScorerWithMaxScore<'a> {
}
}
impl<'a> DerefMut for TermScorerWithMaxScore<'a> {
impl DerefMut for TermScorerWithMaxScore<'_> {
fn deref_mut(&mut self) -> &mut Self::Target {
self.scorer
}
@@ -308,7 +308,7 @@ mod tests {
use crate::query::score_combiner::SumCombiner;
use crate::query::term_query::TermScorer;
use crate::query::{Bm25Weight, Scorer, Union};
use crate::query::{Bm25Weight, BufferedUnionScorer, Scorer};
use crate::{DocId, DocSet, Score, TERMINATED};
struct Float(Score);
@@ -371,7 +371,7 @@ mod tests {
fn compute_checkpoints_manual(term_scorers: Vec<TermScorer>, n: usize) -> Vec<(DocId, Score)> {
let mut heap: BinaryHeap<Float> = BinaryHeap::with_capacity(n);
let mut checkpoints: Vec<(DocId, Score)> = Vec::new();
let mut scorer = Union::build(term_scorers, SumCombiner::default);
let mut scorer = BufferedUnionScorer::build(term_scorers, SumCombiner::default);
let mut limit = Score::MIN;
loop {
@@ -417,7 +417,7 @@ mod tests {
.boxed()
}
#[allow(clippy::type_complexity)]
#[expect(clippy::type_complexity)]
fn gen_term_scorers(num_scorers: usize) -> BoxedStrategy<(Vec<Vec<(DocId, u32)>>, Vec<u32>)> {
(1u32..100u32)
.prop_flat_map(move |max_doc: u32| {

View File

@@ -9,8 +9,8 @@ use crate::query::score_combiner::{DoNothingCombiner, ScoreCombiner};
use crate::query::term_query::TermScorer;
use crate::query::weight::{for_each_docset_buffered, for_each_pruning_scorer, for_each_scorer};
use crate::query::{
intersect_scorers, EmptyScorer, Exclude, Explanation, Occur, RequiredOptionalScorer, Scorer,
Union, Weight,
intersect_scorers, BufferedUnionScorer, EmptyScorer, Exclude, Explanation, Occur,
RequiredOptionalScorer, Scorer, Weight,
};
use crate::{DocId, Score};
@@ -65,14 +65,17 @@ where
// Block wand is only available if we read frequencies.
return SpecializedScorer::TermUnion(scorers);
} else {
return SpecializedScorer::Other(Box::new(Union::build(
return SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
)));
}
}
}
SpecializedScorer::Other(Box::new(Union::build(scorers, score_combiner_fn)))
SpecializedScorer::Other(Box::new(BufferedUnionScorer::build(
scorers,
score_combiner_fn,
)))
}
fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
@@ -81,7 +84,7 @@ fn into_box_scorer<TScoreCombiner: ScoreCombiner>(
) -> Box<dyn Scorer> {
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let union_scorer = Union::build(term_scorers, score_combiner_fn);
let union_scorer = BufferedUnionScorer::build(term_scorers, score_combiner_fn);
Box::new(union_scorer)
}
SpecializedScorer::Other(scorer) => scorer,
@@ -296,7 +299,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
let scorer = self.complex_scorer(reader, 1.0, &self.score_combiner_fn)?;
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
let mut union_scorer =
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
for_each_scorer(&mut union_scorer, callback);
}
SpecializedScorer::Other(mut scorer) => {
@@ -316,7 +320,8 @@ impl<TScoreCombiner: ScoreCombiner + Sync> Weight for BooleanWeight<TScoreCombin
match scorer {
SpecializedScorer::TermUnion(term_scorers) => {
let mut union_scorer = Union::build(term_scorers, &self.score_combiner_fn);
let mut union_scorer =
BufferedUnionScorer::build(term_scorers, &self.score_combiner_fn);
for_each_docset_buffered(&mut union_scorer, &mut buffer, callback);
}
SpecializedScorer::Other(mut scorer) => {

View File

@@ -7,14 +7,32 @@ use crate::docset::{DocSet, TERMINATED};
use crate::index::SegmentReader;
use crate::query::explanation::does_not_match;
use crate::query::{EnableScoring, Explanation, Query, Scorer, Weight};
use crate::schema::Type;
use crate::{DocId, Score, TantivyError};
/// Query that matches all documents with a non-null value in the specified field.
/// Query that matches all documents with a non-null value in the specified
/// field.
///
/// When querying inside a JSON field, "exists" queries can be executed strictly
/// on the field name or check all the subpaths. In that second case a document
/// will be matched if a non-null value exists in any subpath. For example,
/// assuming the following document where `myfield` is a JSON fast field:
/// ```json
/// {
/// "myfield": {
/// "mysubfield": "hello"
/// }
/// }
/// ```
/// With `json_subpaths` enabled queries on either `myfield` or
/// `myfield.mysubfield` will match the document. If it is set to false, only
/// `myfield.mysubfield` will match it.
///
/// All of the matched documents get the score 1.0.
#[derive(Clone, Debug)]
pub struct ExistsQuery {
field_name: String,
json_subpaths: bool,
}
impl ExistsQuery {
@@ -23,8 +41,28 @@ impl ExistsQuery {
/// This query matches all documents with at least one non-null value in the specified field.
/// This constructor never fails, but executing the search with this query will return an
/// error if the specified field doesn't exists or is not a fast field.
#[deprecated]
pub fn new_exists_query(field: String) -> ExistsQuery {
ExistsQuery { field_name: field }
ExistsQuery {
field_name: field,
json_subpaths: false,
}
}
/// Creates a new `ExistQuery` from the given field.
///
/// This query matches all documents with at least one non-null value in the
/// specified field. If `json_subpaths` is set to true, documents with
/// non-null values in any JSON subpath will also be matched.
///
/// This constructor never fails, but executing the search with this query will
/// return an error if the specified field doesn't exists or is not a fast
/// field.
pub fn new(field: String, json_subpaths: bool) -> Self {
Self {
field_name: field,
json_subpaths,
}
}
}
@@ -43,6 +81,8 @@ impl Query for ExistsQuery {
}
Ok(Box::new(ExistsWeight {
field_name: self.field_name.clone(),
field_type: field_type.value_type(),
json_subpaths: self.json_subpaths,
}))
}
}
@@ -50,13 +90,20 @@ impl Query for ExistsQuery {
/// Weight associated with the `ExistsQuery` query.
pub struct ExistsWeight {
field_name: String,
field_type: Type,
json_subpaths: bool,
}
impl Weight for ExistsWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
let fast_field_reader = reader.fast_fields();
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = fast_field_reader
.dynamic_column_handles(&self.field_name)?
let mut column_handles = fast_field_reader.dynamic_column_handles(&self.field_name)?;
if self.field_type == Type::Json && self.json_subpaths {
let mut sub_columns =
fast_field_reader.dynamic_subpath_column_handles(&self.field_name)?;
column_handles.append(&mut sub_columns);
}
let dynamic_columns: crate::Result<Vec<DynamicColumn>> = column_handles
.into_iter()
.map(|handle| handle.open().map_err(|io_error| io_error.into()))
.collect();
@@ -180,11 +227,12 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "all")?, 100);
assert_eq!(count_existing_fields(&searcher, "odd")?, 50);
assert_eq!(count_existing_fields(&searcher, "even")?, 50);
assert_eq!(count_existing_fields(&searcher, "multi")?, 10);
assert_eq!(count_existing_fields(&searcher, "never")?, 0);
assert_eq!(count_existing_fields(&searcher, "all", false)?, 100);
assert_eq!(count_existing_fields(&searcher, "odd", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "even", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "multi", false)?, 10);
assert_eq!(count_existing_fields(&searcher, "multi", true)?, 10);
assert_eq!(count_existing_fields(&searcher, "never", false)?, 0);
// exercise seek
let query = BooleanQuery::intersection(vec![
@@ -192,7 +240,7 @@ mod tests {
Bound::Included(Term::from_field_u64(all_field, 50)),
Bound::Unbounded,
)),
Box::new(ExistsQuery::new_exists_query("even".to_string())),
Box::new(ExistsQuery::new("even".to_string(), false)),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
@@ -201,7 +249,7 @@ mod tests {
Bound::Included(Term::from_field_u64(all_field, 0)),
Bound::Included(Term::from_field_u64(all_field, 50)),
)),
Box::new(ExistsQuery::new_exists_query("odd".to_string())),
Box::new(ExistsQuery::new("odd".to_string(), false)),
]);
assert_eq!(searcher.search(&query, &Count)?, 25);
@@ -230,22 +278,18 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "json.all")?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even")?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd")?, 50);
assert_eq!(count_existing_fields(&searcher, "json.all", false)?, 100);
assert_eq!(count_existing_fields(&searcher, "json.even", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.even", true)?, 50);
assert_eq!(count_existing_fields(&searcher, "json.odd", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "json", false)?, 0);
assert_eq!(count_existing_fields(&searcher, "json", true)?, 100);
// Handling of non-existing fields:
assert_eq!(count_existing_fields(&searcher, "json.absent")?, 0);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists.absent".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists.absent'"
);
assert_eq!(count_existing_fields(&searcher, "json.absent", false)?, 0);
assert_eq!(count_existing_fields(&searcher, "json.absent", true)?, 0);
assert_does_not_exist(&searcher, "does_not_exists.absent", true);
assert_does_not_exist(&searcher, "does_not_exists.absent", false);
Ok(())
}
@@ -284,12 +328,13 @@ mod tests {
let reader = index.reader()?;
let searcher = reader.searcher();
assert_eq!(count_existing_fields(&searcher, "bool")?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes")?, 50);
assert_eq!(count_existing_fields(&searcher, "date")?, 50);
assert_eq!(count_existing_fields(&searcher, "f64")?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr")?, 50);
assert_eq!(count_existing_fields(&searcher, "facet")?, 50);
assert_eq!(count_existing_fields(&searcher, "bool", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "bool", true)?, 50);
assert_eq!(count_existing_fields(&searcher, "bytes", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "date", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "f64", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "ip_addr", false)?, 50);
assert_eq!(count_existing_fields(&searcher, "facet", false)?, 50);
Ok(())
}
@@ -313,31 +358,33 @@ mod tests {
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("not_fast".to_string()),
&Count
)
.search(&ExistsQuery::new("not_fast".to_string(), false), &Count)
.unwrap_err()
.to_string(),
"Schema error: 'Field not_fast is not a fast field.'"
);
assert_eq!(
searcher
.search(
&ExistsQuery::new_exists_query("does_not_exists".to_string()),
&Count
)
.unwrap_err()
.to_string(),
"The field does not exist: 'does_not_exists'"
);
assert_does_not_exist(&searcher, "does_not_exists", false);
Ok(())
}
fn count_existing_fields(searcher: &Searcher, field: &str) -> crate::Result<usize> {
let query = ExistsQuery::new_exists_query(field.to_string());
fn count_existing_fields(
searcher: &Searcher,
field: &str,
json_subpaths: bool,
) -> crate::Result<usize> {
let query = ExistsQuery::new(field.to_string(), json_subpaths);
searcher.search(&query, &Count)
}
fn assert_does_not_exist(searcher: &Searcher, field: &str, json_subpaths: bool) {
assert_eq!(
searcher
.search(&ExistsQuery::new(field.to_string(), json_subpaths), &Count)
.unwrap_err()
.to_string(),
format!("The field does not exist: '{}'", field)
);
}
}

View File

@@ -51,6 +51,7 @@ pub use self::fuzzy_query::FuzzyTermQuery;
pub use self::intersection::{intersect_scorers, Intersection};
pub use self::more_like_this::{MoreLikeThisQuery, MoreLikeThisQueryBuilder};
pub use self::phrase_prefix_query::PhrasePrefixQuery;
pub use self::phrase_query::regex_phrase_query::{wildcard_query_to_regex_str, RegexPhraseQuery};
pub use self::phrase_query::PhraseQuery;
pub use self::query::{EnableScoring, Query, QueryClone};
pub use self::query_parser::{QueryParser, QueryParserError};
@@ -61,7 +62,7 @@ pub use self::score_combiner::{DisjunctionMaxCombiner, ScoreCombiner, SumCombine
pub use self::scorer::Scorer;
pub use self::set_query::TermSetQuery;
pub use self::term_query::TermQuery;
pub use self::union::Union;
pub use self::union::BufferedUnionScorer;
#[cfg(test)]
pub use self::vec_docset::VecDocSet;
pub use self::weight::Weight;

View File

@@ -8,7 +8,7 @@ use crate::{DocId, Score};
// MultiPrefix is the larger variant, and also the one we expect most often. PhraseScorer is > 1kB
// though, it would be interesting to slim it down if possible.
#[allow(clippy::large_enum_variant)]
#[expect(clippy::large_enum_variant)]
enum PhraseKind<TPostings: Postings> {
SinglePrefix {
position_offset: u32,

View File

@@ -53,27 +53,14 @@ impl PhrasePrefixWeight {
.map(|similarity_weight| similarity_weight.boost_by(boost));
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
if reader.has_deletes() {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
} else {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
@@ -109,8 +96,8 @@ impl PhrasePrefixWeight {
{
suffixes.push(postings);
}
} else if let Some(postings) = inv_index
.read_postings_no_deletes(&new_term, IndexRecordOption::WithFreqsAndPositions)?
} else if let Some(postings) =
inv_index.read_postings(&new_term, IndexRecordOption::WithFreqsAndPositions)?
{
suffixes.push(postings);
}

View File

@@ -1,6 +1,8 @@
mod phrase_query;
mod phrase_scorer;
mod phrase_weight;
pub mod regex_phrase_query;
mod regex_phrase_weight;
pub use self::phrase_query::PhraseQuery;
pub(crate) use self::phrase_scorer::intersection_count;
@@ -8,7 +10,7 @@ pub use self::phrase_scorer::PhraseScorer;
pub use self::phrase_weight::PhraseWeight;
#[cfg(test)]
pub mod tests {
pub(crate) mod tests {
use serde_json::json;
@@ -19,15 +21,15 @@ pub mod tests {
use crate::schema::{Schema, Term, TEXT};
use crate::{assert_nearly_equals, DocAddress, DocId, IndexWriter, TERMINATED};
pub fn create_index(texts: &[&'static str]) -> crate::Result<Index> {
pub fn create_index<S: AsRef<str>>(texts: &[S]) -> crate::Result<Index> {
let mut schema_builder = Schema::builder();
let text_field = schema_builder.add_text_field("text", TEXT);
let schema = schema_builder.build();
let index = Index::create_in_ram(schema);
{
let mut index_writer: IndexWriter = index.writer_for_tests()?;
for &text in texts {
let doc = doc!(text_field=>text);
for text in texts {
let doc = doc!(text_field=>text.as_ref());
index_writer.add_document(doc)?;
}
index_writer.commit()?;

View File

@@ -50,27 +50,14 @@ impl PhraseWeight {
.map(|similarity_weight| similarity_weight.boost_by(boost));
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut term_postings_list = Vec::new();
if reader.has_deletes() {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
} else {
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings_no_deletes(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
for &(offset, ref term) in &self.phrase_terms {
if let Some(postings) = reader
.inverted_index(term.field())?
.read_postings(term, IndexRecordOption::WithFreqsAndPositions)?
{
term_postings_list.push((offset, postings));
} else {
return Ok(None);
}
}
Ok(Some(PhraseScorer::new(

View File

@@ -0,0 +1,172 @@
use super::regex_phrase_weight::RegexPhraseWeight;
use crate::query::bm25::Bm25Weight;
use crate::query::{EnableScoring, Query, Weight};
use crate::schema::{Field, IndexRecordOption, Term, Type};
/// `RegexPhraseQuery` matches a specific sequence of regex queries.
///
/// For instance, the phrase query for `"pa.* time"` will match
/// the sentence:
///
/// **Alan just got a part time job.**
///
/// On the other hand it will not match the sentence.
///
/// **This is my favorite part of the job.**
///
/// [Slop](RegexPhraseQuery::set_slop) allows leniency in term proximity
/// for some performance trade-off.
///
/// Using a `RegexPhraseQuery` on a field requires positions
/// to be indexed for this field.
#[derive(Clone, Debug)]
pub struct RegexPhraseQuery {
field: Field,
phrase_terms: Vec<(usize, String)>,
slop: u32,
max_expansions: u32,
}
/// Transform a wildcard query to a regex string.
///
/// `AB*CD` for example is converted to `AB.*CD`
///
/// All other chars are regex escaped.
pub fn wildcard_query_to_regex_str(term: &str) -> String {
regex::escape(term).replace(r"\*", ".*")
}
impl RegexPhraseQuery {
/// Creates a new `RegexPhraseQuery` given a list of terms.
///
/// There must be at least two terms, and all terms
/// must belong to the same field.
///
/// Offset for each term will be same as index in the Vector
pub fn new(field: Field, terms: Vec<String>) -> RegexPhraseQuery {
let terms_with_offset = terms.into_iter().enumerate().collect();
RegexPhraseQuery::new_with_offset(field, terms_with_offset)
}
/// Creates a new `RegexPhraseQuery` given a list of terms and their offsets.
///
/// Can be used to provide custom offset for each term.
pub fn new_with_offset(field: Field, terms: Vec<(usize, String)>) -> RegexPhraseQuery {
RegexPhraseQuery::new_with_offset_and_slop(field, terms, 0)
}
/// Creates a new `RegexPhraseQuery` given a list of terms, their offsets and a slop
pub fn new_with_offset_and_slop(
field: Field,
mut terms: Vec<(usize, String)>,
slop: u32,
) -> RegexPhraseQuery {
assert!(
terms.len() > 1,
"A phrase query is required to have strictly more than one term."
);
terms.sort_by_key(|&(offset, _)| offset);
RegexPhraseQuery {
field,
phrase_terms: terms,
slop,
max_expansions: 1 << 14,
}
}
/// Slop allowed for the phrase.
///
/// The query will match if its terms are separated by `slop` terms at most.
/// The slop can be considered a budget between all terms.
/// E.g. "A B C" with slop 1 allows "A X B C", "A B X C", but not "A X B X C".
///
/// Transposition costs 2, e.g. "A B" with slop 1 will not match "B A" but it would with slop 2
/// Transposition is not a special case, in the example above A is moved 1 position and B is
/// moved 1 position, so the slop is 2.
///
/// As a result slop works in both directions, so the order of the terms may changed as long as
/// they respect the slop.
///
/// By default the slop is 0 meaning query terms need to be adjacent.
pub fn set_slop(&mut self, value: u32) {
self.slop = value;
}
/// Sets the max expansions a regex term can match. The limit will be over all terms.
/// After the limit is hit an error will be returned.
pub fn set_max_expansions(&mut self, value: u32) {
self.max_expansions = value;
}
/// The [`Field`] this `RegexPhraseQuery` is targeting.
pub fn field(&self) -> Field {
self.field
}
/// `Term`s in the phrase without the associated offsets.
pub fn phrase_terms(&self) -> Vec<Term> {
self.phrase_terms
.iter()
.map(|(_, term)| Term::from_field_text(self.field, term))
.collect::<Vec<Term>>()
}
/// Returns the [`RegexPhraseWeight`] for the given phrase query given a specific `searcher`.
///
/// This function is the same as [`Query::weight()`] except it returns
/// a specialized type [`RegexPhraseWeight`] instead of a Boxed trait.
pub(crate) fn regex_phrase_weight(
&self,
enable_scoring: EnableScoring<'_>,
) -> crate::Result<RegexPhraseWeight> {
let schema = enable_scoring.schema();
let field_type = schema.get_field_entry(self.field).field_type().value_type();
if field_type != Type::Str {
return Err(crate::TantivyError::SchemaError(format!(
"RegexPhraseQuery can only be used with a field of type text currently, but got \
{:?}",
field_type
)));
}
let field_entry = schema.get_field_entry(self.field);
let has_positions = field_entry
.field_type()
.get_index_record_option()
.map(IndexRecordOption::has_positions)
.unwrap_or(false);
if !has_positions {
let field_name = field_entry.name();
return Err(crate::TantivyError::SchemaError(format!(
"Applied phrase query on field {field_name:?}, which does not have positions \
indexed"
)));
}
let terms = self.phrase_terms();
let bm25_weight_opt = match enable_scoring {
EnableScoring::Enabled {
statistics_provider,
..
} => Some(Bm25Weight::for_terms(statistics_provider, &terms)?),
EnableScoring::Disabled { .. } => None,
};
let weight = RegexPhraseWeight::new(
self.field,
self.phrase_terms.clone(),
bm25_weight_opt,
self.max_expansions,
self.slop,
);
Ok(weight)
}
}
impl Query for RegexPhraseQuery {
/// Create the weight associated with a query.
///
/// See [`Weight`].
fn weight(&self, enable_scoring: EnableScoring<'_>) -> crate::Result<Box<dyn Weight>> {
let phrase_weight = self.regex_phrase_weight(enable_scoring)?;
Ok(Box::new(phrase_weight))
}
}

View File

@@ -0,0 +1,475 @@
use std::sync::Arc;
use common::BitSet;
use tantivy_fst::Regex;
use super::PhraseScorer;
use crate::fieldnorm::FieldNormReader;
use crate::index::SegmentReader;
use crate::postings::{LoadedPostings, Postings, SegmentPostings, TermInfo};
use crate::query::bm25::Bm25Weight;
use crate::query::explanation::does_not_match;
use crate::query::union::{BitSetPostingUnion, SimpleUnion};
use crate::query::{AutomatonWeight, BitSetDocSet, EmptyScorer, Explanation, Scorer, Weight};
use crate::schema::{Field, IndexRecordOption};
use crate::{DocId, DocSet, InvertedIndexReader, Score};
type UnionType = SimpleUnion<Box<dyn Postings + 'static>>;
/// The `RegexPhraseWeight` is the weight associated to a regex phrase query.
/// See RegexPhraseWeight::get_union_from_term_infos for some design decisions.
pub struct RegexPhraseWeight {
field: Field,
phrase_terms: Vec<(usize, String)>,
similarity_weight_opt: Option<Bm25Weight>,
slop: u32,
max_expansions: u32,
}
impl RegexPhraseWeight {
/// Creates a new phrase weight.
/// If `similarity_weight_opt` is None, then scoring is disabled
pub fn new(
field: Field,
phrase_terms: Vec<(usize, String)>,
similarity_weight_opt: Option<Bm25Weight>,
max_expansions: u32,
slop: u32,
) -> RegexPhraseWeight {
RegexPhraseWeight {
field,
phrase_terms,
similarity_weight_opt,
slop,
max_expansions,
}
}
fn fieldnorm_reader(&self, reader: &SegmentReader) -> crate::Result<FieldNormReader> {
if self.similarity_weight_opt.is_some() {
if let Some(fieldnorm_reader) = reader.fieldnorms_readers().get_field(self.field)? {
return Ok(fieldnorm_reader);
}
}
Ok(FieldNormReader::constant(reader.max_doc(), 1))
}
pub(crate) fn phrase_scorer(
&self,
reader: &SegmentReader,
boost: Score,
) -> crate::Result<Option<PhraseScorer<UnionType>>> {
let similarity_weight_opt = self
.similarity_weight_opt
.as_ref()
.map(|similarity_weight| similarity_weight.boost_by(boost));
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let mut posting_lists = Vec::new();
let inverted_index = reader.inverted_index(self.field)?;
let mut num_terms = 0;
for &(offset, ref term) in &self.phrase_terms {
let regex = Regex::new(term)
.map_err(|e| crate::TantivyError::InvalidArgument(format!("Invalid regex: {e}")))?;
let automaton: AutomatonWeight<Regex> =
AutomatonWeight::new(self.field, Arc::new(regex));
let term_infos = automaton.get_match_term_infos(reader)?;
// If term_infos is empty, the phrase can not match any documents.
if term_infos.is_empty() {
return Ok(None);
}
num_terms += term_infos.len();
if num_terms > self.max_expansions as usize {
return Err(crate::TantivyError::InvalidArgument(format!(
"Phrase query exceeded max expansions {}",
num_terms
)));
}
let union = Self::get_union_from_term_infos(&term_infos, reader, &inverted_index)?;
posting_lists.push((offset, union));
}
Ok(Some(PhraseScorer::new(
posting_lists,
similarity_weight_opt,
fieldnorm_reader,
self.slop,
)))
}
/// Add all docs of the term to the docset
fn add_to_bitset(
inverted_index: &InvertedIndexReader,
term_info: &TermInfo,
doc_bitset: &mut BitSet,
) -> crate::Result<()> {
let mut block_segment_postings = inverted_index
.read_block_postings_from_terminfo(term_info, IndexRecordOption::Basic)?;
loop {
let docs = block_segment_postings.docs();
if docs.is_empty() {
break;
}
for &doc in docs {
doc_bitset.insert(doc);
}
block_segment_postings.advance();
}
Ok(())
}
/// This function generates a union of document sets from multiple term information
/// (`TermInfo`).
///
/// It uses bucketing based on term frequency to optimize query performance and memory usage.
/// The terms are divided into buckets based on their document frequency (the number of
/// documents they appear in).
///
/// ### Bucketing Strategy:
/// Once a bucket contains more than 512 terms, it is moved to the end of the list and replaced
/// with a new empty bucket.
///
/// - **Sparse Term Buckets**: Terms with document frequency `< 100`.
///
/// Each sparse bucket contains:
/// - A `BitSet` to efficiently track which document IDs are present in the bucket, which is
/// used to drive the `DocSet`.
/// - A `Vec<LoadedPostings>` to store the postings for each term in that bucket.
///
/// - **Other Term Buckets**:
/// - **Bucket 0**: Terms appearing in less than `0.1%` of documents.
/// - **Bucket 1**: Terms appearing in `0.1%` to `1%` of documents.
/// - **Bucket 2**: Terms appearing in `1%` to `10%` of documents.
/// - **Bucket 3**: Terms appearing in more than `10%` of documents.
///
/// Each bucket contains:
/// - A `BitSet` to efficiently track which document IDs are present in the bucket.
/// - A `Vec<SegmentPostings>` to store the postings for each term in that bucket.
///
/// ### Design Choices:
/// The main cost for a _unbucketed_ regex phrase query with a medium/high amount of terms is
/// the `append_positions_with_offset` from `Postings`.
/// We don't know which docsets hit, so we need to scan all of them to check if they contain the
/// docid.
/// The bucketing strategy groups less common DocSets together, so we can rule out the
/// whole docset group in many cases.
///
/// E.g. consider the phrase "th* world"
/// It contains the term "the", which may occur in almost all documents.
/// It may also contain 10_000s very rare terms like "theologian".
///
/// For very low-frequency terms (sparse terms), we use `LoadedPostings` and aggregate
/// their document IDs into a `BitSet`, which is more memory-efficient than using
/// `SegmentPostings`. E.g. 100_000 terms with SegmentPostings would consume 184MB.
/// `SegmentPostings` uses memory equivalent to 460 docids. The 100 docs limit should be
/// fine as long as a term doesn't have too many positions per doc.
///
/// ### Future Optimization:
/// A larger performance improvement would be an additional partitioning of the space
/// vertically of u16::MAX blocks, where we mark which docset ord has values in each block.
/// E.g. partitioning in a index with 5 million documents this would reduce the number of
/// docsets to scan to around 1/20 in the sparse term bucket where the terms only have a few
/// docs. For higher cardinality buckets this is irrelevant as they are in most blocks.
///
/// Use Roaring Bitmaps for sparse terms. The full bitvec is main memory consumer currently.
pub(crate) fn get_union_from_term_infos(
term_infos: &[TermInfo],
reader: &SegmentReader,
inverted_index: &InvertedIndexReader,
) -> crate::Result<UnionType> {
let max_doc = reader.max_doc();
// Buckets for sparse terms
let mut sparse_buckets: Vec<(BitSet, Vec<LoadedPostings>)> =
vec![(BitSet::with_max_value(max_doc), Vec::new())];
// Buckets for other terms based on document frequency percentages:
// - Bucket 0: Terms appearing in less than 0.1% of documents
// - Bucket 1: Terms appearing in 0.1% to 1% of documents
// - Bucket 2: Terms appearing in 1% to 10% of documents
// - Bucket 3: Terms appearing in more than 10% of documents
let mut buckets: Vec<(BitSet, Vec<SegmentPostings>)> = (0..4)
.map(|_| (BitSet::with_max_value(max_doc), Vec::new()))
.collect();
const SPARSE_TERM_DOC_THRESHOLD: u32 = 100;
for term_info in term_infos {
let mut term_posting = inverted_index
.read_postings_from_terminfo(term_info, IndexRecordOption::WithFreqsAndPositions)?;
let num_docs = term_posting.doc_freq();
if num_docs < SPARSE_TERM_DOC_THRESHOLD {
let current_bucket = &mut sparse_buckets[0];
Self::add_to_bitset(inverted_index, term_info, &mut current_bucket.0)?;
let docset = LoadedPostings::load(&mut term_posting);
current_bucket.1.push(docset);
// Move the bucket to the end if the term limit is reached
if current_bucket.1.len() == 512 {
sparse_buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
let end_index = sparse_buckets.len() - 1;
sparse_buckets.swap(0, end_index);
}
} else {
// Calculate the percentage of documents the term appears in
let doc_freq_percentage = (num_docs as f32) / (max_doc as f32) * 100.0;
// Determine the appropriate bucket based on percentage thresholds
let bucket_index = if doc_freq_percentage < 0.1 {
0
} else if doc_freq_percentage < 1.0 {
1
} else if doc_freq_percentage < 10.0 {
2
} else {
3
};
let bucket = &mut buckets[bucket_index];
// Add term postings to the appropriate bucket
Self::add_to_bitset(inverted_index, term_info, &mut bucket.0)?;
bucket.1.push(term_posting);
// Move the bucket to the end if the term limit is reached
if bucket.1.len() == 512 {
buckets.push((BitSet::with_max_value(max_doc), Vec::new()));
let end_index = buckets.len() - 1;
buckets.swap(bucket_index, end_index);
}
}
}
// Build unions for sparse term buckets
let sparse_term_docsets: Vec<_> = sparse_buckets
.into_iter()
.filter(|(_, postings)| !postings.is_empty())
.map(|(bitset, postings)| {
BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
})
.collect();
let sparse_term_unions = SimpleUnion::build(sparse_term_docsets);
// Build unions for other term buckets
let bitset_unions_per_bucket: Vec<_> = buckets
.into_iter()
.filter(|(_, postings)| !postings.is_empty())
.map(|(bitset, postings)| {
BitSetPostingUnion::build(postings, BitSetDocSet::from(bitset))
})
.collect();
let other_union = SimpleUnion::build(bitset_unions_per_bucket);
let union: SimpleUnion<Box<dyn Postings + 'static>> =
SimpleUnion::build(vec![Box::new(sparse_term_unions), Box::new(other_union)]);
// Return a union of sparse term unions and other term unions
Ok(union)
}
}
impl Weight for RegexPhraseWeight {
fn scorer(&self, reader: &SegmentReader, boost: Score) -> crate::Result<Box<dyn Scorer>> {
if let Some(scorer) = self.phrase_scorer(reader, boost)? {
Ok(Box::new(scorer))
} else {
Ok(Box::new(EmptyScorer))
}
}
fn explain(&self, reader: &SegmentReader, doc: DocId) -> crate::Result<Explanation> {
let scorer_opt = self.phrase_scorer(reader, 1.0)?;
if scorer_opt.is_none() {
return Err(does_not_match(doc));
}
let mut scorer = scorer_opt.unwrap();
if scorer.seek(doc) != doc {
return Err(does_not_match(doc));
}
let fieldnorm_reader = self.fieldnorm_reader(reader)?;
let fieldnorm_id = fieldnorm_reader.fieldnorm_id(doc);
let phrase_count = scorer.phrase_count();
let mut explanation = Explanation::new("Phrase Scorer", scorer.score());
if let Some(similarity_weight) = self.similarity_weight_opt.as_ref() {
explanation.add_detail(similarity_weight.explain(fieldnorm_id, phrase_count));
}
Ok(explanation)
}
}
#[cfg(test)]
mod tests {
use proptest::prelude::*;
use rand::seq::SliceRandom;
use super::super::tests::create_index;
use crate::docset::TERMINATED;
use crate::query::{wildcard_query_to_regex_str, EnableScoring, RegexPhraseQuery};
use crate::DocSet;
proptest! {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn test_phrase_regex_with_random_strings(mut random_strings in proptest::collection::vec("[c-z ]{0,10}", 1..100), num_occurrences in 1..150_usize) {
let mut rng = rand::thread_rng();
// Insert "aaa ccc" the specified number of times into the list
for _ in 0..num_occurrences {
random_strings.push("aaa ccc".to_string());
}
// Shuffle the list, which now contains random strings and the inserted "aaa ccc"
random_strings.shuffle(&mut rng);
// Compute the positions of "aaa ccc" after the shuffle
let aaa_ccc_positions: Vec<usize> = random_strings
.iter()
.enumerate()
.filter_map(|(idx, s)| if s == "aaa ccc" { Some(idx) } else { None })
.collect();
// Create the index with random strings and the fixed string "aaa ccc"
let index = create_index(&random_strings.iter().map(AsRef::as_ref).collect::<Vec<&str>>())?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = RegexPhraseQuery::new(text_field, vec![wildcard_query_to_regex_str("a*"), wildcard_query_to_regex_str("c*")]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
// Check if the scorer returns the correct document positions for "aaa ccc"
for expected_doc in aaa_ccc_positions {
prop_assert_eq!(phrase_scorer.doc(), expected_doc as u32);
prop_assert_eq!(phrase_scorer.phrase_count(), 1);
phrase_scorer.advance();
}
prop_assert_eq!(phrase_scorer.advance(), TERMINATED);
}
}
#[test]
pub fn test_phrase_count() -> crate::Result<()> {
let index = create_index(&["a c", "a a b d a b c", " a b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = RegexPhraseQuery::new(text_field, vec!["a".into(), "b".into()]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_wildcard() -> crate::Result<()> {
let index = create_index(&["a c", "a aa b d ad b c", " ac b", "bac b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "b".into()]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_regex() -> crate::Result<()> {
let index = create_index(&["ba b", "a aa b d ad b c", "bac b"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = RegexPhraseQuery::new(text_field, vec!["b?a.*".into(), "b".into()]);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 0);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), 1);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), 2);
assert_eq!(phrase_scorer.doc(), 2);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_regex_with_slop() -> crate::Result<()> {
let index = create_index(&["aaa bbb ccc ___ abc ddd bbb ccc"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let mut phrase_query = RegexPhraseQuery::new(text_field, vec!["a.*".into(), "c.*".into()]);
phrase_query.set_slop(1);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 0);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
phrase_query.set_slop(2);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 0);
assert_eq!(phrase_scorer.phrase_count(), 2);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
#[test]
pub fn test_phrase_regex_double_wildcard() -> crate::Result<()> {
let index = create_index(&["baaab bccccb"])?;
let schema = index.schema();
let text_field = schema.get_field("text").unwrap();
let searcher = index.reader()?.searcher();
let phrase_query = RegexPhraseQuery::new(
text_field,
vec![
wildcard_query_to_regex_str("*a*"),
wildcard_query_to_regex_str("*c*"),
],
);
let enable_scoring = EnableScoring::enabled_from_searcher(&searcher);
let phrase_weight = phrase_query.regex_phrase_weight(enable_scoring).unwrap();
let mut phrase_scorer = phrase_weight
.phrase_scorer(searcher.segment_reader(0u32), 1.0)?
.unwrap();
assert_eq!(phrase_scorer.doc(), 0);
assert_eq!(phrase_scorer.phrase_count(), 1);
assert_eq!(phrase_scorer.advance(), TERMINATED);
Ok(())
}
}

View File

@@ -402,7 +402,7 @@ fn search_on_u64_ff(
boost: Score,
bounds: BoundsRange<u64>,
) -> crate::Result<Box<dyn Scorer>> {
#[allow(clippy::reversed_empty_ranges)]
#[expect(clippy::reversed_empty_ranges)]
let value_range = bound_to_value_range(
&bounds.lower_bound,
&bounds.upper_bound,
@@ -1386,7 +1386,7 @@ mod tests {
}
#[cfg(test)]
pub mod ip_range_tests {
pub(crate) mod ip_range_tests {
use proptest::prelude::ProptestConfig;
use proptest::strategy::Strategy;
use proptest::{prop_oneof, proptest};

Some files were not shown because too many files have changed in this diff Show More