mirror of
https://github.com/quickwit-oss/tantivy.git
synced 2026-01-02 23:32:54 +00:00
Compare commits
35 Commits
agg_format
...
dependabot
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ec5748d795 | ||
|
|
c71ea7b2ef | ||
|
|
c35a782747 | ||
|
|
c66af2c0a9 | ||
|
|
f9ac055847 | ||
|
|
21d057059e | ||
|
|
dca508b4ca | ||
|
|
aebae9965d | ||
|
|
e7e3e3f44c | ||
|
|
2f2db16ec1 | ||
|
|
d152e29687 | ||
|
|
285bcc25c9 | ||
|
|
7b65ad922d | ||
|
|
99be20cedd | ||
|
|
5f026901b8 | ||
|
|
6dfa2df06f | ||
|
|
c17e513377 | ||
|
|
2f5a269c70 | ||
|
|
50532260e3 | ||
|
|
8bd6eb06e6 | ||
|
|
55b0b52457 | ||
|
|
56fc56c5b9 | ||
|
|
85395d942a | ||
|
|
a206c3ccd3 | ||
|
|
dc5d31c116 | ||
|
|
95a4ddea3e | ||
|
|
ab5125d3dc | ||
|
|
9f81d59ecd | ||
|
|
c71ec8086d | ||
|
|
27be6aed91 | ||
|
|
3d1c4b313a | ||
|
|
0d4e319965 | ||
|
|
75dc3eb298 | ||
|
|
3f6d225086 | ||
|
|
d8843c608c |
2
.github/workflows/coverage.yml
vendored
2
.github/workflows/coverage.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
- name: Generate code coverage
|
||||
run: cargo +nightly-2024-07-01 llvm-cov --all-features --workspace --doctests --lcov --output-path lcov.info
|
||||
- name: Upload coverage to Codecov
|
||||
uses: codecov/codecov-action@v3
|
||||
uses: codecov/codecov-action@v5
|
||||
continue-on-error: true
|
||||
with:
|
||||
token: ${{ secrets.CODECOV_TOKEN }} # not required for public repos
|
||||
|
||||
@@ -46,7 +46,7 @@ The file of a segment has the format
|
||||
|
||||
```segment-id . ext```
|
||||
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/core/segment_component.rs)) is stored in the file.
|
||||
The extension signals which data structure (or [`SegmentComponent`](src/index/segment_component.rs)) is stored in the file.
|
||||
|
||||
A small `meta.json` file is in charge of keeping track of the list of segments, as well as the schema.
|
||||
|
||||
@@ -102,7 +102,7 @@ but users can extend tantivy with their own implementation.
|
||||
|
||||
Tantivy's document follows a very strict schema, decided before building any index.
|
||||
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
The schema defines all of the fields that the indexes [`Document`](src/schema/document/mod.rs) may and should contain, their types (`text`, `i64`, `u64`, `Date`, ...) as well as how it should be indexed / represented in tantivy.
|
||||
|
||||
Depending on the type of the field, you can decide to
|
||||
|
||||
|
||||
84
CHANGELOG.md
84
CHANGELOG.md
@@ -1,3 +1,69 @@
|
||||
Tantivy 0.23 - Unreleased
|
||||
================================
|
||||
Tantivy 0.23 will be backwards compatible with indices created with v0.22 and v0.21.
|
||||
|
||||
#### Bugfixes
|
||||
- fix potential endless loop in merge [#2457](https://github.com/quickwit-oss/tantivy/pull/2457)(@PSeitz)
|
||||
- fix bug that causes out-of-order sstable key. [#2445](https://github.com/quickwit-oss/tantivy/pull/2445)(@fulmicoton)
|
||||
- fix ReferenceValue API flaw [#2372](https://github.com/quickwit-oss/tantivy/pull/2372)(@PSeitz)
|
||||
|
||||
#### Breaking API Changes
|
||||
- remove index sorting [#2434](https://github.com/quickwit-oss/tantivy/pull/2434)(@PSeitz)
|
||||
|
||||
#### Features/Improvements
|
||||
- **Aggregation**
|
||||
- Support for cardinality aggregation [#2337](https://github.com/quickwit-oss/tantivy/pull/2337) [#2446](https://github.com/quickwit-oss/tantivy/pull/2446) (@raphaelcoeffic @PSeitz)
|
||||
- Support for extended stats aggregation [#2247](https://github.com/quickwit-oss/tantivy/pull/2247)(@giovannicuccu)
|
||||
- Add Key::I64 and Key::U64 variants in aggregation to avoid f64 precision issues [#2468](https://github.com/quickwit-oss/tantivy/pull/2468)(@PSeitz)
|
||||
- Faster term aggregation fetch terms [#2447](https://github.com/quickwit-oss/tantivy/pull/2447)(@PSeitz)
|
||||
- Improve custom order deserialization [#2451](https://github.com/quickwit-oss/tantivy/pull/2451)(@PSeitz)
|
||||
- Change AggregationLimits behavior [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||
- lower contention on AggregationLimits [#2394](https://github.com/quickwit-oss/tantivy/pull/2394)(@PSeitz)
|
||||
- fix postcard compatibility for top_hits, add postcard test [#2346](https://github.com/quickwit-oss/tantivy/pull/2346)(@PSeitz)
|
||||
- reduce top hits memory consumption [#2426](https://github.com/quickwit-oss/tantivy/pull/2426)(@PSeitz)
|
||||
- check unsupported parameters top_hits [#2351](https://github.com/quickwit-oss/tantivy/pull/2351)(@PSeitz)
|
||||
- Change AggregationLimits to AggregationLimitsGuard [#2495](https://github.com/quickwit-oss/tantivy/pull/2495)(@PSeitz)
|
||||
- **Range Queries**
|
||||
- Support fast field range queries on json fields [#2456](https://github.com/quickwit-oss/tantivy/pull/2456)(@PSeitz)
|
||||
- Add support for str fast field range query [#2460](https://github.com/quickwit-oss/tantivy/pull/2460) [#2452](https://github.com/quickwit-oss/tantivy/pull/2452) [#2453](https://github.com/quickwit-oss/tantivy/pull/2453)(@PSeitz)
|
||||
- modify fastfield range query heuristic [#2375](https://github.com/quickwit-oss/tantivy/pull/2375)(@trinity-1686a)
|
||||
- add FastFieldRangeQuery for explicit range queries on fast field (for `RangeQuery` it is autodetected) [#2477](https://github.com/quickwit-oss/tantivy/pull/2477)(@PSeitz)
|
||||
|
||||
- add format backwards-compatibility tests [#2485](https://github.com/quickwit-oss/tantivy/pull/2485)(@PSeitz)
|
||||
- add columnar format compatibility tests [#2433](https://github.com/quickwit-oss/tantivy/pull/2433)(@PSeitz)
|
||||
- Improved snippet ranges algorithm [#2474](https://github.com/quickwit-oss/tantivy/pull/2474)(@gezihuzi)
|
||||
- make find_field_with_default return json fields without path [#2476](https://github.com/quickwit-oss/tantivy/pull/2476)(@trinity-1686a)
|
||||
- feat(query): Make `BooleanQuery` support `minimum_number_should_match` [#2405](https://github.com/quickwit-oss/tantivy/pull/2405)(@LebranceBW)
|
||||
|
||||
- **Optional Index in Multivalue Columnar Index** For mostly empty multivalued indices there was a large overhead during creation when iterating all docids (merge case). This is alleviated by placing an optional index in the multivalued index to mark documents that have values. This will slightly increase space and access time. [#2439](https://github.com/quickwit-oss/tantivy/pull/2439)(@PSeitz)
|
||||
|
||||
- **Performace/Memory**
|
||||
- lift clauses in LogicalAst for optimized ast during execution [#2449](https://github.com/quickwit-oss/tantivy/pull/2449)(@PSeitz)
|
||||
- Use Vec instead of BTreeMap to back OwnedValue object [#2364](https://github.com/quickwit-oss/tantivy/pull/2364)(@fulmicoton)
|
||||
- Replace TantivyDocument with CompactDoc. CompactDoc is much smaller and provides similar performance. [#2402](https://github.com/quickwit-oss/tantivy/pull/2402)(@PSeitz)
|
||||
- Recycling buffer in PrefixPhraseScorer [#2443](https://github.com/quickwit-oss/tantivy/pull/2443)(@fulmicoton)
|
||||
|
||||
- **Json Type**
|
||||
- JSON supports now all values on the root level. Previously an object was required. This enables support for flat mixed types. allow more JSON values, fix i64 special case [#2383](https://github.com/quickwit-oss/tantivy/pull/2383)(@PSeitz)
|
||||
- add json path constructor to term [#2367](https://github.com/quickwit-oss/tantivy/pull/2367)(@PSeitz)
|
||||
|
||||
- **QueryParser**
|
||||
- fix de-escaping too much in query parser [#2427](https://github.com/quickwit-oss/tantivy/pull/2427)(@trinity-1686a)
|
||||
- improve query parser [#2416](https://github.com/quickwit-oss/tantivy/pull/2416)(@trinity-1686a)
|
||||
- Support field grouping `title:(return AND "pink panther")` [#2333](https://github.com/quickwit-oss/tantivy/pull/2333)(@trinity-1686a)
|
||||
|
||||
- add access benchmark for columnar [#2432](https://github.com/quickwit-oss/tantivy/pull/2432)(@PSeitz)
|
||||
- extend indexwriter proptests [#2342](https://github.com/quickwit-oss/tantivy/pull/2342)(@PSeitz)
|
||||
- add bench & test for columnar merging [#2428](https://github.com/quickwit-oss/tantivy/pull/2428)(@PSeitz)
|
||||
- Change in Executor API [#2391](https://github.com/quickwit-oss/tantivy/pull/2391)(@fulmicoton)
|
||||
- Removed usage of num_cpus [#2387](https://github.com/quickwit-oss/tantivy/pull/2387)(@fulmicoton)
|
||||
- use bingang for agg benchmark [#2378](https://github.com/quickwit-oss/tantivy/pull/2378)(@PSeitz)
|
||||
- cleanup top level exports [#2382](https://github.com/quickwit-oss/tantivy/pull/2382)(@PSeitz)
|
||||
- make convert_to_fast_value_and_append_to_json_term pub [#2370](https://github.com/quickwit-oss/tantivy/pull/2370)(@PSeitz)
|
||||
- remove JsonTermWriter [#2238](https://github.com/quickwit-oss/tantivy/pull/2238)(@PSeitz)
|
||||
- validate sort by field type [#2336](https://github.com/quickwit-oss/tantivy/pull/2336)(@PSeitz)
|
||||
- Fix trait bound of StoreReader::iter [#2360](https://github.com/quickwit-oss/tantivy/pull/2360)(@adamreichold)
|
||||
|
||||
Tantivy 0.22
|
||||
================================
|
||||
|
||||
@@ -8,7 +74,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
||||
- Fix bug that can cause `get_docids_for_value_range` to panic. [#2295](https://github.com/quickwit-oss/tantivy/pull/2295)(@fulmicoton)
|
||||
- Avoid 1 document indices by increase min memory to 15MB for indexing [#2176](https://github.com/quickwit-oss/tantivy/pull/2176)(@PSeitz)
|
||||
- Fix merge panic for JSON fields [#2284](https://github.com/quickwit-oss/tantivy/pull/2284)(@PSeitz)
|
||||
- Fix bug occuring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||
- Fix bug occurring when merging JSON object indexed with positions. [#2253](https://github.com/quickwit-oss/tantivy/pull/2253)(@fulmicoton)
|
||||
- Fix empty DateHistogram gap bug [#2183](https://github.com/quickwit-oss/tantivy/pull/2183)(@PSeitz)
|
||||
- Fix range query end check (fields with less than 1 value per doc are affected) [#2226](https://github.com/quickwit-oss/tantivy/pull/2226)(@PSeitz)
|
||||
- Handle exclusive out of bounds ranges on fastfield range queries [#2174](https://github.com/quickwit-oss/tantivy/pull/2174)(@PSeitz)
|
||||
@@ -26,7 +92,7 @@ Tantivy 0.22 will be able to read indices created with Tantivy 0.21.
|
||||
- Support to deserialize f64 from string [#2311](https://github.com/quickwit-oss/tantivy/pull/2311)(@PSeitz)
|
||||
- Add a top_hits aggregator [#2198](https://github.com/quickwit-oss/tantivy/pull/2198)(@ditsuke)
|
||||
- Support bool type in term aggregation [#2318](https://github.com/quickwit-oss/tantivy/pull/2318)(@PSeitz)
|
||||
- Support ip adresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||
- Support ip addresses in term aggregation [#2319](https://github.com/quickwit-oss/tantivy/pull/2319)(@PSeitz)
|
||||
- Support date type in term aggregation [#2172](https://github.com/quickwit-oss/tantivy/pull/2172)(@PSeitz)
|
||||
- Support escaped dot when addressing field [#2250](https://github.com/quickwit-oss/tantivy/pull/2250)(@PSeitz)
|
||||
|
||||
@@ -116,7 +182,7 @@ Tantivy 0.20
|
||||
- Add PhrasePrefixQuery [#1842](https://github.com/quickwit-oss/tantivy/issues/1842) (@trinity-1686a)
|
||||
- Add `coerce` option for text and numbers types (convert the value instead of returning an error during indexing) [#1904](https://github.com/quickwit-oss/tantivy/issues/1904) (@PSeitz)
|
||||
- Add regex tokenizer [#1759](https://github.com/quickwit-oss/tantivy/issues/1759)(@mkleen)
|
||||
- Move tokenizer API to seperate crate. Having a seperate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- Move tokenizer API to separate crate. Having a separate crate with a stable API will allow us to use tokenizers with different tantivy versions. [#1767](https://github.com/quickwit-oss/tantivy/issues/1767) (@PSeitz)
|
||||
- **Columnar crate**: New fast field handling (@fulmicoton @PSeitz) [#1806](https://github.com/quickwit-oss/tantivy/issues/1806)[#1809](https://github.com/quickwit-oss/tantivy/issues/1809)
|
||||
- Support for fast fields with optional values. Previously tantivy supported only single-valued and multi-value fast fields. The encoding of optional fast fields is now very compact.
|
||||
- Fast field Support for JSON (schemaless fast fields). Support multiple types on the same column. [#1876](https://github.com/quickwit-oss/tantivy/issues/1876) (@fulmicoton)
|
||||
@@ -163,13 +229,13 @@ Tantivy 0.20
|
||||
- Auto downgrade index record option, instead of vint error [#1857](https://github.com/quickwit-oss/tantivy/issues/1857) (@PSeitz)
|
||||
- Enable range query on fast field for u64 compatible types [#1762](https://github.com/quickwit-oss/tantivy/issues/1762) (@PSeitz) [#1876]
|
||||
- sstable
|
||||
- Isolating sstable and stacker in independant crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- Isolating sstable and stacker in independent crates. [#1718](https://github.com/quickwit-oss/tantivy/issues/1718) (@fulmicoton)
|
||||
- New sstable format [#1943](https://github.com/quickwit-oss/tantivy/issues/1943)[#1953](https://github.com/quickwit-oss/tantivy/issues/1953) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionnary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add seperate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Use DeltaReader directly to implement Dictionary::ord_to_term [#1928](https://github.com/quickwit-oss/tantivy/issues/1928) (@trinity-1686a)
|
||||
- Use DeltaReader directly to implement Dictionary::term_ord [#1925](https://github.com/quickwit-oss/tantivy/issues/1925) (@trinity-1686a)
|
||||
- Add separate tokenizer manager for fast fields [#2019](https://github.com/quickwit-oss/tantivy/issues/2019) (@PSeitz)
|
||||
- Make construction of LevenshteinAutomatonBuilder for FuzzyTermQuery instances lazy. [#1756](https://github.com/quickwit-oss/tantivy/issues/1756) (@adamreichold)
|
||||
- Added support for madvise when opening an mmaped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Added support for madvise when opening an mmapped Index [#2036](https://github.com/quickwit-oss/tantivy/issues/2036) (@fulmicoton)
|
||||
- Rename `DatePrecision` to `DateTimePrecision` [#2051](https://github.com/quickwit-oss/tantivy/issues/2051) (@guilload)
|
||||
- Query Parser
|
||||
- Quotation mark can now be used for phrase queries. [#2050](https://github.com/quickwit-oss/tantivy/issues/2050) (@fulmicoton)
|
||||
@@ -208,7 +274,7 @@ Tantivy 0.19
|
||||
- Add support for phrase slop in query language [#1393](https://github.com/quickwit-oss/tantivy/pull/1393) (@saroh)
|
||||
- Aggregation
|
||||
- Add aggregation support for date type [#1693](https://github.com/quickwit-oss/tantivy/pull/1693)(@PSeitz)
|
||||
- Add support for keyed parameter in range and histgram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||
- Add support for keyed parameter in range and histogram aggregations [#1424](https://github.com/quickwit-oss/tantivy/pull/1424) (@k-yomo)
|
||||
- Add aggregation bucket limit [#1363](https://github.com/quickwit-oss/tantivy/pull/1363) (@PSeitz)
|
||||
- Faster indexing
|
||||
- [#1610](https://github.com/quickwit-oss/tantivy/pull/1610) (@PSeitz)
|
||||
|
||||
10
CITATION.cff
Normal file
10
CITATION.cff
Normal file
@@ -0,0 +1,10 @@
|
||||
cff-version: 1.2.0
|
||||
message: "If you use this software, please cite it as below."
|
||||
authors:
|
||||
- alias: Quickwit Inc.
|
||||
website: "https://quickwit.io"
|
||||
title: "tantivy"
|
||||
version: 0.22.0
|
||||
doi: 10.5281/zenodo.13942948
|
||||
date-released: 2024-10-17
|
||||
url: "https://github.com/quickwit-oss/tantivy"
|
||||
12
Cargo.toml
12
Cargo.toml
@@ -11,7 +11,7 @@ repository = "https://github.com/quickwit-oss/tantivy"
|
||||
readme = "README.md"
|
||||
keywords = ["search", "information", "retrieval"]
|
||||
edition = "2021"
|
||||
rust-version = "1.66"
|
||||
rust-version = "1.75"
|
||||
exclude = ["benches/*.json", "benches/*.txt"]
|
||||
|
||||
[dependencies]
|
||||
@@ -29,7 +29,7 @@ tantivy-fst = "0.5"
|
||||
memmap2 = { version = "0.9.0", optional = true }
|
||||
lz4_flex = { version = "0.11", default-features = false, optional = true }
|
||||
zstd = { version = "0.13", optional = true, default-features = false }
|
||||
tempfile = { version = "3.3.0", optional = true }
|
||||
tempfile = { version = "3.12.0", optional = true }
|
||||
log = "0.4.16"
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
serde_json = "1.0.79"
|
||||
@@ -43,11 +43,11 @@ bitpacking = { version = "0.9.2", default-features = false, features = [
|
||||
"bitpacker4x",
|
||||
] }
|
||||
census = "0.4.2"
|
||||
rustc-hash = "1.1.0"
|
||||
thiserror = "1.0.30"
|
||||
rustc-hash = "2.0.0"
|
||||
thiserror = "2.0.1"
|
||||
htmlescape = "0.3.1"
|
||||
fail = { version = "0.5.0", optional = true }
|
||||
time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
time = { version = "0.3.35", features = ["serde-well-known"] }
|
||||
smallvec = "1.8.0"
|
||||
rayon = "1.5.2"
|
||||
lru = "0.12.0"
|
||||
@@ -72,7 +72,7 @@ fnv = "1.0.7"
|
||||
winapi = "0.3.9"
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.8.0"
|
||||
binggan = "0.14.0"
|
||||
rand = "0.8.5"
|
||||
maplit = "1.0.2"
|
||||
matches = "0.1.9"
|
||||
|
||||
2
TODO.txt
2
TODO.txt
@@ -1,7 +1,7 @@
|
||||
Make schema_builder API fluent.
|
||||
fix doc serialization and prevent compression problems
|
||||
|
||||
u64 , etc. shoudl return Resutl<Option> now that we support optional missing a column is really not an error
|
||||
u64 , etc. should return Result<Option> now that we support optional missing a column is really not an error
|
||||
remove fastfield codecs
|
||||
ditch the first_or_default trick. if it is still useful, improve its implementation.
|
||||
rename FastFieldReaders::open to load
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
use binggan::plugins::PeakMemAllocPlugin;
|
||||
use binggan::{black_box, InputGroup, PeakMemAlloc, INSTRUMENTED_SYSTEM};
|
||||
use rand::prelude::SliceRandom;
|
||||
use rand::rngs::StdRng;
|
||||
@@ -17,7 +18,9 @@ pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
|
||||
/// runner.register("average_u64", move |index| average_u64(index));
|
||||
macro_rules! register {
|
||||
($runner:expr, $func:ident) => {
|
||||
$runner.register(stringify!($func), move |index| $func(index))
|
||||
$runner.register(stringify!($func), move |index| {
|
||||
$func(index);
|
||||
})
|
||||
};
|
||||
}
|
||||
|
||||
@@ -42,7 +45,8 @@ fn main() {
|
||||
}
|
||||
|
||||
fn bench_agg(mut group: InputGroup<Index>) {
|
||||
group.set_alloc(GLOBAL); // Set the peak mem allocator. This will enable peak memory reporting.
|
||||
group.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
|
||||
|
||||
register!(group, average_u64);
|
||||
register!(group, average_f64);
|
||||
register!(group, average_f64_u64);
|
||||
|
||||
@@ -368,9 +368,9 @@ mod test {
|
||||
for start_idx in 0u32..32u32 {
|
||||
output.resize(len, 0);
|
||||
bitunpacker.get_batch_u32s(start_idx, &buffer, &mut output);
|
||||
for i in 0..len {
|
||||
for (i, output_byte) in output.iter().enumerate() {
|
||||
let expected = (start_idx + i as u32) & mask;
|
||||
assert_eq!(output[i], expected);
|
||||
assert_eq!(*output_byte, expected);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,8 +35,8 @@ const IMPLS: [FilterImplPerInstructionSet; 2] = [
|
||||
const IMPLS: [FilterImplPerInstructionSet; 1] = [FilterImplPerInstructionSet::Scalar];
|
||||
|
||||
impl FilterImplPerInstructionSet {
|
||||
#[allow(unused_variables)]
|
||||
#[inline]
|
||||
#[allow(unused_variables)] // on non-x86_64, code is unused.
|
||||
fn from(code: u8) -> FilterImplPerInstructionSet {
|
||||
#[cfg(target_arch = "x86_64")]
|
||||
if code == FilterImplPerInstructionSet::AVX2 as u8 {
|
||||
|
||||
@@ -23,7 +23,7 @@ downcast-rs = "1.2.0"
|
||||
proptest = "1"
|
||||
more-asserts = "0.3.1"
|
||||
rand = "0.8"
|
||||
binggan = "0.8.1"
|
||||
binggan = "0.14.0"
|
||||
|
||||
[[bench]]
|
||||
name = "bench_merge"
|
||||
|
||||
@@ -31,7 +31,7 @@ restriction on 50% of the values (e.g. a 64-bit hash). On the other hand, a lot
|
||||
# Columnar format
|
||||
|
||||
This columnar format may have more than one column (with different types) associated to the same `column_name` (see [Coercion rules](#coercion-rules) above).
|
||||
The `(column_name, columne_type)` couple however uniquely identifies a column.
|
||||
The `(column_name, column_type)` couple however uniquely identifies a column.
|
||||
That couple is serialized as a column `column_key`. The format of that key is:
|
||||
`[column_name][ZERO_BYTE][column_type_header: u8]`
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
pub mod common;
|
||||
|
||||
use binggan::{black_box, BenchRunner};
|
||||
use binggan::BenchRunner;
|
||||
use common::{generate_columnar_with_name, Card};
|
||||
use tantivy_columnar::*;
|
||||
|
||||
@@ -29,7 +29,7 @@ fn main() {
|
||||
add_combo(Card::Multi, Card::Dense);
|
||||
add_combo(Card::Multi, Card::Sparse);
|
||||
|
||||
let runner: BenchRunner = BenchRunner::new();
|
||||
let mut runner: BenchRunner = BenchRunner::new();
|
||||
let mut group = runner.new_group();
|
||||
for (input_name, columnar_readers) in inputs.iter() {
|
||||
group.register_with_input(
|
||||
@@ -41,7 +41,7 @@ fn main() {
|
||||
let merge_row_order = StackMergeOrder::stack(&columnar_readers[..]);
|
||||
|
||||
merge_columnar(&columnar_readers, &[], merge_row_order.into(), &mut out).unwrap();
|
||||
black_box(out);
|
||||
Some(out.len() as u64)
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
|
||||
# Perf and Size
|
||||
* remove alloc in `ord_to_term`
|
||||
+ multivaued range queries restrat frm the beginning all of the time.
|
||||
+ multivaued range queries restart from the beginning all of the time.
|
||||
* re-add ZSTD compression for dictionaries
|
||||
no systematic monotonic mapping
|
||||
consider removing multilinear
|
||||
@@ -30,7 +30,7 @@ investigate if should have better errors? io::Error is overused at the moment.
|
||||
rename rank/select in unit tests
|
||||
Review the public API via cargo doc
|
||||
go through TODOs
|
||||
remove all doc_id occurences -> row_id
|
||||
remove all doc_id occurrences -> row_id
|
||||
use the rank & select naming in unit tests branch.
|
||||
multi-linear -> blockwise
|
||||
linear codec -> simply a multiplication for the index column
|
||||
@@ -43,5 +43,5 @@ isolate u128_based and uniform naming
|
||||
# Other
|
||||
fix enhance column-cli
|
||||
|
||||
# Santa claus
|
||||
# Santa Claus
|
||||
autodetect datetime ipaddr, plug customizable tokenizer.
|
||||
|
||||
@@ -66,7 +66,7 @@ impl<T: PartialOrd + Copy + std::fmt::Debug + Send + Sync + 'static + Default>
|
||||
&'a self,
|
||||
docs: &'a [u32],
|
||||
accessor: &Column<T>,
|
||||
) -> impl Iterator<Item = (DocId, T)> + '_ {
|
||||
) -> impl Iterator<Item = (DocId, T)> + 'a {
|
||||
if accessor.index.get_cardinality().is_full() {
|
||||
docs.iter().cloned().zip(self.val_cache.iter().cloned())
|
||||
} else {
|
||||
|
||||
@@ -173,7 +173,7 @@ mod tests {
|
||||
.into();
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
panic!("Expected a multivalued index")
|
||||
};
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
@@ -211,7 +211,7 @@ mod tests {
|
||||
|
||||
let merged_column_index = merge_column_index(&column_indexes[..], &merge_row_order);
|
||||
let SerializableColumnIndex::Multivalued(start_index_iterable) = merged_column_index else {
|
||||
panic!("Excpected a multivalued index")
|
||||
panic!("Expected a multivalued index")
|
||||
};
|
||||
let mut output = Vec::new();
|
||||
serialize_multivalued_index(&start_index_iterable, &mut output).unwrap();
|
||||
|
||||
@@ -28,7 +28,7 @@ pub enum ColumnIndex {
|
||||
Full,
|
||||
Optional(OptionalIndex),
|
||||
/// In addition, at index num_rows, an extra value is added
|
||||
/// containing the overal number of values.
|
||||
/// containing the overall number of values.
|
||||
Multivalued(MultiValueIndex),
|
||||
}
|
||||
|
||||
|
||||
@@ -174,7 +174,9 @@ impl<'a> SelectCursor<RowId> for OptionalIndexSelectCursor<'a> {
|
||||
}
|
||||
|
||||
impl Set<RowId> for OptionalIndex {
|
||||
type SelectCursor<'b> = OptionalIndexSelectCursor<'b> where Self: 'b;
|
||||
type SelectCursor<'b>
|
||||
= OptionalIndexSelectCursor<'b>
|
||||
where Self: 'b;
|
||||
// Check if value at position is not null.
|
||||
#[inline]
|
||||
fn contains(&self, row_id: RowId) -> bool {
|
||||
|
||||
@@ -123,7 +123,9 @@ impl<'a> SelectCursor<u16> for DenseBlockSelectCursor<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Set<u16> for DenseBlock<'a> {
|
||||
type SelectCursor<'b> = DenseBlockSelectCursor<'a> where Self: 'b;
|
||||
type SelectCursor<'b>
|
||||
= DenseBlockSelectCursor<'a>
|
||||
where Self: 'b;
|
||||
|
||||
#[inline(always)]
|
||||
fn contains(&self, el: u16) -> bool {
|
||||
|
||||
@@ -32,7 +32,9 @@ impl<'a> SelectCursor<u16> for SparseBlock<'a> {
|
||||
}
|
||||
|
||||
impl<'a> Set<u16> for SparseBlock<'a> {
|
||||
type SelectCursor<'b> = Self where Self: 'b;
|
||||
type SelectCursor<'b>
|
||||
= Self
|
||||
where Self: 'b;
|
||||
|
||||
#[inline(always)]
|
||||
fn contains(&self, el: u16) -> bool {
|
||||
@@ -80,7 +82,7 @@ impl<'a> SparseBlock<'a> {
|
||||
}
|
||||
|
||||
#[inline]
|
||||
#[allow(clippy::comparison_chain)]
|
||||
#[expect(clippy::comparison_chain)]
|
||||
// Looks for the element in the block. Returns the positions if found.
|
||||
fn binary_search(&self, target: u16) -> Result<u16, u16> {
|
||||
let data = &self.0;
|
||||
|
||||
@@ -110,8 +110,8 @@ fn test_null_index(data: &[bool]) {
|
||||
.map(|(pos, _val)| pos as u32)
|
||||
.collect();
|
||||
let mut select_iter = null_index.select_cursor();
|
||||
for i in 0..orig_idx_with_value.len() {
|
||||
assert_eq!(select_iter.select(i as u32), orig_idx_with_value[i]);
|
||||
for (i, expected) in orig_idx_with_value.iter().enumerate() {
|
||||
assert_eq!(select_iter.select(i as u32), *expected);
|
||||
}
|
||||
|
||||
let step_size = (orig_idx_with_value.len() / 100).max(1);
|
||||
|
||||
@@ -184,7 +184,7 @@ impl CompactSpaceBuilder {
|
||||
|
||||
let mut covered_space = Vec::with_capacity(self.blanks.len());
|
||||
|
||||
// begining of the blanks
|
||||
// beginning of the blanks
|
||||
if let Some(first_blank_start) = self.blanks.first().map(RangeInclusive::start) {
|
||||
if *first_blank_start != 0 {
|
||||
covered_space.push(0..=first_blank_start - 1);
|
||||
|
||||
@@ -128,7 +128,7 @@ pub fn open_u128_as_compact_u64(mut bytes: OwnedBytes) -> io::Result<Arc<dyn Col
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use super::*;
|
||||
use crate::column_values::u64_based::{
|
||||
serialize_and_load_u64_based_column_values, serialize_u64_based_column_values,
|
||||
|
||||
@@ -122,12 +122,11 @@ impl Line {
|
||||
line
|
||||
}
|
||||
|
||||
/// Returns a line that attemps to approximate a function
|
||||
/// Returns a line that attempts to approximate a function
|
||||
/// f: i in 0..[ys.num_vals()) -> ys[i].
|
||||
///
|
||||
/// - The approximation is always lower than the actual value.
|
||||
/// Or more rigorously, formally `f(i).wrapping_sub(ys[i])` is small
|
||||
/// for any i in [0..ys.len()).
|
||||
/// - The approximation is always lower than the actual value. Or more rigorously, formally
|
||||
/// `f(i).wrapping_sub(ys[i])` is small for any i in [0..ys.len()).
|
||||
/// - It computes without panicking for any value of it.
|
||||
///
|
||||
/// This function is only invariable by translation if all of the
|
||||
|
||||
@@ -25,7 +25,7 @@ use crate::{
|
||||
/// After merge, all columns belonging to the same category are coerced to
|
||||
/// the same column type.
|
||||
///
|
||||
/// In practise, today, only Numerical colummns are coerced into one type today.
|
||||
/// In practise, today, only Numerical columns are coerced into one type today.
|
||||
///
|
||||
/// See also [README.md].
|
||||
///
|
||||
@@ -63,11 +63,10 @@ impl From<ColumnType> for ColumnTypeCategory {
|
||||
/// `require_columns` makes it possible to ensure that some columns will be present in the
|
||||
/// resulting columnar. When a required column is a numerical column type, one of two things can
|
||||
/// happen:
|
||||
/// - If the required column type is compatible with all of the input columnar, the resulsting
|
||||
/// merged
|
||||
/// columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged
|
||||
/// will fail with an InvalidData error.
|
||||
/// - If the required column type is compatible with all of the input columnar, the resulting merged
|
||||
/// columnar will simply coerce the input column and use the required column type.
|
||||
/// - If the required column type is incompatible with one of the input columnar, the merged will
|
||||
/// fail with an InvalidData error.
|
||||
///
|
||||
/// `merge_row_order` makes it possible to remove or reorder row in the resulting
|
||||
/// `Columnar` table.
|
||||
|
||||
@@ -35,8 +35,7 @@ impl<'a> Ord for HeapItem<'a> {
|
||||
///
|
||||
/// The item yield is actually a pair with
|
||||
/// - the term
|
||||
/// - a slice with the ordinal of the segments containing
|
||||
/// the terms.
|
||||
/// - a slice with the ordinal of the segments containing the terms.
|
||||
pub struct TermMerger<'a> {
|
||||
heap: BinaryHeap<HeapItem<'a>>,
|
||||
current_streamers: Vec<HeapItem<'a>>,
|
||||
|
||||
@@ -87,7 +87,7 @@ impl<V: SymbolValue> ColumnOperation<V> {
|
||||
minibuf
|
||||
}
|
||||
|
||||
/// Deserialize a colummn operation.
|
||||
/// Deserialize a column operation.
|
||||
/// Returns None if the buffer is empty.
|
||||
///
|
||||
/// Panics if the payload is invalid:
|
||||
@@ -122,7 +122,6 @@ impl<T> From<T> for ColumnOperation<T> {
|
||||
// In order to limit memory usage, and in order
|
||||
// to benefit from the stacker, we do this by serialization our data
|
||||
// as "Symbols".
|
||||
#[allow(clippy::from_over_into)]
|
||||
pub(super) trait SymbolValue: Clone + Copy {
|
||||
// Serializes the symbol into the given buffer.
|
||||
// Returns the number of bytes written into the buffer.
|
||||
|
||||
@@ -392,7 +392,7 @@ impl ColumnarWriter {
|
||||
|
||||
// Serialize [Dictionary, Column, dictionary num bytes U32::LE]
|
||||
// Column: [Column Index, Column Values, column index num bytes U32::LE]
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
fn serialize_bytes_or_str_column(
|
||||
cardinality: Cardinality,
|
||||
num_docs: RowId,
|
||||
|
||||
@@ -17,6 +17,31 @@ impl NumericalValue {
|
||||
NumericalValue::F64(_) => NumericalType::F64,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tries to normalize the numerical value in the following priorities:
|
||||
/// i64, i64, f64
|
||||
pub fn normalize(self) -> Self {
|
||||
match self {
|
||||
NumericalValue::U64(val) => {
|
||||
if val <= i64::MAX as u64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else {
|
||||
NumericalValue::F64(val as f64)
|
||||
}
|
||||
}
|
||||
NumericalValue::I64(val) => NumericalValue::I64(val),
|
||||
NumericalValue::F64(val) => {
|
||||
let fract = val.fract();
|
||||
if fract == 0.0 && val >= i64::MIN as f64 && val <= i64::MAX as f64 {
|
||||
NumericalValue::I64(val as i64)
|
||||
} else if fract == 0.0 && val >= u64::MIN as f64 && val <= u64::MAX as f64 {
|
||||
NumericalValue::U64(val as u64)
|
||||
} else {
|
||||
NumericalValue::F64(val)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl From<u64> for NumericalValue {
|
||||
|
||||
@@ -19,7 +19,7 @@ time = { version = "0.3.10", features = ["serde-well-known"] }
|
||||
serde = { version = "1.0.136", features = ["derive"] }
|
||||
|
||||
[dev-dependencies]
|
||||
binggan = "0.8.1"
|
||||
binggan = "0.14.0"
|
||||
proptest = "1.0.0"
|
||||
rand = "0.8.4"
|
||||
|
||||
|
||||
130
common/src/bounds.rs
Normal file
130
common/src/bounds.rs
Normal file
@@ -0,0 +1,130 @@
|
||||
use std::io;
|
||||
use std::ops::Bound;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct BoundsRange<T> {
|
||||
pub lower_bound: Bound<T>,
|
||||
pub upper_bound: Bound<T>,
|
||||
}
|
||||
impl<T> BoundsRange<T> {
|
||||
pub fn new(lower_bound: Bound<T>, upper_bound: Bound<T>) -> Self {
|
||||
BoundsRange {
|
||||
lower_bound,
|
||||
upper_bound,
|
||||
}
|
||||
}
|
||||
pub fn is_unbounded(&self) -> bool {
|
||||
matches!(self.lower_bound, Bound::Unbounded) && matches!(self.upper_bound, Bound::Unbounded)
|
||||
}
|
||||
pub fn map_bound<TTo>(&self, transform: impl Fn(&T) -> TTo) -> BoundsRange<TTo> {
|
||||
BoundsRange {
|
||||
lower_bound: map_bound(&self.lower_bound, &transform),
|
||||
upper_bound: map_bound(&self.upper_bound, &transform),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound_res<TTo, Err>(
|
||||
&self,
|
||||
transform: impl Fn(&T) -> Result<TTo, Err>,
|
||||
) -> Result<BoundsRange<TTo>, Err> {
|
||||
Ok(BoundsRange {
|
||||
lower_bound: map_bound_res(&self.lower_bound, &transform)?,
|
||||
upper_bound: map_bound_res(&self.upper_bound, &transform)?,
|
||||
})
|
||||
}
|
||||
|
||||
pub fn transform_inner<TTo>(
|
||||
&self,
|
||||
transform_lower: impl Fn(&T) -> TransformBound<TTo>,
|
||||
transform_upper: impl Fn(&T) -> TransformBound<TTo>,
|
||||
) -> BoundsRange<TTo> {
|
||||
BoundsRange {
|
||||
lower_bound: transform_bound_inner(&self.lower_bound, &transform_lower),
|
||||
upper_bound: transform_bound_inner(&self.upper_bound, &transform_upper),
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the first set inner value
|
||||
pub fn get_inner(&self) -> Option<&T> {
|
||||
inner_bound(&self.lower_bound).or(inner_bound(&self.upper_bound))
|
||||
}
|
||||
}
|
||||
|
||||
pub enum TransformBound<T> {
|
||||
/// Overwrite the bounds
|
||||
NewBound(Bound<T>),
|
||||
/// Use Existing bounds with new value
|
||||
Existing(T),
|
||||
}
|
||||
|
||||
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||
/// The bound variant may change by the value returned value from the closure.
|
||||
pub fn transform_bound_inner_res<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> io::Result<TransformBound<TTo>>,
|
||||
) -> io::Result<Bound<TTo>> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val)? {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
Unbounded => Unbounded,
|
||||
})
|
||||
}
|
||||
|
||||
/// Takes a bound and transforms the inner value into a new bound via a closure.
|
||||
/// The bound variant may change by the value returned value from the closure.
|
||||
pub fn transform_bound_inner<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> TransformBound<TTo>,
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Excluded(new_val),
|
||||
},
|
||||
Included(ref from_val) => match transform(from_val) {
|
||||
TransformBound::NewBound(new_val) => new_val,
|
||||
TransformBound::Existing(new_val) => Included(new_val),
|
||||
},
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns the inner value of a `Bound`
|
||||
pub fn inner_bound<T>(val: &Bound<T>) -> Option<&T> {
|
||||
match val {
|
||||
Bound::Included(term) | Bound::Excluded(term) => Some(term),
|
||||
Bound::Unbounded => None,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound<TFrom, TTo>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> TTo,
|
||||
) -> Bound<TTo> {
|
||||
use self::Bound::*;
|
||||
match bound {
|
||||
Excluded(ref from_val) => Bound::Excluded(transform(from_val)),
|
||||
Included(ref from_val) => Bound::Included(transform(from_val)),
|
||||
Unbounded => Unbounded,
|
||||
}
|
||||
}
|
||||
|
||||
pub fn map_bound_res<TFrom, TTo, Err>(
|
||||
bound: &Bound<TFrom>,
|
||||
transform: impl Fn(&TFrom) -> Result<TTo, Err>,
|
||||
) -> Result<Bound<TTo>, Err> {
|
||||
use self::Bound::*;
|
||||
Ok(match bound {
|
||||
Excluded(ref from_val) => Excluded(transform(from_val)?),
|
||||
Included(ref from_val) => Included(transform(from_val)?),
|
||||
Unbounded => Unbounded,
|
||||
})
|
||||
}
|
||||
@@ -5,6 +5,7 @@ use std::ops::Deref;
|
||||
pub use byteorder::LittleEndian as Endianness;
|
||||
|
||||
mod bitset;
|
||||
pub mod bounds;
|
||||
mod byte_count;
|
||||
mod datetime;
|
||||
pub mod file_slice;
|
||||
@@ -129,11 +130,11 @@ pub fn replace_in_place(needle: u8, replacement: u8, bytes: &mut [u8]) {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod test {
|
||||
pub(crate) mod test {
|
||||
|
||||
use proptest::prelude::*;
|
||||
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64, BinarySerializable, FixedSize};
|
||||
use super::{f64_to_u64, i64_to_u64, u64_to_f64, u64_to_i64};
|
||||
|
||||
fn test_i64_converter_helper(val: i64) {
|
||||
assert_eq!(u64_to_i64(i64_to_u64(val)), val);
|
||||
@@ -143,12 +144,6 @@ pub mod test {
|
||||
assert_eq!(u64_to_f64(f64_to_u64(val)), val);
|
||||
}
|
||||
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
assert_eq!(buffer.len(), O::SIZE_IN_BYTES);
|
||||
}
|
||||
|
||||
proptest! {
|
||||
#[test]
|
||||
fn test_f64_converter_monotonicity_proptest((left, right) in (proptest::num::f64::NORMAL, proptest::num::f64::NORMAL)) {
|
||||
|
||||
@@ -74,14 +74,14 @@ impl FixedSize for () {
|
||||
|
||||
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self {
|
||||
it.serialize(writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = T::deserialize(reader)?;
|
||||
@@ -236,12 +236,12 @@ impl FixedSize for bool {
|
||||
impl BinarySerializable for String {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
|
||||
impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
let data: &[u8] = self.as_bytes();
|
||||
VInt(data.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
|
||||
writer.write_all(data)
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
|
||||
let string_length = VInt::deserialize(reader)?.val() as usize;
|
||||
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
|
||||
let mut result = String::with_capacity(string_length);
|
||||
reader
|
||||
.take(string_length as u64)
|
||||
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
|
||||
|
||||
impl<'a> BinarySerializable for Cow<'a, [u8]> {
|
||||
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
|
||||
VInt(self.len() as u64).serialize(writer)?;
|
||||
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
|
||||
for it in self.iter() {
|
||||
it.serialize(writer)?;
|
||||
BinarySerializable::serialize(it, writer)?;
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
|
||||
let num_items = VInt::deserialize(reader)?.val();
|
||||
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
|
||||
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
|
||||
for _ in 0..num_items {
|
||||
let item = u8::deserialize(reader)?;
|
||||
let item = <u8 as BinarySerializable>::deserialize(reader)?;
|
||||
items.push(item);
|
||||
}
|
||||
Ok(Cow::Owned(items))
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
> Tantivy is a **search** engine **library** for Rust.
|
||||
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for rust. tantivy is heavily inspired by Lucene's design and
|
||||
If you are familiar with Lucene, it's an excellent approximation to consider tantivy as Lucene for Rust. Tantivy is heavily inspired by Lucene's design and
|
||||
they both have the same scope and targeted use cases.
|
||||
|
||||
If you are not familiar with Lucene, let's break down our little tagline.
|
||||
@@ -17,7 +17,7 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
experience. But keep in mind this is just a toolbox.
|
||||
Which bring us to the second keyword...
|
||||
|
||||
- **Library** means that you will have to write code. tantivy is not an *all-in-one* server solution like elastic search for instance.
|
||||
- **Library** means that you will have to write code. Tantivy is not an *all-in-one* server solution like Elasticsearch for instance.
|
||||
|
||||
Sometimes a functionality will not be available in tantivy because it is too
|
||||
specific to your use case. By design, tantivy should make it possible to extend
|
||||
@@ -31,4 +31,4 @@ relevancy, collapsing, highlighting, spatial search.
|
||||
index from a different format.
|
||||
|
||||
Tantivy exposes a lot of low level API to do all of these things.
|
||||
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ directory shipped with tantivy is the `MmapDirectory`.
|
||||
While this design has some downsides, this greatly simplifies the source code of
|
||||
tantivy. Caching is also entirely delegated to the OS.
|
||||
|
||||
`tantivy` works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
Tantivy works entirely (or almost) by directly reading the datastructures as they are laid on disk. As a result, the act of opening an indexing does not involve loading different datastructures from the disk into random access memory : starting a process, opening an index, and performing your first query can typically be done in a matter of milliseconds.
|
||||
|
||||
This is an interesting property for a command line search engine, or for some multi-tenant log search engine : spawning a new process for each new query can be a perfectly sensible solution in some use case.
|
||||
|
||||
|
||||
@@ -31,13 +31,13 @@ Compression ratio is mainly affected on the fast field of the sorted property, e
|
||||
When data is presorted by a field and search queries request sorting by the same field, we can leverage the natural order of the documents.
|
||||
E.g. if the data is sorted by timestamp and want the top n newest docs containing a term, we can simply leveraging the order of the docids.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Pruning
|
||||
|
||||
Let's say we want all documents and want to apply the filter `>= 2010-08-11`. When the data is sorted, we could make a lookup in the fast field to find the docid range and use this as the filter.
|
||||
|
||||
Note: Tantivy 0.16 does not do this optimization yet.
|
||||
Note: tantivy 0.16 does not do this optimization yet.
|
||||
|
||||
### Other?
|
||||
|
||||
@@ -45,7 +45,7 @@ In principle there are many algorithms possible that exploit the monotonically i
|
||||
|
||||
## Usage
|
||||
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of Tantivy 0.16 only fast fields are allowed to be used.
|
||||
The index sorting can be configured setting [`sort_by_field`](https://github.com/quickwit-oss/tantivy/blob/000d76b11a139a84b16b9b95060a1c93e8b9851c/src/core/index_meta.rs#L238) on `IndexSettings` and passing it to a `IndexBuilder`. As of tantivy 0.16 only fast fields are allowed to be used.
|
||||
|
||||
```rust
|
||||
let settings = IndexSettings {
|
||||
|
||||
@@ -39,7 +39,7 @@ Its representation is done by separating segments by a unicode char `\x01`, and
|
||||
- `value`: The value representation is just the regular Value representation.
|
||||
|
||||
This representation is designed to align the natural sort of Terms with the lexicographical sort
|
||||
of their binary representation (Tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
of their binary representation (tantivy's dictionary (whether fst or sstable) is sorted and does prefix encoding).
|
||||
|
||||
In the example above, the terms will be sorted as
|
||||
|
||||
|
||||
@@ -28,7 +28,7 @@ fn main() -> tantivy::Result<()> {
|
||||
let mut index_writer: IndexWriter = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
index_writer.add_document(doc!(title => "The Old Man and the Sea"))?;
|
||||
index_writer.add_document(doc!(title => "Of Mice and Men"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||
index_writer.commit()?;
|
||||
|
||||
let reader = index.reader()?;
|
||||
|
||||
@@ -151,7 +151,7 @@ impl fmt::Debug for OwnedBytes {
|
||||
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
|
||||
// We truncate the bytes in order to make sure the debug string
|
||||
// is not too long.
|
||||
let bytes_truncated: &[u8] = if self.len() > 8 {
|
||||
let bytes_truncated: &[u8] = if self.len() > 10 {
|
||||
&self.as_slice()[..10]
|
||||
} else {
|
||||
self.as_slice()
|
||||
@@ -252,6 +252,11 @@ mod tests {
|
||||
format!("{short_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100], len=4)"
|
||||
);
|
||||
let medium_bytes = OwnedBytes::new(b"abcdefghi".as_ref());
|
||||
assert_eq!(
|
||||
format!("{medium_bytes:?}"),
|
||||
"OwnedBytes([97, 98, 99, 100, 101, 102, 103, 104, 105], len=9)"
|
||||
);
|
||||
let long_bytes = OwnedBytes::new(b"abcdefghijklmnopq".as_ref());
|
||||
assert_eq!(
|
||||
format!("{long_bytes:?}"),
|
||||
|
||||
@@ -109,6 +109,8 @@ where F: nom::Parser<I, (O, ErrorList), Infallible> {
|
||||
move |input: I| match f.parse(input) {
|
||||
Ok((input, (output, _err))) => Ok((input, output)),
|
||||
Err(Err::Incomplete(needed)) => Err(Err::Incomplete(needed)),
|
||||
// old versions don't understand this is uninhabited and need the empty match to help,
|
||||
// newer versions warn because this arm is unreachable (which it is indeed).
|
||||
Err(Err::Error(val)) | Err(Err::Failure(val)) => match val {},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -6,12 +6,12 @@ use std::fmt::Write;
|
||||
#[derive(Debug, Clone, Hash, Copy, Eq, PartialEq)]
|
||||
pub enum Occur {
|
||||
/// For a given document to be considered for scoring,
|
||||
/// at least one of the terms with the Should or the Must
|
||||
/// at least one of the queries with the Should or the Must
|
||||
/// Occur constraint must be within the document.
|
||||
Should,
|
||||
/// Document without the term are excluded from the search.
|
||||
/// Document without the queries are excluded from the search.
|
||||
Must,
|
||||
/// Document that contain the term are excluded from the
|
||||
/// Document that contain the query are excluded from the
|
||||
/// search.
|
||||
MustNot,
|
||||
}
|
||||
|
||||
@@ -767,7 +767,7 @@ fn occur_leaf(inp: &str) -> IResult<&str, (Option<Occur>, UserInputAst)> {
|
||||
tuple((fallible(occur_symbol), boosted_leaf))(inp)
|
||||
}
|
||||
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
fn operand_occur_leaf_infallible(
|
||||
inp: &str,
|
||||
) -> JResult<&str, (Option<BinaryOperand>, Option<Occur>, Option<UserInputAst>)> {
|
||||
@@ -833,7 +833,7 @@ fn aggregate_infallible_expressions(
|
||||
if early_operand {
|
||||
err.push(LenientErrorInternal {
|
||||
pos: 0,
|
||||
message: "Found unexpeted boolean operator before term".to_string(),
|
||||
message: "Found unexpected boolean operator before term".to_string(),
|
||||
});
|
||||
}
|
||||
|
||||
@@ -856,7 +856,7 @@ fn aggregate_infallible_expressions(
|
||||
_ => Some(Occur::Should),
|
||||
};
|
||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(
|
||||
Some(Occur::Should),
|
||||
ast.clone().unary(Occur::MustNot),
|
||||
@@ -872,7 +872,7 @@ fn aggregate_infallible_expressions(
|
||||
None => None,
|
||||
};
|
||||
if occur == &Some(Occur::MustNot) && default_op == Some(Occur::Should) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(
|
||||
Some(Occur::Should),
|
||||
ast.clone().unary(Occur::MustNot),
|
||||
@@ -897,7 +897,7 @@ fn aggregate_infallible_expressions(
|
||||
}
|
||||
Some(BinaryOperand::Or) => {
|
||||
if last_occur == Some(Occur::MustNot) {
|
||||
// if occur is MustNot *and* operation is OR, we synthetize a ShouldNot
|
||||
// if occur is MustNot *and* operation is OR, we synthesize a ShouldNot
|
||||
clauses.push(vec![(Some(Occur::Should), last_ast.unary(Occur::MustNot))]);
|
||||
} else {
|
||||
clauses.push(vec![(last_occur.or(Some(Occur::Should)), last_ast)]);
|
||||
@@ -1057,7 +1057,7 @@ mod test {
|
||||
valid_parse("1", 1.0, "");
|
||||
valid_parse("0.234234 aaa", 0.234234f64, " aaa");
|
||||
error_parse(".3332");
|
||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeeed,
|
||||
// TODO trinity-1686a: I disagree that it should fail, I think it should succeed,
|
||||
// consuming only "1", and leave "." for the next thing (which will likely fail then)
|
||||
// error_parse("1.");
|
||||
error_parse("-1.");
|
||||
@@ -1467,7 +1467,7 @@ mod test {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_parse_query_to_triming_spaces() {
|
||||
fn test_parse_query_to_trimming_spaces() {
|
||||
test_parse_query_to_ast_helper(" abc", "abc");
|
||||
test_parse_query_to_ast_helper("abc ", "abc");
|
||||
test_parse_query_to_ast_helper("( a OR abc)", "(?a ?abc)");
|
||||
|
||||
@@ -267,7 +267,7 @@ impl fmt::Debug for UserInputAst {
|
||||
match *self {
|
||||
UserInputAst::Clause(ref subqueries) => {
|
||||
if subqueries.is_empty() {
|
||||
// TODO this will break ast reserialization, is writing "( )" enought?
|
||||
// TODO this will break ast reserialization, is writing "( )" enough?
|
||||
write!(formatter, "<emptyclause>")?;
|
||||
} else {
|
||||
write!(formatter, "(")?;
|
||||
|
||||
@@ -21,7 +21,10 @@ impl<K, V, S> MemoryConsumption for HashMap<K, V, S> {
|
||||
|
||||
/// Aggregation memory limit after which the request fails. Defaults to DEFAULT_MEMORY_LIMIT
|
||||
/// (500MB). The limit is shared by all SegmentCollectors
|
||||
pub struct AggregationLimits {
|
||||
///
|
||||
/// The memory limit is also a guard, which tracks how much it allocated and releases it's memory
|
||||
/// on the shared counter. Cloning will create a new guard.
|
||||
pub struct AggregationLimitsGuard {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc<AtomicU64>,
|
||||
/// The memory_limit in bytes
|
||||
@@ -29,28 +32,41 @@ pub struct AggregationLimits {
|
||||
/// The maximum number of buckets _returned_
|
||||
/// This is not counting intermediate buckets.
|
||||
bucket_limit: u32,
|
||||
/// Allocated memory with this guard.
|
||||
allocated_with_the_guard: u64,
|
||||
}
|
||||
impl Clone for AggregationLimits {
|
||||
impl Clone for AggregationLimitsGuard {
|
||||
fn clone(&self) -> Self {
|
||||
Self {
|
||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||
memory_limit: self.memory_limit,
|
||||
bucket_limit: self.bucket_limit,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AggregationLimits {
|
||||
impl Drop for AggregationLimitsGuard {
|
||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||
/// This is used to clear the segment specific memory consumption all at once.
|
||||
fn drop(&mut self) {
|
||||
self.memory_consumption
|
||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for AggregationLimitsGuard {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
memory_consumption: Default::default(),
|
||||
memory_limit: DEFAULT_MEMORY_LIMIT.into(),
|
||||
bucket_limit: DEFAULT_BUCKET_LIMIT,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl AggregationLimits {
|
||||
impl AggregationLimitsGuard {
|
||||
/// *memory_limit*
|
||||
/// memory_limit is defined in bytes.
|
||||
/// Aggregation fails when the estimated memory consumption of the aggregation is higher than
|
||||
@@ -67,24 +83,15 @@ impl AggregationLimits {
|
||||
memory_consumption: Default::default(),
|
||||
memory_limit: memory_limit.unwrap_or(DEFAULT_MEMORY_LIMIT).into(),
|
||||
bucket_limit: bucket_limit.unwrap_or(DEFAULT_BUCKET_LIMIT),
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a new ResourceLimitGuard, that will release the memory when dropped.
|
||||
pub fn new_guard(&self) -> ResourceLimitGuard {
|
||||
ResourceLimitGuard {
|
||||
// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc::clone(&self.memory_consumption),
|
||||
// The memory_limit in bytes
|
||||
memory_limit: self.memory_limit,
|
||||
allocated_with_the_guard: 0,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
pub(crate) fn add_memory_consumed(&mut self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
let prev_value = self
|
||||
.memory_consumption
|
||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||
self.allocated_with_the_guard += add_num_bytes;
|
||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||
Ok(())
|
||||
}
|
||||
@@ -109,34 +116,6 @@ fn validate_memory_consumption(
|
||||
Ok(())
|
||||
}
|
||||
|
||||
pub struct ResourceLimitGuard {
|
||||
/// The counter which is shared between the aggregations for one request.
|
||||
memory_consumption: Arc<AtomicU64>,
|
||||
/// The memory_limit in bytes
|
||||
memory_limit: ByteCount,
|
||||
/// Allocated memory with this guard.
|
||||
allocated_with_the_guard: u64,
|
||||
}
|
||||
|
||||
impl ResourceLimitGuard {
|
||||
pub(crate) fn add_memory_consumed(&self, add_num_bytes: u64) -> crate::Result<()> {
|
||||
let prev_value = self
|
||||
.memory_consumption
|
||||
.fetch_add(add_num_bytes, Ordering::Relaxed);
|
||||
validate_memory_consumption(prev_value + add_num_bytes, self.memory_limit)?;
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
||||
impl Drop for ResourceLimitGuard {
|
||||
/// Removes the memory consumed tracked by this _instance_ of AggregationLimits.
|
||||
/// This is used to clear the segment specific memory consumption all at once.
|
||||
fn drop(&mut self) {
|
||||
self.memory_consumption
|
||||
.fetch_sub(self.allocated_with_the_guard, Ordering::Relaxed);
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use crate::aggregation::tests::exec_request_with_query;
|
||||
|
||||
@@ -5,7 +5,6 @@ use std::io;
|
||||
|
||||
use columnar::{Column, ColumnBlockAccessor, ColumnType, DynamicColumn, StrColumn};
|
||||
|
||||
use super::agg_limits::ResourceLimitGuard;
|
||||
use super::agg_req::{Aggregation, AggregationVariants, Aggregations};
|
||||
use super::bucket::{
|
||||
DateHistogramAggregationReq, HistogramAggregation, RangeAggregation, TermsAggregation,
|
||||
@@ -14,7 +13,7 @@ use super::metric::{
|
||||
AverageAggregation, CardinalityAggregationReq, CountAggregation, ExtendedStatsAggregation,
|
||||
MaxAggregation, MinAggregation, StatsAggregation, SumAggregation,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::segment_agg_result::AggregationLimitsGuard;
|
||||
use super::VecWithNames;
|
||||
use crate::aggregation::{f64_to_fastfield_u64, Key};
|
||||
use crate::index::SegmentReader;
|
||||
@@ -46,7 +45,7 @@ pub struct AggregationWithAccessor {
|
||||
pub(crate) str_dict_column: Option<StrColumn>,
|
||||
pub(crate) field_type: ColumnType,
|
||||
pub(crate) sub_aggregation: AggregationsWithAccessor,
|
||||
pub(crate) limits: ResourceLimitGuard,
|
||||
pub(crate) limits: AggregationLimitsGuard,
|
||||
pub(crate) column_block_accessor: ColumnBlockAccessor<u64>,
|
||||
/// Used for missing term aggregation, which checks all columns for existence.
|
||||
/// And also for `top_hits` aggregation, which may sort on multiple fields.
|
||||
@@ -69,7 +68,7 @@ impl AggregationWithAccessor {
|
||||
sub_aggregation: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<AggregationWithAccessor>> {
|
||||
let mut agg = agg.clone();
|
||||
|
||||
@@ -91,7 +90,7 @@ impl AggregationWithAccessor {
|
||||
&limits,
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits: limits.clone(),
|
||||
missing_value_for_accessor: None,
|
||||
str_dict_column: None,
|
||||
column_block_accessor: Default::default(),
|
||||
@@ -106,6 +105,7 @@ impl AggregationWithAccessor {
|
||||
value_accessors: HashMap<String, Vec<DynamicColumn>>|
|
||||
-> crate::Result<()> {
|
||||
let (accessor, field_type) = accessors.first().expect("at least one accessor");
|
||||
let limits = limits.clone();
|
||||
let res = AggregationWithAccessor {
|
||||
segment_ordinal,
|
||||
// TODO: We should do away with the `accessor` field altogether
|
||||
@@ -120,7 +120,7 @@ impl AggregationWithAccessor {
|
||||
&limits,
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits,
|
||||
missing_value_for_accessor: None,
|
||||
str_dict_column: None,
|
||||
column_block_accessor: Default::default(),
|
||||
@@ -186,6 +186,8 @@ impl AggregationWithAccessor {
|
||||
.map(|missing| match missing {
|
||||
Key::Str(_) => ColumnType::Str,
|
||||
Key::F64(_) => ColumnType::F64,
|
||||
Key::I64(_) => ColumnType::I64,
|
||||
Key::U64(_) => ColumnType::U64,
|
||||
})
|
||||
.unwrap_or(ColumnType::U64);
|
||||
let column_and_types = get_all_ff_reader_or_empty(
|
||||
@@ -232,14 +234,18 @@ impl AggregationWithAccessor {
|
||||
missing.clone()
|
||||
};
|
||||
|
||||
let missing_value_for_accessor = if let Some(missing) =
|
||||
missing_value_term_agg.as_ref()
|
||||
{
|
||||
get_missing_val(column_type, missing, agg.agg.get_fast_field_names()[0])?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
let missing_value_for_accessor =
|
||||
if let Some(missing) = missing_value_term_agg.as_ref() {
|
||||
get_missing_val_as_u64_lenient(
|
||||
column_type,
|
||||
missing,
|
||||
agg.agg.get_fast_field_names()[0],
|
||||
)?
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let limits = limits.clone();
|
||||
let agg = AggregationWithAccessor {
|
||||
segment_ordinal,
|
||||
missing_value_for_accessor,
|
||||
@@ -255,7 +261,7 @@ impl AggregationWithAccessor {
|
||||
)?,
|
||||
agg: agg.clone(),
|
||||
str_dict_column: str_dict_column.clone(),
|
||||
limits: limits.new_guard(),
|
||||
limits,
|
||||
column_block_accessor: Default::default(),
|
||||
};
|
||||
res.push(agg);
|
||||
@@ -330,7 +336,14 @@ impl AggregationWithAccessor {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_missing_val(
|
||||
/// Get the missing value as internal u64 representation
|
||||
///
|
||||
/// For terms we use u64::MAX as sentinel value
|
||||
/// For numerical data we convert the value into the representation
|
||||
/// we would get from the fast field, when we open it as u64_lenient_for_type.
|
||||
///
|
||||
/// That way we can use it the same way as if it would come from the fastfield.
|
||||
fn get_missing_val_as_u64_lenient(
|
||||
column_type: ColumnType,
|
||||
missing: &Key,
|
||||
field_name: &str,
|
||||
@@ -339,9 +352,18 @@ fn get_missing_val(
|
||||
Key::Str(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
// Allow fallback to number on text fields
|
||||
Key::F64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::U64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::I64(_) if column_type == ColumnType::Str => Some(u64::MAX),
|
||||
Key::F64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val, &column_type)
|
||||
}
|
||||
// NOTE: We may loose precision of the passed missing value by casting i64 and u64 to f64.
|
||||
Key::I64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||
}
|
||||
Key::U64(val) if column_type.numerical_type().is_some() => {
|
||||
f64_to_fastfield_u64(*val as f64, &column_type)
|
||||
}
|
||||
_ => {
|
||||
return Err(crate::TantivyError::InvalidArgument(format!(
|
||||
"Missing value {missing:?} for field {field_name} is not supported for column \
|
||||
@@ -365,7 +387,7 @@ pub(crate) fn get_aggs_with_segment_accessor_and_validate(
|
||||
aggs: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: &AggregationLimits,
|
||||
limits: &AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationsWithAccessor> {
|
||||
let mut aggss = Vec::new();
|
||||
for (key, agg) in aggs.iter() {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! Contains the final aggregation tree.
|
||||
//!
|
||||
//! This tree can be converted via the `into()` method from `IntermediateAggregationResults`.
|
||||
//! This conversion computes the final result. For example: The intermediate result contains
|
||||
//! intermediate average results, which is the sum and the number of values. The actual average is
|
||||
@@ -187,7 +188,7 @@ pub enum BucketEntries<T> {
|
||||
}
|
||||
|
||||
impl<T> BucketEntries<T> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &T> + 'a> {
|
||||
fn iter<'a>(&'a self) -> Box<dyn Iterator<Item = &'a T> + 'a> {
|
||||
match self {
|
||||
BucketEntries::Vec(vec) => Box::new(vec.iter()),
|
||||
BucketEntries::HashMap(map) => Box::new(map.values()),
|
||||
|
||||
@@ -5,7 +5,7 @@ use crate::aggregation::agg_result::AggregationResults;
|
||||
use crate::aggregation::buf_collector::DOC_BLOCK_SIZE;
|
||||
use crate::aggregation::collector::AggregationCollector;
|
||||
use crate::aggregation::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use crate::aggregation::segment_agg_result::AggregationLimits;
|
||||
use crate::aggregation::segment_agg_result::AggregationLimitsGuard;
|
||||
use crate::aggregation::tests::{get_test_index_2_segments, get_test_index_from_values_and_terms};
|
||||
use crate::aggregation::DistributedAggregationCollector;
|
||||
use crate::query::{AllQuery, TermQuery};
|
||||
@@ -130,7 +130,7 @@ fn test_aggregation_flushing(
|
||||
let agg_res: AggregationResults = if use_distributed_collector {
|
||||
let collector = DistributedAggregationCollector::from_aggs(
|
||||
agg_req.clone(),
|
||||
AggregationLimits::default(),
|
||||
AggregationLimitsGuard::default(),
|
||||
);
|
||||
|
||||
let searcher = reader.searcher();
|
||||
@@ -146,7 +146,7 @@ fn test_aggregation_flushing(
|
||||
.expect("Post deserialization failed");
|
||||
|
||||
intermediate_agg_result
|
||||
.into_final_result(agg_req, &Default::default())
|
||||
.into_final_result(agg_req, Default::default())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = get_collector(agg_req);
|
||||
@@ -460,7 +460,7 @@ fn test_aggregation_level2(
|
||||
|
||||
let searcher = reader.searcher();
|
||||
let res = searcher.search(&term_query, &collector).unwrap();
|
||||
res.into_final_result(agg_req.clone(), &Default::default())
|
||||
res.into_final_result(agg_req.clone(), Default::default())
|
||||
.unwrap()
|
||||
} else {
|
||||
let collector = get_collector(agg_req.clone());
|
||||
@@ -870,7 +870,7 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
.add_document(doc!(json => json!({"mixed_type": "blue", "mixed_price": 5.0})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
// => Segment with all boolen
|
||||
// => Segment with all boolean
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_type": true, "mixed_price": "no_price"})))
|
||||
.unwrap();
|
||||
@@ -939,11 +939,11 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
},
|
||||
"termagg": {
|
||||
"buckets": [
|
||||
{ "doc_count": 1, "key": 10.0, "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 1, "key": 10, "min_price": { "value": 10.0 } },
|
||||
{ "doc_count": 3, "key": "blue", "min_price": { "value": 5.0 } },
|
||||
{ "doc_count": 2, "key": "red", "min_price": { "value": 1.0 } },
|
||||
{ "doc_count": 1, "key": -20.5, "min_price": { "value": -20.5 } },
|
||||
{ "doc_count": 2, "key": 1.0, "key_as_string": "true", "min_price": { "value": null } },
|
||||
{ "doc_count": 2, "key": 1, "key_as_string": "true", "min_price": { "value": null } },
|
||||
],
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
@@ -951,3 +951,60 @@ fn test_aggregation_on_json_object_mixed_types() {
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_aggregation_on_json_object_mixed_numerical_segments() {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let json = schema_builder.add_json_field("json", FAST);
|
||||
let schema = schema_builder.build();
|
||||
let index = Index::create_in_ram(schema);
|
||||
let mut index_writer: IndexWriter = index.writer_for_tests().unwrap();
|
||||
// => Segment with all values f64 numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10.5})))
|
||||
.unwrap();
|
||||
// Gets converted to f64!
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
// => Segment with all values i64 numeric
|
||||
index_writer
|
||||
.add_document(doc!(json => json!({"mixed_price": 10})))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
index_writer.commit().unwrap();
|
||||
|
||||
// All bucket types
|
||||
let agg_req_str = r#"
|
||||
{
|
||||
"termagg": {
|
||||
"terms": {
|
||||
"field": "json.mixed_price"
|
||||
}
|
||||
}
|
||||
} "#;
|
||||
let agg: Aggregations = serde_json::from_str(agg_req_str).unwrap();
|
||||
let aggregation_collector = get_collector(agg);
|
||||
let reader = index.reader().unwrap();
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let aggregation_results = searcher.search(&AllQuery, &aggregation_collector).unwrap();
|
||||
let aggregation_res_json = serde_json::to_value(aggregation_results).unwrap();
|
||||
use pretty_assertions::assert_eq;
|
||||
assert_eq!(
|
||||
&aggregation_res_json,
|
||||
&serde_json::json!({
|
||||
"termagg": {
|
||||
"buckets": [
|
||||
{ "doc_count": 2, "key": 10},
|
||||
{ "doc_count": 1, "key": 10.5},
|
||||
],
|
||||
"doc_count_error_upper_bound": 0,
|
||||
"sum_other_doc_count": 0
|
||||
}
|
||||
}
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
@@ -244,7 +244,7 @@ fn parse_into_milliseconds(input: &str) -> Result<i64, AggregationError> {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
use pretty_assertions::assert_eq;
|
||||
|
||||
use super::*;
|
||||
|
||||
@@ -438,7 +438,7 @@ fn intermediate_buckets_to_final_buckets_fill_gaps(
|
||||
buckets: Vec<IntermediateHistogramBucketEntry>,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
// Generate the full list of buckets without gaps.
|
||||
//
|
||||
@@ -496,7 +496,7 @@ pub(crate) fn intermediate_histogram_buckets_to_final_buckets(
|
||||
is_date_agg: bool,
|
||||
histogram_req: &HistogramAggregation,
|
||||
sub_aggregation: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<Vec<BucketEntry>> {
|
||||
// Normalization is column type dependent.
|
||||
// The request used in the the call to final is not yet be normalized.
|
||||
@@ -750,7 +750,7 @@ mod tests {
|
||||
agg_req,
|
||||
&index,
|
||||
None,
|
||||
AggregationLimits::new(Some(5_000), None),
|
||||
AggregationLimitsGuard::new(Some(5_000), None),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(res.to_string().starts_with(
|
||||
|
||||
@@ -112,18 +112,64 @@ impl Serialize for CustomOrder {
|
||||
impl<'de> Deserialize<'de> for CustomOrder {
|
||||
fn deserialize<D>(deserializer: D) -> Result<CustomOrder, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
HashMap::<String, Order>::deserialize(deserializer).and_then(|map| {
|
||||
if let Some((key, value)) = map.into_iter().next() {
|
||||
let value = serde_json::Value::deserialize(deserializer)?;
|
||||
let return_err = |message, val: serde_json::Value| {
|
||||
de::Error::custom(format!(
|
||||
"{}, but got {}",
|
||||
message,
|
||||
serde_json::to_string(&val).unwrap()
|
||||
))
|
||||
};
|
||||
|
||||
match value {
|
||||
serde_json::Value::Object(map) => {
|
||||
if map.len() != 1 {
|
||||
return Err(return_err(
|
||||
"expected exactly one key-value pair in the order map",
|
||||
map.into(),
|
||||
));
|
||||
}
|
||||
|
||||
let (key, value) = map.into_iter().next().unwrap();
|
||||
let order = serde_json::from_value(value).map_err(de::Error::custom)?;
|
||||
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order: value,
|
||||
order,
|
||||
})
|
||||
} else {
|
||||
Err(de::Error::custom(
|
||||
"unexpected empty map in order".to_string(),
|
||||
))
|
||||
}
|
||||
})
|
||||
serde_json::Value::Array(arr) => {
|
||||
if arr.is_empty() {
|
||||
return Err(return_err("unexpected empty array in order", arr.into()));
|
||||
}
|
||||
if arr.len() != 1 {
|
||||
return Err(return_err(
|
||||
"only one sort order supported currently",
|
||||
arr.into(),
|
||||
));
|
||||
}
|
||||
let entry = arr.into_iter().next().unwrap();
|
||||
let map = entry
|
||||
.as_object()
|
||||
.ok_or_else(|| return_err("expected object as sort order", entry.clone()))?;
|
||||
let (key, value) = map.into_iter().next().ok_or_else(|| {
|
||||
return_err(
|
||||
"expected exactly one key-value pair in the order map",
|
||||
entry.clone(),
|
||||
)
|
||||
})?;
|
||||
let order = serde_json::from_value(value.clone()).map_err(de::Error::custom)?;
|
||||
|
||||
Ok(CustomOrder {
|
||||
target: key.as_str().into(),
|
||||
order,
|
||||
})
|
||||
}
|
||||
_ => Err(return_err(
|
||||
"unexpected type, expected an object or array",
|
||||
value,
|
||||
)),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -138,11 +184,23 @@ fn custom_order_serde_test() {
|
||||
assert_eq!(order_str, "{\"_key\":\"desc\"}");
|
||||
let order_deser = serde_json::from_str(&order_str).unwrap();
|
||||
|
||||
assert_eq!(order, order_deser);
|
||||
let order_deser: CustomOrder = serde_json::from_str("[{\"_key\":\"desc\"}]").unwrap();
|
||||
assert_eq!(order, order_deser);
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("{}");
|
||||
assert!(order_deser.is_err());
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> = serde_json::from_str("[]");
|
||||
assert!(order_deser.is_err());
|
||||
assert!(order_deser
|
||||
.unwrap_err()
|
||||
.to_string()
|
||||
.contains("unexpected empty array in order"));
|
||||
|
||||
let order_deser: serde_json::Result<CustomOrder> =
|
||||
serde_json::from_str(r#"[{"_key":"desc"},{"_key":"desc"}]"#);
|
||||
assert_eq!(
|
||||
order_deser.unwrap_err().to_string(),
|
||||
r#"only one sort order supported currently, but got [{"_key":"desc"},{"_key":"desc"}]"#
|
||||
);
|
||||
}
|
||||
|
||||
@@ -4,7 +4,6 @@ use std::ops::Range;
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use crate::aggregation::agg_limits::ResourceLimitGuard;
|
||||
use crate::aggregation::agg_req_with_accessor::AggregationsWithAccessor;
|
||||
use crate::aggregation::intermediate_agg_result::{
|
||||
IntermediateAggregationResult, IntermediateAggregationResults, IntermediateBucketResult,
|
||||
@@ -17,6 +16,7 @@ use crate::aggregation::*;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Provide user-defined buckets to aggregate on.
|
||||
///
|
||||
/// Two special buckets will automatically be created to cover the whole range of values.
|
||||
/// The provided buckets have to be continuous.
|
||||
/// During the aggregation, the values extracted from the fast_field `field` will be checked
|
||||
@@ -270,7 +270,7 @@ impl SegmentRangeCollector {
|
||||
pub(crate) fn from_req_and_validate(
|
||||
req: &RangeAggregation,
|
||||
sub_aggregation: &mut AggregationsWithAccessor,
|
||||
limits: &ResourceLimitGuard,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
field_type: ColumnType,
|
||||
accessor_idx: usize,
|
||||
) -> crate::Result<Self> {
|
||||
@@ -471,7 +471,7 @@ mod tests {
|
||||
SegmentRangeCollector::from_req_and_validate(
|
||||
&req,
|
||||
&mut Default::default(),
|
||||
&AggregationLimits::default().new_guard(),
|
||||
&mut AggregationLimitsGuard::default(),
|
||||
field_type,
|
||||
0,
|
||||
)
|
||||
|
||||
@@ -3,7 +3,9 @@ use std::io;
|
||||
use std::net::Ipv6Addr;
|
||||
|
||||
use columnar::column_values::CompactSpaceU64Accessor;
|
||||
use columnar::{ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64};
|
||||
use columnar::{
|
||||
ColumnType, Dictionary, MonotonicallyMappableToU128, MonotonicallyMappableToU64, NumericalValue,
|
||||
};
|
||||
use rustc_hash::FxHashMap;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
@@ -19,11 +21,11 @@ use crate::aggregation::intermediate_agg_result::{
|
||||
use crate::aggregation::segment_agg_result::{
|
||||
build_segment_agg_collector, SegmentAggregationCollector,
|
||||
};
|
||||
use crate::aggregation::{f64_from_fastfield_u64, format_date, Key};
|
||||
use crate::aggregation::{format_date, Key};
|
||||
use crate::error::DataCorruption;
|
||||
use crate::TantivyError;
|
||||
|
||||
/// Creates a bucket for every unique term and counts the number of occurences.
|
||||
/// Creates a bucket for every unique term and counts the number of occurrences.
|
||||
/// Note that doc_count in the response buckets equals term count here.
|
||||
///
|
||||
/// If the text is untokenized and single value, that means one term per document and therefore it
|
||||
@@ -156,7 +158,7 @@ pub struct TermsAggregation {
|
||||
/// when loading the text.
|
||||
/// Special Case 1:
|
||||
/// If we have multiple columns on one field, we need to have a union on the indices on both
|
||||
/// columns, to find docids without a value. That requires a special missing aggreggation.
|
||||
/// columns, to find docids without a value. That requires a special missing aggregation.
|
||||
/// Special Case 2: if the key is of type text and the column is numerical, we also need to use
|
||||
/// the special missing aggregation, since there is no mechanism in the numerical column to
|
||||
/// add text.
|
||||
@@ -362,7 +364,7 @@ impl SegmentTermCollector {
|
||||
let term_buckets = TermBuckets::default();
|
||||
|
||||
if let Some(custom_order) = req.order.as_ref() {
|
||||
// Validate sub aggregtion exists
|
||||
// Validate sub aggregation exists
|
||||
if let OrderTarget::SubAggregation(sub_agg_name) = &custom_order.target {
|
||||
let (agg_name, _agg_property) = get_agg_name_and_property(sub_agg_name);
|
||||
|
||||
@@ -497,6 +499,12 @@ impl SegmentTermCollector {
|
||||
Key::F64(val) => {
|
||||
dict.insert(IntermediateKey::F64(*val), intermediate_entry);
|
||||
}
|
||||
Key::U64(val) => {
|
||||
dict.insert(IntermediateKey::U64(*val), intermediate_entry);
|
||||
}
|
||||
Key::I64(val) => {
|
||||
dict.insert(IntermediateKey::I64(*val), intermediate_entry);
|
||||
}
|
||||
}
|
||||
|
||||
entries.swap_remove(index);
|
||||
@@ -583,8 +591,26 @@ impl SegmentTermCollector {
|
||||
} else {
|
||||
for (val, doc_count) in entries {
|
||||
let intermediate_entry = into_intermediate_bucket_entry(val, doc_count)?;
|
||||
let val = f64_from_fastfield_u64(val, &self.column_type);
|
||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||
if self.column_type == ColumnType::U64 {
|
||||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||
} else if self.column_type == ColumnType::I64 {
|
||||
dict.insert(IntermediateKey::I64(i64::from_u64(val)), intermediate_entry);
|
||||
} else {
|
||||
let val = f64::from_u64(val);
|
||||
let val: NumericalValue = val.into();
|
||||
|
||||
match val.normalize() {
|
||||
NumericalValue::U64(val) => {
|
||||
dict.insert(IntermediateKey::U64(val), intermediate_entry);
|
||||
}
|
||||
NumericalValue::I64(val) => {
|
||||
dict.insert(IntermediateKey::I64(val), intermediate_entry);
|
||||
}
|
||||
NumericalValue::F64(val) => {
|
||||
dict.insert(IntermediateKey::F64(val), intermediate_entry);
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
@@ -643,7 +669,7 @@ mod tests {
|
||||
exec_request, exec_request_with_query, exec_request_with_query_and_memory_limit,
|
||||
get_test_index_from_terms, get_test_index_from_values_and_terms,
|
||||
};
|
||||
use crate::aggregation::AggregationLimits;
|
||||
use crate::aggregation::AggregationLimitsGuard;
|
||||
use crate::indexer::NoMergePolicy;
|
||||
use crate::schema::{IntoIpv6Addr, Schema, FAST, STRING};
|
||||
use crate::{Index, IndexWriter};
|
||||
@@ -1206,8 +1232,8 @@ mod tests {
|
||||
#[test]
|
||||
fn terms_aggregation_min_doc_count_special_case() -> crate::Result<()> {
|
||||
let terms_per_segment = vec![
|
||||
vec!["terma", "terma", "termb", "termb", "termb", "termc"],
|
||||
vec!["terma", "terma", "termb", "termc", "termc"],
|
||||
vec!["terma", "terma", "termb", "termb", "termb"],
|
||||
vec!["terma", "terma", "termb"],
|
||||
];
|
||||
|
||||
let index = get_test_index_from_terms(false, &terms_per_segment)?;
|
||||
@@ -1229,8 +1255,6 @@ mod tests {
|
||||
assert_eq!(res["my_texts"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["key"], "termb");
|
||||
assert_eq!(res["my_texts"]["buckets"][1]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["key"], "termc");
|
||||
assert_eq!(res["my_texts"]["buckets"][2]["doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["sum_other_doc_count"], 0);
|
||||
assert_eq!(res["my_texts"]["doc_count_error_upper_bound"], 0);
|
||||
|
||||
@@ -1398,7 +1422,7 @@ mod tests {
|
||||
agg_req,
|
||||
&index,
|
||||
None,
|
||||
AggregationLimits::new(Some(50_000), None),
|
||||
AggregationLimitsGuard::new(Some(50_000), None),
|
||||
)
|
||||
.unwrap_err();
|
||||
assert!(res
|
||||
@@ -1659,7 +1683,7 @@ mod tests {
|
||||
res["my_texts"]["buckets"][2]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
// text field with numner as missing fallback
|
||||
// text field with number as missing fallback
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 5);
|
||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||
@@ -1719,6 +1743,54 @@ mod tests {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_u64_value() -> crate::Result<()> {
|
||||
// Make sure that large u64 are not truncated
|
||||
let mut schema_builder = Schema::builder();
|
||||
let id_field = schema_builder.add_u64_field("id", FAST);
|
||||
let index = Index::create_in_ram(schema_builder.build());
|
||||
{
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000)?;
|
||||
index_writer.set_merge_policy(Box::new(NoMergePolicy));
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 9_223_372_036_854_775_807u64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 1_769_070_189_829_214_202u64,
|
||||
))?;
|
||||
index_writer.add_document(doc!(
|
||||
id_field => 1_769_070_189_829_214_202u64,
|
||||
))?;
|
||||
index_writer.commit()?;
|
||||
}
|
||||
|
||||
let agg_req: Aggregations = serde_json::from_value(json!({
|
||||
"my_ids": {
|
||||
"terms": {
|
||||
"field": "id"
|
||||
},
|
||||
}
|
||||
}))
|
||||
.unwrap();
|
||||
|
||||
let res = exec_request_with_query(agg_req, &index, None)?;
|
||||
|
||||
// id field
|
||||
assert_eq!(
|
||||
res["my_ids"]["buckets"][0]["key"],
|
||||
1_769_070_189_829_214_202u64
|
||||
);
|
||||
assert_eq!(res["my_ids"]["buckets"][0]["doc_count"], 2);
|
||||
assert_eq!(
|
||||
res["my_ids"]["buckets"][1]["key"],
|
||||
9_223_372_036_854_775_807u64
|
||||
);
|
||||
assert_eq!(res["my_ids"]["buckets"][1]["doc_count"], 1);
|
||||
assert_eq!(res["my_ids"]["buckets"][2]["key"], serde_json::Value::Null);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn terms_aggregation_missing1() -> crate::Result<()> {
|
||||
let mut schema_builder = Schema::builder();
|
||||
@@ -1785,7 +1857,7 @@ mod tests {
|
||||
res["my_texts"]["buckets"][2]["key"],
|
||||
serde_json::Value::Null
|
||||
);
|
||||
// text field with numner as missing fallback
|
||||
// text field with number as missing fallback
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["key"], "Hello Hello");
|
||||
assert_eq!(res["my_texts2"]["buckets"][0]["doc_count"], 4);
|
||||
assert_eq!(res["my_texts2"]["buckets"][1]["key"], 1337.0);
|
||||
|
||||
@@ -70,7 +70,6 @@ impl SegmentAggregationCollector for TermMissingAgg {
|
||||
)?;
|
||||
missing_entry.sub_aggregation = res;
|
||||
}
|
||||
|
||||
entries.insert(missing.into(), missing_entry);
|
||||
|
||||
let bucket = IntermediateBucketResult::Terms {
|
||||
|
||||
@@ -4,7 +4,7 @@ use super::agg_result::AggregationResults;
|
||||
use super::buf_collector::BufAggregationCollector;
|
||||
use super::intermediate_agg_result::IntermediateAggregationResults;
|
||||
use super::segment_agg_result::{
|
||||
build_segment_agg_collector, AggregationLimits, SegmentAggregationCollector,
|
||||
build_segment_agg_collector, AggregationLimitsGuard, SegmentAggregationCollector,
|
||||
};
|
||||
use crate::aggregation::agg_req_with_accessor::get_aggs_with_segment_accessor_and_validate;
|
||||
use crate::collector::{Collector, SegmentCollector};
|
||||
@@ -22,7 +22,7 @@ pub const DEFAULT_MEMORY_LIMIT: u64 = 500_000_000;
|
||||
/// The collector collects all aggregations by the underlying aggregation request.
|
||||
pub struct AggregationCollector {
|
||||
agg: Aggregations,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
}
|
||||
|
||||
impl AggregationCollector {
|
||||
@@ -30,7 +30,7 @@ impl AggregationCollector {
|
||||
///
|
||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||
/// bucket limit)
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||
Self { agg, limits }
|
||||
}
|
||||
}
|
||||
@@ -45,7 +45,7 @@ impl AggregationCollector {
|
||||
/// into the final `AggregationResults` via the `into_final_result()` method.
|
||||
pub struct DistributedAggregationCollector {
|
||||
agg: Aggregations,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
}
|
||||
|
||||
impl DistributedAggregationCollector {
|
||||
@@ -53,7 +53,7 @@ impl DistributedAggregationCollector {
|
||||
///
|
||||
/// Aggregation fails when the limits in `AggregationLimits` is exceeded. (memory limit and
|
||||
/// bucket limit)
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimits) -> Self {
|
||||
pub fn from_aggs(agg: Aggregations, limits: AggregationLimitsGuard) -> Self {
|
||||
Self { agg, limits }
|
||||
}
|
||||
}
|
||||
@@ -115,7 +115,7 @@ impl Collector for AggregationCollector {
|
||||
segment_fruits: Vec<<Self::Child as SegmentCollector>::Fruit>,
|
||||
) -> crate::Result<Self::Fruit> {
|
||||
let res = merge_fruits(segment_fruits)?;
|
||||
res.into_final_result(self.agg.clone(), &self.limits)
|
||||
res.into_final_result(self.agg.clone(), self.limits.clone())
|
||||
}
|
||||
}
|
||||
|
||||
@@ -147,7 +147,7 @@ impl AggregationSegmentCollector {
|
||||
agg: &Aggregations,
|
||||
reader: &SegmentReader,
|
||||
segment_ordinal: SegmentOrdinal,
|
||||
limits: &AggregationLimits,
|
||||
limits: &AggregationLimitsGuard,
|
||||
) -> crate::Result<Self> {
|
||||
let mut aggs_with_accessor =
|
||||
get_aggs_with_segment_accessor_and_validate(agg, reader, segment_ordinal, limits)?;
|
||||
|
||||
@@ -22,7 +22,7 @@ use super::metric::{
|
||||
IntermediateAverage, IntermediateCount, IntermediateExtendedStats, IntermediateMax,
|
||||
IntermediateMin, IntermediateStats, IntermediateSum, PercentilesCollector, TopHitsTopNComputer,
|
||||
};
|
||||
use super::segment_agg_result::AggregationLimits;
|
||||
use super::segment_agg_result::AggregationLimitsGuard;
|
||||
use super::{format_date, AggregationError, Key, SerializedKey};
|
||||
use crate::aggregation::agg_result::{AggregationResults, BucketEntries, BucketEntry};
|
||||
use crate::aggregation::bucket::TermsAggregationInternal;
|
||||
@@ -51,12 +51,18 @@ pub enum IntermediateKey {
|
||||
Str(String),
|
||||
/// `f64` key
|
||||
F64(f64),
|
||||
/// `i64` key
|
||||
I64(i64),
|
||||
/// `u64` key
|
||||
U64(u64),
|
||||
}
|
||||
impl From<Key> for IntermediateKey {
|
||||
fn from(value: Key) -> Self {
|
||||
match value {
|
||||
Key::Str(s) => Self::Str(s),
|
||||
Key::F64(f) => Self::F64(f),
|
||||
Key::U64(f) => Self::U64(f),
|
||||
Key::I64(f) => Self::I64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -73,7 +79,9 @@ impl From<IntermediateKey> for Key {
|
||||
}
|
||||
}
|
||||
IntermediateKey::F64(f) => Self::F64(f),
|
||||
IntermediateKey::Bool(f) => Self::F64(f as u64 as f64),
|
||||
IntermediateKey::Bool(f) => Self::U64(f as u64),
|
||||
IntermediateKey::U64(f) => Self::U64(f),
|
||||
IntermediateKey::I64(f) => Self::I64(f),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -86,6 +94,8 @@ impl std::hash::Hash for IntermediateKey {
|
||||
match self {
|
||||
IntermediateKey::Str(text) => text.hash(state),
|
||||
IntermediateKey::F64(val) => val.to_bits().hash(state),
|
||||
IntermediateKey::U64(val) => val.hash(state),
|
||||
IntermediateKey::I64(val) => val.hash(state),
|
||||
IntermediateKey::Bool(val) => val.hash(state),
|
||||
IntermediateKey::IpAddr(val) => val.hash(state),
|
||||
}
|
||||
@@ -112,9 +122,9 @@ impl IntermediateAggregationResults {
|
||||
pub fn into_final_result(
|
||||
self,
|
||||
req: Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
mut limits: AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
let res = self.into_final_result_internal(&req, limits)?;
|
||||
let res = self.into_final_result_internal(&req, &mut limits)?;
|
||||
let bucket_count = res.get_bucket_count() as u32;
|
||||
if bucket_count > limits.get_bucket_limit() {
|
||||
return Err(TantivyError::AggregationError(
|
||||
@@ -131,7 +141,7 @@ impl IntermediateAggregationResults {
|
||||
pub(crate) fn into_final_result_internal(
|
||||
self,
|
||||
req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResults> {
|
||||
let mut results: FxHashMap<String, AggregationResult> = FxHashMap::default();
|
||||
for (key, agg_res) in self.aggs_res.into_iter() {
|
||||
@@ -247,7 +257,7 @@ impl IntermediateAggregationResult {
|
||||
pub(crate) fn into_final_result(
|
||||
self,
|
||||
req: &Aggregation,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<AggregationResult> {
|
||||
let res = match self {
|
||||
IntermediateAggregationResult::Bucket(bucket) => {
|
||||
@@ -422,7 +432,7 @@ impl IntermediateBucketResult {
|
||||
pub(crate) fn into_final_bucket_result(
|
||||
self,
|
||||
req: &Aggregation,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketResult> {
|
||||
match self {
|
||||
IntermediateBucketResult::Range(range_res) => {
|
||||
@@ -586,7 +596,7 @@ impl IntermediateTermBucketResult {
|
||||
self,
|
||||
req: &TermsAggregation,
|
||||
sub_aggregation_req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketResult> {
|
||||
let req = TermsAggregationInternal::from_req(req);
|
||||
let mut buckets: Vec<BucketEntry> = self
|
||||
@@ -713,7 +723,7 @@ impl IntermediateHistogramBucketEntry {
|
||||
pub(crate) fn into_final_bucket_entry(
|
||||
self,
|
||||
req: &Aggregations,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<BucketEntry> {
|
||||
Ok(BucketEntry {
|
||||
key_as_string: None,
|
||||
@@ -748,7 +758,7 @@ impl IntermediateRangeBucketEntry {
|
||||
req: &Aggregations,
|
||||
_range_req: &RangeAggregation,
|
||||
column_type: Option<ColumnType>,
|
||||
limits: &AggregationLimits,
|
||||
limits: &mut AggregationLimitsGuard,
|
||||
) -> crate::Result<RangeBucketEntry> {
|
||||
let mut range_bucket_entry = RangeBucketEntry {
|
||||
key: self.key.into(),
|
||||
@@ -850,7 +860,7 @@ mod tests {
|
||||
}
|
||||
}
|
||||
|
||||
fn get_intermediat_tree_with_ranges(
|
||||
fn get_intermediate_tree_with_ranges(
|
||||
data: &[(String, u64, String, u64)],
|
||||
) -> IntermediateAggregationResults {
|
||||
let mut map = HashMap::new();
|
||||
@@ -886,18 +896,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_1() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("blue".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right).unwrap();
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 55, "1900".to_string(), 80),
|
||||
]);
|
||||
@@ -907,18 +917,18 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_2() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
let tree_right = get_intermediat_tree_with_ranges(&[
|
||||
let tree_right = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 60, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
]);
|
||||
|
||||
tree_left.merge_fruits(tree_right).unwrap();
|
||||
|
||||
let tree_expected = get_intermediat_tree_with_ranges(&[
|
||||
let tree_expected = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 110, "1900".to_string(), 55),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
("green".to_string(), 25, "1900".to_string(), 50),
|
||||
@@ -929,7 +939,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn test_merge_fruits_tree_empty() {
|
||||
let mut tree_left = get_intermediat_tree_with_ranges(&[
|
||||
let mut tree_left = get_intermediate_tree_with_ranges(&[
|
||||
("red".to_string(), 50, "1900".to_string(), 25),
|
||||
("blue".to_string(), 30, "1900".to_string(), 30),
|
||||
]);
|
||||
|
||||
@@ -179,10 +179,11 @@ impl SegmentCardinalityCollector {
|
||||
Ok(())
|
||||
})?;
|
||||
if has_missing {
|
||||
// Replace missing with the actual value provided
|
||||
let missing_key = self
|
||||
.missing
|
||||
.as_ref()
|
||||
.expect("Found placeholder term_ord but `missing` is None");
|
||||
.expect("Found sentinel value u64::MAX for term_ord but `missing` is not set");
|
||||
match missing_key {
|
||||
Key::Str(missing) => {
|
||||
self.cardinality.sketch.insert_any(&missing);
|
||||
@@ -191,6 +192,12 @@ impl SegmentCardinalityCollector {
|
||||
let val = f64_to_u64(*val);
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
Key::U64(val) => {
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
Key::I64(val) => {
|
||||
self.cardinality.sketch.insert_any(&val);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -163,8 +163,8 @@ impl PartialEq for PercentilesCollector {
|
||||
}
|
||||
}
|
||||
|
||||
fn format_percentil(percentil: f64) -> String {
|
||||
let mut out = percentil.to_string();
|
||||
fn format_percentile(percentile: f64) -> String {
|
||||
let mut out = percentile.to_string();
|
||||
// Slightly silly way to format trailing decimals
|
||||
if !out.contains('.') {
|
||||
out.push_str(".0");
|
||||
@@ -197,7 +197,7 @@ impl PercentilesCollector {
|
||||
let values = if req.keyed {
|
||||
PercentileValues::HashMap(
|
||||
iter_quantile_and_values
|
||||
.map(|(val, quantil)| (format_percentil(val), quantil))
|
||||
.map(|(val, quantil)| (format_percentile(val), quantil))
|
||||
.collect(),
|
||||
)
|
||||
} else {
|
||||
|
||||
@@ -139,7 +139,7 @@ impl<'de> Deserialize<'de> for KeyOrder {
|
||||
}
|
||||
}
|
||||
|
||||
// Tranform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
||||
// Transform a glob (`pattern*`, for example) into a regex::Regex (`^pattern.*$`)
|
||||
fn globbed_string_to_regex(glob: &str) -> Result<Regex, crate::TantivyError> {
|
||||
// Replace `*` glob with `.*` regex
|
||||
let sanitized = format!("^{}$", regex::escape(glob).replace(r"\*", ".*"));
|
||||
|
||||
@@ -148,7 +148,7 @@ mod agg_tests;
|
||||
|
||||
use core::fmt;
|
||||
|
||||
pub use agg_limits::AggregationLimits;
|
||||
pub use agg_limits::AggregationLimitsGuard;
|
||||
pub use collector::{
|
||||
AggregationCollector, AggregationSegmentCollector, DistributedAggregationCollector,
|
||||
DEFAULT_BUCKET_LIMIT,
|
||||
@@ -180,7 +180,7 @@ pub(crate) fn deserialize_option_f64<'de, D>(deserializer: D) -> Result<Option<f
|
||||
where D: Deserializer<'de> {
|
||||
struct StringOrFloatVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
||||
impl Visitor<'_> for StringOrFloatVisitor {
|
||||
type Value = Option<f64>;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
@@ -226,7 +226,7 @@ pub(crate) fn deserialize_f64<'de, D>(deserializer: D) -> Result<f64, D::Error>
|
||||
where D: Deserializer<'de> {
|
||||
struct StringOrFloatVisitor;
|
||||
|
||||
impl<'de> Visitor<'de> for StringOrFloatVisitor {
|
||||
impl Visitor<'_> for StringOrFloatVisitor {
|
||||
type Value = f64;
|
||||
|
||||
fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result {
|
||||
@@ -336,10 +336,16 @@ pub type SerializedKey = String;
|
||||
|
||||
#[derive(Clone, Debug, Serialize, Deserialize, PartialOrd)]
|
||||
/// The key to identify a bucket.
|
||||
///
|
||||
/// The order is important, with serde untagged, that we try to deserialize into i64 first.
|
||||
#[serde(untagged)]
|
||||
pub enum Key {
|
||||
/// String key
|
||||
Str(String),
|
||||
/// `i64` key
|
||||
I64(i64),
|
||||
/// `u64` key
|
||||
U64(u64),
|
||||
/// `f64` key
|
||||
F64(f64),
|
||||
}
|
||||
@@ -350,6 +356,8 @@ impl std::hash::Hash for Key {
|
||||
match self {
|
||||
Key::Str(text) => text.hash(state),
|
||||
Key::F64(val) => val.to_bits().hash(state),
|
||||
Key::U64(val) => val.hash(state),
|
||||
Key::I64(val) => val.hash(state),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -369,6 +377,8 @@ impl Display for Key {
|
||||
match self {
|
||||
Key::Str(val) => f.write_str(val),
|
||||
Key::F64(val) => f.write_str(&val.to_string()),
|
||||
Key::U64(val) => f.write_str(&val.to_string()),
|
||||
Key::I64(val) => f.write_str(&val.to_string()),
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -448,7 +458,7 @@ mod tests {
|
||||
agg_req: Aggregations,
|
||||
index: &Index,
|
||||
query: Option<(&str, &str)>,
|
||||
limits: AggregationLimits,
|
||||
limits: AggregationLimitsGuard,
|
||||
) -> crate::Result<Value> {
|
||||
let collector = AggregationCollector::from_aggs(agg_req, limits);
|
||||
|
||||
|
||||
@@ -5,7 +5,7 @@
|
||||
|
||||
use std::fmt::Debug;
|
||||
|
||||
pub(crate) use super::agg_limits::AggregationLimits;
|
||||
pub(crate) use super::agg_limits::AggregationLimitsGuard;
|
||||
use super::agg_req::AggregationVariants;
|
||||
use super::agg_req_with_accessor::{AggregationWithAccessor, AggregationsWithAccessor};
|
||||
use super::bucket::{SegmentHistogramCollector, SegmentRangeCollector, SegmentTermCollector};
|
||||
@@ -103,7 +103,7 @@ pub(crate) fn build_single_agg_segment_collector(
|
||||
Range(range_req) => Ok(Box::new(SegmentRangeCollector::from_req_and_validate(
|
||||
range_req,
|
||||
&mut req.sub_aggregation,
|
||||
&req.limits,
|
||||
&mut req.limits,
|
||||
req.field_type,
|
||||
accessor_idx,
|
||||
)?)),
|
||||
|
||||
@@ -13,7 +13,7 @@ struct Hit<'a> {
|
||||
facet: &'a Facet,
|
||||
}
|
||||
|
||||
impl<'a> Eq for Hit<'a> {}
|
||||
impl Eq for Hit<'_> {}
|
||||
|
||||
impl<'a> PartialEq<Hit<'a>> for Hit<'a> {
|
||||
fn eq(&self, other: &Hit<'_>) -> bool {
|
||||
@@ -27,7 +27,7 @@ impl<'a> PartialOrd<Hit<'a>> for Hit<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Ord for Hit<'a> {
|
||||
impl Ord for Hit<'_> {
|
||||
fn cmp(&self, other: &Self) -> Ordering {
|
||||
other
|
||||
.count
|
||||
|
||||
@@ -182,6 +182,7 @@ where
|
||||
}
|
||||
|
||||
/// A variant of the [`FilterCollector`] specialized for bytes fast fields, i.e.
|
||||
///
|
||||
/// it transparently wraps an inner [`Collector`] but filters documents
|
||||
/// based on the result of applying the predicate to the bytes fast field.
|
||||
///
|
||||
|
||||
@@ -495,4 +495,4 @@ where
|
||||
impl_downcast!(Fruit);
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests;
|
||||
pub(crate) mod tests;
|
||||
|
||||
@@ -161,7 +161,7 @@ impl<TFruit: Fruit> FruitHandle<TFruit> {
|
||||
/// # Ok(())
|
||||
/// # }
|
||||
/// ```
|
||||
#[allow(clippy::type_complexity)]
|
||||
#[expect(clippy::type_complexity)]
|
||||
#[derive(Default)]
|
||||
pub struct MultiCollector<'a> {
|
||||
collector_wrappers: Vec<
|
||||
@@ -190,7 +190,7 @@ impl<'a> MultiCollector<'a> {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Collector for MultiCollector<'a> {
|
||||
impl Collector for MultiCollector<'_> {
|
||||
type Fruit = MultiFruit;
|
||||
type Child = MultiCollectorChild;
|
||||
|
||||
|
||||
@@ -15,11 +15,6 @@ use crate::{DocAddress, DocId, SegmentOrdinal};
|
||||
/// The REVERSE_ORDER generic parameter controls whether the by-feature order
|
||||
/// should be reversed, which is useful for achieving for example largest-first
|
||||
/// semantics without having to wrap the feature in a `Reverse`.
|
||||
///
|
||||
/// WARNING: equality is not what you would expect here.
|
||||
/// Two elements are equal if their feature is equal, and regardless of whether `doc`
|
||||
/// is equal. This should be perfectly fine for this usage, but let's make sure this
|
||||
/// struct is never public.
|
||||
#[derive(Clone, Default, Serialize, Deserialize)]
|
||||
pub struct ComparableDoc<T, D, const REVERSE_ORDER: bool = false> {
|
||||
/// The feature of the document. In practice, this is
|
||||
|
||||
@@ -3,7 +3,6 @@ use std::marker::PhantomData;
|
||||
use std::sync::Arc;
|
||||
|
||||
use columnar::ColumnValues;
|
||||
use serde::de::DeserializeOwned;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
use super::Collector;
|
||||
@@ -790,7 +789,7 @@ impl<Score, D, const R: bool> From<TopNComputerDeser<Score, D, R>> for TopNCompu
|
||||
impl<Score, D, const R: bool> TopNComputer<Score, D, R>
|
||||
where
|
||||
Score: PartialOrd + Clone,
|
||||
D: Serialize + DeserializeOwned + Ord + Clone,
|
||||
D: Ord,
|
||||
{
|
||||
/// Create a new `TopNComputer`.
|
||||
/// Internally it will allocate a buffer of size `2 * top_n`.
|
||||
|
||||
91
src/compat_tests.rs
Normal file
91
src/compat_tests.rs
Normal file
@@ -0,0 +1,91 @@
|
||||
use std::path::PathBuf;
|
||||
|
||||
use schema::*;
|
||||
|
||||
use crate::*;
|
||||
|
||||
fn create_index(path: &str) {
|
||||
let mut schema_builder = Schema::builder();
|
||||
let label = schema_builder.add_text_field("label", TEXT | STORED);
|
||||
let date = schema_builder.add_date_field("date", INDEXED | STORED);
|
||||
let schema = schema_builder.build();
|
||||
std::fs::create_dir_all(path).unwrap();
|
||||
let index = Index::create_in_dir(path, schema).unwrap();
|
||||
let mut index_writer = index.writer_with_num_threads(1, 20_000_000).unwrap();
|
||||
index_writer
|
||||
.add_document(doc!(label => "dateformat", date => DateTime::from_timestamp_nanos(123456)))
|
||||
.unwrap();
|
||||
index_writer.commit().unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
/// Writes an Index for the current INDEX_FORMAT_VERSION to disk.
|
||||
fn create_format() {
|
||||
let version = INDEX_FORMAT_VERSION.to_string();
|
||||
let file_path = path_for_version(&version);
|
||||
if PathBuf::from(file_path.clone()).exists() {
|
||||
return;
|
||||
}
|
||||
create_index(&file_path);
|
||||
}
|
||||
|
||||
fn path_for_version(version: &str) -> String {
|
||||
format!("./tests/compat_tests_data/index_v{}/", version)
|
||||
}
|
||||
|
||||
/// feature flag quickwit uses a different dictionary type
|
||||
#[test]
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
fn test_format_6() {
|
||||
let path = path_for_version("6");
|
||||
|
||||
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||
// dates are truncated to Microseconds in v6
|
||||
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
|
||||
}
|
||||
|
||||
/// feature flag quickwit uses a different dictionary type
|
||||
#[test]
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
fn test_format_7() {
|
||||
let path = path_for_version("7");
|
||||
|
||||
let index = Index::open_in_dir(path).expect("Failed to open index");
|
||||
// dates are not truncated in v7 in the docstore
|
||||
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
|
||||
}
|
||||
|
||||
#[cfg(not(feature = "quickwit"))]
|
||||
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
|
||||
use collector::TopDocs;
|
||||
let reader = index.reader().expect("Failed to create reader");
|
||||
let searcher = reader.searcher();
|
||||
|
||||
let schema = index.schema();
|
||||
let label_field = schema.get_field("label").expect("Field 'label' not found");
|
||||
let query_parser = query::QueryParser::for_index(index, vec![label_field]);
|
||||
|
||||
let query = query_parser
|
||||
.parse_query("dateformat")
|
||||
.expect("Failed to parse query");
|
||||
let top_docs = searcher
|
||||
.search(&query, &TopDocs::with_limit(1))
|
||||
.expect("Search failed");
|
||||
|
||||
assert_eq!(top_docs.len(), 1, "Expected 1 search result");
|
||||
|
||||
let doc_address = top_docs[0].1;
|
||||
let retrieved_doc: TantivyDocument = searcher
|
||||
.doc(doc_address)
|
||||
.expect("Failed to retrieve document");
|
||||
|
||||
let date_field = schema.get_field("date").expect("Field 'date' not found");
|
||||
let date_value = retrieved_doc
|
||||
.get_first(date_field)
|
||||
.expect("Date field not found in document")
|
||||
.as_datetime()
|
||||
.unwrap();
|
||||
|
||||
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
|
||||
assert_eq!(date_value, expected,);
|
||||
}
|
||||
@@ -100,7 +100,7 @@ impl Executor {
|
||||
|
||||
/// Spawn a task on the pool, returning a future completing on task success.
|
||||
///
|
||||
/// If the task panic, returns `Err(())`.
|
||||
/// If the task panics, returns `Err(())`.
|
||||
#[cfg(feature = "quickwit")]
|
||||
pub fn spawn_blocking<T: Send + 'static>(
|
||||
&self,
|
||||
|
||||
@@ -4,7 +4,7 @@ use rustc_hash::FxHashMap;
|
||||
|
||||
use crate::postings::{IndexingContext, IndexingPosition, PostingsWriter};
|
||||
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf, Value};
|
||||
use crate::schema::Type;
|
||||
use crate::schema::{Type, DATE_TIME_PRECISION_INDEXED};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
use crate::time::{OffsetDateTime, UtcOffset};
|
||||
use crate::tokenizer::TextAnalyzer;
|
||||
@@ -71,7 +71,7 @@ pub fn json_path_sep_to_dot(path: &mut str) {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
fn index_json_object<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_visitor: V::ObjectIter,
|
||||
@@ -101,7 +101,7 @@ fn index_json_object<'a, V: Value<'a>>(
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::too_many_arguments)]
|
||||
#[expect(clippy::too_many_arguments)]
|
||||
pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
doc: DocId,
|
||||
json_value: V,
|
||||
@@ -189,6 +189,7 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
ctx.path_to_unordered_id
|
||||
.get_or_allocate_unordered_id(json_path_writer.as_str()),
|
||||
);
|
||||
let val = val.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
term_buffer.append_type_and_fast_value(val);
|
||||
postings_writer.subscribe(doc, 0u32, term_buffer, ctx);
|
||||
}
|
||||
@@ -239,7 +240,11 @@ pub(crate) fn index_json_value<'a, V: Value<'a>>(
|
||||
/// Tries to infer a JSON type from a string and append it to the term.
|
||||
///
|
||||
/// The term must be json + JSON path.
|
||||
pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &str) -> Option<Term> {
|
||||
pub fn convert_to_fast_value_and_append_to_json_term(
|
||||
mut term: Term,
|
||||
phrase: &str,
|
||||
truncate_date_for_search: bool,
|
||||
) -> Option<Term> {
|
||||
assert_eq!(
|
||||
term.value()
|
||||
.as_json_value_bytes()
|
||||
@@ -250,8 +255,11 @@ pub fn convert_to_fast_value_and_append_to_json_term(mut term: Term, phrase: &st
|
||||
"JSON value bytes should be empty"
|
||||
);
|
||||
if let Ok(dt) = OffsetDateTime::parse(phrase, &Rfc3339) {
|
||||
let dt_utc = dt.to_offset(UtcOffset::UTC);
|
||||
term.append_type_and_fast_value(DateTime::from_utc(dt_utc));
|
||||
let mut dt = DateTime::from_utc(dt.to_offset(UtcOffset::UTC));
|
||||
if truncate_date_for_search {
|
||||
dt = dt.truncate(DATE_TIME_PRECISION_INDEXED);
|
||||
}
|
||||
term.append_type_and_fast_value(dt);
|
||||
return Some(term);
|
||||
}
|
||||
if let Ok(i64_val) = str::parse::<i64>(phrase) {
|
||||
|
||||
@@ -39,7 +39,7 @@ impl RetryPolicy {
|
||||
/// The `DirectoryLock` is an object that represents a file lock.
|
||||
///
|
||||
/// It is associated with a lock file, that gets deleted on `Drop.`
|
||||
#[allow(dead_code)]
|
||||
#[expect(dead_code)]
|
||||
pub struct DirectoryLock(Box<dyn Send + Sync + 'static>);
|
||||
|
||||
struct DirectoryLockGuard {
|
||||
@@ -102,10 +102,8 @@ fn retry_policy(is_blocking: bool) -> RetryPolicy {
|
||||
///
|
||||
/// There are currently two implementations of `Directory`
|
||||
///
|
||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this
|
||||
/// should be your default choice.
|
||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which
|
||||
/// should be used mostly for tests.
|
||||
/// - The [`MMapDirectory`][crate::directory::MmapDirectory], this should be your default choice.
|
||||
/// - The [`RamDirectory`][crate::directory::RamDirectory], which should be used mostly for tests.
|
||||
pub trait Directory: DirectoryClone + fmt::Debug + Send + Sync + 'static {
|
||||
/// Opens a file and returns a boxed `FileHandle`.
|
||||
///
|
||||
|
||||
@@ -48,6 +48,7 @@ pub static INDEX_WRITER_LOCK: Lazy<Lock> = Lazy::new(|| Lock {
|
||||
});
|
||||
/// The meta lock file is here to protect the segment files being opened by
|
||||
/// `IndexReader::reload()` from being garbage collected.
|
||||
///
|
||||
/// It makes it possible for another process to safely consume
|
||||
/// our index in-writing. Ideally, we may have preferred `RWLock` semantics
|
||||
/// here, but it is difficult to achieve on Windows.
|
||||
|
||||
@@ -244,7 +244,7 @@ impl MmapDirectory {
|
||||
directory_path,
|
||||
)));
|
||||
}
|
||||
#[allow(clippy::bind_instead_of_map)]
|
||||
#[expect(clippy::bind_instead_of_map)]
|
||||
let canonical_path: PathBuf = directory_path.canonicalize().or_else(|io_err| {
|
||||
let directory_path = directory_path.to_owned();
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ pub struct WatchCallbackList {
|
||||
/// file change is detected.
|
||||
#[must_use = "This `WatchHandle` controls the lifetime of the watch and should therefore be used."]
|
||||
#[derive(Clone)]
|
||||
#[allow(dead_code)]
|
||||
#[expect(dead_code)]
|
||||
pub struct WatchHandle(Arc<WatchCallback>);
|
||||
|
||||
impl WatchHandle {
|
||||
|
||||
@@ -117,7 +117,7 @@ pub trait DocSet: Send {
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> DocSet for &'a mut dyn DocSet {
|
||||
impl DocSet for &mut dyn DocSet {
|
||||
fn advance(&mut self) -> u32 {
|
||||
(**self).advance()
|
||||
}
|
||||
|
||||
@@ -25,10 +25,9 @@ impl FacetReader {
|
||||
/// Creates a new `FacetReader`.
|
||||
///
|
||||
/// A facet reader just wraps :
|
||||
/// - a `MultiValuedFastFieldReader` that makes it possible to
|
||||
/// access the list of facet ords for a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to
|
||||
/// an ordinal and vice versa.
|
||||
/// - a `MultiValuedFastFieldReader` that makes it possible to access the list of facet ords for
|
||||
/// a given document.
|
||||
/// - a `TermDictionary` that helps associating a facet to an ordinal and vice versa.
|
||||
pub fn new(facet_column: StrColumn) -> FacetReader {
|
||||
FacetReader { facet_column }
|
||||
}
|
||||
|
||||
@@ -942,10 +942,10 @@ mod tests {
|
||||
|
||||
let numbers = [100, 200, 300];
|
||||
let test_range = |range: RangeInclusive<u64>| {
|
||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||
let mut vec = vec![];
|
||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||
assert_eq!(vec.len(), expexted_count);
|
||||
assert_eq!(vec.len(), expected_count);
|
||||
};
|
||||
test_range(50..=50);
|
||||
test_range(150..=150);
|
||||
@@ -1020,10 +1020,10 @@ mod tests {
|
||||
|
||||
let numbers = [1000, 1001, 1003];
|
||||
let test_range = |range: RangeInclusive<u64>| {
|
||||
let expexted_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||
let expected_count = numbers.iter().filter(|num| range.contains(num)).count();
|
||||
let mut vec = vec![];
|
||||
field.get_row_ids_for_value_range(range, 0..u32::MAX, &mut vec);
|
||||
assert_eq!(vec.len(), expexted_count);
|
||||
assert_eq!(vec.len(), expected_count);
|
||||
};
|
||||
let test_range_variant = |start, stop| {
|
||||
let start_range = start..=stop;
|
||||
|
||||
@@ -70,13 +70,13 @@ impl FastFieldReaders {
|
||||
///
|
||||
/// This function transforms `attributes.color` into a column key to be used in the `columnar`.
|
||||
///
|
||||
/// The logic works as follows, first we identify which field is targetted by calling
|
||||
/// The logic works as follows, first we identify which field is targeted by calling
|
||||
/// `schema.find_field(..)`. This method will attempt to split the user splied fast field
|
||||
/// name by non-escaped dots, and find the longest matching schema field name.
|
||||
/// In our case, it would return the (attribute_field, "color").
|
||||
///
|
||||
/// If no field is found, but a dynamic field is supplied, then we
|
||||
/// will simply assuem the user is targetting the dynamic field. (This feature is used in
|
||||
/// will simply assume the user is targeting the dynamic field. (This feature is used in
|
||||
/// Quickwit.)
|
||||
///
|
||||
/// We then encode the `(field, path)` into the right `columnar_key`.
|
||||
|
||||
@@ -149,7 +149,7 @@ impl FieldNormReader {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
||||
pub(crate) fn for_test(field_norms: &[u32]) -> FieldNormReader {
|
||||
let field_norms_id = field_norms
|
||||
.iter()
|
||||
.cloned()
|
||||
|
||||
@@ -1,12 +1,9 @@
|
||||
#![allow(deprecated)] // Remove with index sorting
|
||||
|
||||
use std::collections::HashSet;
|
||||
|
||||
use rand::{thread_rng, Rng};
|
||||
|
||||
use crate::indexer::index_writer::MEMORY_BUDGET_NUM_BYTES_MIN;
|
||||
use crate::schema::*;
|
||||
#[allow(deprecated)]
|
||||
use crate::{doc, schema, Index, IndexWriter, Searcher};
|
||||
|
||||
fn check_index_content(searcher: &Searcher, vals: &[u64]) -> crate::Result<()> {
|
||||
|
||||
@@ -11,8 +11,8 @@ use crate::TantivyError;
|
||||
/// progress. Dropping the `FutureResult` does not cancel the task being executed
|
||||
/// either.
|
||||
///
|
||||
/// - In a sync context, you can call `FutureResult::wait()`. The function
|
||||
/// does not rely on `block_on`.
|
||||
/// - In a sync context, you can call `FutureResult::wait()`. The function does not rely on
|
||||
/// `block_on`.
|
||||
/// - In an async context, you can call simply use `FutureResult` as a future.
|
||||
pub struct FutureResult<T> {
|
||||
inner: Inner<T>,
|
||||
|
||||
@@ -49,10 +49,8 @@ fn load_metas(
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic :
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written
|
||||
/// and flushed.
|
||||
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||
/// - it succeeds, and `meta.json` is written and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
fn save_new_metas(
|
||||
@@ -529,12 +527,12 @@ impl Index {
|
||||
/// `IndexWriter` on the system is accessing the index directory,
|
||||
/// it is safe to manually delete the lockfile.
|
||||
///
|
||||
/// - `num_threads` defines the number of indexing workers that
|
||||
/// should work at the same time.
|
||||
/// - `num_threads` defines the number of indexing workers that should work at the same time.
|
||||
///
|
||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory
|
||||
/// allocated for all indexing thread.
|
||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||
/// - `overall_memory_budget_in_bytes` sets the amount of memory allocated for all indexing
|
||||
/// thread.
|
||||
///
|
||||
/// Each thread will receive a budget of `overall_memory_budget_in_bytes / num_threads`.
|
||||
///
|
||||
/// # Errors
|
||||
/// If the lockfile already exists, returns `Error::DirectoryLockBusy` or an `Error::IoError`.
|
||||
|
||||
@@ -176,10 +176,7 @@ impl SegmentMeta {
|
||||
}
|
||||
|
||||
/// Updates the max_doc value from the `SegmentMeta`.
|
||||
///
|
||||
/// This method is only used when updating `max_doc` from 0
|
||||
/// as we finalize a fresh new segment.
|
||||
pub(crate) fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||
pub fn with_max_doc(self, max_doc: u32) -> SegmentMeta {
|
||||
assert_eq!(self.tracked.max_doc, 0);
|
||||
assert!(self.tracked.deletes.is_none());
|
||||
let tracked = self.tracked.map(move |inner_meta| InnerSegmentMeta {
|
||||
|
||||
@@ -31,7 +31,6 @@ pub struct InvertedIndexReader {
|
||||
}
|
||||
|
||||
impl InvertedIndexReader {
|
||||
#[allow(clippy::needless_pass_by_value)] // for symmetry
|
||||
pub(crate) fn new(
|
||||
termdict: TermDictionary,
|
||||
postings_file_slice: FileSlice,
|
||||
@@ -71,7 +70,7 @@ impl InvertedIndexReader {
|
||||
&self.termdict
|
||||
}
|
||||
|
||||
/// Return the fields and types encoded in the dictionary in lexicographic oder.
|
||||
/// Return the fields and types encoded in the dictionary in lexicographic order.
|
||||
/// Only valid on JSON fields.
|
||||
///
|
||||
/// Notice: This requires a full scan and therefore **very expensive**.
|
||||
@@ -205,16 +204,6 @@ impl InvertedIndexReader {
|
||||
.transpose()
|
||||
}
|
||||
|
||||
pub(crate) fn read_postings_no_deletes(
|
||||
&self,
|
||||
term: &Term,
|
||||
option: IndexRecordOption,
|
||||
) -> io::Result<Option<SegmentPostings>> {
|
||||
self.get_term_info(term)?
|
||||
.map(|term_info| self.read_postings_from_terminfo(&term_info, option))
|
||||
.transpose()
|
||||
}
|
||||
|
||||
/// Returns the number of documents containing the term.
|
||||
pub fn doc_freq(&self, term: &Term) -> io::Result<u32> {
|
||||
Ok(self
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
use std::slice;
|
||||
|
||||
/// Enum describing each component of a tantivy segment.
|
||||
///
|
||||
/// Each component is stored in its own file,
|
||||
/// using the pattern `segment_uuid`.`component_extension`,
|
||||
/// except the delete component that takes an `segment_uuid`.`delete_opstamp`.`component_extension`
|
||||
|
||||
@@ -358,7 +358,7 @@ impl SegmentReader {
|
||||
.map(|(mut field_name, handle)| {
|
||||
json_path_sep_to_dot(&mut field_name);
|
||||
// map to canonical path, to avoid similar but different entries.
|
||||
// Eventually we should just accept '.' seperated for all cases.
|
||||
// Eventually we should just accept '.' separated for all cases.
|
||||
let field_name = map_to_canonical
|
||||
.get(&field_name)
|
||||
.unwrap_or(&field_name)
|
||||
@@ -478,7 +478,7 @@ pub fn merge_field_meta_data(
|
||||
.into_iter()
|
||||
.kmerge_by(|left, right| left < right)
|
||||
// TODO: Remove allocation
|
||||
.group_by(|el| (el.field_name.to_string(), el.typ))
|
||||
.chunk_by(|el| (el.field_name.to_string(), el.typ))
|
||||
{
|
||||
let mut merged: FieldMetadata = group.next().unwrap();
|
||||
for el in group {
|
||||
|
||||
@@ -179,8 +179,7 @@ impl DeleteCursor {
|
||||
/// Skips operations and position it so that
|
||||
/// - either all of the delete operation currently in the queue are consume and the next get
|
||||
/// will return `None`.
|
||||
/// - the next get will return the first operation with an
|
||||
/// `opstamp >= target_opstamp`.
|
||||
/// - the next get will return the first operation with an `opstamp >= target_opstamp`.
|
||||
pub fn skip_to(&mut self, target_opstamp: Opstamp) {
|
||||
// TODO Can be optimize as we work with block.
|
||||
while self.is_behind_opstamp(target_opstamp) {
|
||||
@@ -188,7 +187,6 @@ impl DeleteCursor {
|
||||
}
|
||||
}
|
||||
|
||||
#[allow(clippy::wrong_self_convention)]
|
||||
fn is_behind_opstamp(&mut self, target_opstamp: Opstamp) -> bool {
|
||||
self.get()
|
||||
.map(|operation| operation.opstamp < target_opstamp)
|
||||
|
||||
@@ -21,7 +21,7 @@ pub enum DocToOpstampMapping<'a> {
|
||||
None,
|
||||
}
|
||||
|
||||
impl<'a> DocToOpstampMapping<'a> {
|
||||
impl DocToOpstampMapping<'_> {
|
||||
/// Assess whether a document should be considered deleted given that it contains
|
||||
/// a deleted term that was deleted at the opstamp: `delete_opstamp`.
|
||||
///
|
||||
|
||||
@@ -482,7 +482,7 @@ impl<D: Document> IndexWriter<D> {
|
||||
/// let index = Index::create_in_ram(schema.clone());
|
||||
///
|
||||
/// let mut index_writer = index.writer_with_num_threads(1, 50_000_000)?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Promotheus"))?;
|
||||
/// index_writer.add_document(doc!(title => "The modern Prometheus"))?;
|
||||
/// index_writer.commit()?;
|
||||
///
|
||||
/// let clear_res = index_writer.delete_all_documents().unwrap();
|
||||
@@ -491,7 +491,7 @@ impl<D: Document> IndexWriter<D> {
|
||||
///
|
||||
/// let searcher = index.reader()?.searcher();
|
||||
/// let query_parser = QueryParser::for_index(&index, vec![title]);
|
||||
/// let query_promo = query_parser.parse_query("Promotheus")?;
|
||||
/// let query_promo = query_parser.parse_query("Prometheus")?;
|
||||
/// let top_docs_promo = searcher.search(&query_promo, &TopDocs::with_limit(1))?;
|
||||
///
|
||||
/// assert!(top_docs_promo.is_empty());
|
||||
@@ -2093,7 +2093,7 @@ mod tests {
|
||||
//
|
||||
// Take half as sample
|
||||
let mut sample: Vec<_> = expected_ids_and_num_occurrences.iter().collect();
|
||||
sample.sort_by_key(|(k, _num_occurences)| *k);
|
||||
sample.sort_by_key(|(k, _num_occurrences)| *k);
|
||||
// sample.truncate(sample.len() / 2);
|
||||
if !sample.is_empty() {
|
||||
let (left_sample, right_sample) = sample.split_at(sample.len() / 2);
|
||||
@@ -2102,7 +2102,7 @@ mod tests {
|
||||
sample
|
||||
.iter()
|
||||
.filter(|(id, _)| id_is_full_doc(**id))
|
||||
.map(|(_id, num_occurences)| **num_occurences)
|
||||
.map(|(_id, num_occurrences)| **num_occurrences)
|
||||
.sum::<u64>()
|
||||
};
|
||||
fn gen_query_inclusive<T1: ToString, T2: ToString>(
|
||||
|
||||
@@ -104,7 +104,7 @@ impl MergePolicy for LogMergePolicy {
|
||||
|
||||
let mut current_max_log_size = f64::MAX;
|
||||
let mut levels = vec![];
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().group_by(|segment| {
|
||||
for (_, merge_group) in &size_sorted_segments.into_iter().chunk_by(|segment| {
|
||||
let segment_log_size = f64::from(self.clip_min_size(segment.num_docs())).log2();
|
||||
if segment_log_size < (current_max_log_size - self.level_log_size) {
|
||||
// update current_max_log_size to create a new group
|
||||
|
||||
@@ -29,8 +29,8 @@ impl MergeOperationInventory {
|
||||
|
||||
/// A `MergeOperation` has two roles.
|
||||
/// It carries all of the information required to describe a merge:
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the
|
||||
/// delete queue and reflect their deletes.
|
||||
/// - `target_opstamp` is the opstamp up to which we want to consume the delete queue and reflect
|
||||
/// their deletes.
|
||||
/// - `segment_ids` is the list of segment to be merged.
|
||||
///
|
||||
/// The second role is to ensure keep track of the fact that these
|
||||
|
||||
@@ -36,7 +36,7 @@ impl MergePolicy for NoMergePolicy {
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use super::*;
|
||||
|
||||
|
||||
@@ -673,7 +673,7 @@ mod tests {
|
||||
]
|
||||
);
|
||||
assert_eq!(
|
||||
get_doc_ids(vec![Term::from_field_date(
|
||||
get_doc_ids(vec![Term::from_field_date_for_search(
|
||||
date_field,
|
||||
DateTime::from_utc(curr_time)
|
||||
)])?,
|
||||
|
||||
@@ -43,7 +43,7 @@ impl PathToUnorderedId {
|
||||
next_id
|
||||
}
|
||||
|
||||
/// Retuns ids which reflect the lexical order of the paths.
|
||||
/// Returns ids which reflect the lexical order of the paths.
|
||||
///
|
||||
/// The returned vec can be indexed with the unordered id to get the ordered id.
|
||||
pub(crate) fn unordered_id_to_ordered_id(&self) -> Vec<OrderedPathId> {
|
||||
@@ -57,7 +57,7 @@ impl PathToUnorderedId {
|
||||
result
|
||||
}
|
||||
|
||||
/// Retuns the paths so they can be queried by the ordered id (which is the index).
|
||||
/// Returns the paths so they can be queried by the ordered id (which is the index).
|
||||
pub(crate) fn ordered_id_to_path(&self) -> Vec<&str> {
|
||||
let mut paths = self.map.keys().map(String::as_str).collect::<Vec<_>>();
|
||||
paths.sort_unstable();
|
||||
|
||||
@@ -10,12 +10,9 @@ use crate::indexer::delete_queue::DeleteCursor;
|
||||
///
|
||||
/// In addition to segment `meta`,
|
||||
/// it contains a few transient states
|
||||
/// - `alive_bitset` is a bitset describing
|
||||
/// documents that were alive during the commit
|
||||
/// itself.
|
||||
/// - `delete_cursor` is the position in the delete queue.
|
||||
/// Deletes happening before the cursor are reflected either
|
||||
/// in the .del file or in the `alive_bitset`.
|
||||
/// - `alive_bitset` is a bitset describing documents that were alive during the commit itself.
|
||||
/// - `delete_cursor` is the position in the delete queue. Deletes happening before the cursor are
|
||||
/// reflected either in the .del file or in the `alive_bitset`.
|
||||
#[derive(Clone)]
|
||||
pub struct SegmentEntry {
|
||||
meta: SegmentMeta,
|
||||
|
||||
@@ -30,10 +30,8 @@ const NUM_MERGE_THREADS: usize = 4;
|
||||
/// Save the index meta file.
|
||||
/// This operation is atomic:
|
||||
/// Either
|
||||
/// - it fails, in which case an error is returned,
|
||||
/// and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written
|
||||
/// and flushed.
|
||||
/// - it fails, in which case an error is returned, and the `meta.json` remains untouched,
|
||||
/// - it success, and `meta.json` is written and flushed.
|
||||
///
|
||||
/// This method is not part of tantivy's public API
|
||||
pub(crate) fn save_metas(metas: &IndexMeta, directory: &dyn Directory) -> crate::Result<()> {
|
||||
@@ -379,7 +377,7 @@ impl SegmentUpdater {
|
||||
if self.is_alive() {
|
||||
let index = &self.index;
|
||||
let directory = index.directory();
|
||||
let mut commited_segment_metas = self.segment_manager.committed_segment_metas();
|
||||
let mut committed_segment_metas = self.segment_manager.committed_segment_metas();
|
||||
|
||||
// We sort segment_readers by number of documents.
|
||||
// This is an heuristic to make multithreading more efficient.
|
||||
@@ -394,10 +392,10 @@ impl SegmentUpdater {
|
||||
// from the different drives.
|
||||
//
|
||||
// Segment 1 from disk 1, Segment 1 from disk 2, etc.
|
||||
commited_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||
committed_segment_metas.sort_by_key(|segment_meta| -(segment_meta.max_doc() as i32));
|
||||
let index_meta = IndexMeta {
|
||||
index_settings: index.settings().clone(),
|
||||
segments: commited_segment_metas,
|
||||
segments: committed_segment_metas,
|
||||
schema: index.schema(),
|
||||
opstamp,
|
||||
payload: commit_message,
|
||||
@@ -542,7 +540,13 @@ impl SegmentUpdater {
|
||||
}
|
||||
|
||||
fn consider_merge_options(&self) {
|
||||
let (committed_segments, uncommitted_segments) = self.get_mergeable_segments();
|
||||
let (mut committed_segments, mut uncommitted_segments) = self.get_mergeable_segments();
|
||||
if committed_segments.len() == 1 && committed_segments[0].num_deleted_docs() == 0 {
|
||||
committed_segments.clear();
|
||||
}
|
||||
if uncommitted_segments.len() == 1 && uncommitted_segments[0].num_deleted_docs() == 0 {
|
||||
uncommitted_segments.clear();
|
||||
}
|
||||
|
||||
// Committed segments cannot be merged with uncommitted_segments.
|
||||
// We therefore consider merges using these two sets of segments independently.
|
||||
|
||||
@@ -64,9 +64,9 @@ impl SegmentWriter {
|
||||
///
|
||||
/// The arguments are defined as follows
|
||||
///
|
||||
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders)
|
||||
/// is stored in a memory arena. This makes it possible for the user to define
|
||||
/// the flushing behavior as a memory limit.
|
||||
/// - memory_budget: most of the segment writer data (terms, and postings lists recorders) is
|
||||
/// stored in a memory arena. This makes it possible for the user to define the flushing
|
||||
/// behavior as a memory limit.
|
||||
/// - segment: The segment being written
|
||||
/// - schema
|
||||
pub fn for_segment(memory_budget_in_bytes: usize, segment: Segment) -> crate::Result<Self> {
|
||||
@@ -150,7 +150,7 @@ impl SegmentWriter {
|
||||
let vals_grouped_by_field = doc
|
||||
.iter_fields_and_values()
|
||||
.sorted_by_key(|(field, _)| *field)
|
||||
.group_by(|(field, _)| *field);
|
||||
.chunk_by(|(field, _)| *field);
|
||||
|
||||
for (field, field_values) in &vals_grouped_by_field {
|
||||
let values = field_values.map(|el| el.1);
|
||||
@@ -431,7 +431,7 @@ mod tests {
|
||||
use crate::query::{PhraseQuery, QueryParser};
|
||||
use crate::schema::{
|
||||
Document, IndexRecordOption, OwnedValue, Schema, TextFieldIndexing, TextOptions, Value,
|
||||
STORED, STRING, TEXT,
|
||||
DATE_TIME_PRECISION_INDEXED, STORED, STRING, TEXT,
|
||||
};
|
||||
use crate::store::{Compressor, StoreReader, StoreWriter};
|
||||
use crate::time::format_description::well_known::Rfc3339;
|
||||
@@ -651,7 +651,8 @@ mod tests {
|
||||
set_fast_val(
|
||||
DateTime::from_utc(
|
||||
OffsetDateTime::parse("1985-04-12T23:20:50.52Z", &Rfc3339).unwrap(),
|
||||
),
|
||||
)
|
||||
.truncate(DATE_TIME_PRECISION_INDEXED),
|
||||
term
|
||||
)
|
||||
.serialized_value_bytes()
|
||||
|
||||
@@ -101,7 +101,7 @@ mod test {
|
||||
|
||||
use super::Stamper;
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[expect(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
@@ -117,7 +117,7 @@ mod test {
|
||||
assert_eq!(stamper.stamp(), 15u64);
|
||||
}
|
||||
|
||||
#[allow(clippy::redundant_clone)]
|
||||
#[expect(clippy::redundant_clone)]
|
||||
#[test]
|
||||
fn test_stamper_revert() {
|
||||
let stamper = Stamper::new(7u64);
|
||||
|
||||
22
src/lib.rs
22
src/lib.rs
@@ -125,8 +125,8 @@
|
||||
//!
|
||||
//! - **Searching**: [Searcher] searches the segments with anything that implements
|
||||
//! [Query](query::Query) and merges the results. The list of [supported
|
||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||
//! [Query](query::Query) trait.
|
||||
//! queries](query::Query#implementors). Custom Queries are supported by implementing the
|
||||
//! [Query](query::Query) trait.
|
||||
//!
|
||||
//! - **[Directory](directory)**: Abstraction over the storage where the index data is stored.
|
||||
//!
|
||||
@@ -178,10 +178,8 @@ pub use crate::future_result::FutureResult;
|
||||
pub type Result<T> = std::result::Result<T, TantivyError>;
|
||||
|
||||
mod core;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub mod indexer;
|
||||
|
||||
#[allow(unused_doc_comments)]
|
||||
pub mod error;
|
||||
pub mod tokenizer;
|
||||
|
||||
@@ -190,7 +188,6 @@ pub mod collector;
|
||||
pub mod directory;
|
||||
pub mod fastfield;
|
||||
pub mod fieldnorm;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub mod index;
|
||||
pub mod positions;
|
||||
pub mod postings;
|
||||
@@ -202,12 +199,15 @@ pub mod space_usage;
|
||||
pub mod store;
|
||||
pub mod termdict;
|
||||
|
||||
mod docset;
|
||||
mod reader;
|
||||
|
||||
#[cfg(test)]
|
||||
mod compat_tests;
|
||||
|
||||
pub use self::reader::{IndexReader, IndexReaderBuilder, ReloadPolicy, Warmer};
|
||||
pub mod snippet;
|
||||
|
||||
mod docset;
|
||||
use std::fmt;
|
||||
|
||||
pub use census::{Inventory, TrackedObject};
|
||||
@@ -220,7 +220,6 @@ pub use self::docset::{DocSet, COLLECT_BLOCK_BUFFER_LEN, TERMINATED};
|
||||
pub use crate::core::json_utils;
|
||||
pub use crate::core::{Executor, Searcher, SearcherGeneration};
|
||||
pub use crate::directory::Directory;
|
||||
#[allow(deprecated)] // Remove with index sorting
|
||||
pub use crate::index::{
|
||||
Index, IndexBuilder, IndexMeta, IndexSettings, InvertedIndexReader, Order, Segment,
|
||||
SegmentMeta, SegmentReader,
|
||||
@@ -229,9 +228,9 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
|
||||
pub use crate::schema::{Document, TantivyDocument, Term};
|
||||
|
||||
/// Index format version.
|
||||
const INDEX_FORMAT_VERSION: u32 = 6;
|
||||
pub const INDEX_FORMAT_VERSION: u32 = 7;
|
||||
/// Oldest index format version this tantivy version can read.
|
||||
const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
|
||||
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
|
||||
|
||||
/// Structure version for the index.
|
||||
#[derive(Clone, PartialEq, Eq, Serialize, Deserialize)]
|
||||
@@ -368,6 +367,7 @@ macro_rules! fail_point {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Common test utilities.
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
use common::{BinarySerializable, FixedSize};
|
||||
@@ -386,6 +386,7 @@ pub mod tests {
|
||||
use crate::schema::*;
|
||||
use crate::{DateTime, DocAddress, Index, IndexWriter, ReloadPolicy};
|
||||
|
||||
/// Asserts that the serialized value is the value in the trait.
|
||||
pub fn fixed_size_test<O: BinarySerializable + FixedSize + Default>() {
|
||||
let mut buffer = Vec::new();
|
||||
O::default().serialize(&mut buffer).unwrap();
|
||||
@@ -418,6 +419,7 @@ pub mod tests {
|
||||
}};
|
||||
}
|
||||
|
||||
/// Generates random numbers
|
||||
pub fn generate_nonunique_unsorted(max_value: u32, n_elems: usize) -> Vec<u32> {
|
||||
let seed: [u8; 32] = [1; 32];
|
||||
StdRng::from_seed(seed)
|
||||
@@ -426,6 +428,7 @@ pub mod tests {
|
||||
.collect::<Vec<u32>>()
|
||||
}
|
||||
|
||||
/// Sample `n` elements with Bernoulli distribution.
|
||||
pub fn sample_with_seed(n: u32, ratio: f64, seed_val: u8) -> Vec<u32> {
|
||||
StdRng::from_seed([seed_val; 32])
|
||||
.sample_iter(&Bernoulli::new(ratio).unwrap())
|
||||
@@ -435,6 +438,7 @@ pub mod tests {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Sample `n` elements with Bernoulli distribution.
|
||||
pub fn sample(n: u32, ratio: f64) -> Vec<u32> {
|
||||
sample_with_seed(n, ratio, 4)
|
||||
}
|
||||
|
||||
@@ -41,7 +41,6 @@
|
||||
/// );
|
||||
/// # }
|
||||
/// ```
|
||||
|
||||
#[macro_export]
|
||||
macro_rules! doc(
|
||||
() => {
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
//! Tantivy can (if instructed to do so in the schema) store the term positions in a given field.
|
||||
//!
|
||||
//! This position is expressed as token ordinal. For instance,
|
||||
//! In "The beauty and the beast", the term "the" appears in position 0 and position 3.
|
||||
//! This information is useful to run phrase queries.
|
||||
@@ -38,7 +39,7 @@ pub use self::serializer::PositionSerializer;
|
||||
const COMPRESSION_BLOCK_SIZE: usize = BitPacker4x::BLOCK_LEN;
|
||||
|
||||
#[cfg(test)]
|
||||
pub mod tests {
|
||||
pub(crate) mod tests {
|
||||
|
||||
use std::iter;
|
||||
|
||||
|
||||
@@ -18,7 +18,7 @@ use crate::postings::compression::COMPRESSION_BLOCK_SIZE;
|
||||
/// # Assumption
|
||||
///
|
||||
/// - The block is sorted. Some elements may appear several times. This is the case at the
|
||||
/// end of the last block for instance.
|
||||
/// end of the last block for instance.
|
||||
/// - The target is assumed smaller or equal to the last element of the block.
|
||||
pub fn branchless_binary_search(arr: &[u32; COMPRESSION_BLOCK_SIZE], target: u32) -> usize {
|
||||
let mut start = 0;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user